String process

Jonie / 2018-02-04

grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,

     fixed = FALSE, useBytes = FALSE, invert = FALSE)


grepl(pattern, x, ignore.case = FALSE, perl = FALSE,

      fixed = FALSE, useBytes = FALSE)


sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,

    fixed = FALSE, useBytes = FALSE)


gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,

     fixed = FALSE, useBytes = FALSE)


regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,

        fixed = FALSE, useBytes = FALSE)


gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,

         fixed = FALSE, useBytes = FALSE)


regexec(pattern, text, ignore.case = FALSE, perl = FALSE,

        fixed = FALSE, useBytes = FALSE)

        


td<-c("ab","cabefabkldab","cedab")

grep #查找含对应字符串（只要包含就可以）或对应pattern的字符串的下标。

> grep("a",td)

[1] 1 2 3

> grep(".*ab\\w*ab\\w*",td)

[1] 2


grepl #查找含对应字符串（只要包含就可以）或对应pattern的字符串的逻辑值（是否包含）

> grepl("a",td)

[1] TRUE TRUE TRUE

> grepl(".*ab\\w*ab\\w*",td)

[1] FALSE  TRUE FALSE


sub #将td中每个变量第一个a替换为xxx。

> sub("a","xxx",td)

[1] "xxxb"           "cxxxbefabkldab" "cedxxxb" 


gsub #将td中所有变量每个a替换为xxx

> gsub("a","xxx",td)

[1] "xxxb"               "cxxxbefxxxbkldxxxb" "cedxxxb" 


regexpr #查找在字符串td中,ab在每个变量中的开始位置、ab这个pattern的长度

gregexpr #查找在字符串td中，ab在所有变量的每个开始位置和这个pattern的长度

> regexpr("ab",td,perl=T)

[1] 1 2 4

attr(,"match.length")

[1] 2 2 2

attr(,"useBytes")

[1] TRUE

> gregexpr("ab",td,perl=T)

[[1]]

[1] 1

attr(,"match.length")

[1] 2

attr(,"useBytes")

[1] TRUE


[[2]]

[1]  2  6 11

attr(,"match.length")

[1] 2 2 2

attr(,"useBytes")

[1] TRUE


[[3]]

[1] 4

attr(,"match.length")

[1] 2

attr(,"useBytes")

[1] TRUE


> regexpr(".*ab\\w*ab\\w*",td)

[1] -1  1 -1

attr(,"match.length")

[1] -1 12 -1

attr(,"useBytes")

[1] TRUE


> gregexpr("ab\\w{3}ab\\w*",td)

[[1]]

[1] -1

attr(,"match.length")

[1] -1

attr(,"useBytes")

[1] TRUE


[[2]]

[1] 6

attr(,"match.length")

[1] 7

attr(,"useBytes")

[1] TRUE


[[3]]

[1] -1

attr(,"match.length")

[1] -1

attr(,"useBytes")

[1] TRUE



x <- "http://stat.umn.edu:80/xyz"

m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)

m

regmatches(x, m)

## Element 3 is the protocol, 4 is the host, 6 is the port, and 7

## is the path.  We can use this to make a function for extracting the

## parts of a URL:

URL_parts <- function(x) {

    m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)

    parts <- do.call(rbind,

                     lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))

    colnames(parts) <- c("protocol","host","port","path")

    parts

}

URL_parts(x)



> regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)

[[1]]

[1]  1  1  1  8 20 21 23

attr(,"match.length")

[1] 26  7  4 12  3  2  4

attr(,"useBytes")

[1] TRUE


> gregexpr("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)

[[1]]

[1] 1

attr(,"match.length")

[1] 26

attr(,"useBytes")

[1] TRUE