String process
Jonie
/ 2018-02-04
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
fixed = FALSE, useBytes = FALSE, invert = FALSE)
grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
regexec(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
td<-c("ab","cabefabkldab","cedab")
grep #查找含对应字符串(只要包含就可以)或对应pattern的字符串的下标。
> grep("a",td)
[1] 1 2 3
> grep(".*ab\\w*ab\\w*",td)
[1] 2
grepl #查找含对应字符串(只要包含就可以)或对应pattern的字符串的逻辑值(是否包含)
> grepl("a",td)
[1] TRUE TRUE TRUE
> grepl(".*ab\\w*ab\\w*",td)
[1] FALSE TRUE FALSE
sub #将td中每个变量第一个a替换为xxx。
> sub("a","xxx",td)
[1] "xxxb" "cxxxbefabkldab" "cedxxxb"
gsub #将td中所有变量每个a替换为xxx
> gsub("a","xxx",td)
[1] "xxxb" "cxxxbefxxxbkldxxxb" "cedxxxb"
regexpr #查找在字符串td中,ab在每个变量中的开始位置、ab这个pattern的长度
gregexpr #查找在字符串td中,ab在所有变量的每个开始位置和这个pattern的长度
> regexpr("ab",td,perl=T)
[1] 1 2 4
attr(,"match.length")
[1] 2 2 2
attr(,"useBytes")
[1] TRUE
> gregexpr("ab",td,perl=T)
[[1]]
[1] 1
attr(,"match.length")
[1] 2
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 2 6 11
attr(,"match.length")
[1] 2 2 2
attr(,"useBytes")
[1] TRUE
[[3]]
[1] 4
attr(,"match.length")
[1] 2
attr(,"useBytes")
[1] TRUE
> regexpr(".*ab\\w*ab\\w*",td)
[1] -1 1 -1
attr(,"match.length")
[1] -1 12 -1
attr(,"useBytes")
[1] TRUE
> gregexpr("ab\\w{3}ab\\w*",td)
[[1]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
[[2]]
[1] 6
attr(,"match.length")
[1] 7
attr(,"useBytes")
[1] TRUE
[[3]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"useBytes")
[1] TRUE
x <- "http://stat.umn.edu:80/xyz"
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
m
regmatches(x, m)
## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
## is the path. We can use this to make a function for extracting the
## parts of a URL:
URL_parts <- function(x) {
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
parts <- do.call(rbind,
lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
colnames(parts) <- c("protocol","host","port","path")
parts
}
URL_parts(x)
> regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
[[1]]
[1] 1 1 1 8 20 21 23
attr(,"match.length")
[1] 26 7 4 12 3 2 4
attr(,"useBytes")
[1] TRUE
> gregexpr("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
[[1]]
[1] 1
attr(,"match.length")
[1] 26
attr(,"useBytes")
[1] TRUE