You can match against [^\\x00-\\x7F]
to detect non ASCII or use stringi::stri_enc_isascii
or use utf8ToInt
and test if any code is larger than 127.
inclusion <- c("Include patients ≥ 18 years of age", "a")
grepl("[^\\x00-\\x7F]", inclusion, perl=TRUE)
#[1] TRUE FALSE
stringr::str_detect(inclusion, "[^\\x00-\\x7F]")
#[1] TRUE FALSE
!stringi::stri_enc_isascii(inclusion)
#[1] TRUE FALSE
lapply(inclusion, \(x) any(utf8ToInt(x) > 127))
#[[1]]
#[1] TRUE
#
#[[2]]
#[1] FALSE
And you can replace them using e.g. gsub
, stringr::str_replace_all
or incov
(thanks to @MrFlick for the comment) or stringi::stri_enc_toascii
or textclean::replace_non_ascii
(thanks to @Quarto for the comment).
gsub("[^\\x00-\\x7F]", "?", inclusion, perl=TRUE)
#[1] "Include patients ? 18 years of age" "a"
stringr::str_replace_all(inclusion, "[^\\x00-\\x7F]", "?")
#[1] "Include patients ? 18 years of age" "a"
iconv(inclusion, "UTF-8", "ASCII", sub="?")
#[1] "Include patients ??? 18 years of age"
#[2] "a"
stringi::stri_enc_toascii(inclusion)
#[1] "Include patients \032 18 years of age"
#[2] "a"
textclean::replace_non_ascii(inclusion, "?")
#[1] "Include patients ??? 18 years of age"
#[2] "a"
To be more specific when exchanging the non ASCII you can create a lookup table.
s <- c("x ≥ y", "x = y", "x ≤ y")
i <- gregexpr("[^\\x00-\\x7F]", s, perl=TRUE) # Match non ASCII
m <- regmatches(s, i) # Extract them
unique(unlist(m))
#[1] "≥" "≤"
# Create lookup table
u <- read.table(text="
≥ >=
≤ <=
")
# Exchange them
regmatches(s, i) <- lapply(m, \(x) u[[2]][match(x, u[[1]])])
s
#[1] "x >= y" "x = y" "x <= y"
(I'm aware that in the case of the question solving the issue with RMarkdown will be the solution. Just for the case that someone has a need to translate to ASCII.)