There isn't any specific way to achieve this, you'll need to build up a set of characters, and possibly add additional characters automation misses.
You need to find:
- equivalence sets
[=n=]
or [\p{toNFKD=/n/]
but Python regular expression engines do not support equivalence sets. So we need to build it.
- you need in-script confusables
- you need in script transliterations that resolve to the character you are trying to match.
Then build a pattern out of those
I will try to limit the number of packages required, but robust unicode support is needed:
import icu
import regex
To start with we will need some helper functions.
def NFKD(chars):
normaliser = icu.Normalizer2.getNFKDInstance()
return normaliser.normalize(chars)
def to_ascii(char):
transliterator = icu.Transliterator.createInstance('Latin-ASCII')
return transliterator.transliterate(char)
def single_letter(chars):
pattern = r'^[\p{sc=Common}\p{sc=Inherited}]*\p{Letter}[\p{sc=Common}\p{sc=Inherited}]*$'
if regex.match(pattern, chars):
return chars
return ''
NFKD()
speaks for itself: we need decomposed compatibility sequences that match the target letter.
to_ascii()
will asciify and Latin string.
single_letter()
will weed out compatibility decompositions or asciification that resolve to two or more letters.
We then define a set of characters we want to restrict our choices from. We aren't interested in lookalikes from other scripts, just Latin, But I will include Inherited and Common scripts as well.
LATIN = set(icu.UnicodeSet(r'[[\p{sc=Latin}][\p{sc=Inherited}][\p{sc=Common}]]'))
We will test our target character against this set using a series of set comprehensions, then create a pattern from the union of those sets:
def get_target_pattern(target):
equivalence = {x for x in LATIN if target in single_letter(NFKD(x))}
checker = icu.SpoofChecker()
checker.setChecks(icu.USpoofChecks.ALL_CHECKS)
confusable = {x for x in LATIN if checker.areConfusable(target, x) != 0}
asciified = {x for x in LATIN if target in single_letter(to_ascii(x))}
return f'[{''.join(set.union(equivalence, confusable, asciified))}]'
For n
this gives us:
pattern = get_target_pattern('n')
print(pattern)
# [𝗻ň𝓃ɲnǹ𝖓𝘯𝓷𝐧ŋ𝑛ṋ𝔫ꞑₙṇ𝒏ꝴƞᶇ⒩ⓝɳⁿṅ𝙣ȵñńņᵰʼnnṉꞥ𝕟𝚗𝗇]
Update:
There are a number of ways of modifying the results:
- Use NFD instead of NFKD
- Add Cyrillic and Greek script confusables
def normalize(chars, nf="nfd"):
match nf.lower():
case "nfc":
form = "nfc"
mode = icu.UNormalizationMode2.COMPOSE
case "nfkc":
form = "nfkc"
mode = icu.UNormalizationMode2.COMPOSE
case "nkkc_cf":
form = "nfkc_cf"
mode = icu.UNormalizationMode2.COMPOSE
case "nfkd":
form = "nfkc"
mode = icu.UNormalizationMode2.DECOMPOSE
case "nfd":
form = "nfc"
mode = icu.UNormalizationMode2.DECOMPOSE
normalizer = icu.Normalizer2.getInstance(None, form, mode)
return normalizer.normalize(chars)
LATN_CODESPACE = set(icu.UnicodeSet(r'[[\p{sc=Latin}][\p{sc=Inherited}][\p{sc=Common}]]'))
LCG_CODESPACE = set(icu.UnicodeSet(r'[[\p{sc=Latin}][\p{sc=Cyrillic}][\p{sc=Greek}][\p{sc=Inherited}][\p{sc=Common}]]'))
def get_target_pattern(target, norm_form = "nfkd", extended=False):
equivalence = {x for x in LATN_CODESPACE if target in single_letter(normalize(x, norm_form))}
checker = icu.SpoofChecker()
checker.setChecks(icu.USpoofChecks.ALL_CHECKS)
if extended:
checker.setChecks(icu.USpoofChecks.MIXED_SCRIPT_CONFUSABLE)
confusable = {x for x in LCG_CODESPACE if checker.areConfusable(target, x) != 0}
else:
checker.setChecks(icu.USpoofChecks.WHOLE_SCRIPT_CONFUSABLE)
confusable = {x for x in LATN_CODESPACE if checker.areConfusable(target, x) != 0}
asciified = {x for x in LATN_CODESPACE if target in single_letter(to_ascii(x))}
return f'[{''.join(set.union(equivalence, confusable, asciified))}]'
get_target_pattern('n')
get_target_pattern('n', norm_form="nfd")
get_target_pattern('n', norm_form="nfd", extended = True)