1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeRanges.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

87 lines
2.7 KiB
Java

package nu.marginalia.language.encoding;
public enum UnicodeRanges {
GREEK(false, 0x0370,0x03FF),
CYRILLIC(false, 0x0400,0x04FF),
CYRILLIC2(false, 0x0500,0x052F),
ARMENIAN(false, 0x0530,0x058F),
HEBREW(false, 0x0590,0x05FF),
ARABIC(false, 0x0600,0x06FF),
SYRIAC(false, 0x0700,0x074F),
THAANA(false, 0x0780,0x07BF),
DEVANAGARI(false, 0x0900,0x097F),
BENGALI(false, 0x0980,0x09FF),
GURMUKHI(false, 0x0A00,0x0A7F),
GUJARATI(false, 0x0A80,0x0AFF),
ORIYA(false, 0x0B00,0x0B7F),
TAMIL(false, 0x0B80,0x0BFF),
TELUGU(false, 0x0C00,0x0C7F),
KANNADA(false, 0x0C80,0x0CFF),
MALAYALAM(false, 0x0D00,0x0D7F),
SINHALA(false, 0x0D80,0x0DFF),
THAI(false, 0x0E00,0x0E7F),
LAO(false, 0x0E80,0x0EFF),
TIBETAN(false, 0x0F00,0x0FFF),
MYANMAR(false, 0x1000,0x109F),
GEORGIAN(false, 0x10A0,0x10FF),
HANGUL(false, 0x1100,0x11FF),
ETHIOPIC(false, 0x1200,0x137F),
CHEROKEE(false, 0x13A0,0x13FF),
ABORIGINAL(false, 0x1400,0x167F),
OGHAM(false, 0x1680,0x169F),
RUNIC(false, 0x16A0,0x16FF),
TAGALOG(false, 0x1700,0x171F),
HANUNOO(false, 0x1720,0x173F),
BUHID(false, 0x1740,0x175F),
TAGBANWA(false, 0x1760,0x177F),
KHMER(false, 0x1780,0x17FF),
MONGOLIAN(false, 0x1800,0x18AF),
LIMBU(false, 0x1900,0x194F),
TAILE(false, 0x1950,0x197F),
KHMER2(false, 0x19E0,0x19FF),
CJKRADICALS(true,0x2E80,0x2EFF),
KANGXIRADICALS(true, 0x2F00,0x2FDF),
IDEOGRAPHICDESCRIPTION(true,0x2FF0,0x2FFF),
CJKSYMBOLS(true, 0x3000,0x303F),
HIRAGANA(true,0x3040,0x309F),
KATAKANA(true,0x30A0,0x30FF),
BOPOMOFO(true,0x3100,0x312F),
HANGULJAMO(true,0x3130,0x318F),
KANBUN(true,0x3190,0x319F),
BOPOMOFOEXTENDED(true,0x31A0,0x31BF),
KATAKANAPHONETIC(true, 0x31F0,0x31FF),
ENCLOSEDCJK(true, 0x3200,0x32FF),
CJKCOMPATIBILITY(true,0x3300,0x33FF),
CJKUNIFIEDA(true,0x3400,0x4DBF),
YIJINGHEXAGRAMSYMBOLS(true,0x4DC0,0x4DFF),
CJKUNIFIEDIDEOGRAPHS(true,0x4E00,0x9FFF),
YISYLLABLES(true,0xA000,0xA48F),
YIRADICALS(true, 0xA490,0xA4CF),
HANGULSYLLABLES(true, 0xAC00,0xD7AF)
;
final int min;
final int max;
final boolean sensitive;
UnicodeRanges(boolean sensitive, int min, int max) {
this.sensitive = sensitive;
this.min = min;
this.max = max;
}
public boolean test(String text) {
int count = 0;
int max = sensitive ? 15 : 100;
for (int i = 0; i < Math.min(2000, text.length()); i++) {
char c = text.charAt(i);
if (c >= min && c <= this.max) {
if (count++ > max) {
return true;
}
}
}
return false;
}
}