mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Make unicode normalization configurable
This commit is contained in:
@@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -26,16 +26,21 @@ public class QueryFactory {
|
||||
|
||||
private final QueryParser queryParser = new QueryParser();
|
||||
private final QueryExpansion queryExpansion;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
|
||||
|
||||
@Inject
|
||||
public QueryFactory(QueryExpansion queryExpansion)
|
||||
public QueryFactory(QueryExpansion queryExpansion, LanguageConfiguration languageConfiguration)
|
||||
{
|
||||
this.queryExpansion = queryExpansion;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
}
|
||||
|
||||
public ProcessedQuery createQuery(QueryParams params,
|
||||
@Nullable RpcResultRankingParameters rankingParams) {
|
||||
|
||||
LanguageDefinition languageDefinition = languageConfiguration.getLanguage(params.langIsoCode());
|
||||
|
||||
final var query = params.humanQuery();
|
||||
|
||||
if (query.length() > 1000) {
|
||||
@@ -45,7 +50,7 @@ public class QueryFactory {
|
||||
List<String> searchTermsHuman = new ArrayList<>();
|
||||
List<String> problems = new ArrayList<>();
|
||||
|
||||
List<QueryToken> basicQuery = queryParser.parse(query);
|
||||
List<QueryToken> basicQuery = queryParser.parse(languageDefinition, query);
|
||||
|
||||
if (basicQuery.size() >= 12) {
|
||||
problems.add("Your search query is too long");
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.util.transform_list.TransformList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -12,8 +12,8 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class QueryParser {
|
||||
|
||||
public List<QueryToken> parse(String query) {
|
||||
List<QueryToken> basicTokens = tokenizeQuery(query);
|
||||
public List<QueryToken> parse(LanguageDefinition languageDefinition, String query) {
|
||||
List<QueryToken> basicTokens = tokenizeQuery(languageDefinition, query);
|
||||
|
||||
TransformList<QueryToken> list = new TransformList<>(basicTokens);
|
||||
|
||||
@@ -30,10 +30,10 @@ public class QueryParser {
|
||||
|
||||
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
|
||||
|
||||
public List<QueryToken> tokenizeQuery(String rawQuery) {
|
||||
public List<QueryToken> tokenizeQuery(LanguageDefinition languageDefinition, String rawQuery) {
|
||||
List<QueryToken> tokens = new ArrayList<>();
|
||||
|
||||
String query = AsciiFlattener.flattenUnicode(rawQuery);
|
||||
String query = languageDefinition.unicodeNormalization().flattenUnicode(rawQuery);
|
||||
query = noisePattern.matcher(query).replaceAll(" ");
|
||||
|
||||
int chr = -1;
|
||||
|
@@ -1,32 +1,42 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
class QueryParserTest {
|
||||
|
||||
LanguageConfiguration languageConfiguration = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
|
||||
QueryParserTest() throws IOException, ParserConfigurationException, SAXException {
|
||||
}
|
||||
|
||||
@Test
|
||||
// https://github.com/MarginaliaSearch/MarginaliaSearch/issues/140
|
||||
void parse__builtin_ffs() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("__builtin_ffs");
|
||||
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "__builtin_ffs");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("builtin_ffs", "__builtin_ffs")), tokens);
|
||||
}
|
||||
|
||||
@Test
|
||||
void trailingParens() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("strcpy()");
|
||||
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "strcpy()");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("strcpy", "strcpy()")), tokens);
|
||||
}
|
||||
|
||||
@Test
|
||||
void trailingQuote() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("bob's");
|
||||
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "bob's");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("bob", "bob's")), tokens);
|
||||
}
|
||||
}
|
@@ -6,12 +6,15 @@ import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@@ -24,13 +27,11 @@ public class QueryFactoryTest {
|
||||
static QueryFactory queryFactory;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException {
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
|
||||
var lm = WmsaHome.getLanguageModels();
|
||||
|
||||
queryFactory = new QueryFactory(
|
||||
new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm))
|
||||
);
|
||||
queryFactory = new QueryFactory(new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm)), new LanguageConfiguration(lm));
|
||||
}
|
||||
|
||||
public SearchSpecification parseAndGetSpecs(String query) {
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.index.perftest;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
@@ -22,6 +23,7 @@ import nu.marginalia.index.reverse.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.WordLexicon;
|
||||
import nu.marginalia.index.reverse.query.IndexQuery;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
@@ -37,7 +39,6 @@ import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
public class PerfTestMain {
|
||||
static Duration warmupTime = Duration.ofMinutes(1);
|
||||
@@ -109,18 +110,19 @@ public class PerfTestMain {
|
||||
);
|
||||
}
|
||||
|
||||
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
|
||||
static QueryFactory createQueryFactory(Path homeDir) throws Exception {
|
||||
return new QueryFactory(
|
||||
new QueryExpansion(
|
||||
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
|
||||
new NgramLexicon()
|
||||
)
|
||||
),
|
||||
new LanguageConfiguration(WmsaHome.getLanguageModels())
|
||||
);
|
||||
}
|
||||
|
||||
public static void runValuation(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException, TimeoutException {
|
||||
String rawQuery) throws Exception {
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
@@ -192,7 +194,7 @@ public class PerfTestMain {
|
||||
|
||||
public static void runExecution(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException, InterruptedException {
|
||||
String rawQuery) throws Exception {
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
@@ -245,7 +247,7 @@ public class PerfTestMain {
|
||||
|
||||
public static void runLookup(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException
|
||||
String rawQuery) throws Exception
|
||||
{
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
|
@@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
@@ -136,9 +137,10 @@ public class LanguageConfiguration {
|
||||
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns =
|
||||
parsePosPatterns(posTagger, languageTag, isoCode);
|
||||
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode,
|
||||
new LanguageDefinition(isoCode, name, stemmer, keywordHasher, posTagger, posPatterns));
|
||||
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to set up language " + isoCode, ex);
|
||||
@@ -146,6 +148,21 @@ public class LanguageConfiguration {
|
||||
}
|
||||
}
|
||||
|
||||
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
|
||||
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
|
||||
if (normalizationTags.getLength() == 0)
|
||||
return new UnicodeNormalization.JustNormalizeQuotes();
|
||||
Element normalizationTag = (Element) normalizationTags.item(0);
|
||||
String algorithm = normalizationTag.getAttribute("algorithm");
|
||||
|
||||
return switch(algorithm) {
|
||||
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
|
||||
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
|
||||
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
|
||||
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
||||
|
@@ -1,135 +0,0 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public class AsciiFlattener {
|
||||
|
||||
private static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
public static String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
|
||||
for (int i = 0; i < numCp; i++) {
|
||||
int c = s.codePointAt(i);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
int i;
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (i = 0; i < numCp && isAscii(s.codePointAt(i)); i++);
|
||||
|
||||
return i == s.length();
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,197 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public interface UnicodeNormalization {
|
||||
|
||||
String flattenUnicode(String s);
|
||||
|
||||
static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
class JustNormalizeQuotes implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp; i++) {
|
||||
int c = s.codePointAt(i);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenEAccents implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp; i++) {
|
||||
int c = s.codePointAt(i);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("é".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenAllLatin implements UnicodeNormalization {
|
||||
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
|
||||
for (int i = 0; i < numCp; i++) {
|
||||
int c = s.codePointAt(i);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
int i;
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (i = 0; i < numCp && isAscii(s.codePointAt(i)); i++);
|
||||
|
||||
return i == s.length();
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
@@ -16,6 +17,7 @@ public final class LanguageDefinition {
|
||||
private final String isoCode;
|
||||
private final String name;
|
||||
private final Stemmer stemmer;
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
private final KeywordHasher keywordHasher;
|
||||
|
||||
@Nullable
|
||||
@@ -25,12 +27,14 @@ public final class LanguageDefinition {
|
||||
public LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
UnicodeNormalization unicodeNormalization,
|
||||
KeywordHasher keywordHasher,
|
||||
@Nullable PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
|
||||
this.isoCode = isoCode;
|
||||
this.name = name;
|
||||
this.stemmer = stemmer;
|
||||
this.unicodeNormalization = unicodeNormalization;
|
||||
this.keywordHasher = keywordHasher;
|
||||
this.posTagger = posTagger;
|
||||
this.posPatterns = posPatterns;
|
||||
@@ -57,6 +61,10 @@ public final class LanguageDefinition {
|
||||
return keywordHasher;
|
||||
}
|
||||
|
||||
public UnicodeNormalization unicodeNormalization() {
|
||||
return unicodeNormalization;
|
||||
}
|
||||
|
||||
public long[] posTagSentence(String[] words) {
|
||||
if (posTagger == null) return new long[0];
|
||||
return posTagger.tagSentence(words);
|
||||
|
@@ -129,7 +129,7 @@ public class SentenceExtractor {
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
String[] words = wordsAndSeps.words();
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
@@ -218,11 +218,13 @@ public class SentenceExtractor {
|
||||
|
||||
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
|
||||
|
||||
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
|
||||
|
||||
if (isNaturalLanguage) {
|
||||
// Natural language text; do POS tagging and stemming
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = language.posTagSentence(tokens);
|
||||
@@ -274,7 +276,7 @@ public class SentenceExtractor {
|
||||
// as this is not likely to be useful
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = new long[tokens.length];
|
||||
|
@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
|
||||
public record SeparatedSentence(String[] words, BitSet separators) { }
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
|
||||
private static final Pattern wordBreakPattern;
|
||||
|
||||
static {
|
||||
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
}
|
||||
|
||||
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
|
||||
this.unicodeNormalization = languageDefinition.unicodeNormalization();
|
||||
}
|
||||
|
||||
/** Split a sentence into words and separators.
|
||||
*
|
||||
* @param segment The sentence to split
|
||||
* @return A list of words and separators
|
||||
*/
|
||||
public static SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
public SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = unicodeNormalization.flattenUnicode(segment);
|
||||
|
||||
var matcher = wordBreakPattern.matcher(flatSegment);
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
@@ -18,6 +18,11 @@
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
@@ -37,6 +42,7 @@
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
@@ -56,6 +62,7 @@
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
@@ -106,6 +113,7 @@
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
|
@@ -1,38 +0,0 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class AsciiFlattenerTest {
|
||||
|
||||
@Test
|
||||
void flattenUnicodePlainAscii() {
|
||||
String s = "abc";
|
||||
|
||||
// If the string is ascii, we don't want to allocate a copy
|
||||
|
||||
assertSame(s, AsciiFlattener.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode() {
|
||||
String s = "Stülpnagelstraße";
|
||||
|
||||
assertEquals("Stulpnagelstrasse", AsciiFlattener.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode2() {
|
||||
String s = "Koncevičius";
|
||||
|
||||
assertEquals("Koncevicius", AsciiFlattener.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void omitNonFlattenable() {
|
||||
String s = "[アグレッシブ烈子]";
|
||||
|
||||
assertEquals("[]", AsciiFlattener.flattenUnicode(s));
|
||||
}
|
||||
}
|
@@ -0,0 +1,41 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
class UnicodeNormalizationTest {
|
||||
|
||||
UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
|
||||
|
||||
@Test
|
||||
void flattenUnicodePlainAscii() {
|
||||
String s = "abc";
|
||||
|
||||
// If the string is ascii, we don't want to allocate a copy
|
||||
|
||||
assertSame(s, unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode() {
|
||||
String s = "Stülpnagelstraße";
|
||||
|
||||
assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode2() {
|
||||
String s = "Koncevičius";
|
||||
|
||||
assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void omitNonFlattenable() {
|
||||
String s = "[アグレッシブ烈子]";
|
||||
|
||||
assertEquals("[]", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
}
|
@@ -11,6 +11,7 @@ import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Objects;
|
||||
|
||||
@@ -60,7 +61,8 @@ class SentenceExtractorTest {
|
||||
void testJava() {
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(4, dld.wordsLowerCase.length);
|
||||
System.out.println(Arrays.toString(dld.wordsLowerCase));
|
||||
|
||||
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user