1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Make unicode normalization configurable

This commit is contained in:
Viktor Lofgren
2025-09-08 13:18:58 +02:00
parent 1432fc87d7
commit bffc159486
15 changed files with 332 additions and 206 deletions

View File

@@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -26,16 +26,21 @@ public class QueryFactory {
private final QueryParser queryParser = new QueryParser();
private final QueryExpansion queryExpansion;
private final LanguageConfiguration languageConfiguration;
@Inject
public QueryFactory(QueryExpansion queryExpansion)
public QueryFactory(QueryExpansion queryExpansion, LanguageConfiguration languageConfiguration)
{
this.queryExpansion = queryExpansion;
this.languageConfiguration = languageConfiguration;
}
public ProcessedQuery createQuery(QueryParams params,
@Nullable RpcResultRankingParameters rankingParams) {
LanguageDefinition languageDefinition = languageConfiguration.getLanguage(params.langIsoCode());
final var query = params.humanQuery();
if (query.length() > 1000) {
@@ -45,7 +50,7 @@ public class QueryFactory {
List<String> searchTermsHuman = new ArrayList<>();
List<String> problems = new ArrayList<>();
List<QueryToken> basicQuery = queryParser.parse(query);
List<QueryToken> basicQuery = queryParser.parse(languageDefinition, query);
if (basicQuery.size() >= 12) {
problems.add("Your search query is too long");

View File

@@ -1,9 +1,9 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.util.transform_list.TransformList;
import java.util.ArrayList;
@@ -12,8 +12,8 @@ import java.util.regex.Pattern;
public class QueryParser {
public List<QueryToken> parse(String query) {
List<QueryToken> basicTokens = tokenizeQuery(query);
public List<QueryToken> parse(LanguageDefinition languageDefinition, String query) {
List<QueryToken> basicTokens = tokenizeQuery(languageDefinition, query);
TransformList<QueryToken> list = new TransformList<>(basicTokens);
@@ -30,10 +30,10 @@ public class QueryParser {
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
public List<QueryToken> tokenizeQuery(String rawQuery) {
public List<QueryToken> tokenizeQuery(LanguageDefinition languageDefinition, String rawQuery) {
List<QueryToken> tokens = new ArrayList<>();
String query = AsciiFlattener.flattenUnicode(rawQuery);
String query = languageDefinition.unicodeNormalization().flattenUnicode(rawQuery);
query = noisePattern.matcher(query).replaceAll(" ");
int chr = -1;

View File

@@ -1,32 +1,42 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.WmsaHome;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.config.LanguageConfiguration;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.List;
class QueryParserTest {
LanguageConfiguration languageConfiguration = new LanguageConfiguration(WmsaHome.getLanguageModels());
QueryParserTest() throws IOException, ParserConfigurationException, SAXException {
}
@Test
// https://github.com/MarginaliaSearch/MarginaliaSearch/issues/140
void parse__builtin_ffs() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("__builtin_ffs");
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "__builtin_ffs");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("builtin_ffs", "__builtin_ffs")), tokens);
}
@Test
void trailingParens() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("strcpy()");
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "strcpy()");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("strcpy", "strcpy()")), tokens);
}
@Test
void trailingQuote() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("bob's");
var tokens = parser.parse(languageConfiguration.getLanguage("en"), "bob's");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("bob", "bob's")), tokens);
}
}

View File

@@ -6,12 +6,15 @@ import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -24,13 +27,11 @@ public class QueryFactoryTest {
static QueryFactory queryFactory;
@BeforeAll
public static void setUpAll() throws IOException {
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
var lm = WmsaHome.getLanguageModels();
queryFactory = new QueryFactory(
new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm))
);
queryFactory = new QueryFactory(new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm)), new LanguageConfiguration(lm));
}
public SearchSpecification parseAndGetSpecs(String query) {

View File

@@ -1,6 +1,7 @@
package nu.marginalia.index.perftest;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
@@ -22,6 +23,7 @@ import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.WordLexicon;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.segmentation.NgramLexicon;
@@ -37,7 +39,6 @@ import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeoutException;
public class PerfTestMain {
static Duration warmupTime = Duration.ofMinutes(1);
@@ -109,18 +110,19 @@ public class PerfTestMain {
);
}
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
static QueryFactory createQueryFactory(Path homeDir) throws Exception {
return new QueryFactory(
new QueryExpansion(
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
new NgramLexicon()
)
),
new LanguageConfiguration(WmsaHome.getLanguageModels())
);
}
public static void runValuation(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, TimeoutException {
String rawQuery) throws Exception {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
@@ -192,7 +194,7 @@ public class PerfTestMain {
public static void runExecution(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, InterruptedException {
String rawQuery) throws Exception {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
@@ -245,7 +247,7 @@ public class PerfTestMain {
public static void runLookup(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException
String rawQuery) throws Exception
{
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);

View File

@@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText;
import com.google.inject.Inject;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.pos.PosPattern;
@@ -136,9 +137,10 @@ public class LanguageConfiguration {
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
Map<PosPatternCategory, List<PosPattern>> posPatterns =
parsePosPatterns(posTagger, languageTag, isoCode);
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
languages.put(isoCode,
new LanguageDefinition(isoCode, name, stemmer, keywordHasher, posTagger, posPatterns));
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
}
catch (IOException ex) {
logger.error("Failed to set up language " + isoCode, ex);
@@ -146,6 +148,21 @@ public class LanguageConfiguration {
}
}
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
if (normalizationTags.getLength() == 0)
return new UnicodeNormalization.JustNormalizeQuotes();
Element normalizationTag = (Element) normalizationTags.item(0);
String algorithm = normalizationTag.getAttribute("algorithm");
return switch(algorithm) {
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
};
}
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
Element languageTag, String isoCode) {
if (null == posTagger)

View File

@@ -1,135 +0,0 @@
package nu.marginalia.language.encoding;
public class AsciiFlattener {
private static final boolean NO_FLATTEN_UNICODE =
Boolean.getBoolean("system.noFlattenUnicode");
public static String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
int numCp = s.codePointCount(0, s.length());
// Falsehoods programmers believe about the latin alphabet ;-)
for (int i = 0; i < numCp; i++) {
int c = s.codePointAt(i);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
sb.append('a');
}
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
sb.append('b');
}
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
sb.append('c');
}
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
sb.append('d');
}
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
sb.append('e');
}
else if ("ḟƒ".indexOf(c) >= 0) {
sb.append('f');
}
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
sb.append('i');
}
else if ("ĵǰɉ".indexOf(c) >= 0) {
sb.append('j');
}
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
sb.append('k');
}
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
sb.append('l');
}
else if ("ḿṁṃ".indexOf(c) >= 0) {
sb.append('m');
}
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
sb.append('n');
}
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
sb.append('o');
}
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
sb.append('p');
}
else if ("".indexOf(c) >= 0) {
sb.append('q');
}
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
sb.append('r');
}
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
sb.append('s');
}
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
sb.append('t');
}
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
sb.append('u');
}
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
sb.append('v');
}
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
sb.append('w');
}
else if ("x̂ẍẋ".indexOf(c) >= 0) {
sb.append('x');
}
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
sb.append('y');
}
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
sb.append('z');
}
else if ("Þþ".indexOf(c) >= 0) {
sb.append("th");
}
else if ('ß' == c) {
sb.append("ss");
}
else if (isAscii(c)) {
sb.append((char) c);
}
}
return sb.toString();
}
private static boolean isPlainAscii(String s) {
int i;
int numCp = s.codePointCount(0, s.length());
for (i = 0; i < numCp && isAscii(s.codePointAt(i)); i++);
return i == s.length();
}
private static boolean isAscii(int c) {
return (c & ~0x7f) == 0;
}
}

View File

@@ -0,0 +1,197 @@
package nu.marginalia.language.encoding;
public interface UnicodeNormalization {
String flattenUnicode(String s);
static final boolean NO_FLATTEN_UNICODE =
Boolean.getBoolean("system.noFlattenUnicode");
class JustNormalizeQuotes implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
int numCp = s.codePointCount(0, s.length());
for (int i = 0; i < numCp; i++) {
int c = s.codePointAt(i);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
sb.appendCodePoint(c);
}
return sb.toString();
}
}
class FlattenEAccents implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
int numCp = s.codePointCount(0, s.length());
for (int i = 0; i < numCp; i++) {
int c = s.codePointAt(i);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
else if ("é".indexOf(c) >= 0) {
sb.append('e');
}
sb.appendCodePoint(c);
}
return sb.toString();
}
}
class FlattenAllLatin implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
int numCp = s.codePointCount(0, s.length());
// Falsehoods programmers believe about the latin alphabet ;-)
for (int i = 0; i < numCp; i++) {
int c = s.codePointAt(i);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
sb.append('a');
}
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
sb.append('b');
}
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
sb.append('c');
}
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
sb.append('d');
}
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
sb.append('e');
}
else if ("ḟƒ".indexOf(c) >= 0) {
sb.append('f');
}
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
sb.append('i');
}
else if ("ĵǰɉ".indexOf(c) >= 0) {
sb.append('j');
}
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
sb.append('k');
}
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
sb.append('l');
}
else if ("ḿṁṃ".indexOf(c) >= 0) {
sb.append('m');
}
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
sb.append('n');
}
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
sb.append('o');
}
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
sb.append('p');
}
else if ("".indexOf(c) >= 0) {
sb.append('q');
}
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
sb.append('r');
}
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
sb.append('s');
}
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
sb.append('t');
}
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
sb.append('u');
}
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
sb.append('v');
}
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
sb.append('w');
}
else if ("x̂ẍẋ".indexOf(c) >= 0) {
sb.append('x');
}
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
sb.append('y');
}
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
sb.append('z');
}
else if ("Þþ".indexOf(c) >= 0) {
sb.append("th");
}
else if ('ß' == c) {
sb.append("ss");
}
else if (isAscii(c)) {
sb.append((char) c);
}
}
return sb.toString();
}
}
private static boolean isPlainAscii(String s) {
int i;
int numCp = s.codePointCount(0, s.length());
for (i = 0; i < numCp && isAscii(s.codePointAt(i)); i++);
return i == s.length();
}
private static boolean isAscii(int c) {
return (c & ~0x7f) == 0;
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.language.model;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.pos.PosPattern;
import nu.marginalia.language.pos.PosPatternCategory;
@@ -16,6 +17,7 @@ public final class LanguageDefinition {
private final String isoCode;
private final String name;
private final Stemmer stemmer;
private final UnicodeNormalization unicodeNormalization;
private final KeywordHasher keywordHasher;
@Nullable
@@ -25,12 +27,14 @@ public final class LanguageDefinition {
public LanguageDefinition(String isoCode,
String name,
Stemmer stemmer,
UnicodeNormalization unicodeNormalization,
KeywordHasher keywordHasher,
@Nullable PosTagger posTagger,
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
this.isoCode = isoCode;
this.name = name;
this.stemmer = stemmer;
this.unicodeNormalization = unicodeNormalization;
this.keywordHasher = keywordHasher;
this.posTagger = posTagger;
this.posPatterns = posPatterns;
@@ -57,6 +61,10 @@ public final class LanguageDefinition {
return keywordHasher;
}
public UnicodeNormalization unicodeNormalization() {
return unicodeNormalization;
}
public long[] posTagSentence(String[] words) {
if (posTagger == null) return new long[0];
return posTagger.tagSentence(words);

View File

@@ -129,7 +129,7 @@ public class SentenceExtractor {
EnumSet<HtmlTag> htmlTags) {
final Stemmer stemmer = language.stemmer();
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
String[] words = wordsAndSeps.words();
BitSet seps = wordsAndSeps.separators();
@@ -218,11 +218,13 @@ public class SentenceExtractor {
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
if (isNaturalLanguage) {
// Natural language text; do POS tagging and stemming
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = language.posTagSentence(tokens);
@@ -274,7 +276,7 @@ public class SentenceExtractor {
// as this is not likely to be useful
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = new long[tokens.length];

View File

@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
import com.google.common.base.CharMatcher;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.model.LanguageDefinition;
import java.util.ArrayList;
import java.util.BitSet;
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
public class SentenceSegmentSplitter {
private final UnicodeNormalization unicodeNormalization;
public record SeparatedSentence(String[] words, BitSet separators) { }
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
private static final Pattern wordBreakPattern;
static {
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
}
}
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
this.unicodeNormalization = languageDefinition.unicodeNormalization();
}
/** Split a sentence into words and separators.
*
* @param segment The sentence to split
* @return A list of words and separators
*/
public static SeparatedSentence splitSegment(String segment, int maxLength) {
String flatSegment = AsciiFlattener.flattenUnicode(segment);
public SeparatedSentence splitSegment(String segment, int maxLength) {
String flatSegment = unicodeNormalization.flattenUnicode(segment);
var matcher = wordBreakPattern.matcher(flatSegment);

View File

@@ -1,7 +1,7 @@
<?xml version="1.0"?>
<!DOCTYPE languages [
<!ELEMENT languages (language*,resource*)>
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
<!ELEMENT resource EMPTY>
<!ATTLIST resource
@@ -18,6 +18,11 @@
disabled (true|false) "false"
>
<!ELEMENT unicodeNormalization EMPTY>
<!ATTLIST unicodeNormalization
algorithm (minimal|e-accents|maximal-latin) #REQUIRED
>
<!ELEMENT stemmer (pospattern?)>
<!ATTLIST stemmer
algorithm (porter|snowball|none) #REQUIRED
@@ -37,6 +42,7 @@
<!ELEMENT ngrams (pospattern*)>
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
<!ELEMENT pospattern (#PCDATA)>
<!ELEMENT sentenceDetector EMPTY>
@@ -56,6 +62,7 @@
<pospattern>!(IN TO CC DT)</pospattern>
</stemmer>
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="maximal-latin" />
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
<ngrams type="name">
<pospattern>NNP*</pospattern>
@@ -106,6 +113,7 @@
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="SWEDISH" />
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="e-accents" />
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
<ngrams type="name">
<pospattern>PROPN</pospattern>

View File

@@ -1,38 +0,0 @@
package nu.marginalia.language.encoding;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class AsciiFlattenerTest {
@Test
void flattenUnicodePlainAscii() {
String s = "abc";
// If the string is ascii, we don't want to allocate a copy
assertSame(s, AsciiFlattener.flattenUnicode(s));
}
@Test
void flattenUnicode() {
String s = "Stülpnagelstraße";
assertEquals("Stulpnagelstrasse", AsciiFlattener.flattenUnicode(s));
}
@Test
void flattenUnicode2() {
String s = "Koncevičius";
assertEquals("Koncevicius", AsciiFlattener.flattenUnicode(s));
}
@Test
void omitNonFlattenable() {
String s = "[アグレッシブ烈子]";
assertEquals("[]", AsciiFlattener.flattenUnicode(s));
}
}

View File

@@ -0,0 +1,41 @@
package nu.marginalia.language.encoding;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
class UnicodeNormalizationTest {
UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
@Test
void flattenUnicodePlainAscii() {
String s = "abc";
// If the string is ascii, we don't want to allocate a copy
assertSame(s, unicodeNormalization.flattenUnicode(s));
}
@Test
void flattenUnicode() {
String s = "Stülpnagelstraße";
assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
}
@Test
void flattenUnicode2() {
String s = "Koncevičius";
assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
}
@Test
void omitNonFlattenable() {
String s = "[アグレッシブ烈子]";
assertEquals("[]", unicodeNormalization.flattenUnicode(s));
}
}

View File

@@ -11,6 +11,7 @@ import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Objects;
@@ -60,7 +61,8 @@ class SentenceExtractorTest {
void testJava() {
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
assertEquals(4, dld.wordsLowerCase.length);
System.out.println(Arrays.toString(dld.wordsLowerCase));
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
}