1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Add support for languages with no POS-tagging

This disables a lot of the smart keyword extraction,
which is mostly a crutch for helping English and similar
large languages to find relevant search results.

Smaller languages where a POS-tag model may not be available,
are probably fine with this disabled, as the search engine can
likely just rawdog the entire results list.
This commit is contained in:
Viktor Lofgren
2025-08-29 09:48:52 +02:00
parent 59519ed7c4
commit df6434d177
7 changed files with 135 additions and 48 deletions

View File

@@ -34,37 +34,57 @@ public class DocumentKeywordExtractor {
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
final KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
if (dld.language().hasPosParsing()) {
KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper(keywordExtractor);
var keywordMetadata = KeywordMetadata.builder()
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper(keywordExtractor);
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var keywordMetadata = KeywordMetadata.builder()
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
return wordsBuilder;
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
else {
KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper(keywordExtractor);
var keywordMetadata = KeywordMetadata.builder()
.urlKeywords(urlKeywords)
.build();
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
}
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {

View File

@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
import nu.marginalia.keyword.extractors.UrlKeywords;
import nu.marginalia.model.idx.WordFlags;
import javax.annotation.Nullable;
public class KeywordMetadata {
@Nullable
private final TitleKeywords titleKeywords;
@Nullable
private final NameLikeKeywords nameLikeKeywords;
@Nullable
private final SubjectLikeKeywords subjectLikeKeywords;
@Nullable
private final UrlKeywords urlKeywords;
public KeywordMetadata(
TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords) {
@Nullable TitleKeywords titleKeywords,
@Nullable NameLikeKeywords nameLikeKeywords,
@Nullable SubjectLikeKeywords subjectLikeKeywords,
@Nullable UrlKeywords urlKeywords) {
this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords;
@@ -32,23 +38,23 @@ public class KeywordMetadata {
byte flags = 0;
if (subjectLikeKeywords.contains(stemmed)) {
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
flags |= WordFlags.Subjects.asBit();
}
if (nameLikeKeywords.contains(stemmed)) {
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
flags |= WordFlags.NamesWords.asBit();
}
if (titleKeywords.contains(stemmed)) {
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
flags |= WordFlags.Title.asBit();
}
if (urlKeywords.containsUrl(stemmed)) {
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
flags |= WordFlags.UrlPath.asBit();
}
if (urlKeywords.containsDomain(stemmed)) {
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
flags |= WordFlags.UrlDomain.asBit();
}

View File

@@ -142,8 +142,11 @@ public class LanguageConfiguration {
}
}
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(PosTagger posTagger,
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
Element languageTag, String isoCode) {
if (null == posTagger)
return Map.of();
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
@@ -172,11 +175,14 @@ public class LanguageConfiguration {
return ret;
}
@Nullable
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
if (rdrElements.getLength() != 1) {
throw new IllegalArgumentException(
"language.xml: No rdrTagger block for language element " + isoCode);
if (rdrElements.getLength() < 1) {
return null;
}
else if (rdrElements.getLength() > 1) {
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
}
Element rdrElement = (Element) rdrElements.item(0);

View File

@@ -6,18 +6,66 @@ import nu.marginalia.language.pos.PosPatternCategory;
import nu.marginalia.language.pos.PosTagger;
import nu.marginalia.language.stemming.Stemmer;
import javax.annotation.Nullable;
import java.util.List;
import java.util.Map;
public record LanguageDefinition(String isoCode,
String name,
Stemmer stemmer,
KeywordHasher keywordHasher,
PosTagger posTagger,
Map<PosPatternCategory, List<PosPattern>> posPatterns)
{
public final class LanguageDefinition {
private final String isoCode;
private final String name;
private final Stemmer stemmer;
private final KeywordHasher keywordHasher;
@Nullable
private final PosTagger posTagger;
private final Map<PosPatternCategory, List<PosPattern>> posPatterns;
public LanguageDefinition(String isoCode,
String name,
Stemmer stemmer,
KeywordHasher keywordHasher,
@Nullable PosTagger posTagger,
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
this.isoCode = isoCode;
this.name = name;
this.stemmer = stemmer;
this.keywordHasher = keywordHasher;
this.posTagger = posTagger;
this.posPatterns = posPatterns;
}
public long[] tagSentence(String[] words) {
if (posTagger == null) return new long[words.length];
return posTagger.tagSentence(words);
}
public boolean hasPosParsing() {
return posTagger != null;
}
public List<PosPattern> getPatterns(PosPatternCategory category) {
return posPatterns.getOrDefault(category, List.of());
}
public String decodeTagName(long tagName) {
if (hasPosParsing())
return posTagger.decodeTagName(tagName);
return "";
}
public String isoCode() {
return isoCode;
}
public String name() {
return name;
}
public Stemmer stemmer() {
return stemmer;
}
public KeywordHasher keywordHasher() {
return keywordHasher;
}
}

View File

@@ -135,7 +135,7 @@ public class SentenceExtractor {
BitSet seps = wordsAndSeps.separators();
String[] lc = new String[words.length];
String[] stemmed = new String[words.length];
long[] posTags = language.posTagger().tagSentence(words);
long[] posTags = language.tagSentence(words);
BitSet isCapitalized = new BitSet(words.length);
BitSet isAllCaps = new BitSet(words.length);
@@ -225,7 +225,7 @@ public class SentenceExtractor {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = language.posTagger().tagSentence(tokens);
var posTags = language.tagSentence(tokens);
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];

View File

@@ -1,7 +1,7 @@
<?xml version="1.0"?>
<!DOCTYPE languages [
<!ELEMENT languages (language*,resource*)>
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger,ngrams*)>
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
<!ELEMENT resource EMPTY>
<!ATTLIST resource
@@ -106,6 +106,7 @@
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="SWEDISH" />
<sentenceDetector algorithm="opennlp"/>
<!--
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
<ngrams type="name">
<pospattern>PROPN</pospattern>
@@ -113,8 +114,13 @@
<pospattern>PROPN PROPN PROPN</pospattern>
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
</ngrams>
-->
</language>
<language isoCode="fr" name="French" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="FRENCH" />
<sentenceDetector algorithm="opennlp"/>
</language>
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />

View File

@@ -91,7 +91,8 @@
</span>
<rt>
${language.posTagger().decodeTagName(sentence.posTags[pos])}
${language.decodeTagName(sentence.posTags[pos])}
@if (sentence.isAllCaps(pos))
<i class="fa-solid fa-angles-up"></i>
@elseif (sentence.isCapitalized(pos))