mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Add support for languages with no POS-tagging
This disables a lot of the smart keyword extraction, which is mostly a crutch for helping English and similar large languages to find relevant search results. Smaller languages where a POS-tag model may not be available, are probably fine with this disabled, as the search engine can likely just rawdog the entire results list.
This commit is contained in:
@@ -34,37 +34,57 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
|
||||
final KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
if (dld.language().hasPosParsing()) {
|
||||
KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper(keywordExtractor);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper(keywordExtractor);
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
return wordsBuilder;
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
else {
|
||||
KeywordExtractor keywordExtractor = new KeywordExtractor(dld.language());
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper(keywordExtractor);
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
return wordsBuilder;
|
||||
}
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
||||
|
@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class KeywordMetadata {
|
||||
|
||||
@Nullable
|
||||
private final TitleKeywords titleKeywords;
|
||||
@Nullable
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
@Nullable
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
@Nullable
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords) {
|
||||
@Nullable TitleKeywords titleKeywords,
|
||||
@Nullable NameLikeKeywords nameLikeKeywords,
|
||||
@Nullable SubjectLikeKeywords subjectLikeKeywords,
|
||||
@Nullable UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
@@ -32,23 +38,23 @@ public class KeywordMetadata {
|
||||
|
||||
byte flags = 0;
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed)) {
|
||||
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.NamesWords.asBit();
|
||||
}
|
||||
|
||||
if (titleKeywords.contains(stemmed)) {
|
||||
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
|
||||
flags |= WordFlags.UrlPath.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
|
||||
flags |= WordFlags.UrlDomain.asBit();
|
||||
}
|
||||
|
||||
|
@@ -142,8 +142,11 @@ public class LanguageConfiguration {
|
||||
}
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(PosTagger posTagger,
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
||||
return Map.of();
|
||||
|
||||
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
|
||||
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
|
||||
|
||||
@@ -172,11 +175,14 @@ public class LanguageConfiguration {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
|
||||
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
|
||||
if (rdrElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No rdrTagger block for language element " + isoCode);
|
||||
if (rdrElements.getLength() < 1) {
|
||||
return null;
|
||||
}
|
||||
else if (rdrElements.getLength() > 1) {
|
||||
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
|
||||
}
|
||||
Element rdrElement = (Element) rdrElements.item(0);
|
||||
|
||||
|
@@ -6,18 +6,66 @@ import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public record LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
KeywordHasher keywordHasher,
|
||||
PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns)
|
||||
{
|
||||
public final class LanguageDefinition {
|
||||
private final String isoCode;
|
||||
private final String name;
|
||||
private final Stemmer stemmer;
|
||||
private final KeywordHasher keywordHasher;
|
||||
@Nullable
|
||||
private final PosTagger posTagger;
|
||||
private final Map<PosPatternCategory, List<PosPattern>> posPatterns;
|
||||
|
||||
public LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
KeywordHasher keywordHasher,
|
||||
@Nullable PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
|
||||
this.isoCode = isoCode;
|
||||
this.name = name;
|
||||
this.stemmer = stemmer;
|
||||
this.keywordHasher = keywordHasher;
|
||||
this.posTagger = posTagger;
|
||||
this.posPatterns = posPatterns;
|
||||
}
|
||||
|
||||
public long[] tagSentence(String[] words) {
|
||||
if (posTagger == null) return new long[words.length];
|
||||
return posTagger.tagSentence(words);
|
||||
}
|
||||
|
||||
public boolean hasPosParsing() {
|
||||
return posTagger != null;
|
||||
}
|
||||
|
||||
public List<PosPattern> getPatterns(PosPatternCategory category) {
|
||||
return posPatterns.getOrDefault(category, List.of());
|
||||
}
|
||||
|
||||
public String decodeTagName(long tagName) {
|
||||
if (hasPosParsing())
|
||||
return posTagger.decodeTagName(tagName);
|
||||
return "";
|
||||
}
|
||||
|
||||
public String isoCode() {
|
||||
return isoCode;
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public Stemmer stemmer() {
|
||||
return stemmer;
|
||||
}
|
||||
|
||||
public KeywordHasher keywordHasher() {
|
||||
return keywordHasher;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -135,7 +135,7 @@ public class SentenceExtractor {
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
String[] lc = new String[words.length];
|
||||
String[] stemmed = new String[words.length];
|
||||
long[] posTags = language.posTagger().tagSentence(words);
|
||||
long[] posTags = language.tagSentence(words);
|
||||
|
||||
BitSet isCapitalized = new BitSet(words.length);
|
||||
BitSet isAllCaps = new BitSet(words.length);
|
||||
@@ -225,7 +225,7 @@ public class SentenceExtractor {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = language.posTagger().tagSentence(tokens);
|
||||
var posTags = language.tagSentence(tokens);
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger,ngrams*)>
|
||||
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
@@ -106,6 +106,7 @@
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<!--
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
@@ -113,8 +114,13 @@
|
||||
<pospattern>PROPN PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
|
||||
</ngrams>
|
||||
-->
|
||||
</language>
|
||||
<language isoCode="fr" name="French" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
||||
|
@@ -91,7 +91,8 @@
|
||||
|
||||
</span>
|
||||
<rt>
|
||||
${language.posTagger().decodeTagName(sentence.posTags[pos])}
|
||||
${language.decodeTagName(sentence.posTags[pos])}
|
||||
|
||||
@if (sentence.isAllCaps(pos))
|
||||
<i class="fa-solid fa-angles-up"></i>
|
||||
@elseif (sentence.isCapitalized(pos))
|
||||
|
Reference in New Issue
Block a user