1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(converter) Bypass some of the grammar processing in the keyword extraction depending on language selection

This commit is contained in:
Viktor Lofgren
2025-08-17 09:55:29 +02:00
parent 338d300e1a
commit b3b95edcb5
12 changed files with 39 additions and 19 deletions

View File

@@ -17,7 +17,6 @@ public class DocumentKeywordExtractor {
private final TermFrequencyDict dict;
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
@@ -35,16 +34,16 @@ public class DocumentKeywordExtractor {
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, String language, LinkTexts linkTexts, EdgeUrl url) {
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, language, tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper(language);
var keywordMetadata = KeywordMetadata.builder()
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)

View File

@@ -19,6 +19,12 @@ public class DocumentPositionMapper {
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
private final boolean englishGrammar;
public DocumentPositionMapper(String language) {
englishGrammar = language.equalsIgnoreCase("en");
}
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld,
@@ -38,7 +44,7 @@ public class DocumentPositionMapper {
}
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld)
@@ -80,11 +86,15 @@ public class DocumentPositionMapper {
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
WordRep rep = new WordRep(sent, names);
byte meta = metadata.getMetadataForWord(rep.stemmed);
if (englishGrammar) {
// FIXME: Grammar based Name Detection is limited to English for now
// this *may* work across languages but needs thorough evaluation
for (var names : keywordExtractor.getProperNames(sent)) {
WordRep rep = new WordRep(sent, names);
byte meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(rep.word, meta);
wordsBuilder.addMeta(rep.word, meta);
}
}
}

View File

@@ -14,6 +14,8 @@ public class SubjectLikeKeywords implements WordReps {
private final List<WordRep> wordList;
private final Set<String> stemmed;
private static Set<String> svoLanguages = Set.of("en", "sv");
// Seeks out subjects in a sentence by constructs like
//
// [Name] (Verbs) (the|a|Adverb|Verb|Noun) ...
@@ -23,9 +25,18 @@ public class SubjectLikeKeywords implements WordReps {
// Steve McQueen drove fast | cars -> Steve McQueen
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
String language,
WordsTfIdfCounts tfIdfCounts,
DocumentLanguageData dld) {
// FIXME: We can't assume Subject-Verb-Object grammar is universally valid
if (!svoLanguages.contains(language)) {
wordList = List.of();
stemmed = Set.of();
return;
}
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld) {

View File

@@ -32,7 +32,7 @@ class DocumentKeywordExtractorTest {
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new LinkTexts(), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
var keywords = extractor.extractKeywords(se.extractSentences(doc), "en", new LinkTexts(), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
keywords.getWordToMeta().forEach((k, v) -> {
if (k.contains("_")) {
@@ -52,7 +52,7 @@ class DocumentKeywordExtractorTest {
var keywords = extractor.extractKeywords(
se.extractSentences(doc),
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
"en", new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
);
var keywordsBuilt = keywords.build();

View File

@@ -21,7 +21,7 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DocumentPositionMapperTest {
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper("en");
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test

View File

@@ -42,7 +42,7 @@ class SentenceExtractorTest {
var doc = Jsoup.parse(Files.readString(file.toPath()));
long start = System.currentTimeMillis();
var dld = se.extractSentences(doc);
documentKeywordExtractor.extractKeywords(dld, new LinkTexts(), url);
documentKeywordExtractor.extractKeywords(dld, "en", new LinkTexts(), url);
total += (System.currentTimeMillis() - start);
}
System.out.println(total);

View File

@@ -51,7 +51,7 @@ class SubjectLikeKeywordsTest {
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
tfIdfCounts,
"en", tfIdfCounts,
dld);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());

View File

@@ -185,7 +185,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
(int) -ret.quality, // ret.quality is negative
documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
ret.generator = generatorParts.type();

View File

@@ -136,7 +136,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
(int) -ret.quality,
documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
var tagWords = new MetaTagsBuilder()
.addPubDate(pubDate)

View File

@@ -114,7 +114,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
pubDate.yearByte(), (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
var tagWords = new MetaTagsBuilder()
.addPubDate(pubDate)

View File

@@ -131,7 +131,7 @@ public class StackexchangeSideloader implements SideloadSource {
var dld = sentenceExtractorProvider.get().extractSentences(doc);
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
ret.words = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), url);
List<String> syntheticTerms = new ArrayList<>(
List.of("site:" + domainName,

View File

@@ -39,7 +39,7 @@ class SummaryExtractorTest {
Set<String> getImportantWords(Document doc) throws URISyntaxException {
var dld = setenceExtractor.extractSentences(doc);
var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl(
var keywords = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), new EdgeUrl(
"https://www.marginalia.nu/"
));
System.out.println(keywords.importantWords);