mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(converter) Bypass some of the grammar processing in the keyword extraction depending on language selection
This commit is contained in:
@@ -17,7 +17,6 @@ public class DocumentKeywordExtractor {
|
||||
private final TermFrequencyDict dict;
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
@@ -35,16 +34,16 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, String language, LinkTexts linkTexts, EdgeUrl url) {
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, language, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
var positionMapper = new DocumentPositionMapper(language);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
|
@@ -19,6 +19,12 @@ public class DocumentPositionMapper {
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
|
||||
private final boolean englishGrammar;
|
||||
|
||||
public DocumentPositionMapper(String language) {
|
||||
englishGrammar = language.equalsIgnoreCase("en");
|
||||
}
|
||||
|
||||
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld,
|
||||
@@ -38,7 +44,7 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
|
||||
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld)
|
||||
|
||||
@@ -80,11 +86,15 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
WordRep rep = new WordRep(sent, names);
|
||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
if (englishGrammar) {
|
||||
// FIXME: Grammar based Name Detection is limited to English for now
|
||||
// this *may* work across languages but needs thorough evaluation
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
WordRep rep = new WordRep(sent, names);
|
||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
wordsBuilder.addMeta(rep.word, meta);
|
||||
wordsBuilder.addMeta(rep.word, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -14,6 +14,8 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
private final List<WordRep> wordList;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
private static Set<String> svoLanguages = Set.of("en", "sv");
|
||||
|
||||
// Seeks out subjects in a sentence by constructs like
|
||||
//
|
||||
// [Name] (Verbs) (the|a|Adverb|Verb|Noun) ...
|
||||
@@ -23,9 +25,18 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
|
||||
String language,
|
||||
WordsTfIdfCounts tfIdfCounts,
|
||||
DocumentLanguageData dld) {
|
||||
|
||||
// FIXME: We can't assume Subject-Verb-Object grammar is universally valid
|
||||
|
||||
if (!svoLanguages.contains(language)) {
|
||||
wordList = List.of();
|
||||
stemmed = Set.of();
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld) {
|
||||
|
@@ -32,7 +32,7 @@ class DocumentKeywordExtractorTest {
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new LinkTexts(), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), "en", new LinkTexts(), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
|
||||
keywords.getWordToMeta().forEach((k, v) -> {
|
||||
if (k.contains("_")) {
|
||||
@@ -52,7 +52,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
var keywords = extractor.extractKeywords(
|
||||
se.extractSentences(doc),
|
||||
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||
"en", new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||
);
|
||||
|
||||
var keywordsBuilt = keywords.build();
|
||||
|
@@ -21,7 +21,7 @@ import java.util.List;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class DocumentPositionMapperTest {
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper("en");
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Test
|
||||
|
@@ -42,7 +42,7 @@ class SentenceExtractorTest {
|
||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||
long start = System.currentTimeMillis();
|
||||
var dld = se.extractSentences(doc);
|
||||
documentKeywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
||||
documentKeywordExtractor.extractKeywords(dld, "en", new LinkTexts(), url);
|
||||
total += (System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println(total);
|
||||
|
@@ -51,7 +51,7 @@ class SubjectLikeKeywordsTest {
|
||||
|
||||
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
|
||||
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
|
||||
tfIdfCounts,
|
||||
"en", tfIdfCounts,
|
||||
dld);
|
||||
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
|
@@ -185,7 +185,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
(int) -ret.quality, // ret.quality is negative
|
||||
documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
|
||||
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
|
||||
ret.generator = generatorParts.type();
|
||||
|
@@ -136,7 +136,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
(int) -ret.quality,
|
||||
documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addPubDate(pubDate)
|
||||
|
@@ -114,7 +114,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
|
||||
pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addPubDate(pubDate)
|
||||
|
@@ -131,7 +131,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
var dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||
|
||||
ret.url = url;
|
||||
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
||||
ret.words = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), url);
|
||||
|
||||
List<String> syntheticTerms = new ArrayList<>(
|
||||
List.of("site:" + domainName,
|
||||
|
@@ -39,7 +39,7 @@ class SummaryExtractorTest {
|
||||
|
||||
Set<String> getImportantWords(Document doc) throws URISyntaxException {
|
||||
var dld = setenceExtractor.extractSentences(doc);
|
||||
var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl(
|
||||
var keywords = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), new EdgeUrl(
|
||||
"https://www.marginalia.nu/"
|
||||
));
|
||||
System.out.println(keywords.importantWords);
|
||||
|
Reference in New Issue
Block a user