mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Initial integration of new language configuration utility
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
@@ -16,14 +21,18 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class LanguageConfiguration {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
|
||||
|
||||
private final Map<String, LanguageDefinition> languages = new HashMap<>();
|
||||
private final JFastText fastTextLanguageModel = new JFastText();
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
|
||||
public LanguageConfiguration(LanguageModels lm) throws IOException, ParserConfigurationException, SAXException {
|
||||
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
|
||||
|
||||
// TODO: Read from data directory
|
||||
|
||||
try (var languagesXmlStream = ClassLoader.getSystemResourceAsStream("languages.xml")) {
|
||||
@@ -43,22 +52,66 @@ public class LanguageConfiguration {
|
||||
NodeList classifierNodes = doc.getElementsByTagName("language");
|
||||
|
||||
for (int i = 0; i < classifierNodes.getLength(); i++) {
|
||||
Element classifier = (Element) classifierNodes.item(i);
|
||||
|
||||
String isoCode = classifier.getAttribute("isoCode").toLowerCase();
|
||||
String name = classifier.getAttribute("name");
|
||||
boolean disabled = "TRUE".equalsIgnoreCase(classifier.getAttribute("disabled"));
|
||||
Element languageTag = (Element) classifierNodes.item(i);
|
||||
|
||||
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
|
||||
if (disabled) continue;
|
||||
|
||||
languages.put(isoCode, new LanguageDefinition(isoCode, name));
|
||||
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
|
||||
String name = languageTag.getAttribute("name");
|
||||
|
||||
Stemmer stemmer = parseStemmerTag(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode, new LanguageDefinition(isoCode, name, stemmer));
|
||||
}
|
||||
}
|
||||
|
||||
private Stemmer parseStemmerTag(Element languageElement, String isoCode) {
|
||||
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
|
||||
if (stemmerElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException("language.xml: No stemmer block for language element " + isoCode);
|
||||
}
|
||||
Element stemmerElement = (Element) stemmerElements.item(0);
|
||||
|
||||
String stemmerName = stemmerElement.getAttribute("algorithm");
|
||||
String stemmerVariant = stemmerElement.getTextContent().trim();
|
||||
|
||||
return switch (stemmerName.toLowerCase()) {
|
||||
case "porter" -> new Stemmer.Porter();
|
||||
case "snowball" -> new Stemmer.Snowball(stemmerVariant);
|
||||
case "none" -> new Stemmer.NoOpStemmer();
|
||||
default -> throw new IllegalArgumentException("language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
|
||||
StringBuilder sampleBuilder = new StringBuilder();
|
||||
jsoupDoc.body().traverse((node, depth) -> {
|
||||
if (sampleBuilder.length() > 4096) return;
|
||||
if (!(node instanceof TextNode tn)) return;
|
||||
|
||||
sampleBuilder.append(' ').append(tn.text());
|
||||
});
|
||||
return identifyLanguage(sampleBuilder.toString());
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample) {
|
||||
String prediction = fastTextLanguageModel.predict(sample);
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
return Optional.ofNullable(getLanguage(isoCode));
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
|
||||
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
public record LanguageDefinition(String isoCode, String name) {}
|
||||
}
|
||||
|
@@ -1,49 +0,0 @@
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class LanguageFilter {
|
||||
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final JFastText jft = new JFastText();
|
||||
|
||||
@Inject
|
||||
public LanguageFilter(LanguageModels lm, LanguageConfiguration languageConfiguration) {
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
jft.loadModel(lm.fasttextLanguageModel.toString());
|
||||
}
|
||||
|
||||
public Optional<String> predictLanguage(DocumentLanguageData dld) {
|
||||
String prediction = jft.predict(dld.text());
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
|
||||
LanguageConfiguration.LanguageDefinition config = languageConfiguration.getLanguage(isoCode);
|
||||
|
||||
if (config != null)
|
||||
return Optional.of(isoCode);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public boolean isBlockedUnicodeRange(String data) {
|
||||
for (var range: UnicodeRanges.values()) {
|
||||
if (range.test(data))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
@@ -15,11 +15,13 @@ import java.util.stream.Stream;
|
||||
*
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
|
||||
public record DocumentLanguageData(LanguageDefinition language,
|
||||
List<DocumentSentence> sentences,
|
||||
String text) implements Iterable<DocumentSentence> {
|
||||
|
||||
public DocumentLanguageData(List<DocumentSentence> sentences,
|
||||
String text)
|
||||
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
|
||||
{
|
||||
this.language = language;
|
||||
this.sentences = Collections.unmodifiableList(sentences);
|
||||
this.text = text;
|
||||
}
|
||||
|
@@ -0,0 +1,10 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
|
||||
public record LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer)
|
||||
{
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
public class UnsupportedLanguageException extends Exception {
|
||||
}
|
@@ -3,15 +3,18 @@ package nu.marginalia.language.sentence;
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
@@ -29,12 +32,12 @@ import java.util.*;
|
||||
*/
|
||||
public class SentenceExtractor {
|
||||
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private static RDRPOSTagger rdrposTagger;
|
||||
|
||||
private static NgramLexicon ngramLexicon = null;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
|
||||
@@ -46,8 +49,10 @@ public class SentenceExtractor {
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
public SentenceExtractor(LanguageConfiguration languageConfiguration, LanguageModels models)
|
||||
{
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
var sentenceModel = new SentenceModel(modelIn);
|
||||
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
||||
@@ -73,9 +78,10 @@ public class SentenceExtractor {
|
||||
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(Document doc) {
|
||||
public DocumentLanguageData extractSentences(Document doc) throws UnsupportedLanguageException {
|
||||
var language = languageConfiguration.identifyLanguage(doc).orElseThrow(UnsupportedLanguageException::new);
|
||||
|
||||
final List<DocumentSentence> textSentences = new ArrayList<>();
|
||||
|
||||
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
|
||||
|
||||
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
|
||||
@@ -85,7 +91,7 @@ public class SentenceExtractor {
|
||||
String text = taggedString.string();
|
||||
|
||||
textSentences.addAll(
|
||||
extractSentencesFromString(text, taggedString.tags())
|
||||
extractSentencesFromString(language, text, taggedString.tags())
|
||||
);
|
||||
|
||||
if (documentText.isEmpty()) {
|
||||
@@ -96,23 +102,31 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
return new DocumentLanguageData(textSentences, documentText.toString());
|
||||
return new DocumentLanguageData(language, textSentences, documentText.toString());
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var textSentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(language, title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
|
||||
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
|
||||
combined.addAll(titleSentences);
|
||||
combined.addAll(textSentences);
|
||||
|
||||
return new DocumentLanguageData(
|
||||
language,
|
||||
combined,
|
||||
text);
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
public DocumentSentence extractSentence(LanguageDefinition language,
|
||||
String text,
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
String[] words = wordsAndSeps.words();
|
||||
@@ -134,7 +148,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(lc[i]);
|
||||
stemmed[i] = stemmer.stem(lc[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
@@ -152,8 +166,9 @@ public class SentenceExtractor {
|
||||
);
|
||||
}
|
||||
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
public List<DocumentSentence> extractSentencesFromString(LanguageDefinition language, String text, EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
|
||||
// Safety net against malformed data DOS attacks,
|
||||
// found 5+ MB <p>-tags in the wild that just break
|
||||
@@ -167,7 +182,7 @@ public class SentenceExtractor {
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
||||
String[] sentences;
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(text);
|
||||
}
|
||||
@@ -221,7 +236,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(tokens[i]);
|
||||
stemmed[i] = stemmer.stem(tokens[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
|
@@ -3,14 +3,15 @@ package nu.marginalia.language.sentence;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class ThreadLocalSentenceExtractorProvider {
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
|
||||
|
||||
@Inject
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageConfiguration languageConfiguration, LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, languageModels));
|
||||
}
|
||||
|
||||
public SentenceExtractor get() {
|
||||
|
@@ -0,0 +1,39 @@
|
||||
package nu.marginalia.language.stemming;
|
||||
|
||||
import opennlp.tools.stemmer.snowball.SnowballStemmer;
|
||||
|
||||
public sealed interface Stemmer {
|
||||
String stem(String input);
|
||||
|
||||
final class Porter implements Stemmer {
|
||||
private static final ca.rmen.porterstemmer.PorterStemmer porterStemmerImpl = new ca.rmen.porterstemmer.PorterStemmer();
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return porterStemmerImpl.stemWord(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class Snowball implements Stemmer {
|
||||
private final SnowballStemmer snowballStemmer;
|
||||
|
||||
public Snowball(String algorithmName) {
|
||||
SnowballStemmer.ALGORITHM algorithm = SnowballStemmer.ALGORITHM.valueOf(algorithmName.toUpperCase());
|
||||
snowballStemmer = new SnowballStemmer(algorithm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
// Snowball impl declares return value as CharSequence,
|
||||
// but in practice always returns a String
|
||||
return (String) snowballStemmer.stem(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class NoOpStemmer implements Stemmer {
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return input;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,19 +1,44 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*)>
|
||||
<!ELEMENT language (#PCDATA)>
|
||||
<!ELEMENT language (stemmer,sentenceDetector,rdrTagger)>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) #IMPLIED
|
||||
>
|
||||
]>
|
||||
|
||||
<!ELEMENT stemmer (#PCDATA)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dict CDATA #REQUIRED
|
||||
rdr CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English">
|
||||
|
||||
<language isoCode="xx" name="Undefined" display="ltr">
|
||||
<stemmer algorithm="none" />
|
||||
<sentenceDetector algorithm="none"/>
|
||||
<rdrTagger dict="English.DICT" rdr="English.RDR" />
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish/Svenska">
|
||||
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<stemmer algorithm="porter" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<rdrTagger dict="English.DICT" rdr="English.RDR" />
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish" display="ltr">
|
||||
<stemmer algorithm="snowball">SWEDISH</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<rdrTagger dict="Swedish.DICT" rdr="Swedish.RDR" />
|
||||
</language>
|
||||
</languages>
|
@@ -1,27 +0,0 @@
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class LanguageDefinitionFilterTest {
|
||||
|
||||
@Test
|
||||
void isPageInteresting() throws IOException, ParserConfigurationException, SAXException {
|
||||
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels(), new LanguageConfiguration());
|
||||
|
||||
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Carlos fue al bosque y recogió bayas")));
|
||||
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie est allé dans la forêt et a cueilli des baies")));
|
||||
assertEquals(Optional.of("sv"), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Kalle gick i skogen och plockade bär")));
|
||||
assertEquals(Optional.of("en"), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie went to the woods to go berry-picking")));
|
||||
}
|
||||
|
||||
}
|
@@ -1,11 +1,15 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Objects;
|
||||
@@ -15,36 +19,38 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class SentenceExtractorTest {
|
||||
private static SentenceExtractor sentenceExtractor;
|
||||
private static LanguageConfiguration languageConfig;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() {
|
||||
sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
public static void setUp() throws IOException, ParserConfigurationException, SAXException {
|
||||
languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
sentenceExtractor = new SentenceExtractor(languageConfig, WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParen() {
|
||||
var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class));
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"),"I am (very) tall", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
System.out.println(dld);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCplusplus() {
|
||||
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "std::vector", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("std::vector", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPHP() {
|
||||
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "$_GET", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("$_get", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPolishArtist() {
|
||||
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"),"Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(1, dld.wordsLowerCase.length);
|
||||
assertEquals("uklanski", dld.wordsLowerCase[0]);
|
||||
@@ -52,7 +58,7 @@ class SentenceExtractorTest {
|
||||
|
||||
@Test
|
||||
void testJava() {
|
||||
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(4, dld.wordsLowerCase.length);
|
||||
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
|
||||
@@ -70,7 +76,7 @@ class SentenceExtractorTest {
|
||||
System.out.println(sent);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
} catch (IOException | UnsupportedLanguageException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
@@ -87,13 +93,15 @@ class SentenceExtractorTest {
|
||||
System.out.println(sent);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
} catch (IOException | UnsupportedLanguageException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
@Test
|
||||
void testApostrophe() {
|
||||
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
|
||||
var lang = Objects.requireNonNull(languageConfig.getLanguage("en"));
|
||||
|
||||
var dld = sentenceExtractor.extractSentence(lang, "duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(7, dld.wordsLowerCase.length);
|
||||
|
||||
assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase);
|
||||
|
@@ -6,7 +6,9 @@ import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.model.Link;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -18,13 +20,21 @@ import java.util.*;
|
||||
|
||||
public class AnchorTextKeywords {
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final LanguageDefinition englishLanguage;
|
||||
private final Set<String> stopList;
|
||||
|
||||
@Inject
|
||||
public AnchorTextKeywords(SentenceExtractor sentenceExtractor)
|
||||
public AnchorTextKeywords(SentenceExtractor sentenceExtractor, LanguageConfiguration languageConfiguration)
|
||||
{
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
|
||||
// FIXME: Currently the atags file does not provide information about the language in the source document
|
||||
// which means we have to run the link texts through English processing. For euro-languages this is
|
||||
// likely fine, but for stuff like Japanese it's going to produce bad results. We'll need to add this
|
||||
// information when extracting link texts so we can use the appropriate language processing here later.
|
||||
// (sampling based on the link text alone is likely insufficient, since the sample size is going to be 2-3 words).
|
||||
this.englishLanguage = languageConfiguration.getLanguage("en");
|
||||
|
||||
stopList = readStoplist();
|
||||
}
|
||||
|
||||
@@ -60,7 +70,7 @@ public class AnchorTextKeywords {
|
||||
if (stopList.contains(keyword.text().toLowerCase()))
|
||||
continue;
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentence = sentenceExtractor.extractSentence(englishLanguage, keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
ret.add(sentence);
|
||||
counts.add(keyword.count());
|
||||
}
|
||||
@@ -82,7 +92,7 @@ public class AnchorTextKeywords {
|
||||
if (stopList.contains(keyword.text().toLowerCase()))
|
||||
continue;
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentence = sentenceExtractor.extractSentence(englishLanguage, keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
ret.add(sentence);
|
||||
counts.add(keyword.count());
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.atags;
|
||||
|
||||
import nu.marginalia.atags.source.AnchorTagsImpl;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -36,11 +37,9 @@ class DomainAnchorTagsImplTest {
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));
|
||||
|
||||
var atagsKeywords = new AnchorTextKeywords(
|
||||
new SentenceExtractor(
|
||||
TestLanguageModels.getLanguageModels()
|
||||
)
|
||||
);
|
||||
var languageConfig = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
|
||||
var atagsKeywords = new AnchorTextKeywords(new SentenceExtractor(languageConfig, TestLanguageModels.getLanguageModels()), languageConfig);
|
||||
|
||||
System.out.println(
|
||||
atagsKeywords.getAnchorTextKeywords(tags, new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/"))
|
||||
);
|
||||
|
@@ -2,15 +2,19 @@ package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.Charset;
|
||||
@@ -22,10 +26,16 @@ import java.util.Set;
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
static SentenceExtractor se;
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
public void testKeyboards2() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -43,7 +53,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testMadonna() throws IOException, URISyntaxException {
|
||||
public void testMadonna() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -80,17 +90,4 @@ class DocumentKeywordExtractorTest {
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpam() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
}
|
||||
}
|
@@ -6,14 +6,20 @@ import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
@@ -21,8 +27,16 @@ import java.util.List;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class DocumentPositionMapperTest {
|
||||
private static LanguageDefinition english;
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper("en");
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
static SentenceExtractor se;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
@@ -44,8 +58,8 @@ class DocumentPositionMapperTest {
|
||||
@Test
|
||||
public void testBasic() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
DocumentLanguageData dld = new DocumentLanguageData(
|
||||
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
DocumentLanguageData dld = new DocumentLanguageData(english,
|
||||
se.extractSentencesFromString(english, "I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
"I am a teapot"
|
||||
);
|
||||
|
||||
@@ -73,7 +87,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1 });
|
||||
|
||||
@@ -94,7 +108,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
|
||||
|
||||
@@ -122,7 +136,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoWords2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 });
|
||||
|
||||
@@ -152,8 +166,8 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoSent1Word1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences1 = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString(english, "Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences1.size());
|
||||
assertEquals(1, sentences2.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
|
||||
|
@@ -2,15 +2,21 @@ package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
@@ -23,9 +29,19 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
class SentenceExtractorTest {
|
||||
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
static SentenceExtractor se;
|
||||
private static LanguageDefinition english;
|
||||
|
||||
public static void main(String... args) throws IOException, URISyntaxException {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
|
||||
}
|
||||
|
||||
public static void main(String... args) throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
@@ -58,7 +74,7 @@ class SentenceExtractorTest {
|
||||
|
||||
@Test
|
||||
public void testACDC() {
|
||||
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
|
||||
var ret = se.extractSentence(english, "AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals("ac/dc", ret.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
|
@@ -1,16 +1,21 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class ArtifactKeywordsTest {
|
||||
|
||||
@Test
|
||||
public void testExtractArtifacts() {
|
||||
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
|
||||
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
|
||||
|
||||
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
|
||||
System.out.println(artifacts.getWords());
|
||||
|
@@ -4,11 +4,15 @@ import com.google.common.collect.Sets;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collections;
|
||||
@@ -44,10 +48,16 @@ class NameLikeKeywordsTest {
|
||||
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
static SentenceExtractor se;
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
|
||||
@@ -58,15 +68,13 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiArticle() throws IOException {
|
||||
public void testWikiArticle() throws IOException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0));
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var ke = new KeywordExtractor();
|
||||
|
||||
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
|
||||
@@ -74,7 +82,7 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiArticleP1() {
|
||||
public void testWikiArticleP1() throws UnsupportedLanguageException {
|
||||
String html = """
|
||||
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.</p>
|
||||
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
|
||||
@@ -82,8 +90,6 @@ class NameLikeKeywordsTest {
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0));
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var ke = new KeywordExtractor();
|
||||
|
||||
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
|
||||
|
@@ -1,12 +1,17 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
@@ -41,12 +46,18 @@ class SubjectLikeKeywordsTest {
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
|
||||
static SentenceExtractor se;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
var lm = TestLanguageModels.getLanguageModels();
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
var dld = se.extractSentences(text, "Julius Caesar");
|
||||
|
||||
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
|
||||
|
@@ -2,12 +2,17 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -187,8 +192,8 @@ class TitleKeywordsTest {
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void extractTitleWords() {
|
||||
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
|
||||
var se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
|
||||
|
||||
var dld = se.extractSentences(Jsoup.parse(document));
|
||||
|
||||
|
@@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@@ -114,6 +115,11 @@ public class DocumentProcessor {
|
||||
ret.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
}
|
||||
catch (UnsupportedLanguageException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = "Language";
|
||||
logger.info(converterAuditMarker, "Disqualified {}: Language", ret.url);
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = ex.reason.toString();
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@@ -20,7 +21,12 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public abstract class AbstractDocumentProcessorPlugin {
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException, IOException;
|
||||
public abstract DetailsWithWords createDetails(
|
||||
CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
Set<DomSampleClassification> domSampleClassifications,
|
||||
DocumentClass documentClass)
|
||||
throws DisqualifiedException, UnsupportedLanguageException, URISyntaxException, IOException;
|
||||
public abstract boolean isApplicable(CrawledDocument doc);
|
||||
|
||||
protected static class MetaTagsBuilder {
|
||||
|
@@ -21,8 +21,9 @@ import nu.marginalia.gregex.GuardedRegexFactory;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
@@ -41,7 +42,6 @@ import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason;
|
||||
@@ -56,7 +56,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final PubDateSniffer pubDateSniffer;
|
||||
|
||||
private final LanguageFilter languageFilter;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
private final MetaRobotsTag metaRobotsTag;
|
||||
@@ -73,7 +73,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@Named("min-document-quality") Double minDocumentQuality,
|
||||
LanguageFilter languageFilter,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
FeatureExtractor featureExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
PubDateSniffer pubDateSniffer,
|
||||
@@ -83,7 +83,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
HtmlProcessorSpecializations specializations)
|
||||
{
|
||||
this.languageFilter = languageFilter;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.featureExtractor = featureExtractor;
|
||||
@@ -106,11 +106,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
throws DisqualifiedException, URISyntaxException, IOException, UnsupportedLanguageException {
|
||||
|
||||
Document doc = crawledDocument.parseBody();
|
||||
|
||||
@@ -151,18 +147,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
}
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
|
||||
|
||||
Optional<String> language = languageFilter.predictLanguage(dld);
|
||||
if (language.isEmpty()) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
final String languageIsoCode = dld.language().isoCode();
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = length;
|
||||
ret.format = format;
|
||||
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
||||
ret.language = language.get();
|
||||
ret.language = languageIsoCode;
|
||||
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||
|
||||
@@ -185,7 +177,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
(int) -ret.quality, // ret.quality is negative
|
||||
documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
|
||||
|
||||
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
|
||||
ret.generator = generatorParts.type();
|
||||
@@ -196,7 +188,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addFeatures(features)
|
||||
.addFormat(format)
|
||||
.addGenerator(generatorParts.keywords())
|
||||
.addLanguage(language.get())
|
||||
.addLanguage(languageIsoCode)
|
||||
.build();
|
||||
|
||||
|
||||
|
@@ -11,8 +11,9 @@ import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -39,7 +40,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
private final int maxTitleLength;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final LanguageFilter languageFilter;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
@@ -49,14 +50,14 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
@Inject
|
||||
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
LanguageFilter languageFilter,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
DefaultSpecialization defaultSpecialization)
|
||||
|
||||
{
|
||||
this.languageFilter = languageFilter;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
@@ -80,14 +81,10 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
throws DisqualifiedException, URISyntaxException, IOException, UnsupportedLanguageException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
|
||||
@@ -101,22 +98,19 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||
|
||||
Optional<String> language = languageFilter.predictLanguage(dld);
|
||||
if (language.isEmpty()) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
final String languageIsoCode = dld.language().isoCode();
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = documentBody.length();
|
||||
|
||||
ret.format = DocumentFormat.PDF;
|
||||
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
|
||||
ret.language = language.get();
|
||||
ret.language = languageIsoCode;
|
||||
|
||||
ret.quality = -5;
|
||||
|
||||
@@ -136,7 +130,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
(int) -ret.quality,
|
||||
documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addPubDate(pubDate)
|
||||
@@ -146,7 +140,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
words.addSyntheticTerm("lang:" + language.get());
|
||||
words.addSyntheticTerm("lang:" + languageIsoCode);
|
||||
|
||||
if (pubDate.hasYear()) {
|
||||
ret.pubYear = pubDate.year();
|
||||
|
@@ -12,7 +12,7 @@ import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -32,7 +32,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
private final int maxTitleLength;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
|
||||
private final LanguageFilter languageFilter;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
@@ -41,13 +41,13 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
@Inject
|
||||
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
LanguageFilter languageFilter,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
DocumentLengthLogic documentLengthLogic
|
||||
)
|
||||
{
|
||||
this.languageFilter = languageFilter;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
@@ -74,23 +74,15 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
var dld = sentenceExtractorProvider.get().extractSentences(documentBody, "");
|
||||
|
||||
Optional<String> language = languageFilter.predictLanguage(dld);
|
||||
if (language.isEmpty()) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
final String languageIsoCode = dld.language().isoCode();
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
||||
@@ -99,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
ret.format = DocumentFormat.PLAIN;
|
||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||
ret.language = language.get();
|
||||
ret.language = languageIsoCode;
|
||||
|
||||
ret.quality = -1;
|
||||
|
||||
@@ -114,14 +106,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
|
||||
pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.format)
|
||||
.addLanguage(language.get())
|
||||
.addLanguage(languageIsoCode)
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
@@ -10,7 +10,6 @@ import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
@@ -36,8 +35,8 @@ class PdfDocumentProcessorPluginTest {
|
||||
static void setUpBeforeClass() throws Exception {
|
||||
var lm = WmsaHome.getLanguageModels();
|
||||
plugin = new PdfDocumentProcessorPlugin(255,
|
||||
new LanguageFilter(lm, new LanguageConfiguration()),
|
||||
new ThreadLocalSentenceExtractorProvider(lm),
|
||||
new LanguageConfiguration(lm),
|
||||
new ThreadLocalSentenceExtractorProvider(new LanguageConfiguration(lm), lm),
|
||||
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||
new DocumentLengthLogic(100),
|
||||
new DefaultSpecialization(new SummaryExtractor(
|
||||
|
@@ -4,6 +4,8 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -12,7 +14,9 @@ import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
@@ -25,9 +29,9 @@ class SummaryExtractorTest {
|
||||
private SentenceExtractor setenceExtractor;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
public void setUp() throws IOException, ParserConfigurationException, SAXException {
|
||||
keywordExtractor = new DocumentKeywordExtractor();
|
||||
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
setenceExtractor = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
|
||||
summaryExtractor = new SummaryExtractor(255,
|
||||
new DomFilterHeuristic(255),
|
||||
@@ -37,7 +41,7 @@ class SummaryExtractorTest {
|
||||
new FallbackHeuristic());
|
||||
}
|
||||
|
||||
Set<String> getImportantWords(Document doc) throws URISyntaxException {
|
||||
Set<String> getImportantWords(Document doc) throws URISyntaxException, UnsupportedLanguageException {
|
||||
var dld = setenceExtractor.extractSentences(doc);
|
||||
var keywords = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), new EdgeUrl(
|
||||
"https://www.marginalia.nu/"
|
||||
@@ -48,7 +52,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTheRegister() throws IOException, URISyntaxException {
|
||||
public void testTheRegister() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
String html = readClassPathFile("html/theregister.html");
|
||||
var doc = Jsoup.parse(html);
|
||||
|
||||
@@ -92,7 +96,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSurrey() throws IOException, URISyntaxException {
|
||||
void extractSurrey() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
String html = readClassPathFile("html/summarization/surrey.html");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@@ -104,7 +108,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSurrey1() throws IOException, URISyntaxException {
|
||||
void extractSurrey1() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
String html = readClassPathFile("html/summarization/surrey.html.1");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@@ -115,7 +119,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extract187() throws IOException, URISyntaxException {
|
||||
void extract187() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
String html = readClassPathFile("html/summarization/187.shtml");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@@ -126,7 +130,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractMonadnock() throws IOException, URISyntaxException {
|
||||
void extractMonadnock() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
String html = readClassPathFile("html/monadnock.html");
|
||||
|
||||
var doc = Jsoup.parse(html);
|
||||
@@ -138,7 +142,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWorkSet() throws IOException, URISyntaxException {
|
||||
public void testWorkSet() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var workSet = readWorkSet();
|
||||
for (Map.Entry<Path, String> entry : workSet.entrySet()) {
|
||||
final Path path = entry.getKey();
|
||||
|
@@ -7,8 +7,8 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
@@ -36,13 +36,13 @@ import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
|
||||
|
||||
public class TermFrequencyExporter implements ExporterIf {
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageFilter lf;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
|
||||
|
||||
@Inject
|
||||
public TermFrequencyExporter(FileStorageService storageService, LanguageConfiguration languageConfiguration) {
|
||||
this.storageService = storageService;
|
||||
this.lf = new LanguageFilter(WmsaHome.getLanguageModels(), languageConfiguration);
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -50,7 +50,7 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, WmsaHome.getLanguageModels()));
|
||||
|
||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||
AtomicInteger docCount = new AtomicInteger();
|
||||
@@ -120,10 +120,13 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
Document parsed = doc.parseBody();
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.extractSentences(parsed);
|
||||
DocumentLanguageData dld;
|
||||
|
||||
if (lf.predictLanguage(dld).isEmpty()) {
|
||||
return;
|
||||
try {
|
||||
dld = se.extractSentences(parsed);
|
||||
}
|
||||
catch (UnsupportedLanguageException ex) {
|
||||
continue; // This is ok
|
||||
}
|
||||
|
||||
for (var sent : dld) {
|
||||
|
Reference in New Issue
Block a user