1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Initial integration of new language configuration utility

This commit is contained in:
Viktor Lofgren
2025-08-19 15:41:42 +02:00
parent eea32bb7b4
commit de67006c4f
28 changed files with 382 additions and 242 deletions

View File

@@ -1,6 +1,11 @@
package nu.marginalia.language.config;
import com.github.jfasttext.JFastText;
import com.google.inject.Inject;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.stemming.Stemmer;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -16,14 +21,18 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
public class LanguageConfiguration {
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
private final Map<String, LanguageDefinition> languages = new HashMap<>();
private final JFastText fastTextLanguageModel = new JFastText();
@Inject
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
public LanguageConfiguration(LanguageModels lm) throws IOException, ParserConfigurationException, SAXException {
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
// TODO: Read from data directory
try (var languagesXmlStream = ClassLoader.getSystemResourceAsStream("languages.xml")) {
@@ -43,22 +52,66 @@ public class LanguageConfiguration {
NodeList classifierNodes = doc.getElementsByTagName("language");
for (int i = 0; i < classifierNodes.getLength(); i++) {
Element classifier = (Element) classifierNodes.item(i);
String isoCode = classifier.getAttribute("isoCode").toLowerCase();
String name = classifier.getAttribute("name");
boolean disabled = "TRUE".equalsIgnoreCase(classifier.getAttribute("disabled"));
Element languageTag = (Element) classifierNodes.item(i);
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
if (disabled) continue;
languages.put(isoCode, new LanguageDefinition(isoCode, name));
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
String name = languageTag.getAttribute("name");
Stemmer stemmer = parseStemmerTag(languageTag, isoCode);
languages.put(isoCode, new LanguageDefinition(isoCode, name, stemmer));
}
}
private Stemmer parseStemmerTag(Element languageElement, String isoCode) {
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
if (stemmerElements.getLength() != 1) {
throw new IllegalArgumentException("language.xml: No stemmer block for language element " + isoCode);
}
Element stemmerElement = (Element) stemmerElements.item(0);
String stemmerName = stemmerElement.getAttribute("algorithm");
String stemmerVariant = stemmerElement.getTextContent().trim();
return switch (stemmerName.toLowerCase()) {
case "porter" -> new Stemmer.Porter();
case "snowball" -> new Stemmer.Snowball(stemmerVariant);
case "none" -> new Stemmer.NoOpStemmer();
default -> throw new IllegalArgumentException("language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
};
}
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
StringBuilder sampleBuilder = new StringBuilder();
jsoupDoc.body().traverse((node, depth) -> {
if (sampleBuilder.length() > 4096) return;
if (!(node instanceof TextNode tn)) return;
sampleBuilder.append(' ').append(tn.text());
});
return identifyLanguage(sampleBuilder.toString());
}
public Optional<LanguageDefinition> identifyLanguage(String sample) {
String prediction = fastTextLanguageModel.predict(sample);
if (prediction.length() == "__label__??".length()) {
String isoCode = prediction.substring("__label__".length());
return Optional.ofNullable(getLanguage(isoCode));
}
return Optional.empty();
}
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
}
@Nullable
public LanguageDefinition getLanguage(String language) {
return languages.get(language);
}
public record LanguageDefinition(String isoCode, String name) {}
}

View File

@@ -1,49 +0,0 @@
package nu.marginalia.language.filter;
import com.github.jfasttext.JFastText;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.Optional;
@Singleton
public class LanguageFilter {
private final LanguageConfiguration languageConfiguration;
private final JFastText jft = new JFastText();
@Inject
public LanguageFilter(LanguageModels lm, LanguageConfiguration languageConfiguration) {
this.languageConfiguration = languageConfiguration;
jft.loadModel(lm.fasttextLanguageModel.toString());
}
public Optional<String> predictLanguage(DocumentLanguageData dld) {
String prediction = jft.predict(dld.text());
if (prediction.length() == "__label__??".length()) {
String isoCode = prediction.substring("__label__".length());
LanguageConfiguration.LanguageDefinition config = languageConfiguration.getLanguage(isoCode);
if (config != null)
return Optional.of(isoCode);
}
return Optional.empty();
}
public boolean isBlockedUnicodeRange(String data) {
for (var range: UnicodeRanges.values()) {
if (range.test(data))
return true;
}
return false;
}
}

View File

@@ -15,11 +15,13 @@ import java.util.stream.Stream;
*
* @see SentenceExtractor
*/
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
public record DocumentLanguageData(LanguageDefinition language,
List<DocumentSentence> sentences,
String text) implements Iterable<DocumentSentence> {
public DocumentLanguageData(List<DocumentSentence> sentences,
String text)
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
{
this.language = language;
this.sentences = Collections.unmodifiableList(sentences);
this.text = text;
}

View File

@@ -0,0 +1,10 @@
package nu.marginalia.language.model;
import nu.marginalia.language.stemming.Stemmer;
public record LanguageDefinition(String isoCode,
String name,
Stemmer stemmer)
{
}

View File

@@ -0,0 +1,4 @@
package nu.marginalia.language.model;
public class UnsupportedLanguageException extends Exception {
}

View File

@@ -3,15 +3,18 @@ package nu.marginalia.language.sentence;
import com.github.datquocnguyen.RDRPOSTagger;
import com.google.inject.Inject;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
import nu.marginalia.language.stemming.Stemmer;
import nu.marginalia.segmentation.NgramLexicon;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@@ -29,12 +32,12 @@ import java.util.*;
*/
public class SentenceExtractor {
private final LanguageConfiguration languageConfiguration;
private SentenceDetectorME sentenceDetector;
private static RDRPOSTagger rdrposTagger;
private static NgramLexicon ngramLexicon = null;
private final PorterStemmer porterStemmer = new PorterStemmer();
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
@@ -46,8 +49,10 @@ public class SentenceExtractor {
static final int MAX_SENTENCE_COUNT = 1000;
@Inject
public SentenceExtractor(LanguageModels models)
public SentenceExtractor(LanguageConfiguration languageConfiguration, LanguageModels models)
{
this.languageConfiguration = languageConfiguration;
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
var sentenceModel = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(sentenceModel);
@@ -73,9 +78,10 @@ public class SentenceExtractor {
}
public DocumentLanguageData extractSentences(Document doc) {
public DocumentLanguageData extractSentences(Document doc) throws UnsupportedLanguageException {
var language = languageConfiguration.identifyLanguage(doc).orElseThrow(UnsupportedLanguageException::new);
final List<DocumentSentence> textSentences = new ArrayList<>();
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
@@ -85,7 +91,7 @@ public class SentenceExtractor {
String text = taggedString.string();
textSentences.addAll(
extractSentencesFromString(text, taggedString.tags())
extractSentencesFromString(language, text, taggedString.tags())
);
if (documentText.isEmpty()) {
@@ -96,23 +102,31 @@ public class SentenceExtractor {
}
}
return new DocumentLanguageData(textSentences, documentText.toString());
return new DocumentLanguageData(language, textSentences, documentText.toString());
}
public DocumentLanguageData extractSentences(String text, String title) {
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
var textSentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
var titleSentences = extractSentencesFromString(language, title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
combined.addAll(titleSentences);
combined.addAll(textSentences);
return new DocumentLanguageData(
language,
combined,
text);
}
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
public DocumentSentence extractSentence(LanguageDefinition language,
String text,
EnumSet<HtmlTag> htmlTags) {
final Stemmer stemmer = language.stemmer();
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
String[] words = wordsAndSeps.words();
@@ -134,7 +148,7 @@ public class SentenceExtractor {
}
try {
stemmed[i] = porterStemmer.stem(lc[i]);
stemmed[i] = stemmer.stem(lc[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???
@@ -152,8 +166,9 @@ public class SentenceExtractor {
);
}
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
String[] sentences;
public List<DocumentSentence> extractSentencesFromString(LanguageDefinition language, String text, EnumSet<HtmlTag> htmlTags) {
final Stemmer stemmer = language.stemmer();
// Safety net against malformed data DOS attacks,
// found 5+ MB <p>-tags in the wild that just break
@@ -167,7 +182,7 @@ public class SentenceExtractor {
text = normalizeSpaces(text);
// Split into sentences
String[] sentences;
try {
sentences = sentenceDetector.sentDetect(text);
}
@@ -221,7 +236,7 @@ public class SentenceExtractor {
}
try {
stemmed[i] = porterStemmer.stem(tokens[i]);
stemmed[i] = stemmer.stem(tokens[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???

View File

@@ -3,14 +3,15 @@ package nu.marginalia.language.sentence;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
@Singleton
public class ThreadLocalSentenceExtractorProvider {
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
@Inject
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
public ThreadLocalSentenceExtractorProvider(LanguageConfiguration languageConfiguration, LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, languageModels));
}
public SentenceExtractor get() {

View File

@@ -0,0 +1,39 @@
package nu.marginalia.language.stemming;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
public sealed interface Stemmer {
String stem(String input);
final class Porter implements Stemmer {
private static final ca.rmen.porterstemmer.PorterStemmer porterStemmerImpl = new ca.rmen.porterstemmer.PorterStemmer();
@Override
public String stem(String input) {
return porterStemmerImpl.stemWord(input);
}
}
final class Snowball implements Stemmer {
private final SnowballStemmer snowballStemmer;
public Snowball(String algorithmName) {
SnowballStemmer.ALGORITHM algorithm = SnowballStemmer.ALGORITHM.valueOf(algorithmName.toUpperCase());
snowballStemmer = new SnowballStemmer(algorithm);
}
@Override
public String stem(String input) {
// Snowball impl declares return value as CharSequence,
// but in practice always returns a String
return (String) snowballStemmer.stem(input);
}
}
final class NoOpStemmer implements Stemmer {
@Override
public String stem(String input) {
return input;
}
}
}

View File

@@ -1,19 +1,44 @@
<?xml version="1.0"?>
<!DOCTYPE languages [
<!ELEMENT languages (language*)>
<!ELEMENT language (#PCDATA)>
<!ELEMENT language (stemmer,sentenceDetector,rdrTagger)>
<!ATTLIST language
isoCode ID #REQUIRED
name CDATA #REQUIRED
display (rtl|ltr) #REQUIRED
disabled (true|false) #IMPLIED
>
]>
<!ELEMENT stemmer (#PCDATA)>
<!ATTLIST stemmer
algorithm (porter|snowball|none) #REQUIRED
>
<!ELEMENT rdrTagger EMPTY>
<!ATTLIST rdrTagger
dict CDATA #REQUIRED
rdr CDATA #REQUIRED
>
<!ELEMENT sentenceDetector EMPTY>
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
]>
<languages>
<language isoCode="en" name="English">
<language isoCode="xx" name="Undefined" display="ltr">
<stemmer algorithm="none" />
<sentenceDetector algorithm="none"/>
<rdrTagger dict="English.DICT" rdr="English.RDR" />
</language>
<language isoCode="sv" name="Swedish/Svenska">
<language isoCode="en" name="English" display="ltr">
<stemmer algorithm="porter" />
<sentenceDetector algorithm="opennlp"/>
<rdrTagger dict="English.DICT" rdr="English.RDR" />
</language>
<language isoCode="sv" name="Swedish" display="ltr">
<stemmer algorithm="snowball">SWEDISH</stemmer>
<sentenceDetector algorithm="opennlp"/>
<rdrTagger dict="Swedish.DICT" rdr="Swedish.RDR" />
</language>
</languages>

View File

@@ -1,27 +0,0 @@
package nu.marginalia.language.filter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import static org.junit.jupiter.api.Assertions.assertEquals;
class LanguageDefinitionFilterTest {
@Test
void isPageInteresting() throws IOException, ParserConfigurationException, SAXException {
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels(), new LanguageConfiguration());
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Carlos fue al bosque y recogió bayas")));
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie est allé dans la forêt et a cueilli des baies")));
assertEquals(Optional.of("sv"), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Kalle gick i skogen och plockade bär")));
assertEquals(Optional.of("en"), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie went to the woods to go berry-picking")));
}
}

View File

@@ -1,11 +1,15 @@
package nu.marginalia.language.sentence;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Objects;
@@ -15,36 +19,38 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class SentenceExtractorTest {
private static SentenceExtractor sentenceExtractor;
private static LanguageConfiguration languageConfig;
@BeforeAll
public static void setUp() {
sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
public static void setUp() throws IOException, ParserConfigurationException, SAXException {
languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels());
sentenceExtractor = new SentenceExtractor(languageConfig, WmsaHome.getLanguageModels());
}
@Test
void testParen() {
var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class));
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"),"I am (very) tall", EnumSet.noneOf(HtmlTag.class));
System.out.println(dld);
}
@Test
void testCplusplus() {
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "std::vector", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("std::vector", dld.wordsLowerCase[0]);
}
@Test
void testPHP() {
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "$_GET", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("$_get", dld.wordsLowerCase[0]);
}
@Test
void testPolishArtist() {
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"),"Uklański", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.wordsLowerCase.length);
assertEquals("uklanski", dld.wordsLowerCase[0]);
@@ -52,7 +58,7 @@ class SentenceExtractorTest {
@Test
void testJava() {
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
assertEquals(4, dld.wordsLowerCase.length);
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
@@ -70,7 +76,7 @@ class SentenceExtractorTest {
System.out.println(sent);
}
} catch (IOException e) {
} catch (IOException | UnsupportedLanguageException e) {
throw new RuntimeException(e);
}
}
@@ -87,13 +93,15 @@ class SentenceExtractorTest {
System.out.println(sent);
}
} catch (IOException e) {
} catch (IOException | UnsupportedLanguageException e) {
throw new RuntimeException(e);
}
}
@Test
void testApostrophe() {
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
var lang = Objects.requireNonNull(languageConfig.getLanguage("en"));
var dld = sentenceExtractor.extractSentence(lang, "duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
assertEquals(7, dld.wordsLowerCase.length);
assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase);

View File

@@ -6,7 +6,9 @@ import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.Link;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
@@ -18,13 +20,21 @@ import java.util.*;
public class AnchorTextKeywords {
private final SentenceExtractor sentenceExtractor;
private final LanguageDefinition englishLanguage;
private final Set<String> stopList;
@Inject
public AnchorTextKeywords(SentenceExtractor sentenceExtractor)
public AnchorTextKeywords(SentenceExtractor sentenceExtractor, LanguageConfiguration languageConfiguration)
{
this.sentenceExtractor = sentenceExtractor;
// FIXME: Currently the atags file does not provide information about the language in the source document
// which means we have to run the link texts through English processing. For euro-languages this is
// likely fine, but for stuff like Japanese it's going to produce bad results. We'll need to add this
// information when extracting link texts so we can use the appropriate language processing here later.
// (sampling based on the link text alone is likely insufficient, since the sample size is going to be 2-3 words).
this.englishLanguage = languageConfiguration.getLanguage("en");
stopList = readStoplist();
}
@@ -60,7 +70,7 @@ public class AnchorTextKeywords {
if (stopList.contains(keyword.text().toLowerCase()))
continue;
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentence = sentenceExtractor.extractSentence(englishLanguage, keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
counts.add(keyword.count());
}
@@ -82,7 +92,7 @@ public class AnchorTextKeywords {
if (stopList.contains(keyword.text().toLowerCase()))
continue;
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentence = sentenceExtractor.extractSentence(englishLanguage, keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
ret.add(sentence);
counts.add(keyword.count());
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.atags;
import nu.marginalia.atags.source.AnchorTagsImpl;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@@ -36,11 +37,9 @@ class DomainAnchorTagsImplTest {
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));
var atagsKeywords = new AnchorTextKeywords(
new SentenceExtractor(
TestLanguageModels.getLanguageModels()
)
);
var languageConfig = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
var atagsKeywords = new AnchorTextKeywords(new SentenceExtractor(languageConfig, TestLanguageModels.getLanguageModels()), languageConfig);
System.out.println(
atagsKeywords.getAnchorTextKeywords(tags, new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/"))
);

View File

@@ -2,15 +2,19 @@ package nu.marginalia.keyword;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
@@ -22,10 +26,16 @@ import java.util.Set;
class DocumentKeywordExtractorTest {
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
}
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
public void testKeyboards2() throws IOException, URISyntaxException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
@@ -43,7 +53,7 @@ class DocumentKeywordExtractorTest {
@Test
public void testMadonna() throws IOException, URISyntaxException {
public void testMadonna() throws IOException, URISyntaxException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
@@ -80,17 +90,4 @@ class DocumentKeywordExtractorTest {
);
}
@Test
public void testSpam() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
}
}

View File

@@ -6,14 +6,20 @@ import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.keyword.model.DocumentWordSpan;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
@@ -21,8 +27,16 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DocumentPositionMapperTest {
private static LanguageDefinition english;
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper("en");
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
english = config.getLanguage("en");
}
@Test
public void testWordPattern() {
@@ -44,8 +58,8 @@ class DocumentPositionMapperTest {
@Test
public void testBasic() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
DocumentLanguageData dld = new DocumentLanguageData(
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
DocumentLanguageData dld = new DocumentLanguageData(english,
se.extractSentencesFromString(english, "I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
"I am a teapot"
);
@@ -73,7 +87,7 @@ class DocumentPositionMapperTest {
public void testLinksSingleWord1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 1 });
@@ -94,7 +108,7 @@ class DocumentPositionMapperTest {
public void testLinksSingleWord2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
@@ -122,7 +136,7 @@ class DocumentPositionMapperTest {
public void testLinksTwoWords2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 });
@@ -152,8 +166,8 @@ class DocumentPositionMapperTest {
public void testLinksTwoSent1Word1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences1 = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences2 = se.extractSentencesFromString(english, "Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences1.size());
assertEquals(1, sentences2.size());
TIntList counts = new TIntArrayList(new int[] { 1, 1 });

View File

@@ -2,15 +2,21 @@ package nu.marginalia.keyword;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
@@ -23,9 +29,19 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class SentenceExtractorTest {
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
static SentenceExtractor se = new SentenceExtractor(lm);
static SentenceExtractor se;
private static LanguageDefinition english;
public static void main(String... args) throws IOException, URISyntaxException {
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
english = config.getLanguage("en");
}
public static void main(String... args) throws IOException, URISyntaxException, UnsupportedLanguageException {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
var data = WmsaHome.getHomePath().resolve("test-data/");
@@ -58,7 +74,7 @@ class SentenceExtractorTest {
@Test
public void testACDC() {
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
var ret = se.extractSentence(english, "AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
assertEquals("ac/dc", ret.wordsLowerCase[0]);
}

View File

@@ -1,16 +1,21 @@
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import static org.junit.jupiter.api.Assertions.*;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ArtifactKeywordsTest {
@Test
public void testExtractArtifacts() {
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
System.out.println(artifacts.getWords());

View File

@@ -4,11 +4,15 @@ import com.google.common.collect.Sets;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collections;
@@ -44,10 +48,16 @@ class NameLikeKeywordsTest {
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
}
@Test
public void test() {
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
@@ -58,15 +68,13 @@ class NameLikeKeywordsTest {
}
@Test
public void testWikiArticle() throws IOException {
public void testWikiArticle() throws IOException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
@@ -74,7 +82,7 @@ class NameLikeKeywordsTest {
}
@Test
public void testWikiArticleP1() {
public void testWikiArticleP1() throws UnsupportedLanguageException {
String html = """
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p>
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
@@ -82,8 +90,6 @@ class NameLikeKeywordsTest {
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);

View File

@@ -1,12 +1,17 @@
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
@@ -41,12 +46,18 @@ class SubjectLikeKeywordsTest {
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
}
@Test
public void test() throws IOException {
var lm = TestLanguageModels.getLanguageModels();
var dict = new TermFrequencyDict(lm);
SentenceExtractor se = new SentenceExtractor(lm);
var dld = se.extractSentences(text, "Julius Caesar");
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);

View File

@@ -2,12 +2,17 @@ package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
@@ -187,8 +192,8 @@ class TitleKeywordsTest {
""";
@Test
public void extractTitleWords() {
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
var se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
var dld = se.extractSentences(Jsoup.parse(document));

View File

@@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
import nu.marginalia.domclassifier.DomSampleClassification;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
@@ -114,6 +115,11 @@ public class DocumentProcessor {
ret.details.features.add(HtmlFeature.COOKIES);
}
}
catch (UnsupportedLanguageException ex) {
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = "Language";
logger.info(converterAuditMarker, "Disqualified {}: Language", ret.url);
}
catch (DisqualifiedException ex) {
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = ex.reason.toString();

View File

@@ -6,6 +6,7 @@ import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.domclassifier.DomSampleClassification;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.model.DocumentFormat;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
@@ -20,7 +21,12 @@ import java.util.List;
import java.util.Set;
public abstract class AbstractDocumentProcessorPlugin {
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException, IOException;
public abstract DetailsWithWords createDetails(
CrawledDocument crawledDocument,
LinkTexts linkTexts,
Set<DomSampleClassification> domSampleClassifications,
DocumentClass documentClass)
throws DisqualifiedException, UnsupportedLanguageException, URISyntaxException, IOException;
public abstract boolean isApplicable(CrawledDocument doc);
protected static class MetaTagsBuilder {

View File

@@ -21,8 +21,9 @@ import nu.marginalia.gregex.GuardedRegexFactory;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.DocumentFormat;
@@ -41,7 +42,6 @@ import java.io.IOException;
import java.net.URISyntaxException;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason;
@@ -56,7 +56,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final DocumentKeywordExtractor keywordExtractor;
private final PubDateSniffer pubDateSniffer;
private final LanguageFilter languageFilter;
private final LanguageConfiguration languageConfiguration;
private final DocumentLengthLogic documentLengthLogic;
private final MetaRobotsTag metaRobotsTag;
@@ -73,7 +73,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
@Inject
public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality,
LanguageFilter languageFilter,
LanguageConfiguration languageConfiguration,
FeatureExtractor featureExtractor,
DocumentKeywordExtractor keywordExtractor,
PubDateSniffer pubDateSniffer,
@@ -83,7 +83,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
HtmlProcessorSpecializations specializations)
{
this.languageFilter = languageFilter;
this.languageConfiguration = languageConfiguration;
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.featureExtractor = featureExtractor;
@@ -106,11 +106,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
LinkTexts linkTexts,
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
throws DisqualifiedException, URISyntaxException, IOException {
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
}
throws DisqualifiedException, URISyntaxException, IOException, UnsupportedLanguageException {
Document doc = crawledDocument.parseBody();
@@ -151,18 +147,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
}
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
Optional<String> language = languageFilter.predictLanguage(dld);
if (language.isEmpty()) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
}
final String languageIsoCode = dld.language().isoCode();
var ret = new ProcessedDocumentDetails();
ret.length = length;
ret.format = format;
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
ret.language = language.get();
ret.language = languageIsoCode;
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
@@ -185,7 +177,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
(int) -ret.quality, // ret.quality is negative
documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
ret.description = specialization.getSummary(prunedDoc, words.importantWords);
ret.generator = generatorParts.type();
@@ -196,7 +188,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.addFeatures(features)
.addFormat(format)
.addGenerator(generatorParts.keywords())
.addLanguage(language.get())
.addLanguage(languageIsoCode)
.build();

View File

@@ -11,8 +11,9 @@ import nu.marginalia.domclassifier.DomSampleClassification;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.DocumentFormat;
import nu.marginalia.model.EdgeUrl;
@@ -39,7 +40,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final int maxTitleLength;
private final DocumentKeywordExtractor keywordExtractor;
private final LanguageFilter languageFilter;
private final LanguageConfiguration languageConfiguration;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentLengthLogic documentLengthLogic;
private final DefaultSpecialization defaultSpecialization;
@@ -49,14 +50,14 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
@Inject
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
LanguageFilter languageFilter,
LanguageConfiguration languageConfiguration,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor,
DocumentLengthLogic documentLengthLogic,
DefaultSpecialization defaultSpecialization)
{
this.languageFilter = languageFilter;
this.languageConfiguration = languageConfiguration;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentLengthLogic = documentLengthLogic;
this.maxTitleLength = maxTitleLength;
@@ -80,14 +81,10 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
LinkTexts linkTexts,
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
throws DisqualifiedException, URISyntaxException, IOException {
throws DisqualifiedException, URISyntaxException, IOException, UnsupportedLanguageException {
String documentBody = crawledDocument.documentBody();
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
}
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
@@ -101,22 +98,19 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
Optional<String> language = languageFilter.predictLanguage(dld);
if (language.isEmpty()) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
}
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
final String languageIsoCode = dld.language().isoCode();
var ret = new ProcessedDocumentDetails();
ret.length = documentBody.length();
ret.format = DocumentFormat.PDF;
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
ret.language = language.get();
ret.language = languageIsoCode;
ret.quality = -5;
@@ -136,7 +130,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
(int) -ret.quality,
documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
var tagWords = new MetaTagsBuilder()
.addPubDate(pubDate)
@@ -146,7 +140,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.build();
words.addAllSyntheticTerms(tagWords);
words.addSyntheticTerm("lang:" + language.get());
words.addSyntheticTerm("lang:" + languageIsoCode);
if (pubDate.hasYear()) {
ret.pubYear = pubDate.year();

View File

@@ -12,7 +12,7 @@ import nu.marginalia.domclassifier.DomSampleClassification;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.DocumentFormat;
import nu.marginalia.model.EdgeUrl;
@@ -32,7 +32,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
private final int maxTitleLength;
private final DocumentKeywordExtractor keywordExtractor;
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
private final LanguageFilter languageFilter;
private final LanguageConfiguration languageConfiguration;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentLengthLogic documentLengthLogic;
@@ -41,13 +41,13 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
@Inject
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
LanguageFilter languageFilter,
LanguageConfiguration languageConfiguration,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor,
DocumentLengthLogic documentLengthLogic
)
{
this.languageFilter = languageFilter;
this.languageConfiguration = languageConfiguration;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentLengthLogic = documentLengthLogic;
this.maxTitleLength = maxTitleLength;
@@ -74,23 +74,15 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
String documentBody = crawledDocument.documentBody();
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
}
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
var dld = sentenceExtractorProvider.get().extractSentences(documentBody, "");
Optional<String> language = languageFilter.predictLanguage(dld);
if (language.isEmpty()) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
}
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
final String languageIsoCode = dld.language().isoCode();
var ret = new ProcessedDocumentDetails();
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
@@ -99,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
ret.format = DocumentFormat.PLAIN;
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
ret.language = language.get();
ret.language = languageIsoCode;
ret.quality = -1;
@@ -114,14 +106,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
pubDate.yearByte(), (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, language.get(), linkTexts, url);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, languageIsoCode, linkTexts, url);
var tagWords = new MetaTagsBuilder()
.addPubDate(pubDate)
.addUrl(url)
.addFeatures(ret.features)
.addFormat(ret.format)
.addLanguage(language.get())
.addLanguage(languageIsoCode)
.build();
words.addAllSyntheticTerms(tagWords);

View File

@@ -10,7 +10,6 @@ import nu.marginalia.converting.processor.summary.heuristic.*;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
@@ -36,8 +35,8 @@ class PdfDocumentProcessorPluginTest {
static void setUpBeforeClass() throws Exception {
var lm = WmsaHome.getLanguageModels();
plugin = new PdfDocumentProcessorPlugin(255,
new LanguageFilter(lm, new LanguageConfiguration()),
new ThreadLocalSentenceExtractorProvider(lm),
new LanguageConfiguration(lm),
new ThreadLocalSentenceExtractorProvider(new LanguageConfiguration(lm), lm),
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
new DocumentLengthLogic(100),
new DefaultSpecialization(new SummaryExtractor(

View File

@@ -4,6 +4,8 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.summary.heuristic.*;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
@@ -12,7 +14,9 @@ import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
@@ -25,9 +29,9 @@ class SummaryExtractorTest {
private SentenceExtractor setenceExtractor;
@BeforeEach
public void setUp() {
public void setUp() throws IOException, ParserConfigurationException, SAXException {
keywordExtractor = new DocumentKeywordExtractor();
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
setenceExtractor = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
summaryExtractor = new SummaryExtractor(255,
new DomFilterHeuristic(255),
@@ -37,7 +41,7 @@ class SummaryExtractorTest {
new FallbackHeuristic());
}
Set<String> getImportantWords(Document doc) throws URISyntaxException {
Set<String> getImportantWords(Document doc) throws URISyntaxException, UnsupportedLanguageException {
var dld = setenceExtractor.extractSentences(doc);
var keywords = keywordExtractor.extractKeywords(dld, "en", new LinkTexts(), new EdgeUrl(
"https://www.marginalia.nu/"
@@ -48,7 +52,7 @@ class SummaryExtractorTest {
}
@Test
public void testTheRegister() throws IOException, URISyntaxException {
public void testTheRegister() throws IOException, URISyntaxException, UnsupportedLanguageException {
String html = readClassPathFile("html/theregister.html");
var doc = Jsoup.parse(html);
@@ -92,7 +96,7 @@ class SummaryExtractorTest {
}
@Test
void extractSurrey() throws IOException, URISyntaxException {
void extractSurrey() throws IOException, URISyntaxException, UnsupportedLanguageException {
String html = readClassPathFile("html/summarization/surrey.html");
var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@@ -104,7 +108,7 @@ class SummaryExtractorTest {
}
@Test
void extractSurrey1() throws IOException, URISyntaxException {
void extractSurrey1() throws IOException, URISyntaxException, UnsupportedLanguageException {
String html = readClassPathFile("html/summarization/surrey.html.1");
var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@@ -115,7 +119,7 @@ class SummaryExtractorTest {
}
@Test
void extract187() throws IOException, URISyntaxException {
void extract187() throws IOException, URISyntaxException, UnsupportedLanguageException {
String html = readClassPathFile("html/summarization/187.shtml");
var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@@ -126,7 +130,7 @@ class SummaryExtractorTest {
}
@Test
void extractMonadnock() throws IOException, URISyntaxException {
void extractMonadnock() throws IOException, URISyntaxException, UnsupportedLanguageException {
String html = readClassPathFile("html/monadnock.html");
var doc = Jsoup.parse(html);
@@ -138,7 +142,7 @@ class SummaryExtractorTest {
}
@Test
public void testWorkSet() throws IOException, URISyntaxException {
public void testWorkSet() throws IOException, URISyntaxException, UnsupportedLanguageException {
var workSet = readWorkSet();
for (Map.Entry<Path, String> entry : workSet.entrySet()) {
final Path path = entry.getKey();

View File

@@ -7,8 +7,8 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
@@ -36,13 +36,13 @@ import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
public class TermFrequencyExporter implements ExporterIf {
private final FileStorageService storageService;
private final LanguageFilter lf;
private final LanguageConfiguration languageConfiguration;
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
@Inject
public TermFrequencyExporter(FileStorageService storageService, LanguageConfiguration languageConfiguration) {
this.storageService = storageService;
this.lf = new LanguageFilter(WmsaHome.getLanguageModels(), languageConfiguration);
this.languageConfiguration = languageConfiguration;
}
@Override
@@ -50,7 +50,7 @@ public class TermFrequencyExporter implements ExporterIf {
Path inputDir = storageService.getStorage(crawlId).asPath();
FileStorage destStorage = storageService.getStorage(destId);
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, WmsaHome.getLanguageModels()));
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
AtomicInteger docCount = new AtomicInteger();
@@ -120,10 +120,13 @@ public class TermFrequencyExporter implements ExporterIf {
Document parsed = doc.parseBody();
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.extractSentences(parsed);
DocumentLanguageData dld;
if (lf.predictLanguage(dld).isEmpty()) {
return;
try {
dld = se.extractSentences(parsed);
}
catch (UnsupportedLanguageException ex) {
continue; // This is ok
}
for (var sent : dld) {