mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Initial embryo for language configuration
This commit is contained in:
@@ -0,0 +1,32 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class LanguageConfiguration {
|
||||
private final Map<String, Language> languages = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() {
|
||||
Path languageConfigurationFile = WmsaHome.getDataPath().resolve("language.xml");
|
||||
|
||||
// TODO: read the xml
|
||||
|
||||
// for now:
|
||||
languages.put("en", new Language("en", "English", true));
|
||||
languages.put("sv", new Language("sv", "Swedish/Svenska", true));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public Language getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
public record Language(String isoCode, String name, boolean permitted) {
|
||||
}
|
||||
}
|
@@ -4,20 +4,21 @@ import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class LanguageFilter {
|
||||
|
||||
private final Set<String> permittedLanguages = Set.of("en", "sv");
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final JFastText jft = new JFastText();
|
||||
|
||||
@Inject
|
||||
public LanguageFilter(LanguageModels lm) {
|
||||
public LanguageFilter(LanguageModels lm, LanguageConfiguration languageConfiguration) {
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
jft.loadModel(lm.fasttextLanguageModel.toString());
|
||||
}
|
||||
|
||||
@@ -27,9 +28,10 @@ public class LanguageFilter {
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
|
||||
if (permittedLanguages.contains(isoCode)) {
|
||||
LanguageConfiguration.Language config = languageConfiguration.getLanguage(isoCode);
|
||||
|
||||
if (config != null && config.permitted())
|
||||
return Optional.of(isoCode);
|
||||
}
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@@ -12,7 +13,7 @@ class LanguageFilterTest {
|
||||
|
||||
@Test
|
||||
void isPageInteresting() {
|
||||
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels());
|
||||
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels(), new LanguageConfiguration());
|
||||
|
||||
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Carlos fue al bosque y recogió bayas")));
|
||||
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie est allé dans la forêt et a cueilli des baies")));
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
@@ -35,7 +36,7 @@ class PdfDocumentProcessorPluginTest {
|
||||
static void setUpBeforeClass() throws Exception {
|
||||
var lm = WmsaHome.getLanguageModels();
|
||||
plugin = new PdfDocumentProcessorPlugin(255,
|
||||
new LanguageFilter(lm),
|
||||
new LanguageFilter(lm, new LanguageConfiguration()),
|
||||
new ThreadLocalSentenceExtractorProvider(lm),
|
||||
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||
new DocumentLengthLogic(100),
|
||||
|
@@ -6,6 +6,7 @@ import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@@ -35,7 +36,7 @@ import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
|
||||
|
||||
public class TermFrequencyExporter implements ExporterIf {
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
|
||||
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels(), new LanguageConfiguration());
|
||||
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
|
||||
|
||||
@Inject
|
||||
|
Reference in New Issue
Block a user