1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Initial embryo for language configuration

This commit is contained in:
Viktor Lofgren
2025-08-19 09:29:09 +02:00
parent 1cca16a58e
commit b564b33028
5 changed files with 45 additions and 8 deletions

View File

@@ -0,0 +1,32 @@
package nu.marginalia.language.config;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import javax.annotation.Nullable;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
public class LanguageConfiguration {
private final Map<String, Language> languages = new HashMap<>();
@Inject
public LanguageConfiguration() {
Path languageConfigurationFile = WmsaHome.getDataPath().resolve("language.xml");
// TODO: read the xml
// for now:
languages.put("en", new Language("en", "English", true));
languages.put("sv", new Language("sv", "Swedish/Svenska", true));
}
@Nullable
public Language getLanguage(String language) {
return languages.get(language);
}
public record Language(String isoCode, String name, boolean permitted) {
}
}

View File

@@ -4,20 +4,21 @@ import com.github.jfasttext.JFastText;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.Optional;
import java.util.Set;
@Singleton
public class LanguageFilter {
private final Set<String> permittedLanguages = Set.of("en", "sv");
private final LanguageConfiguration languageConfiguration;
private final JFastText jft = new JFastText();
@Inject
public LanguageFilter(LanguageModels lm) {
public LanguageFilter(LanguageModels lm, LanguageConfiguration languageConfiguration) {
this.languageConfiguration = languageConfiguration;
jft.loadModel(lm.fasttextLanguageModel.toString());
}
@@ -27,9 +28,10 @@ public class LanguageFilter {
if (prediction.length() == "__label__??".length()) {
String isoCode = prediction.substring("__label__".length());
if (permittedLanguages.contains(isoCode)) {
LanguageConfiguration.Language config = languageConfiguration.getLanguage(isoCode);
if (config != null && config.permitted())
return Optional.of(isoCode);
}
}
return Optional.empty();

View File

@@ -1,5 +1,6 @@
package nu.marginalia.language.filter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import org.junit.jupiter.api.Test;
@@ -12,7 +13,7 @@ class LanguageFilterTest {
@Test
void isPageInteresting() {
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels());
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels(), new LanguageConfiguration());
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Carlos fue al bosque y recogió bayas")));
assertEquals(Optional.empty(), languageFilter.predictLanguage(new DocumentLanguageData(List.of(), "Charlie est allé dans la forêt et a cueilli des baies")));

View File

@@ -9,6 +9,7 @@ import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.converting.processor.summary.heuristic.*;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.crawldata.CrawledDocument;
@@ -35,7 +36,7 @@ class PdfDocumentProcessorPluginTest {
static void setUpBeforeClass() throws Exception {
var lm = WmsaHome.getLanguageModels();
plugin = new PdfDocumentProcessorPlugin(255,
new LanguageFilter(lm),
new LanguageFilter(lm, new LanguageConfiguration()),
new ThreadLocalSentenceExtractorProvider(lm),
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
new DocumentLengthLogic(100),

View File

@@ -6,6 +6,7 @@ import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;
@@ -35,7 +36,7 @@ import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
public class TermFrequencyExporter implements ExporterIf {
private final FileStorageService storageService;
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels(), new LanguageConfiguration());
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
@Inject