1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Fix RDRPosTagger back to working order and integrate with SentenceExtractor

This commit is contained in:
Viktor Lofgren
2025-08-24 16:35:58 +02:00
parent 686a40e69b
commit bbf7c5a1cb
4 changed files with 113 additions and 9 deletions

View File

@@ -60,6 +60,9 @@ public class LanguageConfiguration {
public Optional<LanguageDefinition> identifyLanguage(String sample) {
String prediction = fastTextLanguageModel.predict(sample);
System.out.println("prediction: " + prediction);
if (null == prediction)
return Optional.empty();
if (prediction.length() == "__label__??".length()) {
String isoCode = prediction.substring("__label__".length());
@@ -73,6 +76,9 @@ public class LanguageConfiguration {
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
}
public List<LanguageDefinition> languages() {
return new ArrayList<>(this.languages.values());
}
@Nullable
public LanguageDefinition getLanguage(String language) {
return languages.get(language);

View File

@@ -34,7 +34,7 @@ public class SentenceExtractor {
private final LanguageConfiguration languageConfiguration;
private SentenceDetectorME sentenceDetector;
private static RDRPOSTagger rdrposTagger;
private static HashMap<String, RDRPOSTagger> taggers;
private static NgramLexicon ngramLexicon = null;
@@ -67,11 +67,16 @@ public class SentenceExtractor {
ngramLexicon = new NgramLexicon(models);
}
if (rdrposTagger == null) {
try {
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
} catch (Exception ex) {
throw new IllegalStateException(ex);
if (taggers == null) {
taggers = new HashMap<>();
for (var langauge : languageConfiguration.languages()) {
try {
var tagger = new RDRPOSTagger( langauge.posTaggingData().dictFilePath, langauge.posTaggingData().rdrFilePath);
taggers.put(langauge.isoCode(), tagger);
}
catch (IOException ex) {
logger.error("Failed to initialize RDRPosTagger for language " + langauge.isoCode());
}
}
}
}
@@ -122,6 +127,17 @@ public class SentenceExtractor {
text);
}
public DocumentLanguageData extractSentences(String text) {
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
return new DocumentLanguageData(language, sentences, text);
}
public DocumentSentence extractSentence(LanguageDefinition language,
String text,
EnumSet<HtmlTag> htmlTags) {
@@ -158,7 +174,7 @@ public class SentenceExtractor {
return new DocumentSentence(
seps,
lc,
rdrposTagger.tagsForEnSentence(words),
taggers.get(language.isoCode()).tagSentence(words),
stemmed,
htmlTags,
isCapitalized,
@@ -211,7 +227,7 @@ public class SentenceExtractor {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = rdrposTagger.tagsForEnSentence(tokens);
var posTags = taggers.get(language.isoCode()).tagSentence(tokens);
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];

View File

@@ -109,6 +109,70 @@ public class InitialTagger
return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
}
public static String[] InitTagger4Sentence(
HashMap<String, String> DICT, String[] sentence)
{
String[] wordtags = new String[sentence.length];
for (int i = 0; i < sentence.length; i++) {
wordtags[i] = getTagForWord(DICT, sentence[i]);
}
return wordtags;
}
private static String getTagForWord(HashMap<String, String> DICT, String word) {
if ("[]()<>!".contains(word)) {
return "?";
}
String tag = "";
String lowerW = word.toLowerCase();
if (DICT.containsKey(word))
tag = DICT.get(word);
else if (DICT.containsKey(lowerW))
tag = DICT.get(lowerW);
else {
if (cd(word)) {
tag = DICT.get("TAG4UNKN-NUM");
}
else {
String suffixL2 = null, suffixL3 = null, suffixL4 = null, suffixL5 = null;
int wordLength = word.length();
if (wordLength >= 4) {
suffixL2 = ".*" + word.substring(wordLength - 2);
suffixL3 = ".*" + word.substring(wordLength - 3);
}
if (wordLength >= 5) {
suffixL4 = ".*" + word.substring(wordLength - 4);
}
if (wordLength >= 6) {
suffixL5 = ".*" + word.substring(wordLength - 5);
}
if (DICT.containsKey(suffixL5)) {
tag = DICT.get(suffixL5);
}
else if (DICT.containsKey(suffixL4)) {
tag = DICT.get(suffixL4);
}
else if (DICT.containsKey(suffixL3)) {
tag = DICT.get(suffixL3);
}
else if (DICT.containsKey(suffixL2)) {
tag = DICT.get(suffixL2);
}
else if (Character.isUpperCase(word.codePointAt(0)))
tag = DICT.get("TAG4UNKN-CAPITAL");
else
tag = DICT.get("TAG4UNKN-WORD");
}
}
return tag;
}
public static String[] EnInitTagger4Sentence(
HashMap<String, String> DICT, String[] sentence)
{

View File

@@ -2,7 +2,10 @@ package com.github.datquocnguyen;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.io.*;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Arrays;
@@ -172,4 +175,19 @@ public class RDRPOSTagger
return tags;
}
public String[] tagSentence(String[] sentence)
{
var initialTags = InitialTagger.InitTagger4Sentence(FREQDICT, sentence);
String[] tags = new String[initialTags.length];
FWObject object = new FWObject(true);
for (int i = 0; i < initialTags.length; i++) {
Utils.getObject(object, sentence, initialTags, initialTags.length, i);
tags[i] = findFiredNode(object);
}
return tags;
}
}