(language) Fix RDRPosTagger back to working order and integrate with SentenceExtractor

2025-10-05 21:22:39 +02:00 · 2025-08-24 16:35:58 +02:00
parent 686a40e69b
commit bbf7c5a1cb
4 changed files with 113 additions and 9 deletions
--- a/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java
@@ -60,6 +60,9 @@ public class LanguageConfiguration {

    public Optional<LanguageDefinition> identifyLanguage(String sample) {
        String prediction = fastTextLanguageModel.predict(sample);
+        System.out.println("prediction: " + prediction);
+        if (null == prediction)
+            return Optional.empty();

        if (prediction.length() == "__label__??".length()) {
            String isoCode = prediction.substring("__label__".length());
@@ -73,6 +76,9 @@ public class LanguageConfiguration {
        return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
    }

+    public List<LanguageDefinition> languages() {
+        return new ArrayList<>(this.languages.values());
+    }
    @Nullable
    public LanguageDefinition getLanguage(String language) {
        return languages.get(language);
--- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@@ -34,7 +34,7 @@ public class SentenceExtractor {

    private final LanguageConfiguration languageConfiguration;
    private SentenceDetectorME sentenceDetector;
-    private static RDRPOSTagger rdrposTagger;
+    private static HashMap<String, RDRPOSTagger> taggers;

    private static NgramLexicon ngramLexicon = null;

@@ -67,11 +67,16 @@ public class SentenceExtractor {
                ngramLexicon = new NgramLexicon(models);
            }

-            if (rdrposTagger == null) {
-                try {
-                    rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
-                } catch (Exception ex) {
-                    throw new IllegalStateException(ex);
+            if (taggers == null) {
+                taggers = new HashMap<>();
+                for (var langauge : languageConfiguration.languages()) {
+                    try {
+                        var tagger = new RDRPOSTagger( langauge.posTaggingData().dictFilePath, langauge.posTaggingData().rdrFilePath);
+                        taggers.put(langauge.isoCode(), tagger);
+                    }
+                    catch (IOException ex) {
+                        logger.error("Failed to initialize RDRPosTagger for language " + langauge.isoCode());
+                    }
                }
            }
        }
@@ -122,6 +127,17 @@ public class SentenceExtractor {
                text);
    }

+
+    public DocumentLanguageData extractSentences(String text) {
+        LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
+                .orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
+
+        var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
+
+        return new DocumentLanguageData(language, sentences, text);
+    }
+
+
    public DocumentSentence extractSentence(LanguageDefinition language,
                                            String text,
                                            EnumSet<HtmlTag> htmlTags) {
@@ -158,7 +174,7 @@ public class SentenceExtractor {
        return new DocumentSentence(
                seps,
                lc,
-                rdrposTagger.tagsForEnSentence(words),
+                taggers.get(language.isoCode()).tagSentence(words),
                stemmed,
                htmlTags,
                isCapitalized,
@@ -211,7 +227,7 @@ public class SentenceExtractor {
                var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
                var tokens = wordsAndSeps.words();
                var separators = wordsAndSeps.separators();
-                var posTags = rdrposTagger.tagsForEnSentence(tokens);
+                var posTags = taggers.get(language.isoCode()).tagSentence(tokens);
                var tokensLc = new String[tokens.length];
                var stemmed = new String[tokens.length];

--- a/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/InitialTagger.java
+++ b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/InitialTagger.java
@@ -109,6 +109,70 @@ public class InitialTagger
 		return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
 	}

+    public static String[] InitTagger4Sentence(
+            HashMap<String, String> DICT, String[] sentence)
+    {
+        String[] wordtags = new String[sentence.length];
+
+        for (int i = 0; i < sentence.length; i++) {
+            wordtags[i] = getTagForWord(DICT, sentence[i]);
+        }
+        return wordtags;
+    }
+
+
+    private static String getTagForWord(HashMap<String, String> DICT, String word) {
+        if ("[]()<>!".contains(word)) {
+            return "?";
+        }
+
+        String tag = "";
+        String lowerW = word.toLowerCase();
+        if (DICT.containsKey(word))
+            tag = DICT.get(word);
+        else if (DICT.containsKey(lowerW))
+            tag = DICT.get(lowerW);
+        else {
+            if (cd(word)) {
+                tag = DICT.get("TAG4UNKN-NUM");
+            }
+            else {
+                String suffixL2 = null, suffixL3 = null, suffixL4 = null, suffixL5 = null;
+
+                int wordLength = word.length();
+                if (wordLength >= 4) {
+                    suffixL2 = ".*" + word.substring(wordLength - 2);
+                    suffixL3 = ".*" + word.substring(wordLength - 3);
+                }
+                if (wordLength >= 5) {
+                    suffixL4 = ".*" + word.substring(wordLength - 4);
+                }
+                if (wordLength >= 6) {
+                    suffixL5 = ".*" + word.substring(wordLength - 5);
+                }
+
+                if (DICT.containsKey(suffixL5)) {
+                    tag = DICT.get(suffixL5);
+                }
+                else if (DICT.containsKey(suffixL4)) {
+                    tag = DICT.get(suffixL4);
+                }
+                else if (DICT.containsKey(suffixL3)) {
+                    tag = DICT.get(suffixL3);
+                }
+                else if (DICT.containsKey(suffixL2)) {
+                    tag = DICT.get(suffixL2);
+                }
+                else if (Character.isUpperCase(word.codePointAt(0)))
+                    tag = DICT.get("TAG4UNKN-CAPITAL");
+                else
+                    tag = DICT.get("TAG4UNKN-WORD");
+            }
+        }
+
+        return tag;
+    }
+
 	public static String[] EnInitTagger4Sentence(
 		HashMap<String, String> DICT, String[] sentence)
 	{
--- a/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java
+++ b/third-party/rdrpostagger/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java
@@ -2,7 +2,10 @@ package com.github.datquocnguyen;

 import gnu.trove.map.hash.TObjectIntHashMap;

-import java.io.*;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.util.Arrays;
@@ -172,4 +175,19 @@ public class RDRPOSTagger
 		return tags;
 	}

+    public String[] tagSentence(String[] sentence)
+    {
+
+        var initialTags = InitialTagger.InitTagger4Sentence(FREQDICT, sentence);
+
+        String[] tags = new String[initialTags.length];
+        FWObject object = new FWObject(true);
+
+        for (int i = 0; i < initialTags.length; i++) {
+            Utils.getObject(object, sentence, initialTags, initialTags.length, i);
+            tags[i] = findFiredNode(object);
+        }
+
+        return tags;
+    }
 }