mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Fix RDRPosTagger back to working order and integrate with SentenceExtractor
This commit is contained in:
@@ -60,6 +60,9 @@ public class LanguageConfiguration {
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample) {
|
||||
String prediction = fastTextLanguageModel.predict(sample);
|
||||
System.out.println("prediction: " + prediction);
|
||||
if (null == prediction)
|
||||
return Optional.empty();
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
@@ -73,6 +76,9 @@ public class LanguageConfiguration {
|
||||
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
|
||||
}
|
||||
|
||||
public List<LanguageDefinition> languages() {
|
||||
return new ArrayList<>(this.languages.values());
|
||||
}
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
|
@@ -34,7 +34,7 @@ public class SentenceExtractor {
|
||||
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private static RDRPOSTagger rdrposTagger;
|
||||
private static HashMap<String, RDRPOSTagger> taggers;
|
||||
|
||||
private static NgramLexicon ngramLexicon = null;
|
||||
|
||||
@@ -67,11 +67,16 @@ public class SentenceExtractor {
|
||||
ngramLexicon = new NgramLexicon(models);
|
||||
}
|
||||
|
||||
if (rdrposTagger == null) {
|
||||
try {
|
||||
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
||||
} catch (Exception ex) {
|
||||
throw new IllegalStateException(ex);
|
||||
if (taggers == null) {
|
||||
taggers = new HashMap<>();
|
||||
for (var langauge : languageConfiguration.languages()) {
|
||||
try {
|
||||
var tagger = new RDRPOSTagger( langauge.posTaggingData().dictFilePath, langauge.posTaggingData().rdrFilePath);
|
||||
taggers.put(langauge.isoCode(), tagger);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to initialize RDRPosTagger for language " + langauge.isoCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -122,6 +127,17 @@ public class SentenceExtractor {
|
||||
text);
|
||||
}
|
||||
|
||||
|
||||
public DocumentLanguageData extractSentences(String text) {
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
return new DocumentLanguageData(language, sentences, text);
|
||||
}
|
||||
|
||||
|
||||
public DocumentSentence extractSentence(LanguageDefinition language,
|
||||
String text,
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
@@ -158,7 +174,7 @@ public class SentenceExtractor {
|
||||
return new DocumentSentence(
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
taggers.get(language.isoCode()).tagSentence(words),
|
||||
stemmed,
|
||||
htmlTags,
|
||||
isCapitalized,
|
||||
@@ -211,7 +227,7 @@ public class SentenceExtractor {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = rdrposTagger.tagsForEnSentence(tokens);
|
||||
var posTags = taggers.get(language.isoCode()).tagSentence(tokens);
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
|
@@ -109,6 +109,70 @@ public class InitialTagger
|
||||
return Character.isLowerCase(s.charAt(0)) && s.endsWith("s");
|
||||
}
|
||||
|
||||
public static String[] InitTagger4Sentence(
|
||||
HashMap<String, String> DICT, String[] sentence)
|
||||
{
|
||||
String[] wordtags = new String[sentence.length];
|
||||
|
||||
for (int i = 0; i < sentence.length; i++) {
|
||||
wordtags[i] = getTagForWord(DICT, sentence[i]);
|
||||
}
|
||||
return wordtags;
|
||||
}
|
||||
|
||||
|
||||
private static String getTagForWord(HashMap<String, String> DICT, String word) {
|
||||
if ("[]()<>!".contains(word)) {
|
||||
return "?";
|
||||
}
|
||||
|
||||
String tag = "";
|
||||
String lowerW = word.toLowerCase();
|
||||
if (DICT.containsKey(word))
|
||||
tag = DICT.get(word);
|
||||
else if (DICT.containsKey(lowerW))
|
||||
tag = DICT.get(lowerW);
|
||||
else {
|
||||
if (cd(word)) {
|
||||
tag = DICT.get("TAG4UNKN-NUM");
|
||||
}
|
||||
else {
|
||||
String suffixL2 = null, suffixL3 = null, suffixL4 = null, suffixL5 = null;
|
||||
|
||||
int wordLength = word.length();
|
||||
if (wordLength >= 4) {
|
||||
suffixL2 = ".*" + word.substring(wordLength - 2);
|
||||
suffixL3 = ".*" + word.substring(wordLength - 3);
|
||||
}
|
||||
if (wordLength >= 5) {
|
||||
suffixL4 = ".*" + word.substring(wordLength - 4);
|
||||
}
|
||||
if (wordLength >= 6) {
|
||||
suffixL5 = ".*" + word.substring(wordLength - 5);
|
||||
}
|
||||
|
||||
if (DICT.containsKey(suffixL5)) {
|
||||
tag = DICT.get(suffixL5);
|
||||
}
|
||||
else if (DICT.containsKey(suffixL4)) {
|
||||
tag = DICT.get(suffixL4);
|
||||
}
|
||||
else if (DICT.containsKey(suffixL3)) {
|
||||
tag = DICT.get(suffixL3);
|
||||
}
|
||||
else if (DICT.containsKey(suffixL2)) {
|
||||
tag = DICT.get(suffixL2);
|
||||
}
|
||||
else if (Character.isUpperCase(word.codePointAt(0)))
|
||||
tag = DICT.get("TAG4UNKN-CAPITAL");
|
||||
else
|
||||
tag = DICT.get("TAG4UNKN-WORD");
|
||||
}
|
||||
}
|
||||
|
||||
return tag;
|
||||
}
|
||||
|
||||
public static String[] EnInitTagger4Sentence(
|
||||
HashMap<String, String> DICT, String[] sentence)
|
||||
{
|
||||
|
@@ -2,7 +2,10 @@ package com.github.datquocnguyen;
|
||||
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
@@ -172,4 +175,19 @@ public class RDRPOSTagger
|
||||
return tags;
|
||||
}
|
||||
|
||||
public String[] tagSentence(String[] sentence)
|
||||
{
|
||||
|
||||
var initialTags = InitialTagger.InitTagger4Sentence(FREQDICT, sentence);
|
||||
|
||||
String[] tags = new String[initialTags.length];
|
||||
FWObject object = new FWObject(true);
|
||||
|
||||
for (int i = 0; i < initialTags.length; i++) {
|
||||
Utils.getObject(object, sentence, initialTags, initialTags.length, i);
|
||||
tags[i] = findFiredNode(object);
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user