diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index 5ce8a910a..e815fd0c2 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -114,4 +114,7 @@ public class WmsaHome { } + public static Path getLangugeConfig() { + return getHomePath().resolve("conf/languages.xml"); + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java b/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java index 11890e642..976d2d8b8 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java @@ -111,6 +111,19 @@ public class LanguageConfiguration { logger.info("Loaded language configuration: {}", languages); } + InputStream findLanguageConfiguration() throws IOException { + Path filesystemPath = WmsaHome.getLangugeConfig(); + if (Files.exists(filesystemPath)) { + return Files.newInputStream(filesystemPath, StandardOpenOption.READ); + } + if (Boolean.getBoolean("language.experimental")) { + return ClassLoader.getSystemResourceAsStream("languages-experimental.xml"); + } + else { + return ClassLoader.getSystemResourceAsStream("languages-default.xml"); + } + } + private void parseLanguages(Document doc) { NodeList languageNodes = doc.getElementsByTagName("language"); diff --git a/code/libraries/language-processing/resources/languages-experimental.xml b/code/libraries/language-processing/resources/languages-experimental.xml new file mode 100644 index 000000000..d2268e3ac --- /dev/null +++ b/code/libraries/language-processing/resources/languages-experimental.xml @@ -0,0 +1,135 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]> + + + + + + !(IN TO CC DT) + + + + + + NNP* + NNP* NNP* + NNP* (NNP* IN DT CC) NNP* + NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP* + + + VBG + RB VBG + (NNP* JJ) + (NN* JJ) NN* + (NN* JJ) (NN* JJ) NN* + (NN* JJ) (NN* JJ) (NN* JJ) NN* + (NNP* JJ) (NNP* IN TO CC) NNP* + (NNP* JJ) (NNP* IN TO CC) DT NNP* + (NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP* + + + (VBD VBZ) + MD VB + VBZ DT + (DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ) + + + !(CC IN DT TO) + !CC !(IN DT TO) + !CC * !(IN DT TO) + !CC * * !(IN DT TO) + + + + (N* VBG VBN JJ* R* VBG) + + (N* VBG VBN JJ* R* VBG) (N* VBG VBN) + (N* VBG VBN) CD + + (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN) + NNP* (IN TO CC NNP*) (N* VBG VBN) + (N* VBG VBN) (N* VBG VBN) CD + + (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN) + NNP* (DT IN TO CC) (IN TO CC) NNP* + + + + + + + + + + + PROPN + PROPN PROPN + PROPN PROPN PROPN + PROPN PROPN PROPN PROPN + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/libraries/language-processing/resources/languages.xml b/code/libraries/language-processing/resources/languages.xml index d2268e3ac..a3255e477 100644 --- a/code/libraries/language-processing/resources/languages.xml +++ b/code/libraries/language-processing/resources/languages.xml @@ -101,35 +101,9 @@ (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN) NNP* (DT IN TO CC) (IN TO CC) NNP* + - - - - - - - - - PROPN - PROPN PROPN - PROPN PROPN PROPN - PROPN PROPN PROPN PROPN - - - - - - - - - - - - - - - \ No newline at end of file