mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Make language configuration configurable
This commit is contained in:
@@ -114,4 +114,7 @@ public class WmsaHome {
|
||||
}
|
||||
|
||||
|
||||
public static Path getLangugeConfig() {
|
||||
return getHomePath().resolve("conf/languages.xml");
|
||||
}
|
||||
}
|
||||
|
@@ -111,6 +111,19 @@ public class LanguageConfiguration {
|
||||
logger.info("Loaded language configuration: {}", languages);
|
||||
}
|
||||
|
||||
InputStream findLanguageConfiguration() throws IOException {
|
||||
Path filesystemPath = WmsaHome.getLangugeConfig();
|
||||
if (Files.exists(filesystemPath)) {
|
||||
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("language.experimental")) {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
}
|
||||
else {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
|
||||
private void parseLanguages(Document doc) {
|
||||
NodeList languageNodes = doc.getElementsByTagName("language");
|
||||
|
||||
|
@@ -0,0 +1,135 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
<language isoCode="fr" name="French" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
||||
<resource id="pos-rdr-sv" md5="" path="rdr/Swedish.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.RDR" />
|
||||
|
||||
</languages>
|
@@ -101,35 +101,9 @@
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
<language isoCode="fr" name="French" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
||||
<resource id="pos-rdr-sv" md5="" path="rdr/Swedish.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.RDR" />
|
||||
|
||||
</languages>
|
Reference in New Issue
Block a user