1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Update documentation for the language processing function

This commit is contained in:
Viktor Lofgren
2025-10-04 11:20:24 +02:00
parent b7d3b67a1d
commit 1ed1f2f299
3 changed files with 28 additions and 6 deletions

View File

@@ -2,6 +2,7 @@ plugins {
id 'java'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
id 'application'
}
java {
@@ -9,6 +10,10 @@ java {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.language.LanguageProcessingTool'
applicationName = 'language-processing-tool'
}
apply from: "$rootProject.projectDir/srcsets.gradle"

View File

@@ -28,6 +28,7 @@ public class LanguageProcessingTool extends Jooby {
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final TermFrequencyDict termFrequencyDict;
static void main(String[] args) {
Jooby.runApp(args, LanguageProcessingTool::new);
}
@@ -41,7 +42,14 @@ public class LanguageProcessingTool extends Jooby {
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
languageModels
);
// Depending on how the tool is started, we may be in the project root, or the module root;
// so here's some guesswork to try to suss out which one it is...
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
if (!Files.exists(basePath)) {
basePath = Path.of(".").toAbsolutePath();
}
System.out.println("Base path: " + basePath);
if (Files.exists(basePath.resolve("resources/ltt/jte")))

View File

@@ -1,14 +1,23 @@
# Language Processing
This library contains various tools used in language processing.
This function gathers various tools used in language processing,
keyword extraction, and so on.
## Language Processing Tool
It also houses a tool for inspecting the output of keyword extraction,
which can be accessed by running the command below from the root of the project.
The tool becomes accessible on port 8080.
```bash
$ ./gradlew :code:functions:language-processing:run
```
## Central Classes
* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) -
Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
its words, how they stem, POS tags, and so on.
## See Also
[converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction) uses this code to identify which keywords
are important.
* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents