mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
10 Commits
detached
...
f1a71e9033
Author | SHA1 | Date | |
---|---|---|---|
|
f1a71e9033 | ||
|
7b525918c9 | ||
|
0f3aede66f | ||
|
88236f3836 | ||
|
ad31a22fbb | ||
|
2785ae8241 | ||
|
1ed1f2f299 | ||
|
b7d3b67a1d | ||
|
d28010b7e6 | ||
|
2689bd9eaa |
@@ -2,6 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'application'
|
||||
}
|
||||
|
||||
java {
|
||||
@@ -9,6 +10,10 @@ java {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
application {
|
||||
mainClass = 'nu.marginalia.language.LanguageProcessingTool'
|
||||
applicationName = 'language-processing-tool'
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
|
@@ -28,6 +28,7 @@ public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
@@ -41,7 +42,14 @@ public class LanguageProcessingTool extends Jooby {
|
||||
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
||||
languageModels
|
||||
);
|
||||
|
||||
// Depending on how the tool is started, we may be in the project root, or the module root;
|
||||
// so here's some guesswork to try to suss out which one it is...
|
||||
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
||||
if (!Files.exists(basePath)) {
|
||||
basePath = Path.of(".").toAbsolutePath();
|
||||
}
|
||||
|
||||
System.out.println("Base path: " + basePath);
|
||||
|
||||
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
||||
|
@@ -1,14 +1,31 @@
|
||||
# Language Processing
|
||||
|
||||
This library contains various tools used in language processing.
|
||||
This function gathers various tools used in language processing,
|
||||
keyword extraction, and so on.
|
||||
|
||||
## Language Configuration
|
||||
|
||||
The files [resources/languages-default.xml](resources/languages-default.xml) and [resources/languages-experimental.xml](resources/languages-experimental.xml) hold the laguage definitions used by the search engine,
|
||||
the former is used in production and the latter in most tests that require language processing.
|
||||
|
||||
The search engine excludes any languages not configured in these files, though it is relatively easy to define a stub
|
||||
configuration that gets a simpler behavior out of the search engine.
|
||||
|
||||
## Language Processing Tool
|
||||
|
||||
It also houses a tool for inspecting the output of keyword extraction,
|
||||
which can be accessed by running the command below from the root of the project.
|
||||
The tool becomes accessible on port 8080.
|
||||
|
||||
```bash
|
||||
$ ./gradlew :code:functions:language-processing:run
|
||||
```
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) -
|
||||
Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
|
||||
its words, how they stem, POS tags, and so on.
|
||||
|
||||
## See Also
|
||||
|
||||
[converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction) uses this code to identify which keywords
|
||||
are important.
|
||||
* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
|
||||
* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
|
||||
* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents
|
||||
|
@@ -123,7 +123,7 @@
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<stemmer algorithm="snowball" variant="GERMAN" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
|
@@ -92,6 +92,13 @@ public class DomainTestingQueue {
|
||||
}
|
||||
|
||||
public void fetch() {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
refreshQueue(conn);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error refreshing the ndp queue");
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
List<DomainToTest> domains = new ArrayList<>(2000);
|
||||
try (var conn = dataSource.getConnection();
|
||||
@@ -126,6 +133,7 @@ public class DomainTestingQueue {
|
||||
throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in ndp process");
|
||||
throw new RuntimeException("Failed to fetch domains from database", e);
|
||||
}
|
||||
|
||||
@@ -193,7 +201,8 @@ public class DomainTestingQueue {
|
||||
|
||||
/* Insert new domains into NDP_NEW_DOMAINS table */
|
||||
try (var insertStmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
|
||||
INSERT INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
|
||||
ON DUPLICATE KEY UPDATE PRIORITY = VALUES(PRIORITY)
|
||||
""")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
@@ -228,7 +237,10 @@ public class DomainTestingQueue {
|
||||
// This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
|
||||
// repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
|
||||
try (var stmt = conn.createStatement()) {
|
||||
conn.setAutoCommit(false);
|
||||
stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
|
||||
stmt.executeUpdate("UPDATE NDP_NEW_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NDP_NEW_DOMAINS.DOMAIN_ID SET PRIORITY=1 WHERE DOMAIN_TOP='tumblr.com'");
|
||||
conn.commit();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);
|
||||
|
@@ -58,7 +58,6 @@ public class NdpMain extends ProcessMainClass {
|
||||
|
||||
|
||||
public void run(int goalCount) throws InterruptedException {
|
||||
logger.info("Wait for blacklist to load...");
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
|
||||
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(
|
||||
|
@@ -89,4 +89,6 @@ tasks.register('paperDoll', Test) {
|
||||
includeTags "paperdoll"
|
||||
}
|
||||
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
|
||||
classpath = sourceSets.test.runtimeClasspath
|
||||
testClassesDirs = sourceSets.test.output.classesDirs
|
||||
}
|
||||
|
@@ -135,4 +135,6 @@ tasks.register('paperDoll', Test) {
|
||||
includeTags "paperdoll"
|
||||
}
|
||||
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
|
||||
classpath = sourceSets.test.runtimeClasspath
|
||||
testClassesDirs = sourceSets.test.output.classesDirs
|
||||
}
|
||||
|
@@ -79,9 +79,9 @@
|
||||
<div class="mt-8 flex justify-center space-x-2 font-mono text-sm">
|
||||
@for(ResultsPage page : results.getResultPages())
|
||||
@if (page.current())
|
||||
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 border dark:border-gray-600 border-gray-300 bg-gray-100 dark:bg-gray-900">${page.number()}</a>
|
||||
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 border dark:border-gray-600 border-gray-400 bg-gray-200 dark:bg-gray-900">${page.number()}</a>
|
||||
@else
|
||||
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 bg-white border dark:border-gray-600 border-gray-300 hover:bg-gray-100 dark:bg-gray-800 hover:bg-gray-900">${page.number()}</a>
|
||||
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 bg-white border dark:border-gray-600 border-gray-300 hover:bg-gray-100 dark:bg-gray-800 dark:hover:bg-gray-900">${page.number()}</a>
|
||||
@endif
|
||||
@endfor
|
||||
</div>
|
||||
|
Reference in New Issue
Block a user