1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

10 Commits

Author SHA1 Message Date
Viktor Lofgren
f1a71e9033 (ndp) Deprioritize tumblr in the visitation order 2025-10-05 12:17:46 +02:00
Viktor Lofgren
7b525918c9 (ndp) Deprioritize tumblr in the visitation order 2025-10-05 12:16:05 +02:00
Viktor Lofgren
0f3aede66f (ndp) Clean up code 2025-10-05 11:56:41 +02:00
Viktor Lofgren
88236f3836 (ndp) Use mariadb syntax instead of sqlite syntax when querying mariadb 2025-10-05 11:56:31 +02:00
Viktor Lofgren
ad31a22fbb (ndp) Refresh the ndp queue on restart 2025-10-05 10:32:05 +02:00
Viktor Lofgren
2785ae8241 (language) Further amend the docs to mention the language configuration files 2025-10-05 09:04:12 +02:00
Viktor Lofgren
1ed1f2f299 (language) Update documentation for the language processing function 2025-10-04 11:20:24 +02:00
Viktor Lofgren
b7d3b67a1d (language) Fix language configuration stub for German to not use French stemming 2025-10-02 10:15:30 +02:00
Viktor Lofgren
d28010b7e6 (search) Fix pagination in light mode 2025-10-02 09:04:49 +02:00
Viktor Lofgren
2689bd9eaa (chore) Update to Java 25
Unbreak test suites
2025-10-02 09:04:25 +02:00
9 changed files with 56 additions and 11 deletions

View File

@@ -2,6 +2,7 @@ plugins {
id 'java'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
id 'application'
}
java {
@@ -9,6 +10,10 @@ java {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.language.LanguageProcessingTool'
applicationName = 'language-processing-tool'
}
apply from: "$rootProject.projectDir/srcsets.gradle"

View File

@@ -28,6 +28,7 @@ public class LanguageProcessingTool extends Jooby {
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final TermFrequencyDict termFrequencyDict;
static void main(String[] args) {
Jooby.runApp(args, LanguageProcessingTool::new);
}
@@ -41,7 +42,14 @@ public class LanguageProcessingTool extends Jooby {
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
languageModels
);
// Depending on how the tool is started, we may be in the project root, or the module root;
// so here's some guesswork to try to suss out which one it is...
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
if (!Files.exists(basePath)) {
basePath = Path.of(".").toAbsolutePath();
}
System.out.println("Base path: " + basePath);
if (Files.exists(basePath.resolve("resources/ltt/jte")))

View File

@@ -1,14 +1,31 @@
# Language Processing
This library contains various tools used in language processing.
This function gathers various tools used in language processing,
keyword extraction, and so on.
## Language Configuration
The files [resources/languages-default.xml](resources/languages-default.xml) and [resources/languages-experimental.xml](resources/languages-experimental.xml) hold the laguage definitions used by the search engine,
the former is used in production and the latter in most tests that require language processing.
The search engine excludes any languages not configured in these files, though it is relatively easy to define a stub
configuration that gets a simpler behavior out of the search engine.
## Language Processing Tool
It also houses a tool for inspecting the output of keyword extraction,
which can be accessed by running the command below from the root of the project.
The tool becomes accessible on port 8080.
```bash
$ ./gradlew :code:functions:language-processing:run
```
## Central Classes
* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) -
Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
its words, how they stem, POS tags, and so on.
## See Also
[converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction) uses this code to identify which keywords
are important.
* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents

View File

@@ -123,7 +123,7 @@
</language>
<language isoCode="de" name="German" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="FRENCH" />
<stemmer algorithm="snowball" variant="GERMAN" />
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="german" />
</language>

View File

@@ -92,6 +92,13 @@ public class DomainTestingQueue {
}
public void fetch() {
try (var conn = dataSource.getConnection()) {
refreshQueue(conn);
} catch (Exception e) {
logger.error("Error refreshing the ndp queue");
throw new RuntimeException(e);
}
while (true) {
List<DomainToTest> domains = new ArrayList<>(2000);
try (var conn = dataSource.getConnection();
@@ -126,6 +133,7 @@ public class DomainTestingQueue {
throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
}
catch (Exception e) {
logger.error("Error in ndp process");
throw new RuntimeException("Failed to fetch domains from database", e);
}
@@ -193,7 +201,8 @@ public class DomainTestingQueue {
/* Insert new domains into NDP_NEW_DOMAINS table */
try (var insertStmt = conn.prepareStatement("""
INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
INSERT INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
ON DUPLICATE KEY UPDATE PRIORITY = VALUES(PRIORITY)
""")) {
conn.setAutoCommit(false);
@@ -228,7 +237,10 @@ public class DomainTestingQueue {
// This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
// repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
try (var stmt = conn.createStatement()) {
conn.setAutoCommit(false);
stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
stmt.executeUpdate("UPDATE NDP_NEW_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NDP_NEW_DOMAINS.DOMAIN_ID SET PRIORITY=1 WHERE DOMAIN_TOP='tumblr.com'");
conn.commit();
}
catch (Exception e) {
throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);

View File

@@ -58,7 +58,6 @@ public class NdpMain extends ProcessMainClass {
public void run(int goalCount) throws InterruptedException {
logger.info("Wait for blacklist to load...");
domainBlacklist.waitUntilLoaded();
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(

View File

@@ -89,4 +89,6 @@ tasks.register('paperDoll', Test) {
includeTags "paperdoll"
}
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
classpath = sourceSets.test.runtimeClasspath
testClassesDirs = sourceSets.test.output.classesDirs
}

View File

@@ -135,4 +135,6 @@ tasks.register('paperDoll', Test) {
includeTags "paperdoll"
}
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
classpath = sourceSets.test.runtimeClasspath
testClassesDirs = sourceSets.test.output.classesDirs
}

View File

@@ -79,9 +79,9 @@
<div class="mt-8 flex justify-center space-x-2 font-mono text-sm">
@for(ResultsPage page : results.getResultPages())
@if (page.current())
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 border dark:border-gray-600 border-gray-300 bg-gray-100 dark:bg-gray-900">${page.number()}</a>
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 border dark:border-gray-600 border-gray-400 bg-gray-200 dark:bg-gray-900">${page.number()}</a>
@else
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 bg-white border dark:border-gray-600 border-gray-300 hover:bg-gray-100 dark:bg-gray-800 hover:bg-gray-900">${page.number()}</a>
<a href="${results.getParams().withPage(page.number()).renderUrl()}" class="px-2 py-1 bg-white border dark:border-gray-600 border-gray-300 hover:bg-gray-100 dark:bg-gray-800 dark:hover:bg-gray-900">${page.number()}</a>
@endif
@endfor
</div>