1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(refac) Fold term-frequency-dict into language-processing

This commit is contained in:
Viktor Lofgren
2025-09-03 12:55:01 +02:00
parent acb9ec7b15
commit 673c65d3c9
25 changed files with 51 additions and 217 deletions

View File

@@ -33,7 +33,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:language-processing')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api')

View File

@@ -23,14 +23,12 @@ dependencies {
implementation project(':code:functions:search-query:api')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:openzim')
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j

View File

@@ -77,7 +77,7 @@ dependencies {
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -23,7 +23,6 @@ dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:language-processing')
implementation project(':code:common:linkdb')
implementation project(':code:index')

View File

@@ -18,10 +18,10 @@ dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:commons-codec')
implementation project(':third-party:openzim')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:coded-sequence')
implementation libs.notnull
implementation libs.bundles.jooby

View File

@@ -1,7 +1,6 @@
package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.util.SimpleBlockingThreadPool;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.openzim.ZIMTypes.ZIMFile;
@@ -11,7 +10,7 @@ import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.ForkJoinPool;
public class NgramExtractorMain {
public static void main(String... args) throws IOException, InterruptedException {
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
var orderedHasher = HasherGroup.ordered();
var pool = new SimpleBlockingThreadPool("ngram-extractor",
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
32
);
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
reader.forEachTitles((title) -> {
pool.submitQuietly(() -> {
LongArrayList orderedHashesTitle = new LongArrayList();
reader.forEachTitles((title) -> {
pool.submit(() -> {
LongArrayList orderedHashesTitle = new LongArrayList();
String normalizedTitle = title.replace('_', ' ');
String normalizedTitle = title.replace('_', ' ');
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
}
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
}
}
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
}
}
});
});
});
reader.forEachArticles((title, body) -> {
pool.submit(() -> {
LongArrayList orderedHashesBody = new LongArrayList();
reader.forEachArticles((title, body) -> {
pool.submitQuietly(() -> {
LongArrayList orderedHashesBody = new LongArrayList();
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
}
synchronized (lexicon) {
for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash);
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
}
}
});
}, p -> true);
synchronized (lexicon) {
for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash);
}
}
});
pool.shutDown();
pool.awaitTermination(10, TimeUnit.DAYS);
}, p -> true);
}
lexicon.saveCounts(countsOutputFile);
}

View File

@@ -5,16 +5,19 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.LanguageModels;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** Dictionary with term frequency information for (stemmed) words.
*
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
}
private static Long2IntOpenHashMap load(Path file) throws IOException {
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
try (Arena arena = Arena.ofConfined();
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
int size = (int) Files.size(file) / 16;
long fileSizeBytes = Files.size(file);
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
int size = (int) fileSizeBytes / 16;
var ret = new Long2IntOpenHashMap(size, 0.5f);
ret.defaultReturnValue(0);
for (int i = 0; i < size; i++) {
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
ret.put(key, (int) val);
}
return ret;

View File

@@ -1,44 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:commons-codec')
implementation project(':third-party:openzim')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:array')
implementation project(':code:libraries:blocking-thread-pool')
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.jsoup
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@@ -1,8 +0,0 @@
# Term Frequency Dictionary
This dictionary is used by various parts of the system to evaluate for example
the TF-IDF score of a keyword.
## Central Classes
* [TermFrequencyDict](java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java)

View File

@@ -54,7 +54,7 @@ dependencies {
implementation project(':code:functions:live-capture:api')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:processes:crawling-process:model')
implementation libs.slop

View File

@@ -18,7 +18,6 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j

View File

@@ -29,7 +29,6 @@ dependencies {
implementation project(':code:functions:link-graph:api')
implementation project(':code:processes:process-mq-api')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:processes:converting-process:ft-anchor-keywords')

View File

@@ -39,7 +39,6 @@ dependencies {
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math:api')

View File

@@ -36,7 +36,6 @@ dependencies {
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math:api')

View File

@@ -40,7 +40,6 @@ dependencies {
implementation project(':code:libraries:domain-lock')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':third-party:symspell')

View File

@@ -29,7 +29,6 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:crawling-process:model')

View File

@@ -1,23 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j
implementation libs.notnull
}

View File

@@ -1,76 +0,0 @@
package nu.marginalia.load_test;
import nu.marginalia.WmsaHome;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.logging.log4j.util.Strings;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class LoadTestMain {
private static List<String> commonWords;
public static void main(String... args) throws URISyntaxException, IOException, InterruptedException {
commonWords = loadCommonWords();
System.out.println(commonWords.size());
HttpClient client = HttpClient.newHttpClient();
List<Long> times = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
Strings.join(pickNCommonWords(3), '+')
);
HttpRequest req = HttpRequest.newBuilder(new URI(uri))
.build();
long startTime = System.currentTimeMillis();
client.send(req, HttpResponse.BodyHandlers.ofString());
long stopTime = System.currentTimeMillis();
times.add(stopTime - startTime);
if (times.size() > 100) {
System.out.println(times.stream().mapToLong(Long::longValue).average().orElse(-1));
times.clear();
}
}
}
private static List<String> loadCommonWords() throws IOException {
var dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
try (var lines = Files.lines(Path.of("/usr/share/dict/american-english"))) {
return lines.map(String::toLowerCase).filter(term -> dict.getTermFreq(term) > 100000).collect(Collectors.toList());
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
static List<String> pickNCommonWords(int n) {
assert commonWords.size() > 10*n;
Set<String> words = new HashSet<>(n);
Random r = new Random(System.nanoTime());
while (words.size() < n) {
words.add(commonWords.get(r.nextInt(0, commonWords.size())));
}
return new ArrayList<>(words);
}
}

View File

@@ -1,9 +0,0 @@
# Load Test
Performs random queries to puts load on the local
Marginalia deployment to enable profiling of the index.
Run in the IDE.
Configure the profiler to look at port 7021 for the index service,
and 7023 for the search service.

View File

@@ -51,7 +51,6 @@ include 'code:libraries:random-write-funnel'
include 'code:libraries:blocking-thread-pool'
include 'code:libraries:braille-block-punch-cards'
include 'code:libraries:language-processing'
include 'code:libraries:term-frequency-dict'
include 'code:libraries:test-helpers'
include 'code:libraries:domain-lock'
@@ -92,7 +91,6 @@ include 'code:processes:export-task-process'
include 'code:tools:experiment-runner'
include 'code:tools:screenshot-capture-tool'
include 'code:tools:load-test'
include 'code:tools:integration-test'
include 'code:tools:browserless'