mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(refac) Fold term-frequency-dict into language-processing
This commit is contained in:
@@ -33,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
@@ -23,14 +23,12 @@ dependencies {
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@@ -77,7 +77,7 @@ dependencies {
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
testImplementation project(':code:libraries:braille-block-punch-cards')
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
@@ -23,7 +23,6 @@ dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:index')
|
||||
|
@@ -18,10 +18,10 @@ dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.jooby
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
@@ -11,7 +10,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
|
||||
|
||||
var orderedHasher = HasherGroup.ordered();
|
||||
|
||||
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||
32
|
||||
);
|
||||
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
|
||||
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
}, p -> true);
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||
}, p -> true);
|
||||
}
|
||||
|
||||
lexicon.saveCounts(countsOutputFile);
|
||||
}
|
@@ -5,16 +5,19 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** Dictionary with term frequency information for (stemmed) words.
|
||||
*
|
||||
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
private static Long2IntOpenHashMap load(Path file) throws IOException {
|
||||
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
|
||||
|
||||
int size = (int) Files.size(file) / 16;
|
||||
long fileSizeBytes = Files.size(file);
|
||||
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
|
||||
|
||||
int size = (int) fileSizeBytes / 16;
|
||||
var ret = new Long2IntOpenHashMap(size, 0.5f);
|
||||
|
||||
ret.defaultReturnValue(0);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
|
||||
|
||||
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
|
||||
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
|
||||
|
||||
ret.put(key, (int) val);
|
||||
}
|
||||
|
||||
return ret;
|
@@ -1,44 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.jsoup
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@@ -1,8 +0,0 @@
|
||||
# Term Frequency Dictionary
|
||||
|
||||
This dictionary is used by various parts of the system to evaluate for example
|
||||
the TF-IDF score of a keyword.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [TermFrequencyDict](java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java)
|
@@ -54,7 +54,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
testImplementation project(':code:processes:crawling-process:model')
|
||||
|
||||
implementation libs.slop
|
||||
|
@@ -18,7 +18,6 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -29,7 +29,6 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:processes:converting-process:ft-anchor-keywords')
|
||||
|
@@ -39,7 +39,6 @@ dependencies {
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
|
@@ -36,7 +36,6 @@ dependencies {
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
|
@@ -40,7 +40,6 @@ dependencies {
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
|
@@ -29,7 +29,6 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
||||
|
@@ -1,23 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
}
|
||||
|
@@ -1,76 +0,0 @@
|
||||
package nu.marginalia.load_test;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class LoadTestMain {
|
||||
private static List<String> commonWords;
|
||||
public static void main(String... args) throws URISyntaxException, IOException, InterruptedException {
|
||||
commonWords = loadCommonWords();
|
||||
|
||||
|
||||
System.out.println(commonWords.size());
|
||||
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
|
||||
List<Long> times = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
|
||||
Strings.join(pickNCommonWords(3), '+')
|
||||
);
|
||||
|
||||
HttpRequest req = HttpRequest.newBuilder(new URI(uri))
|
||||
.build();
|
||||
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
client.send(req, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
long stopTime = System.currentTimeMillis();
|
||||
|
||||
times.add(stopTime - startTime);
|
||||
if (times.size() > 100) {
|
||||
System.out.println(times.stream().mapToLong(Long::longValue).average().orElse(-1));
|
||||
times.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> loadCommonWords() throws IOException {
|
||||
var dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
|
||||
try (var lines = Files.lines(Path.of("/usr/share/dict/american-english"))) {
|
||||
return lines.map(String::toLowerCase).filter(term -> dict.getTermFreq(term) > 100000).collect(Collectors.toList());
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static List<String> pickNCommonWords(int n) {
|
||||
assert commonWords.size() > 10*n;
|
||||
|
||||
Set<String> words = new HashSet<>(n);
|
||||
Random r = new Random(System.nanoTime());
|
||||
while (words.size() < n) {
|
||||
words.add(commonWords.get(r.nextInt(0, commonWords.size())));
|
||||
}
|
||||
|
||||
return new ArrayList<>(words);
|
||||
}
|
||||
}
|
@@ -1,9 +0,0 @@
|
||||
# Load Test
|
||||
|
||||
Performs random queries to puts load on the local
|
||||
Marginalia deployment to enable profiling of the index.
|
||||
|
||||
Run in the IDE.
|
||||
|
||||
Configure the profiler to look at port 7021 for the index service,
|
||||
and 7023 for the search service.
|
@@ -51,7 +51,6 @@ include 'code:libraries:random-write-funnel'
|
||||
include 'code:libraries:blocking-thread-pool'
|
||||
include 'code:libraries:braille-block-punch-cards'
|
||||
include 'code:libraries:language-processing'
|
||||
include 'code:libraries:term-frequency-dict'
|
||||
include 'code:libraries:test-helpers'
|
||||
include 'code:libraries:domain-lock'
|
||||
|
||||
@@ -92,7 +91,6 @@ include 'code:processes:export-task-process'
|
||||
|
||||
include 'code:tools:experiment-runner'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
include 'code:tools:integration-test'
|
||||
include 'code:tools:browserless'
|
||||
|
||||
|
Reference in New Issue
Block a user