mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
14 Commits
deploy-014
...
deploy-014
Author | SHA1 | Date | |
---|---|---|---|
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e |
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
Files.deleteIfExists(Path.of(tarFileName));
|
||||
|
||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
is.transferTo(os);
|
||||
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||
|
||||
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||
long size = urlConnection.getContentLengthLong();
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = is.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size'
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
os.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
|
||||
// Update progress bar
|
||||
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||
logger.error("Error downloading sample", ex);
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
urlConnection.disconnect();
|
||||
}
|
||||
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||
yield new Extract(fileStorageId, tarFileName);
|
||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ServiceEventLog eventLog)
|
||||
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.eventLog = eventLog;
|
||||
this.heartbeat = heartbeat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -495,7 +495,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Convert the WARC file to Parquet
|
||||
// Convert the WARC file to Slop
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
|
@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
static final int MAX_TIME = 30_000;
|
||||
|
||||
/** Maximum (decompressed) size we'll save */
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
|
||||
|
||||
private final WarcWriter writer;
|
||||
private final Path warcFile;
|
||||
|
@@ -43,18 +43,18 @@ public class DomainLocks {
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
||||
@@ -19,4 +20,9 @@ public class ContentTypes {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isBinary(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
return lcHeader.startsWith("application/pdf");
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
String ctLc = contentType.toLowerCase();
|
||||
|
||||
// Permit all plain text content types
|
||||
if (ctLc.startsWith("text/"))
|
||||
return true;
|
||||
// PDF
|
||||
else if (ctLc.startsWith("application/pdf"))
|
||||
return true;
|
||||
else if (ctLc.startsWith("x-marginalia/"))
|
||||
return true;
|
||||
|
||||
|
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class ContentTypeLogic {
|
||||
|
||||
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
|
||||
"application/rss+xml",
|
||||
"application/x-rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/pdf",
|
||||
"x-rss+xml"
|
||||
);
|
||||
private boolean allowAllContentTypes = false;
|
||||
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
|
||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||
String pathLowerCase = url.path.toLowerCase();
|
||||
|
||||
if (probableHtmlPattern.test(pathLowerCase))
|
||||
if (probableGoodPattern.test(pathLowerCase))
|
||||
return false;
|
||||
|
||||
return probableBinaryPattern.test(pathLowerCase);
|
||||
|
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl keepAliveUrl;
|
||||
|
||||
private static EdgeUrl pdfUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
|
||||
));
|
||||
|
||||
|
||||
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/pdf")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdf() {
|
||||
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
|
||||
private List<WarcRecord> getWarcRecords() throws IOException {
|
||||
List<WarcRecord> records = new ArrayList<>();
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
@@ -24,13 +24,14 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class WarcRecorderTest {
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
Path fileNameSlop;
|
||||
WarcRecorder client;
|
||||
|
||||
HttpClient httpClient;
|
||||
@@ -39,7 +40,7 @@ class WarcRecorderTest {
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
fileNameSlop = Files.createTempFile("test", ".slop.zip");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
@@ -159,17 +160,28 @@ class WarcRecorderTest {
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
|
||||
request4.addHeader("User-agent", "test.marginalia.nu");
|
||||
request4.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request4);
|
||||
|
||||
SlopCrawlDataRecord.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
new UserAgent("test", "test"),
|
||||
fileNameWarc,
|
||||
fileNameParquet);
|
||||
fileNameSlop);
|
||||
|
||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||
assertEquals(2, urls.size());
|
||||
List<String> urls;
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
|
||||
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
|
||||
}
|
||||
|
||||
assertEquals(3, urls.size());
|
||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||
// sanic.jpg gets filtered out for its bad mime type
|
||||
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
|
||||
|
||||
}
|
||||
|
||||
|
@@ -1,9 +1,16 @@
|
||||
This is a bit of a hack!
|
||||
|
||||
This class exists to let tailwind we're using these classes even though they aren't visible in the code,
|
||||
as we sometimes generate classes from Java code!
|
||||
as we sometimes generate classes from Java code or javascript!
|
||||
|
||||
<i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
|
||||
<i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
|
||||
<i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
|
||||
<i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
|
||||
<span class="hover:bg-gray-300 "></span>
|
||||
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" checked>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
|
||||
</div>
|
||||
</label>
|
@@ -13,7 +13,7 @@
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
autofocus
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@@ -21,13 +21,13 @@
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@endif
|
||||
|
||||
<div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
|
||||
<div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>
|
||||
|
||||
<button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
|
||||
<i class="fas fa-search text-sm sm:mr-3"></i>
|
||||
|
@@ -43,13 +43,13 @@ function displaySuggestions(suggestions) {
|
||||
}
|
||||
|
||||
suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
|
||||
<div
|
||||
class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
|
||||
data-index="${index}"
|
||||
>
|
||||
${suggestion}
|
||||
</div>
|
||||
`).join('');
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
|
||||
${suggestion}
|
||||
</div>
|
||||
</label>
|
||||
`).join('');
|
||||
|
||||
suggestionsContainer.classList.remove('hidden');
|
||||
|
||||
|
@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
|
||||
|
||||
public class AssistantModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
@@ -0,0 +1,465 @@
|
||||
package nu.marginalia.assistant.suggest;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Unhinged data structure for fast prefix searching.
|
||||
*/
|
||||
public class PrefixSearchStructure {
|
||||
// Core data structures
|
||||
private final HashMap<String, TIntArrayList> prefixIndex; // Short prefix index (up to 8 chars)
|
||||
private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
|
||||
private final ArrayList<String> words; // All words by ID
|
||||
private final TIntArrayList wordScores; // Scores for all words
|
||||
|
||||
// Configuration
|
||||
private static final int SHORT_PREFIX_LENGTH = 8;
|
||||
private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
// For sorting efficiency
|
||||
private static class WordScorePair {
|
||||
final String word;
|
||||
final int score;
|
||||
|
||||
WordScorePair(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new PrefixTrie for typeahead search.
|
||||
*/
|
||||
public PrefixSearchStructure() {
|
||||
prefixIndex = new HashMap<>(1024);
|
||||
longPrefixIndex = new HashMap<>(1024);
|
||||
words = new ArrayList<>(1024);
|
||||
wordScores = new TIntArrayList(1024);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a prefix to the index.
|
||||
*/
|
||||
private void indexPrefix(String word, int wordId) {
|
||||
// Index short prefixes
|
||||
for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// Index longer prefixes
|
||||
for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(8));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// If the word contains spaces, also index by each term for multi-word queries
|
||||
if (word.contains(" ")) {
|
||||
String[] terms = word.split("\\s+");
|
||||
for (String term : terms) {
|
||||
if (term.length() >= 2) {
|
||||
for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String termPrefix = "t:" + term.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
termPrefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a word with its associated score.
|
||||
*/
|
||||
public void insert(String word, int score) {
|
||||
if (word == null || word.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add to the word list and index
|
||||
int wordId = words.size();
|
||||
words.add(word);
|
||||
wordScores.add(score);
|
||||
indexPrefix(word, wordId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the top k completions for a given prefix.
|
||||
*/
|
||||
public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
|
||||
if (prefix == null || prefix.isEmpty()) {
|
||||
// Return top k words by score
|
||||
return getTopKWords(k);
|
||||
}
|
||||
|
||||
// Check if this is a term search (t:) - for searching within multi-word items
|
||||
boolean isTermSearch = false;
|
||||
if (prefix.startsWith("t:") && prefix.length() > 2) {
|
||||
isTermSearch = true;
|
||||
prefix = prefix.substring(2);
|
||||
}
|
||||
|
||||
// 1. Fast path for short prefixes
|
||||
if (prefix.length() <= SHORT_PREFIX_LENGTH) {
|
||||
String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
|
||||
TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
// Try exact match in longPrefixIndex first
|
||||
if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
|
||||
TIntArrayList wordIds = longPrefixIndex.get(prefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
|
||||
if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
|
||||
String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
|
||||
TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
|
||||
if (candidateIds != null) {
|
||||
// Filter candidates by the full prefix
|
||||
return getFilteredTopKFromWordIds(candidateIds, prefix, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Optimized fallback for long prefixes - use prefix tree for segments
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
|
||||
// Handle multi-segment queries by finding candidates from first 8 chars
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
|
||||
TIntArrayList candidates = prefixIndex.get(shortPrefix);
|
||||
|
||||
if (candidates != null) {
|
||||
return getFilteredTopKFromWordIds(candidates, prefix, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Last resort - optimized binary search in sorted segments
|
||||
return findByBinarySearchPrefix(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words by score.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKWords(int k) {
|
||||
// Create pairs of (score, wordId)
|
||||
int[][] pairs = new int[words.size()][2];
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
pairs[i][0] = wordScores.get(i);
|
||||
pairs[i][1] = i;
|
||||
}
|
||||
|
||||
// Sort by score (descending)
|
||||
Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
|
||||
|
||||
// Take top k
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
for (int i = 0; i < Math.min(k, pairs.length); i++) {
|
||||
String word = words.get(pairs[i][1]);
|
||||
int score = pairs[i][0];
|
||||
results.add(new ScoredSuggestion(word, score));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words from a list of word IDs.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// For small lists, avoid sorting
|
||||
if (wordIds.size() <= k) {
|
||||
List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int wordId : ids) {
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
|
||||
}
|
||||
}
|
||||
results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
|
||||
return results;
|
||||
}
|
||||
|
||||
// For larger lists, use an array-based approach for better performance
|
||||
// Find top k without full sorting
|
||||
int[] topScores = new int[k];
|
||||
int[] topWordIds = new int[k];
|
||||
int[] ids = wordIds.toArray();
|
||||
|
||||
// Initialize with first k elements
|
||||
int filledCount = Math.min(k, ids.length);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
topWordIds[i] = wordId;
|
||||
topScores[i] = wordScores.get(wordId);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort initial elements
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
for (int j = i + 1; j < filledCount; j++) {
|
||||
if (topScores[j] > topScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[i];
|
||||
topScores[i] = topScores[j];
|
||||
topScores[j] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[i];
|
||||
topWordIds[i] = topWordIds[j];
|
||||
topWordIds[j] = tempId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining elements
|
||||
int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
|
||||
|
||||
for (int i = k; i < ids.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
int score = wordScores.get(wordId);
|
||||
|
||||
if (score > minScore) {
|
||||
// Replace the lowest element
|
||||
topScores[filledCount - 1] = score;
|
||||
topWordIds[filledCount - 1] = wordId;
|
||||
|
||||
// Bubble up the new element
|
||||
for (int j = filledCount - 1; j > 0; j--) {
|
||||
if (topScores[j] > topScores[j - 1]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[j];
|
||||
topScores[j] = topScores[j - 1];
|
||||
topScores[j - 1] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[j];
|
||||
topWordIds[j] = topWordIds[j - 1];
|
||||
topWordIds[j - 1] = tempId;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Update min score
|
||||
minScore = topScores[filledCount - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list
|
||||
List<ScoredSuggestion> results = new ArrayList<>(filledCount);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use binary search on sorted word segments to efficiently find matches.
|
||||
*/
|
||||
private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
|
||||
// If we have a lot of words, use an optimized segment approach
|
||||
if (words.size() > 1000) {
|
||||
// Divide words into segments for better locality
|
||||
int segmentSize = 1000;
|
||||
int numSegments = (words.size() + segmentSize - 1) / segmentSize;
|
||||
|
||||
// Find matches using binary search within each segment
|
||||
List<WordScorePair> allMatches = new ArrayList<>();
|
||||
for (int segment = 0; segment < numSegments; segment++) {
|
||||
int start = segment * segmentSize;
|
||||
int end = Math.min(start + segmentSize, words.size());
|
||||
|
||||
// Binary search for first potential match
|
||||
int pos = Collections.binarySearch(
|
||||
words.subList(start, end),
|
||||
prefix,
|
||||
(a, b) -> a.compareTo(b)
|
||||
);
|
||||
|
||||
if (pos < 0) {
|
||||
pos = -pos - 1;
|
||||
}
|
||||
|
||||
// Collect all matches
|
||||
for (int i = start + pos; i < end && i < words.size(); i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
allMatches.add(new WordScorePair(word, wordScores.get(i)));
|
||||
} else if (word.compareTo(prefix) > 0) {
|
||||
break; // Past potential matches
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score and take top k
|
||||
allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
|
||||
for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
|
||||
WordScorePair pair = allMatches.get(i);
|
||||
results.add(new ScoredSuggestion(pair.word, pair.score));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// Fallback for small dictionaries - linear scan but optimized
|
||||
return simpleSearchFallback(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized linear scan - only used for small dictionaries.
|
||||
*/
|
||||
private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
|
||||
// Use primitive arrays for better cache locality
|
||||
int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
|
||||
String[] matchWords = new String[matchScores.length];
|
||||
int matchCount = 0;
|
||||
|
||||
for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(i);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort matches by score (in-place for small arrays)
|
||||
for (int i = 0; i < matchCount; i++) {
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[j];
|
||||
matchScores[j] = tempScore;
|
||||
|
||||
// Swap words
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[j];
|
||||
matchWords[j] = tempWord;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create results
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top k words from candidate IDs, filtering by the full prefix.
|
||||
*/
|
||||
private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Make primitive arrays for better performance
|
||||
String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
|
||||
int[] matchScores = new int[matchWords.length];
|
||||
int matchCount = 0;
|
||||
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
String word = words.get(wordId);
|
||||
if (word.startsWith(fullPrefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(wordId);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score (efficient insertion sort for small k)
|
||||
for (int i = 0; i < Math.min(matchCount, k); i++) {
|
||||
int maxPos = i;
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[maxPos]) {
|
||||
maxPos = j;
|
||||
}
|
||||
}
|
||||
if (maxPos != i) {
|
||||
// Swap
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[maxPos];
|
||||
matchScores[maxPos] = tempScore;
|
||||
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[maxPos];
|
||||
matchWords[maxPos] = tempWord;
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list (only up to k elements)
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class representing a suggested completion.
|
||||
*/
|
||||
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
|
||||
private final String word;
|
||||
private final int score;
|
||||
|
||||
public ScoredSuggestion(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public int getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return word + " (" + score + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
|
||||
return Integer.compare(this.score, o.score);
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,74 +2,83 @@ package nu.marginalia.assistant.suggest;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.functions.math.dict.SpellChecker;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class Suggestions {
|
||||
private PatriciaTrie<String> suggestionsTrie = null;
|
||||
private TermFrequencyDict termFrequencyDict = null;
|
||||
private volatile boolean ready = false;
|
||||
private final SpellChecker spellChecker;
|
||||
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
|
||||
|
||||
private volatile boolean ready = false;
|
||||
|
||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
|
||||
|
||||
private static final int MIN_SUGGEST_LENGTH = 3;
|
||||
@Inject
|
||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||
SpellChecker spellChecker,
|
||||
TermFrequencyDict dict
|
||||
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
|
||||
@Named("suggestions-file2") Path suggestionsFile2
|
||||
) {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
searchStructures.add(loadSuggestions(suggestionsFile1));
|
||||
searchStructures.add(loadSuggestions(suggestionsFile2));
|
||||
ready = true;
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
logger.info("Loaded suggestions");
|
||||
});
|
||||
}
|
||||
|
||||
private static PatriciaTrie<String> loadSuggestions(Path file) {
|
||||
private static PrefixSearchStructure loadSuggestions(Path file) {
|
||||
PrefixSearchStructure ret = new PrefixSearchStructure();
|
||||
|
||||
if (!Files.exists(file)) {
|
||||
logger.error("Suggestions file {} absent, loading empty suggestions db", file);
|
||||
return new PatriciaTrie<>();
|
||||
return ret;
|
||||
}
|
||||
try (var lines = Files.lines(file)) {
|
||||
var ret = new PatriciaTrie<String>();
|
||||
|
||||
lines.filter(suggestionPattern.asPredicate())
|
||||
.filter(line -> line.length()<32)
|
||||
.map(String::toLowerCase)
|
||||
.forEach(w -> ret.put(w, w));
|
||||
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
|
||||
while (scanner.hasNextLine()) {
|
||||
String line = scanner.nextLine().trim();
|
||||
String[] parts = StringUtils.split(line, " ,", 2);
|
||||
if (parts.length != 2) {
|
||||
logger.warn("Invalid suggestion line: {}", line);
|
||||
continue;
|
||||
}
|
||||
int cnt = Integer.parseInt(parts[0]);
|
||||
if (cnt > 1) {
|
||||
String word = parts[1];
|
||||
|
||||
// Add special keywords to the suggestions
|
||||
for (var feature : HtmlFeature.values()) {
|
||||
String keyword = feature.getKeyword();
|
||||
// Remove quotes and trailing periods if this is a CSV
|
||||
if (word.startsWith("\"") && word.endsWith("\"")) {
|
||||
word = word.substring(1, word.length() - 1);
|
||||
}
|
||||
|
||||
ret.put(keyword, keyword);
|
||||
ret.put("-" + keyword, "-" + keyword);
|
||||
// Remove trailing periods
|
||||
while (word.endsWith(".")) {
|
||||
word = word.substring(0, word.length() - 1);
|
||||
}
|
||||
|
||||
// Remove junk items we may have gotten from link extraction
|
||||
if (word.startsWith("click here"))
|
||||
continue;
|
||||
|
||||
if (word.length() > 3) {
|
||||
ret.insert(word, cnt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to load suggestions file", ex);
|
||||
return new PatriciaTrie<>();
|
||||
return new PrefixSearchStructure();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,96 +92,36 @@ public class Suggestions {
|
||||
|
||||
searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");
|
||||
|
||||
return Stream.of(
|
||||
new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
|
||||
suggestionsForLastWord(count, searchWord),
|
||||
spellCheckStream(searchWord)
|
||||
)
|
||||
.flatMap(SuggestionsStreamable::stream)
|
||||
.limit(count)
|
||||
.collect(Collectors.toList());
|
||||
return getSuggestionsForKeyword(count, searchWord);
|
||||
}
|
||||
|
||||
private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) {
|
||||
int sp = searchWord.lastIndexOf(' ');
|
||||
|
||||
if (sp < 0) {
|
||||
return Stream::empty;
|
||||
}
|
||||
|
||||
String prefixString = searchWord.substring(0, sp+1);
|
||||
String suggestString = searchWord.substring(sp+1);
|
||||
|
||||
return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
|
||||
|
||||
}
|
||||
|
||||
private SuggestionsStreamable spellCheckStream(String word) {
|
||||
int start = word.lastIndexOf(' ');
|
||||
String prefix;
|
||||
String corrWord;
|
||||
|
||||
if (start < 0) {
|
||||
corrWord = word;
|
||||
prefix = "";
|
||||
}
|
||||
else {
|
||||
prefix = word.substring(0, start + 1);
|
||||
corrWord = word.substring(start + 1);
|
||||
}
|
||||
|
||||
if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
|
||||
Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
|
||||
return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
|
||||
}
|
||||
else {
|
||||
return Stream::empty;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
public List<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
if (!ready)
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
|
||||
if (prefix.length() < MIN_SUGGEST_LENGTH) {
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
}
|
||||
|
||||
var start = suggestionsTrie.select(prefix);
|
||||
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
|
||||
|
||||
if (start == null) {
|
||||
return Stream.empty();
|
||||
for (var searchStructure : searchStructures) {
|
||||
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
|
||||
}
|
||||
resultsAll.sort(Comparator.reverseOrder());
|
||||
List<String> ret = new ArrayList<>(count);
|
||||
|
||||
Set<String> seen = new HashSet<>();
|
||||
for (var result : resultsAll) {
|
||||
if (seen.add(result.getWord())) {
|
||||
ret.add(result.getWord());
|
||||
}
|
||||
if (ret.size() >= count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!start.getKey().startsWith(prefix)) {
|
||||
return Stream.empty();
|
||||
}
|
||||
|
||||
SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
|
||||
|
||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||
.takeWhile(s -> s.startsWith(prefix))
|
||||
.limit(256)
|
||||
.sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
|
||||
.limit(count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
|
||||
public Stream<String> stream() {
|
||||
return suggestionStream.map(s -> prefix + s);
|
||||
}
|
||||
}
|
||||
|
||||
interface SuggestionsStreamable { Stream<String> stream(); }
|
||||
|
||||
private class SuggestionsValueCalculator {
|
||||
|
||||
private final Map<String, Long> hashCache = new HashMap<>(512);
|
||||
|
||||
public int get(String s) {
|
||||
long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
|
||||
return -termFrequencyDict.getTermFreqHash(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -59,9 +59,14 @@ public class ControlMain extends MainClass {
|
||||
download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
|
||||
}
|
||||
|
||||
Path suggestionsFile = dataPath.resolve("suggestions.txt");
|
||||
Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
|
||||
if (!Files.exists(suggestionsFile)) {
|
||||
downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz"));
|
||||
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
|
||||
}
|
||||
|
||||
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
|
||||
if (!Files.exists(altSuggestionsFile)) {
|
||||
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
|
||||
}
|
||||
|
||||
Path asnRawData = dataPath.resolve("asn-data-raw-table");
|
||||
|
@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
|
||||
<tr>
|
||||
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-s">Small</label></td>
|
||||
<td>1000 Domains. About 2 GB. </td>
|
||||
<td>1000 Domains. About 1 GB. </td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-m">Medium</label></td>
|
||||
<td>2000 Domains. About 6 GB. Recommended.</td>
|
||||
<td>2000 Domains. About 2 GB. Recommended.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-l">Large</label></td>
|
||||
<td>5000 Domains. About 20 GB.</td>
|
||||
<td>5000 Domains. About 7 GB.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-xl">Huge</label></td>
|
||||
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
|
||||
<td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
|
||||
Expect hours of processing time. </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
@@ -1,4 +1,5 @@
|
||||
## This is a token file for automatic deployment
|
||||
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-04-24: Deploy executor.
|
Reference in New Issue
Block a user