1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-08 10:02:41 +02:00

Compare commits

...

15 Commits

Author SHA1 Message Date
Viktor
f0c9b935d8 Merge pull request #192 from MarginaliaSearch/improve-suggestions
Improve typeahead suggestions
2025-04-23 20:17:49 +02:00
Viktor Lofgren
7b5493dd51 (assistant) Improve typeahead suggestions
Implement a new prefix search structure (not a trie, but hash table based) with a concept of score.
2025-04-23 20:13:53 +02:00
Viktor Lofgren
c246a59158 (search) Make it clearer that it's a search engine 2025-04-22 16:03:42 +02:00
Viktor
0b99781d24 Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler
Pdf support in crawler
2025-04-22 15:52:41 +02:00
Viktor Lofgren
39db9620c1 (crawler) Increase maximum permitted file size to 32 MB 2025-04-22 15:51:03 +02:00
Viktor Lofgren
1781599363 (crawler) Add support for crawling PDF files 2025-04-22 15:50:05 +02:00
Viktor Lofgren
6b2d18fb9b (crawler) Adjust domain limits to be generally more permissive. 2025-04-22 15:27:57 +02:00
Viktor
59b1d200ab Merge pull request #190 from MarginaliaSearch/download-sample-chores
Download sample chores
2025-04-22 13:29:49 +02:00
Viktor Lofgren
897010a2cf (control) Update download sample data actor with better UI
The original implementation didn't really give a lot of feedback about what it was doing.  Adding a progress bar to the download step.

Relates to issue 189.
2025-04-22 13:27:22 +02:00
Viktor Lofgren
602af7a77e (control) Update UI with new sample sizes
Relates to issue 189.
2025-04-22 13:27:13 +02:00
Viktor Lofgren
a7d91c8527 (crawler) Clean up fetcher detailed logging 2025-04-21 12:53:52 +02:00
Viktor Lofgren
7151602124 (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Cleaning up after changes.
2025-04-21 12:47:03 +02:00
Viktor Lofgren
884e33bd4a (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change back to an unbounded queue, tighten sleep times a bit.
2025-04-21 11:48:15 +02:00
Viktor Lofgren
e84d5c497a (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.
2025-04-21 00:39:26 +02:00
Viktor Lofgren
2d2d3e2466 (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.
2025-04-21 00:36:48 +02:00
17 changed files with 647 additions and 164 deletions

View File

@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume; import nu.marginalia.actor.state.Resume;
import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.*; import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URI; import java.net.URI;
import java.net.URL; import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
private final FileStorageService storageService; private final FileStorageService storageService;
private final ServiceEventLog eventLog; private final ServiceEventLog eventLog;
private final ServiceHeartbeat heartbeat;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Resume(behavior = ActorResumeBehavior.ERROR) @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
Files.deleteIfExists(Path.of(tarFileName)); Files.deleteIfExists(Path.of(tarFileName));
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream()); HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
is.transferTo(os); try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
long size = urlConnection.getContentLengthLong();
byte[] buffer = new byte[8192];
try (var is = new BufferedInputStream(urlConnection.getInputStream());
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
long copiedSize = 0;
while (copiedSize < size) {
int read = is.read(buffer);
if (read < 0) // We've been promised a file of length 'size'
throw new IOException("Unexpected end of stream");
os.write(buffer, 0, read);
copiedSize += read;
// Update progress bar
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
}
}
} }
catch (Exception ex) { catch (Exception ex) {
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample"); eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
logger.error("Error downloading sample", ex); logger.error("Error downloading sample", ex);
yield new Error(); yield new Error();
} }
finally {
urlConnection.disconnect();
}
eventLog.logEvent(DownloadSampleActor.class, "Download complete"); eventLog.logEvent(DownloadSampleActor.class, "Download complete");
yield new Extract(fileStorageId, tarFileName); yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
@Inject @Inject
public DownloadSampleActor(Gson gson, public DownloadSampleActor(Gson gson,
FileStorageService storageService, FileStorageService storageService,
ServiceEventLog eventLog) ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
{ {
super(gson); super(gson);
this.storageService = storageService; this.storageService = storageService;
this.eventLog = eventLog; this.eventLog = eventLog;
this.heartbeat = heartbeat;
} }
} }

View File

@@ -66,6 +66,7 @@ public class CrawlerMain extends ProcessMainClass {
private final DomainLocks domainLocks = new DomainLocks(); private final DomainLocks domainLocks = new DomainLocks();
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>(); private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>(); private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
private final AtomicInteger tasksDone = new AtomicInteger(0); private final AtomicInteger tasksDone = new AtomicInteger(0);
@@ -289,10 +290,13 @@ public class CrawlerMain extends ProcessMainClass {
if (hasTasks || hasRetryTasks || hasRunningTasks) { if (hasTasks || hasRetryTasks || hasRunningTasks) {
retryQueue.drainTo(taskList); retryQueue.drainTo(taskList);
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
taskList.removeIf(this::trySubmitDeferredTask); taskList.removeIf(this::trySubmitDeferredTask);
// Add a small pause here to avoid busy looping toward the end of the execution cycle when // Add a small pause here to avoid busy looping toward the end of the execution cycle when
// we might have no new viable tasks to run for hours on end // we might have no new viable tasks to run for hours on end
TimeUnit.MILLISECONDS.sleep(50); TimeUnit.MILLISECONDS.sleep(5);
} else { } else {
// We have no tasks to run, and no tasks in the retry queue // We have no tasks to run, and no tasks in the retry queue
// but we wait a bit to see if any new tasks come in via the retry queue // but we wait a bit to see if any new tasks come in via the retry queue
@@ -430,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
/** Best effort indicator whether we could start this now without getting stuck in /** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */ * DomainLocks purgatory */
public boolean canRun() { public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain)); return domainLocks.isLockableHint(new EdgeDomain(domain));
} }
@Override @Override
@@ -445,12 +449,21 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task // We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long // we return to avoid blocking the pool for too long
if (lock.isEmpty()) { if (lock.isEmpty()) {
retryQueue.add(this); if (retryQueue.remainingCapacity() > 0) {
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
retryQueue.put(this);
return; return;
} }
DomainLocks.DomainLock domainLock = lock.get(); DomainLocks.DomainLock domainLock = lock.get();
try (domainLock) { try (domainLock) {
Thread.currentThread().setName("crawling:" + domain);
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain); Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -482,7 +495,7 @@ public class CrawlerMain extends ProcessMainClass {
// (mostly a case when migrating from legacy->warc) // (mostly a case when migrating from legacy->warc)
reference.delete(); reference.delete();
// Convert the WARC file to Parquet // Convert the WARC file to Slop
SlopCrawlDataRecord SlopCrawlDataRecord
.convertWarc(domain, userAgent, newWarcFile, slopFile); .convertWarc(domain, userAgent, newWarcFile, slopFile);

View File

@@ -53,6 +53,7 @@ import java.net.SocketTimeoutException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.time.Duration; import java.time.Duration;
import java.time.Instant;
import java.util.*; import java.util.*;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -393,25 +394,31 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
if (probeType == HttpFetcher.ProbeType.FULL) { if (probeType == HttpFetcher.ProbeType.FULL) {
try { try {
var probeResult = probeContentType(url, cookies, timer, contentTags); var probeResult = probeContentType(url, cookies, timer, contentTags);
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
switch (probeResult) { switch (probeResult) {
case HttpFetcher.ContentTypeProbeResult.NoOp(): case HttpFetcher.ContentTypeProbeResult.NoOp():
break; // break; //
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl): case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
break; break;
case ContentTypeProbeResult.BadContentType badContentType: case ContentTypeProbeResult.BadContentType badContentType:
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode()); warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
return new HttpFetchResult.ResultNone(); return new HttpFetchResult.ResultNone();
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex): case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
warcRecorder.flagAsTimeout(url); warcRecorder.flagAsTimeout(url);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
case ContentTypeProbeResult.Exception(Exception ex): case ContentTypeProbeResult.Exception(Exception ex):
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
warcRecorder.flagAsError(url, ex); warcRecorder.flagAsError(url, ex);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
case ContentTypeProbeResult.HttpError httpError: case ContentTypeProbeResult.HttpError httpError:
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message())); return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
case ContentTypeProbeResult.Redirect redirect: case ContentTypeProbeResult.Redirect redirect:
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
return new HttpFetchResult.ResultRedirect(redirect.location()); return new HttpFetchResult.ResultRedirect(redirect.location());
} }
} catch (Exception ex) { } catch (Exception ex) {
@@ -430,27 +437,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
contentTags.paint(request); contentTags.paint(request);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
Instant start = Instant.now();
HttpFetchResult result = warcRecorder.fetch(client, cookies, request); HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
Duration fetchDuration = Duration.between(start, Instant.now());
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw(); result = new HttpFetchResult.Result304Raw();
} }
} }
switch (result) { switch (result) {
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url); case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url); case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url); case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex()); case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url); case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url); case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
} }
return result; return result;
} }
} }
catch (Exception ex) { catch (Exception ex) {
ex.printStackTrace(); logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
} }

View File

@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
static final int MAX_TIME = 30_000; static final int MAX_TIME = 30_000;
/** Maximum (decompressed) size we'll save */ /** Maximum (decompressed) size we'll save */
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024); static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
private final WarcWriter writer; private final WarcWriter writer;
private final Path warcFile; private final Path warcFile;

View File

@@ -20,16 +20,17 @@ public class DomainLocks {
* and may be held by another thread. The caller is responsible for locking and releasing the lock. * and may be held by another thread. The caller is responsible for locking and releasing the lock.
*/ */
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException { public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
var ret = new DomainLock(domain.toString(), var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
ret.lock(); sem.acquire();
return ret;
return new DomainLock(sem);
} }
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) { public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits); var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
if (sem.tryAcquire(1)) { if (sem.tryAcquire(1)) {
return Optional.of(new DomainLock(domain.toString(), sem)); return Optional.of(new DomainLock(sem));
} }
else { else {
// We don't have a lock, so we return an empty optional // We don't have a lock, so we return an empty optional
@@ -42,23 +43,27 @@ public class DomainLocks {
return new Semaphore(16); return new Semaphore(16);
if (topDomain.equals("blogspot.com")) if (topDomain.equals("blogspot.com"))
return new Semaphore(8); return new Semaphore(8);
if (topDomain.equals("tumblr.com"))
return new Semaphore(8);
if (topDomain.equals("neocities.org")) if (topDomain.equals("neocities.org"))
return new Semaphore(4); return new Semaphore(8);
if (topDomain.equals("github.io")) if (topDomain.equals("github.io"))
return new Semaphore(4); return new Semaphore(8);
// Substack really dislikes broad-scale crawlers, so we need to be careful
// to not get blocked.
if (topDomain.equals("substack.com")) { if (topDomain.equals("substack.com")) {
return new Semaphore(1); return new Semaphore(1);
} }
if (topDomain.endsWith(".edu")) {
return new Semaphore(1);
}
return new Semaphore(2); return new Semaphore(2);
} }
public boolean canLock(EdgeDomain domain) { /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
* after this method returns true)
*/
public boolean isLockableHint(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase()); Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem) if (null == sem)
return true; return true;
@@ -67,25 +72,16 @@ public class DomainLocks {
} }
public static class DomainLock implements AutoCloseable { public static class DomainLock implements AutoCloseable {
private final String domainName;
private final Semaphore semaphore; private final Semaphore semaphore;
DomainLock(String domainName, Semaphore semaphore) { DomainLock(Semaphore semaphore) {
this.domainName = domainName;
this.semaphore = semaphore; this.semaphore = semaphore;
} }
// This method is called to lock the domain. It will block until the lock is available.
private void lock() throws InterruptedException {
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
semaphore.acquire();
Thread.currentThread().setName("crawling:" + domainName);
}
@Override @Override
public void close() throws Exception { public void close() throws Exception {
semaphore.release(); semaphore.release();
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]"); Thread.currentThread().setName("[idle]");
} }
} }
} }

View File

@@ -6,6 +6,7 @@ public class ContentTypes {
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml", public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
"application/xhtml", "application/xhtml",
"text/html", "text/html",
"application/pdf",
"image/x-icon", "image/x-icon",
"text/plain"); "text/plain");
@@ -19,4 +20,9 @@ public class ContentTypes {
return false; return false;
} }
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
return lcHeader.startsWith("application/pdf");
}
} }

View File

@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
public boolean filter(String url, int status, String contentType) { public boolean filter(String url, int status, String contentType) {
String ctLc = contentType.toLowerCase(); String ctLc = contentType.toLowerCase();
// Permit all plain text content types
if (ctLc.startsWith("text/")) if (ctLc.startsWith("text/"))
return true; return true;
// PDF
else if (ctLc.startsWith("application/pdf"))
return true;
else if (ctLc.startsWith("x-marginalia/")) else if (ctLc.startsWith("x-marginalia/"))
return true; return true;

View File

@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
public class ContentTypeLogic { public class ContentTypeLogic {
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate(); private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate(); private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript"); private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
private static final List<String> acceptedContentTypePrefixes = List.of( private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
"application/rss+xml", "application/rss+xml",
"application/x-rss+xml", "application/x-rss+xml",
"application/rdf+xml", "application/rdf+xml",
"application/pdf",
"x-rss+xml" "x-rss+xml"
); );
private boolean allowAllContentTypes = false; private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
public boolean isUrlLikeBinary(EdgeUrl url) { public boolean isUrlLikeBinary(EdgeUrl url) {
String pathLowerCase = url.path.toLowerCase(); String pathLowerCase = url.path.toLowerCase();
if (probableHtmlPattern.test(pathLowerCase)) if (probableGoodPattern.test(pathLowerCase))
return false; return false;
return probableBinaryPattern.test(pathLowerCase); return probableBinaryPattern.test(pathLowerCase);

View File

@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
return false; return false;
} }
// If the format is binary, we don't want to translate it if the response is truncated
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
return false;
}
return true; return true;
} }

View File

@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
private static EdgeUrl badHttpStatusUrl; private static EdgeUrl badHttpStatusUrl;
private static EdgeUrl keepAliveUrl; private static EdgeUrl keepAliveUrl;
private static EdgeUrl pdfUrl;
@BeforeAll @BeforeAll
public static void setupAll() throws URISyntaxException { public static void setupAll() throws URISyntaxException {
wireMockServer = wireMockServer =
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
)); ));
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
.willReturn(WireMock.aResponse()
.withHeader("Content-Type", "application/pdf")
.withStatus(200)
.withBody("Hello World")));
wireMockServer.start(); wireMockServer.start();
} }
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
Assertions.assertTrue(result.isOk()); Assertions.assertTrue(result.isOk());
} }
@Test
public void testPdf() {
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
}
private List<WarcRecord> getWarcRecords() throws IOException { private List<WarcRecord> getWarcRecords() throws IOException {
List<WarcRecord> records = new ArrayList<>(); List<WarcRecord> records = new ArrayList<>();

View File

@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies; import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.slop.SlopCrawlDataRecord;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
class WarcRecorderTest { class WarcRecorderTest {
Path fileNameWarc; Path fileNameWarc;
Path fileNameParquet; Path fileNameSlop;
WarcRecorder client; WarcRecorder client;
HttpClient httpClient; HttpClient httpClient;
@@ -39,7 +40,7 @@ class WarcRecorderTest {
httpClient = HttpClients.createDefault(); httpClient = HttpClients.createDefault();
fileNameWarc = Files.createTempFile("test", ".warc"); fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet"); fileNameSlop = Files.createTempFile("test", ".slop.zip");
client = new WarcRecorder(fileNameWarc); client = new WarcRecorder(fileNameWarc);
} }
@@ -159,17 +160,28 @@ class WarcRecorderTest {
client.fetch(httpClient, new DomainCookies(), request3); client.fetch(httpClient, new DomainCookies(), request3);
CrawledDocumentParquetRecordFileWriter.convertWarc( HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
request4.addHeader("User-agent", "test.marginalia.nu");
request4.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request4);
SlopCrawlDataRecord.convertWarc(
"www.marginalia.nu", "www.marginalia.nu",
new UserAgent("test", "test"), new UserAgent("test", "test"),
fileNameWarc, fileNameWarc,
fileNameParquet); fileNameSlop);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); List<String> urls;
assertEquals(2, urls.size()); try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
}
assertEquals(3, urls.size());
assertEquals("https://www.marginalia.nu/", urls.get(0)); assertEquals("https://www.marginalia.nu/", urls.get(0));
assertEquals("https://www.marginalia.nu/log/", urls.get(1)); assertEquals("https://www.marginalia.nu/log/", urls.get(1));
// sanic.jpg gets filtered out for its bad mime type // sanic.jpg gets filtered out for its bad mime type
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
} }

View File

@@ -13,7 +13,7 @@
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5" class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}" value="${query}"
autofocus autofocus
placeholder="Search..." placeholder="Search the web!"
autocomplete="off" autocomplete="off"
name="query" name="query"
id="searchInput" /> id="searchInput" />
@@ -21,7 +21,7 @@
<input type="text" <input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5" class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}" value="${query}"
placeholder="Search..." placeholder="Search the web!"
autocomplete="off" autocomplete="off"
name="query" name="query"
id="searchInput" /> id="searchInput" />

View File

@@ -10,7 +10,7 @@ import static com.google.inject.name.Names.named;
public class AssistantModule extends AbstractModule { public class AssistantModule extends AbstractModule {
public void configure() { public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt")); bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
} }

View File

@@ -0,0 +1,459 @@
package nu.marginalia.assistant.suggest;
import gnu.trove.list.array.TIntArrayList;
import java.util.*;
/** Unhinged data structure for fast prefix searching.
*/
public class PrefixSearchStructure {
// Core data structures
private final HashMap<String, TIntArrayList> prefixIndex; // Short prefix index (up to 8 chars)
private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
private final ArrayList<String> words; // All words by ID
private final TIntArrayList wordScores; // Scores for all words
// Configuration
private static final int SHORT_PREFIX_LENGTH = 8;
private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
public int size() {
return words.size();
}
// For sorting efficiency
private static class WordScorePair {
final String word;
final int score;
WordScorePair(String word, int score) {
this.word = word;
this.score = score;
}
}
/**
* Creates a new PrefixTrie for typeahead search.
*/
public PrefixSearchStructure() {
prefixIndex = new HashMap<>(1024);
longPrefixIndex = new HashMap<>(1024);
words = new ArrayList<>(1024);
wordScores = new TIntArrayList(1024);
}
/**
* Adds a prefix to the index.
*/
private void indexPrefix(String word, int wordId) {
// Index short prefixes
for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
String prefix = word.substring(0, i);
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
prefix, k -> new TIntArrayList(16));
wordIds.add(wordId);
}
// Index longer prefixes
for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
String prefix = word.substring(0, i);
TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
prefix, k -> new TIntArrayList(8));
wordIds.add(wordId);
}
// If the word contains spaces, also index by each term for multi-word queries
if (word.contains(" ")) {
String[] terms = word.split("\\s+");
for (String term : terms) {
if (term.length() >= 2) {
for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
String termPrefix = "t:" + term.substring(0, i);
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
termPrefix, k -> new TIntArrayList(16));
wordIds.add(wordId);
}
}
}
}
}
/**
* Inserts a word with its associated score.
*/
public void insert(String word, int score) {
if (word == null || word.isEmpty()) {
return;
}
// Add to the word list and index
int wordId = words.size();
words.add(word);
wordScores.add(score);
indexPrefix(word, wordId);
}
/**
* Returns the top k completions for a given prefix.
*/
public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
if (prefix == null || prefix.isEmpty()) {
// Return top k words by score
return getTopKWords(k);
}
// Check if this is a term search (t:) - for searching within multi-word items
boolean isTermSearch = false;
if (prefix.startsWith("t:") && prefix.length() > 2) {
isTermSearch = true;
prefix = prefix.substring(2);
}
// 1. Fast path for short prefixes
if (prefix.length() <= SHORT_PREFIX_LENGTH) {
String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
if (wordIds != null) {
return getTopKFromWordIds(wordIds, k);
}
}
// 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
if (prefix.length() > SHORT_PREFIX_LENGTH) {
// Try exact match in longPrefixIndex first
if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
TIntArrayList wordIds = longPrefixIndex.get(prefix);
if (wordIds != null) {
return getTopKFromWordIds(wordIds, k);
}
}
// If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
if (candidateIds != null) {
// Filter candidates by the full prefix
return getFilteredTopKFromWordIds(candidateIds, prefix, k);
}
}
}
// 3. Optimized fallback for long prefixes - use prefix tree for segments
List<ScoredSuggestion> results = new ArrayList<>();
// Handle multi-segment queries by finding candidates from first 8 chars
if (prefix.length() > SHORT_PREFIX_LENGTH) {
String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
TIntArrayList candidates = prefixIndex.get(shortPrefix);
if (candidates != null) {
return getFilteredTopKFromWordIds(candidates, prefix, k);
}
}
// 4. Last resort - optimized binary search in sorted segments
return findByBinarySearchPrefix(prefix, k);
}
/**
* Helper to get the top k words by score.
*/
private List<ScoredSuggestion> getTopKWords(int k) {
// Create pairs of (score, wordId)
int[][] pairs = new int[words.size()][2];
for (int i = 0; i < words.size(); i++) {
pairs[i][0] = wordScores.get(i);
pairs[i][1] = i;
}
// Sort by score (descending)
Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
// Take top k
List<ScoredSuggestion> results = new ArrayList<>();
for (int i = 0; i < Math.min(k, pairs.length); i++) {
String word = words.get(pairs[i][1]);
int score = pairs[i][0];
results.add(new ScoredSuggestion(word, score));
}
return results;
}
/**
* Helper to get the top k words from a list of word IDs.
*/
private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
if (wordIds == null || wordIds.isEmpty()) {
return Collections.emptyList();
}
// For small lists, avoid sorting
if (wordIds.size() <= k) {
List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
int[] ids = wordIds.toArray();
for (int wordId : ids) {
if (wordId >= 0 && wordId < words.size()) {
results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
}
}
results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
return results;
}
// For larger lists, use an array-based approach for better performance
// Find top k without full sorting
int[] topScores = new int[k];
int[] topWordIds = new int[k];
int[] ids = wordIds.toArray();
// Initialize with first k elements
int filledCount = Math.min(k, ids.length);
for (int i = 0; i < filledCount; i++) {
int wordId = ids[i];
if (wordId >= 0 && wordId < words.size()) {
topWordIds[i] = wordId;
topScores[i] = wordScores.get(wordId);
}
}
// Sort initial elements
for (int i = 0; i < filledCount; i++) {
for (int j = i + 1; j < filledCount; j++) {
if (topScores[j] > topScores[i]) {
// Swap scores
int tempScore = topScores[i];
topScores[i] = topScores[j];
topScores[j] = tempScore;
// Swap word IDs
int tempId = topWordIds[i];
topWordIds[i] = topWordIds[j];
topWordIds[j] = tempId;
}
}
}
// Process remaining elements
int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
for (int i = k; i < ids.length; i++) {
int wordId = ids[i];
if (wordId >= 0 && wordId < words.size()) {
int score = wordScores.get(wordId);
if (score > minScore) {
// Replace the lowest element
topScores[filledCount - 1] = score;
topWordIds[filledCount - 1] = wordId;
// Bubble up the new element
for (int j = filledCount - 1; j > 0; j--) {
if (topScores[j] > topScores[j - 1]) {
// Swap scores
int tempScore = topScores[j];
topScores[j] = topScores[j - 1];
topScores[j - 1] = tempScore;
// Swap word IDs
int tempId = topWordIds[j];
topWordIds[j] = topWordIds[j - 1];
topWordIds[j - 1] = tempId;
} else {
break;
}
}
// Update min score
minScore = topScores[filledCount - 1];
}
}
}
// Create result list
List<ScoredSuggestion> results = new ArrayList<>(filledCount);
for (int i = 0; i < filledCount; i++) {
results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
}
return results;
}
/**
* Use binary search on sorted word segments to efficiently find matches.
*/
private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
// If we have a lot of words, use an optimized segment approach
if (words.size() > 1000) {
// Divide words into segments for better locality
int segmentSize = 1000;
int numSegments = (words.size() + segmentSize - 1) / segmentSize;
// Find matches using binary search within each segment
List<WordScorePair> allMatches = new ArrayList<>();
for (int segment = 0; segment < numSegments; segment++) {
int start = segment * segmentSize;
int end = Math.min(start + segmentSize, words.size());
// Binary search for first potential match
int pos = Collections.binarySearch(
words.subList(start, end),
prefix,
(a, b) -> a.compareTo(b)
);
if (pos < 0) {
pos = -pos - 1;
}
// Collect all matches
for (int i = start + pos; i < end && i < words.size(); i++) {
String word = words.get(i);
if (word.startsWith(prefix)) {
allMatches.add(new WordScorePair(word, wordScores.get(i)));
} else if (word.compareTo(prefix) > 0) {
break; // Past potential matches
}
}
}
// Sort by score and take top k
allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
WordScorePair pair = allMatches.get(i);
results.add(new ScoredSuggestion(pair.word, pair.score));
}
return results;
}
// Fallback for small dictionaries - linear scan but optimized
return simpleSearchFallback(prefix, k);
}
/**
* Optimized linear scan - only used for small dictionaries.
*/
private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
// Use primitive arrays for better cache locality
int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
String[] matchWords = new String[matchScores.length];
int matchCount = 0;
for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
String word = words.get(i);
if (word.startsWith(prefix)) {
matchWords[matchCount] = word;
matchScores[matchCount] = wordScores.get(i);
matchCount++;
}
}
// Sort matches by score (in-place for small arrays)
for (int i = 0; i < matchCount; i++) {
for (int j = i + 1; j < matchCount; j++) {
if (matchScores[j] > matchScores[i]) {
// Swap scores
int tempScore = matchScores[i];
matchScores[i] = matchScores[j];
matchScores[j] = tempScore;
// Swap words
String tempWord = matchWords[i];
matchWords[i] = matchWords[j];
matchWords[j] = tempWord;
}
}
}
// Create results
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
for (int i = 0; i < Math.min(k, matchCount); i++) {
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
}
return results;
}
/**
* Get top k words from candidate IDs, filtering by the full prefix.
*/
private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
if (wordIds == null || wordIds.isEmpty()) {
return Collections.emptyList();
}
// Make primitive arrays for better performance
String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
int[] matchScores = new int[matchWords.length];
int matchCount = 0;
int[] ids = wordIds.toArray();
for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
int wordId = ids[i];
if (wordId >= 0 && wordId < words.size()) {
String word = words.get(wordId);
if (word.startsWith(fullPrefix)) {
matchWords[matchCount] = word;
matchScores[matchCount] = wordScores.get(wordId);
matchCount++;
}
}
}
// Sort by score (efficient insertion sort for small k)
for (int i = 0; i < Math.min(matchCount, k); i++) {
int maxPos = i;
for (int j = i + 1; j < matchCount; j++) {
if (matchScores[j] > matchScores[maxPos]) {
maxPos = j;
}
}
if (maxPos != i) {
// Swap
int tempScore = matchScores[i];
matchScores[i] = matchScores[maxPos];
matchScores[maxPos] = tempScore;
String tempWord = matchWords[i];
matchWords[i] = matchWords[maxPos];
matchWords[maxPos] = tempWord;
}
}
// Create result list (only up to k elements)
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
for (int i = 0; i < Math.min(k, matchCount); i++) {
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
}
return results;
}
/**
* Class representing a suggested completion.
*/
public static class ScoredSuggestion {
private final String word;
private final int score;
public ScoredSuggestion(String word, int score) {
this.word = word;
this.score = score;
}
public String getWord() {
return word;
}
public int getScore() {
return score;
}
@Override
public String toString() {
return word + " (" + score + ")";
}
}
}

View File

@@ -4,23 +4,24 @@ import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.functions.math.dict.SpellChecker; import nu.marginalia.functions.math.dict.SpellChecker;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.crawl.HtmlFeature;
import org.apache.commons.collections4.trie.PatriciaTrie;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.nio.file.StandardOpenOption;
import java.util.function.Supplier; import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.zip.GZIPInputStream;
import java.util.stream.Stream;
public class Suggestions { public class Suggestions {
private PatriciaTrie<String> suggestionsTrie = null; private PrefixSearchStructure searchStructure = null;
private TermFrequencyDict termFrequencyDict = null; private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false; private volatile boolean ready = false;
private final SpellChecker spellChecker; private final SpellChecker spellChecker;
@@ -37,39 +38,40 @@ public class Suggestions {
this.spellChecker = spellChecker; this.spellChecker = spellChecker;
Thread.ofPlatform().start(() -> { Thread.ofPlatform().start(() -> {
suggestionsTrie = loadSuggestions(suggestionsFile); searchStructure = loadSuggestions(suggestionsFile);
termFrequencyDict = dict; termFrequencyDict = dict;
ready = true; ready = true;
logger.info("Loaded {} suggestions", suggestionsTrie.size()); logger.info("Loaded {} suggestions", searchStructure.size());
}); });
} }
private static PatriciaTrie<String> loadSuggestions(Path file) { private static PrefixSearchStructure loadSuggestions(Path file) {
PrefixSearchStructure ret = new PrefixSearchStructure();
if (!Files.exists(file)) { if (!Files.exists(file)) {
logger.error("Suggestions file {} absent, loading empty suggestions db", file); logger.error("Suggestions file {} absent, loading empty suggestions db", file);
return new PatriciaTrie<>(); return ret;
} }
try (var lines = Files.lines(file)) {
var ret = new PatriciaTrie<String>();
lines.filter(suggestionPattern.asPredicate()) try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
.filter(line -> line.length()<32) while (scanner.hasNextLine()) {
.map(String::toLowerCase) String line = scanner.nextLine();
.forEach(w -> ret.put(w, w)); String[] parts = StringUtils.split(line, " ", 2);
if (parts.length != 2) {
// Add special keywords to the suggestions logger.warn("Invalid suggestion line: {}", line);
for (var feature : HtmlFeature.values()) { continue;
String keyword = feature.getKeyword(); }
int cnt = Integer.parseInt(parts[0]);
ret.put(keyword, keyword); if (cnt > 1) {
ret.put("-" + keyword, "-" + keyword); String word = parts[1];
ret.insert(word, cnt);
}
} }
return ret; return ret;
} }
catch (IOException ex) { catch (IOException ex) {
logger.error("Failed to load suggestions file", ex); logger.error("Failed to load suggestions file", ex);
return new PatriciaTrie<>(); return new PrefixSearchStructure();
} }
} }
@@ -83,96 +85,24 @@ public class Suggestions {
searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " "); searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");
return Stream.of( return getSuggestionsForKeyword(count, searchWord);
new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
suggestionsForLastWord(count, searchWord),
spellCheckStream(searchWord)
)
.flatMap(SuggestionsStreamable::stream)
.limit(count)
.collect(Collectors.toList());
} }
private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) { public List<String> getSuggestionsForKeyword(int count, String prefix) {
int sp = searchWord.lastIndexOf(' ');
if (sp < 0) {
return Stream::empty;
}
String prefixString = searchWord.substring(0, sp+1);
String suggestString = searchWord.substring(sp+1);
return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
}
private SuggestionsStreamable spellCheckStream(String word) {
int start = word.lastIndexOf(' ');
String prefix;
String corrWord;
if (start < 0) {
corrWord = word;
prefix = "";
}
else {
prefix = word.substring(0, start + 1);
corrWord = word.substring(start + 1);
}
if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
}
else {
return Stream::empty;
}
}
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
if (!ready) if (!ready)
return Stream.empty(); return List.of();
if (prefix.length() < MIN_SUGGEST_LENGTH) { if (prefix.length() < MIN_SUGGEST_LENGTH) {
return Stream.empty(); return List.of();
} }
var start = suggestionsTrie.select(prefix); var results = searchStructure.getTopCompletions(prefix, count);
List<String> ret = new ArrayList<>(count);
if (start == null) { for (var result : results) {
return Stream.empty(); ret.add(result.getWord());
} }
if (!start.getKey().startsWith(prefix)) { return ret;
return Stream.empty();
}
SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
.takeWhile(s -> s.startsWith(prefix))
.limit(256)
.sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
.limit(count);
} }
private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
public Stream<String> stream() {
return suggestionStream.map(s -> prefix + s);
}
}
interface SuggestionsStreamable { Stream<String> stream(); }
private class SuggestionsValueCalculator {
private final Map<String, Long> hashCache = new HashMap<>(512);
public int get(String s) {
long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
return -termFrequencyDict.getTermFreqHash(hash);
}
}
} }

View File

@@ -59,9 +59,9 @@ public class ControlMain extends MainClass {
download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt")); download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
} }
Path suggestionsFile = dataPath.resolve("suggestions.txt"); Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
if (!Files.exists(suggestionsFile)) { if (!Files.exists(suggestionsFile)) {
downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz")); download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
} }
Path asnRawData = dataPath.resolve("asn-data-raw-table"); Path asnRawData = dataPath.resolve("asn-data-raw-table");

View File

@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
<tr> <tr>
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td> <td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-s">Small</label></td> <td><label for="sample-s">Small</label></td>
<td>1000 Domains. About 2 GB. </td> <td>1000 Domains. About 1 GB. </td>
</tr> </tr>
<tr> <tr>
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td> <td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-m">Medium</label></td> <td><label for="sample-m">Medium</label></td>
<td>2000 Domains. About 6 GB. Recommended.</td> <td>2000 Domains. About 2 GB. Recommended.</td>
</tr> </tr>
<tr> <tr>
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td> <td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-l">Large</label></td> <td><label for="sample-l">Large</label></td>
<td>5000 Domains. About 20 GB.</td> <td>5000 Domains. About 7 GB.</td>
</tr> </tr>
<tr> <tr>
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td> <td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-xl">Huge</label></td> <td><label for="sample-xl">Huge</label></td>
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments. <td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
Expect hours of processing time. </td> Expect hours of processing time. </td>
</tr> </tr>
</table> </table>