mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
20 Commits
deploy-007
...
deploy-008
Author | SHA1 | Date | |
---|---|---|---|
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
5354e034bf | ||
|
72384ad6ca | ||
|
a2b076f9be | ||
|
c8b0a32c0f |
@@ -24,58 +24,4 @@ public class LanguageModels {
|
|||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LanguageModelsBuilder builder() {
|
|
||||||
return new LanguageModelsBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class LanguageModelsBuilder {
|
|
||||||
private Path termFrequencies;
|
|
||||||
private Path openNLPSentenceDetectionData;
|
|
||||||
private Path posRules;
|
|
||||||
private Path posDict;
|
|
||||||
private Path fasttextLanguageModel;
|
|
||||||
private Path segments;
|
|
||||||
|
|
||||||
LanguageModelsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
|
||||||
this.termFrequencies = termFrequencies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posRules(Path posRules) {
|
|
||||||
this.posRules = posRules;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posDict(Path posDict) {
|
|
||||||
this.posDict = posDict;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder segments(Path segments) {
|
|
||||||
this.segments = segments;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModels build() {
|
|
||||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/** WorkLog is a journal of work done by a process,
|
/** WorkLog is a journal of work done by a process,
|
||||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
|||||||
return new WorkLoadIterable<>(logFile, mapper);
|
return new WorkLoadIterable<>(logFile, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int countEntries(Path crawlerLog) throws IOException{
|
||||||
|
try (var linesStream = Files.lines(crawlerLog)) {
|
||||||
|
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use synchro over concurrent set to avoid competing writes
|
// Use synchro over concurrent set to avoid competing writes
|
||||||
// - correct is better than fast here, it's sketchy enough to use
|
// - correct is better than fast here, it's sketchy enough to use
|
||||||
// a PrintWriter
|
// a PrintWriter
|
||||||
|
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
|||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public UpdateRssActor(Gson gson,
|
public UpdateRssActor(Gson gson,
|
||||||
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateRefresh(int count, long msgId) -> {
|
case UpdateRefresh(int count, long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateRefresh is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateRefresh(count, msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateClean(long msgId) -> {
|
case UpdateClean(long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateClean is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateClean(msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
|
|||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
@@ -26,14 +28,15 @@ import java.util.function.Function;
|
|||||||
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) {
|
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
|
||||||
super(gson);
|
super(gson);
|
||||||
|
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
public record Run(long fileStorageId) implements ActorStep {}
|
public record Run(long fileStorageId) implements ActorStep {}
|
||||||
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
|
|||||||
Path crawlerLog = root.resolve("crawler.log");
|
Path crawlerLog = root.resolve("crawler.log");
|
||||||
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(newCrawlerLog)) {
|
int totalEntries = WorkLog.countEntries(crawlerLog);
|
||||||
|
|
||||||
|
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||||
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
|
||||||
|
) {
|
||||||
|
int entryIdx = 0;
|
||||||
|
|
||||||
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||||
|
|
||||||
var entry = item.getKey();
|
final WorkLogEntry entry = item.getKey();
|
||||||
var path = item.getValue();
|
final Path inputPath = item.getValue();
|
||||||
|
|
||||||
logger.info("Converting {}", entry.id());
|
Path outputPath = inputPath;
|
||||||
|
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
|
||||||
|
|
||||||
|
if (inputPath.toString().endsWith(".parquet")) {
|
||||||
if (path.toFile().getName().endsWith(".parquet")) {
|
|
||||||
String domain = entry.id();
|
String domain = entry.id();
|
||||||
String id = Integer.toHexString(domain.hashCode());
|
String id = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
|
outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||||
|
|
||||||
SlopCrawlDataRecord.convertFromParquet(path, outputFile);
|
if (Files.exists(inputPath)) {
|
||||||
|
try {
|
||||||
|
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
|
||||||
|
Files.deleteIfExists(inputPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
outputPath = inputPath; // don't update the work log on error
|
||||||
|
logger.error("Failed to convert " + inputPath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
|
||||||
|
// if the input file is missing, and the output file is missing, we just write the log
|
||||||
|
// record identical to the old one
|
||||||
|
outputPath = inputPath;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
|
// Write a log entry for the (possibly) converted file
|
||||||
}
|
workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
|
||||||
else {
|
|
||||||
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||||
Files.move(crawlerLog, oldCrawlerLog);
|
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||||
Files.move(newCrawlerLog, crawlerLog);
|
Files.move(newCrawlerLog, crawlerLog);
|
||||||
|
|
||||||
yield new End();
|
yield new End();
|
||||||
|
@@ -34,6 +34,7 @@ dependencies {
|
|||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.wiremock
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -27,6 +28,8 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
private final URI browserlessURI;
|
private final URI browserlessURI;
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final String userAgent = WmsaHome.getUserAgent().uaString();
|
||||||
|
|
||||||
public BrowserlessClient(URI browserlessURI) {
|
public BrowserlessClient(URI browserlessURI) {
|
||||||
this.browserlessURI = browserlessURI;
|
this.browserlessURI = browserlessURI;
|
||||||
}
|
}
|
||||||
@@ -34,6 +37,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -60,6 +64,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"options", screenshotOptions,
|
"options", screenshotOptions,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
@@ -1,5 +1,8 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@@ -8,19 +11,74 @@ import org.testcontainers.containers.GenericContainer;
|
|||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.SocketException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.util.Map;
|
import java.util.Map;import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
|
import static java.net.NetworkInterface.getNetworkInterfaces;
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withNetworkMode("bridge")
|
||||||
.withExposedPorts(3000);
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
|
static WireMockServer wireMockServer = new WireMockServer(WireMockConfiguration.wireMockConfig().port(18089));
|
||||||
|
|
||||||
|
static String localIp;
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() throws IOException {
|
||||||
container.start();
|
container.start();
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
|
localIp = findLocalIp();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String findLocalIp() throws SocketException {
|
||||||
|
var interfaces = getNetworkInterfaces();
|
||||||
|
while (interfaces.hasMoreElements()) {
|
||||||
|
var iface = interfaces.nextElement();
|
||||||
|
if (iface.isLoopback())
|
||||||
|
continue;
|
||||||
|
else if (iface.isVirtual())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var addresses = iface.getInetAddresses();
|
||||||
|
|
||||||
|
while (addresses.hasMoreElements()) {
|
||||||
|
var address = addresses.nextElement();
|
||||||
|
|
||||||
|
if (!address.isSiteLocalAddress()) continue;
|
||||||
|
|
||||||
|
return address.getHostAddress();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return "127.0.0.1";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectContentUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||||
|
client.content("http://" + localIp + ":18089/", BrowserlessClient.GotoOptions.defaultValues());
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectScreenshotUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||||
|
client.screenshot("http://" + localIp + ":18089/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@@ -134,6 +134,10 @@ public class QueryExpansion {
|
|||||||
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
||||||
graph.addVariantForSpan(prev, qw, joinedWord);
|
graph.addVariantForSpan(prev, qw, joinedWord);
|
||||||
}
|
}
|
||||||
|
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prev = qw;
|
prev = qw;
|
||||||
|
@@ -213,6 +213,18 @@ public class QueryFactoryTest {
|
|||||||
System.out.println(subquery);
|
System.out.println(subquery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContractionWordNum() {
|
||||||
|
var subquery = parseAndGetSpecs("glove 80");
|
||||||
|
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCplusPlus() {
|
public void testCplusPlus() {
|
||||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||||
|
@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public abstract class RecordActorPrototype implements ActorPrototype {
|
public abstract class RecordActorPrototype implements ActorPrototype {
|
||||||
|
|
||||||
@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String functionName(Class<? extends ActorStep> functionClass) {
|
private String functionName(Class<? extends ActorStep> functionClass) {
|
||||||
return functionClass.getSimpleName().toUpperCase();
|
return ActorStep.functionName(functionClass);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ActorStep constructState(String message) throws ReflectiveOperationException {
|
private ActorStep constructState(String message) throws ReflectiveOperationException {
|
||||||
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Get a list of JSON prototypes for each actor step declared by this actor */
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Map<String, String> getMessagePrototypes() {
|
||||||
|
Map<String, String> messagePrototypes = new HashMap<>();
|
||||||
|
|
||||||
|
for (var clazz : getClass().getDeclaredClasses()) {
|
||||||
|
if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
|
||||||
|
|
||||||
|
renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
|
||||||
|
|
||||||
|
messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return messagePrototypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
|
||||||
|
for (var field : recordType.getDeclaredFields()) {
|
||||||
|
String typeName = field.getType().getSimpleName();
|
||||||
|
|
||||||
|
if ("List".equals(typeName)) {
|
||||||
|
sj.add(String.format("\"%s\": [ ]", field.getName()));
|
||||||
|
}
|
||||||
|
else if (field.getType().isRecord()) {
|
||||||
|
var innerSj = new StringJoiner(",", "{", "}");
|
||||||
|
renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
|
||||||
|
sj.add(String.format("\"%s\": %s", field.getName(), sj));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1,3 +1,7 @@
|
|||||||
package nu.marginalia.actor.state;
|
package nu.marginalia.actor.state;
|
||||||
|
|
||||||
public interface ActorStep {}
|
public interface ActorStep {
|
||||||
|
static String functionName(Class<? extends ActorStep> type) {
|
||||||
|
return type.getSimpleName().toUpperCase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -35,6 +35,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -203,11 +204,18 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Processing small items");
|
logger.info("Processing small items");
|
||||||
|
|
||||||
|
// We separate the large and small domains to reduce the number of critical sections,
|
||||||
|
// as the large domains have a separate processing track that doesn't store everything
|
||||||
|
// in memory
|
||||||
|
|
||||||
|
final List<Path> bigTasks = new ArrayList<>();
|
||||||
|
|
||||||
// First process the small items
|
// First process the small items
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||||
{
|
{
|
||||||
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
||||||
|
bigTasks.add(dataPath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,30 +243,28 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Processing large items");
|
logger.info("Processing large items");
|
||||||
|
|
||||||
// Next the big items domain-by-domain
|
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
int bigTaskIdx = 0;
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
// Next the big items domain-by-domain
|
||||||
{
|
for (var dataPath : bigTasks) {
|
||||||
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
|
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
|
||||||
if (sizeHint < SIDELOAD_THRESHOLD) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
||||||
// closed before it's consumed by the converterWriter. Instead, the converterWriter guarantees it
|
// closed before it's consumed by the converterWriter. Instead, the converterWriter guarantees it
|
||||||
// will close it after it's consumed.
|
// will close it after it's consumed.
|
||||||
|
|
||||||
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
||||||
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint);
|
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
|
||||||
|
|
||||||
converterWriter.accept(writable);
|
converterWriter.accept(writable);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.info("Error in processing", ex);
|
logger.info("Error in processing", ex);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -116,7 +116,7 @@ public class AdblockSimulator {
|
|||||||
|
|
||||||
|
|
||||||
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||||
// This version is about 100x faster than the a "clean" first stab implementation.
|
// This version is about 100x faster than a "clean" first stab implementation.
|
||||||
|
|
||||||
class RuleVisitor implements NodeFilter {
|
class RuleVisitor implements NodeFilter {
|
||||||
public boolean sawAds;
|
public boolean sawAds;
|
||||||
|
@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
|
|
||||||
var tags = doc.select("meta[name=generator]");
|
var tags = doc.select("meta[name=generator]");
|
||||||
|
|
||||||
if (tags.size() == 0) {
|
if (tags.isEmpty()) {
|
||||||
// Some sites have a comment in the head instead of a meta tag
|
// Some sites have a comment in the head instead of a meta tag
|
||||||
return fingerprintServerTech(doc, responseHeaders);
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
}
|
}
|
||||||
|
@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
}
|
}
|
||||||
fullHtml.append("</div></body></html>");
|
fullHtml.append("</div></body></html>");
|
||||||
|
|
||||||
var doc = sideloaderProcessing
|
return sideloaderProcessing
|
||||||
.processDocument(fullUrl,
|
.processDocument(fullUrl,
|
||||||
fullHtml.toString(),
|
fullHtml.toString(),
|
||||||
List.of("encyclopedia", "wiki"),
|
List.of("encyclopedia", "wiki"),
|
||||||
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
||||||
LocalDate.now().getYear(),
|
LocalDate.now().getYear(),
|
||||||
10_000_000);
|
10_000_000);
|
||||||
|
|
||||||
return doc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String normalizeUtf8(String url) {
|
private String normalizeUtf8(String url) {
|
||||||
|
@@ -20,7 +20,6 @@ import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
|||||||
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
||||||
import nu.marginalia.db.DomainBlacklist;
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.process.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
@@ -417,13 +416,13 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
try {
|
try {
|
||||||
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
||||||
if (Files.exists(slopPath)) {
|
if (Files.exists(slopPath)) {
|
||||||
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
||||||
if (Files.exists(parquetPath)) {
|
if (Files.exists(parquetPath)) {
|
||||||
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
||||||
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@@ -107,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(probeTimeout)
|
||||||
.build();
|
.build();
|
||||||
} catch (URISyntaxException e) {
|
} catch (URISyntaxException e) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
for (int tries = 0;; tries++) {
|
||||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
try {
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||||
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
|
|
||||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
|
}
|
||||||
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
if (tries > 3) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
|
}
|
||||||
|
// else try again ...
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -143,7 +148,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
var headBuilder = HttpRequest.newBuilder()
|
var headBuilder = HttpRequest.newBuilder()
|
||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.timeout(requestTimeout)
|
.timeout(requestTimeout)
|
||||||
;
|
;
|
||||||
@@ -215,7 +220,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
var getBuilder = HttpRequest.newBuilder()
|
var getBuilder = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept-Language", "en,*;q=0.5")
|
.header("Accept-Language", "en,*;q=0.5")
|
||||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||||
@@ -307,7 +312,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.uri(sitemapUrl.asURI())
|
.uri(sitemapUrl.asURI())
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(requestTimeout)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -386,7 +391,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.timeout(requestTimeout);
|
.timeout(requestTimeout);
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||||
|
@@ -4,6 +4,7 @@ import nu.marginalia.ContentTypes;
|
|||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -11,51 +12,73 @@ import javax.annotation.Nullable;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/** A reference to a domain that has been crawled before. */
|
/** A reference to a domain that has been crawled before. */
|
||||||
public class CrawlDataReference implements AutoCloseable {
|
public class CrawlDataReference implements AutoCloseable, Iterable<CrawledDocument> {
|
||||||
|
|
||||||
|
private boolean closed = false;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private final Path path;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private SerializableCrawlDataStream data = null;
|
||||||
|
|
||||||
private final SerializableCrawlDataStream data;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||||
|
|
||||||
public CrawlDataReference(SerializableCrawlDataStream data) {
|
public CrawlDataReference(@Nullable Path path) {
|
||||||
this.data = data;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawlDataReference() {
|
public CrawlDataReference() {
|
||||||
this(SerializableCrawlDataStream.empty());
|
this(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Delete the associated data from disk, if it exists */
|
/** Delete the associated data from disk, if it exists */
|
||||||
public void delete() throws IOException {
|
public void delete() throws IOException {
|
||||||
Path filePath = data.path();
|
if (path != null) {
|
||||||
|
Files.deleteIfExists(path);
|
||||||
if (filePath != null) {
|
|
||||||
Files.deleteIfExists(filePath);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the next document from the crawl data,
|
public @NotNull Iterator<CrawledDocument> iterator() {
|
||||||
* returning null when there are no more documents
|
|
||||||
* available
|
|
||||||
*/
|
|
||||||
@Nullable
|
|
||||||
public CrawledDocument nextDocument() {
|
|
||||||
try {
|
|
||||||
while (data.hasNext()) {
|
|
||||||
if (data.next() instanceof CrawledDocument doc) {
|
|
||||||
if (!ContentTypes.isAccepted(doc.contentType))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
return doc;
|
requireStream();
|
||||||
|
// Guaranteed by requireStream, but helps java
|
||||||
|
Objects.requireNonNull(data);
|
||||||
|
|
||||||
|
return data.map(next -> {
|
||||||
|
if (next instanceof CrawledDocument doc && ContentTypes.isAccepted(doc.contentType)) {
|
||||||
|
return Optional.of(doc);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** After calling this method, data is guaranteed to be non-null */
|
||||||
|
private void requireStream() {
|
||||||
|
if (closed) {
|
||||||
|
throw new IllegalStateException("Use after close()");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data == null) {
|
||||||
|
try {
|
||||||
|
if (path != null) {
|
||||||
|
data = SerializableCrawlDataStream.openDataStream(path);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
catch (Exception ex) {
|
||||||
catch (IOException ex) {
|
logger.error("Failed to open stream", ex);
|
||||||
logger.error("Failed to read next document", ex);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
data = SerializableCrawlDataStream.empty();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isContentBodySame(byte[] one, byte[] other) {
|
public static boolean isContentBodySame(byte[] one, byte[] other) {
|
||||||
@@ -98,7 +121,12 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws IOException {
|
||||||
data.close();
|
if (!closed) {
|
||||||
|
if (data != null) {
|
||||||
|
data.close();
|
||||||
|
}
|
||||||
|
closed = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -89,30 +89,45 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
try {
|
try (oldCrawlData) {
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
EdgeUrl rootUrl;
|
|
||||||
|
|
||||||
var probeResult = probeRootUrl();
|
var probeResult = probeRootUrl();
|
||||||
switch (probeResult) {
|
|
||||||
|
return switch (probeResult) {
|
||||||
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
||||||
rootUrl = probedUrl; // Good track
|
|
||||||
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
|
// so we don't know the crawl delay
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
|
||||||
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||||
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
|
||||||
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||||
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
|
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||||
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
|
}
|
||||||
|
|
||||||
|
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||||
|
|
||||||
|
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
||||||
}
|
}
|
||||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
return 1;
|
yield 1;
|
||||||
}
|
}
|
||||||
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
return 1;
|
yield 1;
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
|
||||||
// so we don't know the crawl delay
|
|
||||||
TimeUnit.SECONDS.sleep(1);
|
|
||||||
|
|
||||||
return crawlDomain(oldCrawlData, rootUrl, domainLinks);
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error crawling domain {}", domain, ex);
|
logger.error("Error crawling domain {}", domain, ex);
|
||||||
@@ -120,28 +135,15 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int crawlDomain(CrawlDataReference oldCrawlData,
|
private int crawlDomain(EdgeUrl rootUrl,
|
||||||
EdgeUrl rootUrl,
|
SimpleRobotRules robotsRules,
|
||||||
DomainLinks domainLinks) throws InterruptedException {
|
CrawlDelayTimer delayTimer,
|
||||||
|
DomainLinks domainLinks) {
|
||||||
|
|
||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
|
||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
|
||||||
|
|
||||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
|
|
||||||
domainStateDb.save(summaryRecord);
|
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
|
|
||||||
// Fetch sitemaps
|
// Fetch sitemaps
|
||||||
for (var sitemap : robotsRules.getSitemaps()) {
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
@@ -379,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
if (docOpt.isPresent()) {
|
if (docOpt.isPresent()) {
|
||||||
var doc = docOpt.get();
|
var doc = docOpt.get();
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(top, doc);
|
var responseUrl = new EdgeUrl(ok.uri());
|
||||||
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
|
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||||
|
crawlFrontier.addVisited(responseUrl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||||
|
@@ -40,18 +40,12 @@ public class CrawlerRevisitor {
|
|||||||
int errors = 0;
|
int errors = 0;
|
||||||
int skipped = 0;
|
int skipped = 0;
|
||||||
|
|
||||||
for (;;) {
|
for (CrawledDocument doc : oldCrawlData) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
// If we've had too many errors, we'll stop trying to recrawl
|
// If we've had too many errors, we'll stop trying to recrawl
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocument doc = oldCrawlData.nextDocument();
|
|
||||||
|
|
||||||
if (doc == null)
|
|
||||||
break;
|
|
||||||
|
|
||||||
// This Shouldn't Happen (TM)
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
if (urlMaybe.isEmpty())
|
if (urlMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
|
@@ -12,8 +12,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
*/
|
*/
|
||||||
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
|
|||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
|
|
||||||
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||||
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||||
|
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||||
|
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||||
|
|
||||||
|
inputMap.forEach((k, v) -> {
|
||||||
|
if (k.isBlank()) return;
|
||||||
|
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||||
|
|
||||||
|
filteredMap.put(k, v);
|
||||||
|
});
|
||||||
|
|
||||||
|
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
|
@@ -108,15 +108,17 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
|
public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
|
||||||
Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
|
Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
|
||||||
|
|
||||||
try (var writer = new Writer(tempDir)) {
|
try (var writer = new Writer(tempDir);
|
||||||
CrawledDocumentParquetRecordFileReader.stream(parquetInput).forEach(
|
var stream = CrawledDocumentParquetRecordFileReader.stream(parquetInput))
|
||||||
parquetRecord -> {
|
{
|
||||||
try {
|
stream.forEach(
|
||||||
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
parquetRecord -> {
|
||||||
} catch (IOException e) {
|
try {
|
||||||
throw new RuntimeException(e);
|
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
||||||
}
|
} catch (IOException e) {
|
||||||
});
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
FileUtils.deleteDirectory(tempDir.toFile());
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
@@ -375,7 +375,7 @@ class CrawlerRetreiverTest {
|
|||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
doCrawlWithReferenceStream(specs,
|
doCrawlWithReferenceStream(specs,
|
||||||
SerializableCrawlDataStream.openDataStream(tempFileParquet1)
|
new CrawlDataReference(tempFileParquet1)
|
||||||
);
|
);
|
||||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||||
|
|
||||||
@@ -447,11 +447,9 @@ class CrawlerRetreiverTest {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1);
|
|
||||||
|
|
||||||
System.out.println("---");
|
System.out.println("---");
|
||||||
|
|
||||||
doCrawlWithReferenceStream(specs, stream);
|
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
||||||
|
|
||||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
@@ -508,12 +506,11 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||||
new CrawlDataReference(stream));
|
|
||||||
}
|
}
|
||||||
catch (IOException | SQLException ex) {
|
catch (IOException | SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
<span>
|
<span>
|
||||||
Access logs containing IP-addresses are retained for up to 24 hours,
|
Access logs containing IP-addresses are retained for up to 24 hours,
|
||||||
anonymized logs with source addresses removed are sometimes kept longer
|
anonymized logs with source addresses removed are sometimes kept longer
|
||||||
for to help diagnosing bugs.
|
to help diagnose bugs.
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex space-y-4 flex-col">
|
<div class="flex space-y-4 flex-col">
|
||||||
@@ -33,4 +33,4 @@
|
|||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</footer>
|
</footer>
|
||||||
|
@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
|
|||||||
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
||||||
graalce is a good distribution choice but it doesn't matter too much.
|
graalce is a good distribution choice but it doesn't matter too much.
|
||||||
|
|
||||||
**Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`
|
|
||||||
|
|
||||||
## Quick Set up
|
## Quick Set up
|
||||||
|
|
||||||
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
||||||
|
@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
|
|||||||
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
|
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
|
||||||
|
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
pushd $(dirname $0)/..
|
||||||
|
npm install -D tailwindcss@3
|
||||||
|
popd
|
||||||
|
@@ -234,11 +234,12 @@ dependencyResolutionManagement {
|
|||||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||||
|
|
||||||
library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-5-SNAPSHOT')
|
library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
|
||||||
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
||||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||||
|
|
||||||
|
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
||||||
library('jte','gg.jte','jte').version('3.1.15')
|
library('jte','gg.jte','jte').version('3.1.15')
|
||||||
|
|
||||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||||
|
Reference in New Issue
Block a user