mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
16 Commits
deploy-006
...
deploy-007
Author | SHA1 | Date | |
---|---|---|---|
|
f0d74aa3bb | ||
|
74a1f100f4 | ||
|
eb049658e4 | ||
|
db138b2a6f | ||
|
1673fc284c | ||
|
503ea57d5b | ||
|
18ca926c7f | ||
|
db99242db2 | ||
|
2b9d2985ba | ||
|
eeb6ecd711 | ||
|
1f58aeadbf | ||
|
3d68be64da | ||
|
668f3b16ef | ||
|
98a340a0d1 | ||
|
8862100f7e | ||
|
274941f6de |
@@ -5,7 +5,7 @@ plugins {
|
|||||||
|
|
||||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
group 'marginalia'
|
group 'marginalia'
|
||||||
|
@@ -155,8 +155,15 @@ public class SentenceExtractor {
|
|||||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||||
String[] sentences;
|
String[] sentences;
|
||||||
|
|
||||||
// Normalize spaces
|
// Safety net against malformed data DOS attacks,
|
||||||
|
// found 5+ MB <p>-tags in the wild that just break
|
||||||
|
// the sentence extractor causing it to stall forever.
|
||||||
|
if (text.length() > 50_000) {
|
||||||
|
// 50k chars can hold a small novel, let alone single html tags
|
||||||
|
text = text.substring(0, 50_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize spaces
|
||||||
text = normalizeSpaces(text);
|
text = normalizeSpaces(text);
|
||||||
|
|
||||||
// Split into sentences
|
// Split into sentences
|
||||||
|
@@ -12,6 +12,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
|||||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.converting.writer.ConverterWriter;
|
import nu.marginalia.converting.writer.ConverterWriter;
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
import nu.marginalia.process.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
@@ -49,6 +50,7 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
private final ProcessHeartbeat heartbeat;
|
private final ProcessHeartbeat heartbeat;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final SideloadSourceFactory sideloadSourceFactory;
|
private final SideloadSourceFactory sideloadSourceFactory;
|
||||||
|
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
|
|
||||||
@@ -199,12 +201,19 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
processedDomains.set(batchingWorkLog.size());
|
processedDomains.set(batchingWorkLog.size());
|
||||||
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
||||||
|
|
||||||
for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(),
|
logger.info("Processing small items");
|
||||||
|
|
||||||
|
// First process the small items
|
||||||
|
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||||
{
|
{
|
||||||
|
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
pool.submit(() -> {
|
pool.submit(() -> {
|
||||||
try {
|
try (var dataStream = SerializableCrawlDataStream.openDataStream(dataPath)) {
|
||||||
ConverterBatchWritableIf writable = processor.createWritable(domain);
|
ConverterBatchWritableIf writable = processor.fullProcessing(dataStream) ;
|
||||||
converterWriter.accept(writable);
|
converterWriter.accept(writable);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@@ -223,6 +232,37 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
do {
|
do {
|
||||||
System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
|
System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
|
||||||
} while (!pool.awaitTermination(60, TimeUnit.SECONDS));
|
} while (!pool.awaitTermination(60, TimeUnit.SECONDS));
|
||||||
|
|
||||||
|
logger.info("Processing large items");
|
||||||
|
|
||||||
|
// Next the big items domain-by-domain
|
||||||
|
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||||
|
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||||
|
{
|
||||||
|
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
|
||||||
|
if (sizeHint < SIDELOAD_THRESHOLD) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
||||||
|
// closed before it's consumed by the converterWriter. Instead, the converterWriter guarantees it
|
||||||
|
// will close it after it's consumed.
|
||||||
|
|
||||||
|
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
||||||
|
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint);
|
||||||
|
|
||||||
|
converterWriter.accept(writable);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error in processing", ex);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Processing complete");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -14,7 +14,6 @@ import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
|||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.geoip.GeoIpDictionary;
|
import nu.marginalia.geoip.GeoIpDictionary;
|
||||||
import nu.marginalia.geoip.sources.AsnTable;
|
import nu.marginalia.geoip.sources.AsnTable;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
@@ -28,13 +27,11 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
private final SiteWords siteWords;
|
private final SiteWords siteWords;
|
||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
@@ -56,21 +53,6 @@ public class DomainProcessor {
|
|||||||
geoIpDictionary.waitReady();
|
geoIpDictionary.waitReady();
|
||||||
}
|
}
|
||||||
|
|
||||||
public ConverterBatchWritableIf createWritable(Path path) throws IOException {
|
|
||||||
|
|
||||||
var dataStream = CrawledDomainReader.createDataStream(path);
|
|
||||||
|
|
||||||
final int sizeHint = dataStream.sizeHint();
|
|
||||||
|
|
||||||
if (sizeHint > SIDELOAD_THRESHOLD) {
|
|
||||||
// If the file is too big, we run a processing mode that doesn't
|
|
||||||
// require loading the entire dataset into RAM
|
|
||||||
return simpleProcessing(dataStream, sizeHint);
|
|
||||||
}
|
|
||||||
|
|
||||||
return fullProcessing(dataStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||||
try {
|
try {
|
||||||
return new SimpleProcessing(dataStream, sizeHint, extraKeywords);
|
return new SimpleProcessing(dataStream, sizeHint, extraKeywords);
|
||||||
@@ -159,6 +141,7 @@ public class DomainProcessor {
|
|||||||
private final Set<String> processedUrls = new HashSet<>();
|
private final Set<String> processedUrls = new HashSet<>();
|
||||||
private final DomainLinks externalDomainLinks;
|
private final DomainLinks externalDomainLinks;
|
||||||
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
|
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
|
||||||
|
|
||||||
private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
|
private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
|
||||||
Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
|
Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
|
||||||
);
|
);
|
||||||
@@ -194,6 +177,7 @@ public class DomainProcessor {
|
|||||||
@Override
|
@Override
|
||||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||||
return iteratorFactory.create((taskConsumer) -> {
|
return iteratorFactory.create((taskConsumer) -> {
|
||||||
|
|
||||||
while (dataStream.hasNext())
|
while (dataStream.hasNext())
|
||||||
{
|
{
|
||||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||||
|
@@ -39,6 +39,9 @@ public class ConverterWriter implements AutoCloseable {
|
|||||||
workerThread.start();
|
workerThread.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Queue and eventually write the domain into the converter journal
|
||||||
|
* The domain object will be closed after it's processed.
|
||||||
|
* */
|
||||||
public void accept(@Nullable ConverterBatchWritableIf domain) {
|
public void accept(@Nullable ConverterBatchWritableIf domain) {
|
||||||
if (null == domain)
|
if (null == domain)
|
||||||
return;
|
return;
|
||||||
@@ -72,15 +75,15 @@ public class ConverterWriter implements AutoCloseable {
|
|||||||
|
|
||||||
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
|
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
|
||||||
logger.warn("Skipping already logged item {}", id);
|
logger.warn("Skipping already logged item {}", id);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
currentWriter.write(data);
|
||||||
|
workLog.logItem(id);
|
||||||
data.close();
|
data.close();
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
currentWriter.write(data);
|
|
||||||
|
|
||||||
workLog.logItem(id);
|
|
||||||
|
|
||||||
switcher.tick();
|
switcher.tick();
|
||||||
|
data.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@@ -26,7 +26,7 @@ public class DocumentBodyToString {
|
|||||||
return new String(data, charset);
|
return new String(data, charset);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Document getParsedData(ContentType type, byte[] data, String url) throws IOException {
|
public static Document getParsedData(ContentType type, byte[] data, int maxLength, String url) throws IOException {
|
||||||
final Charset charset;
|
final Charset charset;
|
||||||
|
|
||||||
if (type.charset() == null || type.charset().isBlank()) {
|
if (type.charset() == null || type.charset().isBlank()) {
|
||||||
@@ -35,7 +35,7 @@ public class DocumentBodyToString {
|
|||||||
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
|
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
|
||||||
}
|
}
|
||||||
|
|
||||||
ByteArrayInputStream bais = new ByteArrayInputStream(data);
|
ByteArrayInputStream bais = new ByteArrayInputStream(data, 0, Math.min(data.length, maxLength));
|
||||||
|
|
||||||
return Jsoup.parse(bais, charset.name(), url);
|
return Jsoup.parse(bais, charset.name(), url);
|
||||||
}
|
}
|
||||||
|
@@ -19,8 +19,8 @@ import nu.marginalia.crawl.retreival.DomainProber;
|
|||||||
import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
||||||
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
||||||
import nu.marginalia.db.DomainBlacklist;
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
|
||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.process.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
@@ -28,13 +28,11 @@ import nu.marginalia.process.ProcessConfigurationModule;
|
|||||||
import nu.marginalia.process.ProcessMainClass;
|
import nu.marginalia.process.ProcessMainClass;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -44,11 +42,13 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.function.Function;
|
|
||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||||
|
|
||||||
@@ -182,8 +182,6 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
|
// Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
|
||||||
// to be crawled.
|
// to be crawled.
|
||||||
|
|
||||||
performMigration(outputDir);
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
try (var assignFreeDomains = conn.prepareStatement(
|
try (var assignFreeDomains = conn.prepareStatement(
|
||||||
"""
|
"""
|
||||||
@@ -417,11 +415,22 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
private CrawlDataReference getReference() {
|
private CrawlDataReference getReference() {
|
||||||
try {
|
try {
|
||||||
return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
|
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
||||||
|
if (Files.exists(slopPath)) {
|
||||||
|
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
||||||
|
if (Files.exists(parquetPath)) {
|
||||||
|
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
||||||
|
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
||||||
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||||
return new CrawlDataReference();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new CrawlDataReference();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -482,92 +491,19 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Data migration logic
|
// Migrate from parquet to slop if necessary
|
||||||
|
//
|
||||||
private void performMigration(Path root) throws IOException {
|
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||||
Path crawlerLog = root.resolve("crawler.log");
|
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||||
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
if (!inputPath.endsWith(".parquet")) {
|
||||||
|
return inputPath;
|
||||||
|
|
||||||
int finishedTasks = 0;
|
|
||||||
int totalTasks;
|
|
||||||
try (var oldLog = new WorkLog(crawlerLog)) {
|
|
||||||
totalTasks = oldLog.countFinishedJobs();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
Path outputFile = CrawlerOutputFile.createSlopPath(crawlDataRoot, Integer.toHexString(domain.hashCode()), domain);
|
||||||
var migrationHeartbeat = heartbeat.createAdHocTaskHeartbeat("MIGRATING")) {
|
|
||||||
|
|
||||||
|
SlopCrawlDataRecord.convertFromParquet(inputPath, outputFile);
|
||||||
|
|
||||||
|
return outputFile;
|
||||||
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
|
||||||
|
|
||||||
var entry = item.getKey();
|
|
||||||
var path = item.getValue();
|
|
||||||
|
|
||||||
if (path.toFile().getName().endsWith(".parquet")) {
|
|
||||||
logger.info("Converting {}", entry.id());
|
|
||||||
|
|
||||||
String domain = entry.id();
|
|
||||||
String id = Integer.toHexString(domain.hashCode());
|
|
||||||
|
|
||||||
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
|
|
||||||
|
|
||||||
SlopCrawlDataRecord.convertFromParquet(path, outputFile);
|
|
||||||
|
|
||||||
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
|
|
||||||
}
|
|
||||||
|
|
||||||
migrationHeartbeat.progress("Parquet To Slop", ++finishedTasks, totalTasks);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
|
||||||
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
Files.move(newCrawlerLog, crawlerLog);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<Map.Entry<WorkLogEntry, Path>>> {
|
|
||||||
|
|
||||||
private final Path crawlRootDir;
|
|
||||||
|
|
||||||
CrawlDataLocator(Path crawlRootDir) {
|
|
||||||
this.crawlRootDir = crawlRootDir;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Optional<Map.Entry<WorkLogEntry, Path>> apply(WorkLogEntry entry) {
|
|
||||||
var path = getCrawledFilePath(crawlRootDir, entry.path());
|
|
||||||
|
|
||||||
if (!Files.exists(path)) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return Optional.of(Map.entry(entry, path));
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Path getCrawledFilePath(Path crawlDir, String fileName) {
|
|
||||||
int sp = fileName.lastIndexOf('/');
|
|
||||||
|
|
||||||
// Normalize the filename
|
|
||||||
if (sp >= 0 && sp + 1< fileName.length())
|
|
||||||
fileName = fileName.substring(sp + 1);
|
|
||||||
if (fileName.length() < 4)
|
|
||||||
fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
|
|
||||||
|
|
||||||
String sp1 = fileName.substring(0, 2);
|
|
||||||
String sp2 = fileName.substring(2, 4);
|
|
||||||
return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -9,6 +9,7 @@ import java.sql.Connection;
|
|||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/** Supplemental sqlite database for storing the summary of a crawl.
|
/** Supplemental sqlite database for storing the summary of a crawl.
|
||||||
@@ -60,6 +61,8 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||||
|
|
||||||
public DomainStateDb(Path filename) throws SQLException {
|
public DomainStateDb(Path filename) throws SQLException {
|
||||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
connection = DriverManager.getConnection(sqliteDbString);
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
@@ -74,7 +77,13 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
feedUrl TEXT
|
feedUrl TEXT
|
||||||
)
|
)
|
||||||
""");
|
""");
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS favicon (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
contentType TEXT NOT NULL,
|
||||||
|
icon BLOB NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
stmt.execute("PRAGMA journal_mode=WAL");
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -85,6 +94,41 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||||
|
VALUES(?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, Objects.requireNonNullElse(faviconRecord.contentType, "application/octet-stream"));
|
||||||
|
stmt.setBytes(3, faviconRecord.imageData);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to insert favicon", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<FaviconRecord> getIcon(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(
|
||||||
|
new FaviconRecord(
|
||||||
|
rs.getString("contentType"),
|
||||||
|
rs.getBytes("icon")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to retrieve favicon", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
public void save(SummaryRecord record) {
|
public void save(SummaryRecord record) {
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
|
@@ -23,12 +23,10 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.URLDecoder;
|
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.net.http.HttpTimeoutException;
|
import java.net.http.HttpTimeoutException;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
@@ -96,7 +96,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (Exception ex) {
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
|
@@ -19,7 +19,6 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -273,7 +272,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
|
||||||
|
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||||
|
String contentType = iconResult.header("Content-Type");
|
||||||
|
byte[] iconData = iconResult.getBodyBytes();
|
||||||
|
|
||||||
|
domainStateDb.saveIcon(
|
||||||
|
domain,
|
||||||
|
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||||
|
);
|
||||||
|
}
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1,55 +0,0 @@
|
|||||||
package nu.marginalia.io;
|
|
||||||
|
|
||||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
|
||||||
import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
|
||||||
|
|
||||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
|
||||||
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
|
||||||
{
|
|
||||||
|
|
||||||
String fileName = fullPath.getFileName().toString();
|
|
||||||
if (fileName.endsWith(".parquet")) {
|
|
||||||
try {
|
|
||||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
|
||||||
return SerializableCrawlDataStream.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (fileName.endsWith(".slop.zip")) {
|
|
||||||
try {
|
|
||||||
return new SlopSerializableCrawlDataStream(fullPath);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
|
||||||
return SerializableCrawlDataStream.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.error("Unknown file type: {}", fullPath);
|
|
||||||
return SerializableCrawlDataStream.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
|
|
||||||
public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
|
|
||||||
Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
|
|
||||||
|
|
||||||
if (Files.exists(parquetPath)) {
|
|
||||||
return createDataStream(parquetPath);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new FileNotFoundException("No such file: " + parquetPath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -35,19 +35,6 @@ public class CrawlerOutputFile {
|
|||||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + "-" + version.suffix + ".warc.gz");
|
return destDir.resolve(id + "-" + filesystemSafeName(domain) + "-" + version.suffix + ".warc.gz");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
|
|
||||||
id = padId(id);
|
|
||||||
|
|
||||||
String first = id.substring(0, 2);
|
|
||||||
String second = id.substring(2, 4);
|
|
||||||
|
|
||||||
Path destDir = basePath.resolve(first).resolve(second);
|
|
||||||
if (!Files.exists(destDir)) {
|
|
||||||
Files.createDirectories(destDir);
|
|
||||||
}
|
|
||||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Path createSlopPath(Path basePath, String id, String domain) throws IOException {
|
public static Path createSlopPath(Path basePath, String id, String domain) throws IOException {
|
||||||
id = padId(id);
|
id = padId(id);
|
||||||
|
|
||||||
@@ -71,16 +58,17 @@ public class CrawlerOutputFile {
|
|||||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
|
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
|
public static Path getSlopPath(Path basePath, String id, String domain) {
|
||||||
id = padId(id);
|
id = padId(id);
|
||||||
|
|
||||||
String first = id.substring(0, 2);
|
String first = id.substring(0, 2);
|
||||||
String second = id.substring(2, 4);
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
Path destDir = basePath.resolve(first).resolve(second);
|
Path destDir = basePath.resolve(first).resolve(second);
|
||||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".warc" + version.suffix);
|
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".slop.zip");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pads the given ID with leading zeros to ensure it has a length of 4 characters.
|
* Pads the given ID with leading zeros to ensure it has a length of 4 characters.
|
||||||
*/
|
*/
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.io;
|
package nu.marginalia.io;
|
||||||
|
|
||||||
|
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
@@ -18,7 +20,6 @@ import java.util.function.Function;
|
|||||||
/** Closable iterator exceptional over serialized crawl data
|
/** Closable iterator exceptional over serialized crawl data
|
||||||
* The data may appear in any order, and the iterator must be closed.
|
* The data may appear in any order, and the iterator must be closed.
|
||||||
*
|
*
|
||||||
* @see CrawledDomainReader
|
|
||||||
* */
|
* */
|
||||||
public interface SerializableCrawlDataStream extends AutoCloseable {
|
public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||||
Logger logger = LoggerFactory.getLogger(SerializableCrawlDataStream.class);
|
Logger logger = LoggerFactory.getLogger(SerializableCrawlDataStream.class);
|
||||||
@@ -27,13 +28,58 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
|||||||
|
|
||||||
/** Return a size hint for the stream. 0 is returned if the hint is not available,
|
/** Return a size hint for the stream. 0 is returned if the hint is not available,
|
||||||
* or if the file is seemed too small to bother */
|
* or if the file is seemed too small to bother */
|
||||||
default int sizeHint() { return 0; }
|
default int getSizeHint() { return 0; }
|
||||||
|
|
||||||
boolean hasNext() throws IOException;
|
boolean hasNext() throws IOException;
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
default Path path() { return null; }
|
default Path path() { return null; }
|
||||||
|
|
||||||
|
void close() throws IOException;
|
||||||
|
|
||||||
|
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||||
|
static SerializableCrawlDataStream openDataStream(Path fullPath) throws IOException
|
||||||
|
{
|
||||||
|
|
||||||
|
String fileName = fullPath.getFileName().toString();
|
||||||
|
if (fileName.endsWith(".parquet")) {
|
||||||
|
try {
|
||||||
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
|
return SerializableCrawlDataStream.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fileName.endsWith(".slop.zip")) {
|
||||||
|
try {
|
||||||
|
return new SlopSerializableCrawlDataStream(fullPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
|
return SerializableCrawlDataStream.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.error("Unknown file type: {}", fullPath);
|
||||||
|
return SerializableCrawlDataStream.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get an idication of the size of the stream. This is used to determine whether to
|
||||||
|
* load the stream into memory or not. 0 is returned if the hint is not available,
|
||||||
|
* or if the file is seemed too small to bother */
|
||||||
|
static int getSizeHint(Path fullPath) {
|
||||||
|
String fileName = fullPath.getFileName().toString();
|
||||||
|
if (fileName.endsWith(".parquet")) {
|
||||||
|
return ParquetSerializableCrawlDataStream.sizeHint(fullPath);
|
||||||
|
}
|
||||||
|
else if (fileName.endsWith(".slop.zip")) {
|
||||||
|
return SlopSerializableCrawlDataStream.sizeHint(fullPath);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default <T> Iterator<T> map(Function<SerializableCrawlData, Optional<T>> mapper) {
|
default <T> Iterator<T> map(Function<SerializableCrawlData, Optional<T>> mapper) {
|
||||||
return new Iterator<>() {
|
return new Iterator<>() {
|
||||||
T next = null;
|
T next = null;
|
||||||
|
@@ -40,7 +40,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
return path;
|
return path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int sizeHint() {
|
public static int sizeHint(Path path) {
|
||||||
// Only calculate size hint for large files
|
// Only calculate size hint for large files
|
||||||
// (the reason we calculate them in the first place is to assess whether it is large
|
// (the reason we calculate them in the first place is to assess whether it is large
|
||||||
// because it has many documents, or because it is a small number of large documents)
|
// because it has many documents, or because it is a small number of large documents)
|
||||||
|
@@ -52,7 +52,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
return path;
|
return path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int sizeHint() {
|
public static int sizeHint(Path path) {
|
||||||
// Only calculate size hint for large files
|
// Only calculate size hint for large files
|
||||||
// (the reason we calculate them in the first place is to assess whether it is large
|
// (the reason we calculate them in the first place is to assess whether it is large
|
||||||
// because it has many documents, or because it is a small number of large documents)
|
// because it has many documents, or because it is a small number of large documents)
|
||||||
|
@@ -12,6 +12,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
@@ -58,7 +59,7 @@ public sealed interface HttpFetchResult {
|
|||||||
int statusCode,
|
int statusCode,
|
||||||
HttpHeaders headers,
|
HttpHeaders headers,
|
||||||
String ipAddress,
|
String ipAddress,
|
||||||
byte[] bytesRaw,
|
byte[] bytesRaw, // raw data for the entire response including headers
|
||||||
int bytesStart,
|
int bytesStart,
|
||||||
int bytesLength
|
int bytesLength
|
||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
@@ -75,6 +76,12 @@ public sealed interface HttpFetchResult {
|
|||||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Copy the byte range corresponding to the payload of the response,
|
||||||
|
Warning: Copies the data, use getInputStream() for zero copy access */
|
||||||
|
public byte[] getBodyBytes() {
|
||||||
|
return Arrays.copyOfRange(bytesRaw, bytesStart, bytesStart + bytesLength);
|
||||||
|
}
|
||||||
|
|
||||||
public Optional<Document> parseDocument() {
|
public Optional<Document> parseDocument() {
|
||||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||||
if (contentType.is("text/html")) {
|
if (contentType.is("text/html")) {
|
||||||
|
@@ -59,9 +59,12 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Document parseBody() throws IOException {
|
public Document parseBody() throws IOException {
|
||||||
|
// Prevent stalls from parsing excessively large documents
|
||||||
|
|
||||||
return DocumentBodyToString.getParsedData(
|
return DocumentBodyToString.getParsedData(
|
||||||
ContentType.parse(contentType),
|
ContentType.parse(contentType),
|
||||||
documentBodyBytes,
|
documentBodyBytes,
|
||||||
|
200_000,
|
||||||
url);
|
url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class DomainStateDbTest {
|
class DomainStateDbTest {
|
||||||
|
|
||||||
@@ -26,7 +26,7 @@ class DomainStateDbTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSunnyDay() throws SQLException {
|
public void testSummaryRecord() throws SQLException {
|
||||||
try (var db = new DomainStateDb(tempFile)) {
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
var allFields = new DomainStateDb.SummaryRecord(
|
var allFields = new DomainStateDb.SummaryRecord(
|
||||||
"all.marginalia.nu",
|
"all.marginalia.nu",
|
||||||
@@ -63,4 +63,21 @@ class DomainStateDbTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFavicon() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
db.saveIcon("www.marginalia.nu", new DomainStateDb.FaviconRecord("text/plain", "hello world".getBytes()));
|
||||||
|
|
||||||
|
var maybeData = db.getIcon("www.marginalia.nu");
|
||||||
|
assertTrue(maybeData.isPresent());
|
||||||
|
var actualData = maybeData.get();
|
||||||
|
|
||||||
|
assertEquals("text/plain", actualData.contentType());
|
||||||
|
assertArrayEquals("hello world".getBytes(), actualData.imageData());
|
||||||
|
|
||||||
|
maybeData = db.getIcon("foobar");
|
||||||
|
assertTrue(maybeData.isEmpty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
@@ -10,7 +10,6 @@ import nu.marginalia.crawl.fetcher.HttpFetcher;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.*;
|
import nu.marginalia.crawl.retreival.*;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@@ -227,7 +226,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -280,7 +279,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -329,7 +328,7 @@ class CrawlerRetreiverTest {
|
|||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -376,7 +375,7 @@ class CrawlerRetreiverTest {
|
|||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
doCrawlWithReferenceStream(specs,
|
doCrawlWithReferenceStream(specs,
|
||||||
CrawledDomainReader.createDataStream(tempFileParquet1)
|
SerializableCrawlDataStream.openDataStream(tempFileParquet1)
|
||||||
);
|
);
|
||||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||||
|
|
||||||
@@ -397,7 +396,7 @@ class CrawlerRetreiverTest {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||||
while (ds.hasNext()) {
|
while (ds.hasNext()) {
|
||||||
var doc = ds.next();
|
var doc = ds.next();
|
||||||
if (doc instanceof CrawledDomain dr) {
|
if (doc instanceof CrawledDomain dr) {
|
||||||
@@ -439,7 +438,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
var doc = stream.next();
|
var doc = stream.next();
|
||||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||||
@@ -448,7 +447,7 @@ class CrawlerRetreiverTest {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1);
|
||||||
|
|
||||||
System.out.println("---");
|
System.out.println("---");
|
||||||
|
|
||||||
@@ -488,7 +487,7 @@ class CrawlerRetreiverTest {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||||
while (ds.hasNext()) {
|
while (ds.hasNext()) {
|
||||||
var doc = ds.next();
|
var doc = ds.next();
|
||||||
if (doc instanceof CrawledDomain dr) {
|
if (doc instanceof CrawledDomain dr) {
|
||||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.extractor;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import gnu.trove.set.hash.TLongHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -59,7 +58,7 @@ public class AtagExporter implements ExporterIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||||
exportLinks(tagWriter, stream);
|
exportLinks(tagWriter, stream);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.extractor;
|
package nu.marginalia.extractor;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.link_parser.FeedExtractor;
|
import nu.marginalia.link_parser.FeedExtractor;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
@@ -56,7 +55,7 @@ public class FeedExporter implements ExporterIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||||
exportFeeds(tagWriter, stream);
|
exportFeeds(tagWriter, stream);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@@ -75,7 +74,7 @@ public class FeedExporter implements ExporterIf {
|
|||||||
private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||||
FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());
|
FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());
|
||||||
|
|
||||||
int size = stream.sizeHint();
|
int size = stream.getSizeHint();
|
||||||
|
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (!(stream.next() instanceof CrawledDocument doc))
|
if (!(stream.next() instanceof CrawledDocument doc))
|
||||||
|
@@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap;
|
|||||||
import gnu.trove.set.hash.TLongHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
@@ -103,7 +103,7 @@ public class TermFrequencyExporter implements ExporterIf {
|
|||||||
{
|
{
|
||||||
TLongHashSet words = new TLongHashSet(1000);
|
TLongHashSet words = new TLongHashSet(1000);
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (Thread.interrupted())
|
if (Thread.interrupted())
|
||||||
return;
|
return;
|
||||||
|
@@ -228,7 +228,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() throws IOException {
|
public boolean hasNext() {
|
||||||
if (dataStack == null) {
|
if (dataStack == null) {
|
||||||
query();
|
query();
|
||||||
}
|
}
|
||||||
@@ -236,7 +236,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() {
|
||||||
dataStack.clear();
|
dataStack.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -5,7 +5,7 @@ plugins {
|
|||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
|
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'gg.jte.gradle' version '3.1.15'
|
id 'gg.jte.gradle' version '3.1.15'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
@@ -104,6 +104,8 @@ task compileTailwind {
|
|||||||
|
|
||||||
doLast {
|
doLast {
|
||||||
exec {
|
exec {
|
||||||
|
// If you're getting a build error like 'npm error could not determine executable to run'
|
||||||
|
// pointing you here, you need to run `npm install -D tailwindcss`
|
||||||
workingDir projectDir
|
workingDir projectDir
|
||||||
if (System.getProperty('os.name').toLowerCase().contains('windows')) {
|
if (System.getProperty('os.name').toLowerCase().contains('windows')) {
|
||||||
commandLine 'cmd', '/c', 'npx', 'tailwindcss',
|
commandLine 'cmd', '/c', 'npx', 'tailwindcss',
|
||||||
|
@@ -140,7 +140,8 @@ public class SearchSiteInfoService {
|
|||||||
) throws SQLException, ExecutionException {
|
) throws SQLException, ExecutionException {
|
||||||
|
|
||||||
if (null == domainName || domainName.isBlank()) {
|
if (null == domainName || domainName.isBlank()) {
|
||||||
return null;
|
// If we don't get a domain name, we redirect to the /site endpoint
|
||||||
|
return new MapModelAndView("redirect.jte", Map.of("url", "/site"));
|
||||||
}
|
}
|
||||||
|
|
||||||
page = Objects.requireNonNullElse(page, 1);
|
page = Objects.requireNonNullElse(page, 1);
|
||||||
|
@@ -86,7 +86,7 @@
|
|||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(result.getFirst().isTracking())
|
@if(result.getFirst().isTracking())
|
||||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
|
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(result.getFirst().isScripts())
|
@if(result.getFirst().isScripts())
|
||||||
@@ -94,11 +94,11 @@
|
|||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(result.getFirst().isAds())
|
@if(result.getFirst().isAds())
|
||||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Ads</span>
|
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Has Ads</span>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(result.getFirst().isAffiliate())
|
@if(result.getFirst().isAffiliate())
|
||||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
|
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
</span>
|
</span>
|
||||||
|
@@ -53,7 +53,7 @@
|
|||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(details.isTracking())
|
@if(details.isTracking())
|
||||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
|
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(details.isScripts())
|
@if(details.isScripts())
|
||||||
@@ -65,7 +65,7 @@
|
|||||||
@endif
|
@endif
|
||||||
|
|
||||||
@if(details.isAffiliate())
|
@if(details.isAffiliate())
|
||||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
|
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
@@ -2,7 +2,7 @@ plugins {
|
|||||||
id 'java'
|
id 'java'
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -2,7 +2,7 @@ plugins {
|
|||||||
id 'java'
|
id 'java'
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
application {
|
application {
|
||||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.tools;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.converting.ConverterModule;
|
import nu.marginalia.converting.ConverterModule;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ public class ExperimentRunnerMain {
|
|||||||
Path basePath = Path.of(args[0]);
|
Path basePath = Path.of(args[0]);
|
||||||
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
||||||
Path crawlDataPath = basePath.resolve(item.relPath());
|
Path crawlDataPath = basePath.resolve(item.relPath());
|
||||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||||
experiment.process(stream);
|
experiment.process(stream);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@@ -26,7 +26,7 @@ import nu.marginalia.index.index.StatefulIndex;
|
|||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.searchset.SearchSetAny;
|
import nu.marginalia.index.searchset.SearchSetAny;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||||
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
||||||
@@ -152,7 +152,7 @@ public class IntegrationTest {
|
|||||||
|
|
||||||
/** PROCESS CRAWL DATA */
|
/** PROCESS CRAWL DATA */
|
||||||
|
|
||||||
var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet));
|
var processedDomain = domainProcessor.fullProcessing(SerializableCrawlDataStream.openDataStream(crawlDataParquet));
|
||||||
|
|
||||||
System.out.println(processedDomain);
|
System.out.println(processedDomain);
|
||||||
|
|
||||||
|
@@ -3,7 +3,7 @@ plugins {
|
|||||||
|
|
||||||
id 'application'
|
id 'application'
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||||
}
|
}
|
||||||
|
|
||||||
java {
|
java {
|
||||||
|
@@ -16,7 +16,7 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
|
|||||||
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
||||||
graalce is a good distribution choice but it doesn't matter too much.
|
graalce is a good distribution choice but it doesn't matter too much.
|
||||||
|
|
||||||
**Tailwindcss** - Install NPM and run `npm install -D tailwindcss`
|
**Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`
|
||||||
|
|
||||||
## Quick Set up
|
## Quick Set up
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user