1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
Viktor Lofgren 1c128e6d82 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:02:03 +02:00

125 lines
5.0 KiB
Java

package nu.marginalia.converting.sideload;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.keyword.LinkTexts;
import nu.marginalia.model.DocumentFormat;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.util.EnumSet;
import java.util.List;
@Singleton
public class SideloaderProcessing {
private final HtmlDocumentProcessorPlugin htmlProcessorPlugin;
@Inject
public SideloaderProcessing(HtmlDocumentProcessorPlugin htmlProcessorPlugin) {
this.htmlProcessorPlugin = htmlProcessorPlugin;
}
public ProcessedDocument processDocument(String url,
String body,
List<String> extraKeywords,
DomainLinks domainLinks,
GeneratorType type,
DocumentClass documentClass,
LinkTexts linkTexts,
int pubYear,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"synthetic",
url,
"text/html",
LocalDateTime.now().toString(),
200,
"OK",
"NP",
"",
body.getBytes(StandardCharsets.UTF_8),
false,
-1,
null,
null
);
// Give the document processing preferential treatment if this is a sideloaded wiki, since we
// truncate the document to the first paragraph, which typically is too short to be included
// on its own.
var ret = new ProcessedDocument();
try {
var details = htmlProcessorPlugin.createDetails(crawledDoc, linkTexts, documentClass);
ret.words = details.words();
for (String keyword : extraKeywords)
ret.words.addMeta(keyword, WordFlags.Subjects.asBit());
if (type == GeneratorType.WIKI) {
ret.words.addAllSyntheticTerms(List.of("generator:wiki"));
} else if (type == GeneratorType.DOCS) {
ret.words.addAllSyntheticTerms(List.of("generator:docs"));
} else if (type == GeneratorType.FORUM) {
ret.words.addAllSyntheticTerms(List.of("generator:forum"));
}
ret.details = details.details();
// Add a few things that we know about the document
// that we can't get from the sideloaded data since it's
// so stripped down
ret.details.format = DocumentFormat.HTML5;
ret.details.pubYear = pubYear;
ret.details.features.add(HtmlFeature.JS);
ret.details.features.add(HtmlFeature.TRACKING);
ret.details.quality = -4.5;
ret.details.generator = type;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(pubYear),
(int) -ret.details.quality,
switch (type) {
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs, DocumentFlags.Sideloaded);
default -> EnumSet.noneOf(DocumentFlags.class);
});
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
// deal but something to keep in mind
int topology = domainLinks.countForUrl(new EdgeUrl(url));
ret.details.metadata = ret.details.metadata
.withSizeAndTopology(size, topology);
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";
}
catch (Exception e) {
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = "SIDELOAD";
}
return ret;
}
}