1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

15 Commits

Author SHA1 Message Date
Viktor Lofgren
612f076400 (converter) Tweak escort filter parameters 2025-05-01 17:27:07 +02:00
Viktor Lofgren
2e87614768 (converter) Create a filter for escort ads spam 2025-05-01 16:07:53 +02:00
Viktor Lofgren
b07080db16 (crawler) Don't retry requests when encountering UnknownHostException 2025-05-01 16:07:34 +02:00
Viktor Lofgren
e9d86dca4a (crawler) Add timeout to wrap-up phase of WarcInputBuffer. 2025-05-01 15:57:47 +02:00
Viktor Lofgren
1d693f0efa (build) Upgrade JIB to 3.4.5 2025-04-30 15:26:52 +02:00
Viktor Lofgren
5874a163dc (build) Upgrade gradle to 8.14 2025-04-30 15:26:37 +02:00
Viktor Lofgren
5ec7a1deab (crawler) Fix 80%-ish progress crawler stall
Since the crawl tasks are started in two phases, first when generating them in one loop, and then in a second loop that drains the task list; if the first loop contains a long-running crawl task that is triggered late, the rest of the crawl may halt until that task is finish.

Fixed the problem by draining and re-trying also in the first loop.
2025-04-29 12:23:51 +02:00
Viktor Lofgren
7fea2808ed (search) Fix error view
Fix rendering error when query was null

Fix border on error message.
2025-04-27 12:12:56 +02:00
Viktor Lofgren
8da74484f0 (search) Remove unused count modifier from the footer help 2025-04-27 12:08:34 +02:00
Viktor Lofgren
923d5a7234 (search) Add a note for TUI users pointing them to the old UI 2025-04-27 11:52:07 +02:00
Viktor Lofgren
58f88749b8 (deploy) assistant 2025-04-25 13:25:50 +02:00
Viktor Lofgren
77f727a5ba (crawler) Alter conditional request logic to avoid sending both If-None-Match and If-Modified-Since
It seems like some servers dislike this combination, and may turn a 304 into a 200.
2025-04-25 13:19:07 +02:00
Viktor Lofgren
667cfb53dc (assistant) Remove more link text junk from suggestions at loadtime. 2025-04-24 13:35:29 +02:00
Viktor Lofgren
fe36d4ed20 (deploy) Executor services 2025-04-24 13:23:51 +02:00
Viktor Lofgren
acf4bef98d (assistant) Improve search suggestions
Improve suggestions by loading a secondary suggestions set with link text data.
2025-04-24 13:10:59 +02:00
34 changed files with 272 additions and 433 deletions

View File

@@ -5,7 +5,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
}
group 'marginalia'
@@ -47,7 +47,7 @@ ext {
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.4'
jibVersion = '3.4.5'
}
idea {

View File

@@ -26,6 +26,9 @@ public enum HtmlFeature {
ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"),
CATEGORY_NSFW("special:nsfw"),
CATEGORY_SPAM("special:spam"),
GA_SPAM("special:gaspam"),
/** For fingerprinting and ranking */

View File

@@ -229,13 +229,15 @@ public class FeedFetcherService {
.timeout(Duration.ofSeconds(15))
;
if (ifModifiedSinceDate != null) {
// Set the If-Modified-Since or If-None-Match headers if we have them
// though since there are certain idiosyncrasies in server implementations,
// we avoid setting both at the same time as that may turn a 304 into a 200.
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
} else if (ifModifiedSinceDate != null) {
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
}
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
}
HttpRequest getRequest = requestBuilder.build();

View File

@@ -1,53 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.language.model.DocumentLanguageData;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class AdHocDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
public AdHocDetector(List<String> terms) {
PorterStemmer ps = new PorterStemmer();
for (String term : terms) {
String[] parts = StringUtils.split(term, ' ');
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
}
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -0,0 +1,123 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** Basic spam detector for escort ads.
* <p></p>
* Tries to differentiate between escorts (callgirls) and escorts (warships)
* and the ford escort.
*/
public class EscortSpamDetector {
private final Map<String, Double> sexyValues = new HashMap<>();
private final Map<String, Double> escortValues = new HashMap<>();
private final Map<String, Double> navyValues = new HashMap<>();
private final Map<String, Double> carValues = new HashMap<>();
private final List<String[]> callgirlPhrases = new ArrayList<>();
private static final Logger logger = LoggerFactory.getLogger(EscortSpamDetector.class);
private static final Marker marker = MarkerFactory.getMarker("FILTER");
private final PorterStemmer ps = new PorterStemmer();
@Inject
public EscortSpamDetector() {
register(sexyValues, "sexy", 0.5);
register(sexyValues, "hot", 0.1);
register(sexyValues, "girl", 0.3);
register(sexyValues, "massage", 0.3);
register(sexyValues, "adult", 0.3);
register(sexyValues, "companion", 0.3);
register(sexyValues, "date", 0.1);
register(sexyValues, "callgirl", 0.5); // Note callgirl will raise escortValues too
register(escortValues, "escort", 0.3);
register(escortValues, "callgirl", 1);
register(navyValues, "navy", 0.1);
register(navyValues, "fleet", 0.2);
register(navyValues, "maritime", 0.3);
register(navyValues, "warship", 0.5);
register(navyValues, "cruiser", 0.5);
register(navyValues, "carrier", 0.3);
register(navyValues, "destroyer", 0.3);
register(carValues, "ford", 0.3);
register(carValues, "vehicle", 0.3);
register(carValues, "sedan", 0.3);
register(carValues, "hatchback", 0.3);
register(carValues, "transmission", 0.3);
register(carValues, "exhaust", 0.3);
register(carValues, "fuel", 0.3);
addCallgirlPhrase("call", "girl");
addCallgirlPhrase("escort", "service");
addCallgirlPhrase("escort", "agency");
}
private void register(Map<String, Double> map, String word, double value) {
String stemmed = ps.stemWord(word);
map.put(stemmed, value);
}
private void addCallgirlPhrase(String word1, String word2) {
String stemmed1 = ps.stemWord(word1);
String stemmed2 = ps.stemWord(word2);
callgirlPhrases.add(new String[] { stemmed1, stemmed2 });
}
public boolean test(DocumentLanguageData dld, EdgeUrl url) {
double sexyP = 0.0;
double escortP = 0.0;
double navyP = 0.0;
double carP = 0.0;
int count = 0;
for (var sentence : dld) {
String prev = "";
for (var stemmed : sentence.stemmedWords) {
count++;
sexyP += sexyValues.getOrDefault(stemmed, 0.0);
escortP += escortValues.getOrDefault(stemmed, 0.0);
navyP += navyValues.getOrDefault(stemmed, 0.0);
carP += carValues.getOrDefault(stemmed, 0.0);
for (var phrase : callgirlPhrases) {
if (prev.equals(phrase[0]) && stemmed.equals(phrase[1])) {
escortP += 0.5;
sexyP += 0.5;
}
}
prev = stemmed;
}
}
if (count == 0 || escortP < 1.5) return false;
boolean is = sexyP > navyP + carP + 1.5;
if (is) {
logger.info(marker, "Escort spam identified in {}", url);
}
return is;
}
}

View File

@@ -1,159 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class TextileCraftDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public TextileCraftDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
termValues.put(ps.stemWord("knit"), 0.05);
termValues.put(ps.stemWord("stitch"), 0.05);
termValues.put(ps.stemWord("yarn"), 0.05);
termValues.put(ps.stemWord("crochet"), 0.05);
termValues.put(ps.stemWord("ravelry"), 0.15);
termValues.put(ps.stemWord("stockinette"), 0.075);
termValues.put(ps.stemWord("purl"), 0.075);
termValues.put(ps.stemWord("ksp"), 0.075);
termValues.put(ps.stemWord("kwise"), 0.075);
termValues.put(ps.stemWord("k2tog"), 0.075);
termValues.put(ps.stemWord("k1b"), 0.075);
termValues.put(ps.stemWord("psso"), 0.075);
termValues.put(ps.stemWord("p2sso"), 0.075);
termValues.put(ps.stemWord("pwise"), 0.075);
termValues.put(ps.stemWord("yrn"), 0.075);
termValues.put(ps.stemWord("yon"), 0.075);
termValues.put(ps.stemWord("entrelac"), 0.075);
termValues.put(ps.stemWord("thrum"), 0.075);
termValues.put(ps.stemWord("bobbin"), 0.025);
termValues.put(ps.stemWord("boucle"), 0.075);
termValues.put(ps.stemWord("lopi"), 0.075);
termValues.put(ps.stemWord("eyelash"), 0.01);
termValues.put(ps.stemWord("variegated"), 0.075);
termValues.put(ps.stemWord("serge"), 0.04);
termValues.put(ps.stemWord("selvage"), 0.075);
termValues.put(ps.stemWord("topstitch"), 0.075);
termValues.put(ps.stemWord("gauge"), 0.01);
termValues.put(ps.stemWord("design"), 0.01);
termValues.put(ps.stemWord("pattern"), 0.01);
termValues.put(ps.stemWord("layer"), 0.01);
termValues.put(ps.stemWord("color"), 0.01);
termValues.put(ps.stemWord("colour"), 0.01);
termValues.put(ps.stemWord("chart"), 0.01);
termValues.put(ps.stemWord("grid"), 0.01);
termValues.put(ps.stemWord("wool"), 0.01);
termValues.put(ps.stemWord("acrylic"), 0.01);
termValues.put(ps.stemWord("loose"), 0.01);
termValues.put(ps.stemWord("loop"), 0.01);
termValues.put(ps.stemWord("needle"), 0.01);
termValues.put(ps.stemWord("row"), 0.01);
termValues.put(ps.stemWord("circular"), 0.01);
termValues.put(ps.stemWord("sew"), 0.01);
termValues.put(ps.stemWord("size"), 0.01);
termValues.put(ps.stemWord("repeat"), 0.01);
termValues.put(ps.stemWord("repetition"), 0.01);
termValues.put(ps.stemWord("basketweave"), 0.01);
termValues.put(ps.stemWord("weave"), 0.01);
termValues.put(ps.stemWord("loom"), 0.01);
termValues.put(ps.stemWord("warp"), 0.01);
termValues.put(ps.stemWord("weft"), 0.01);
termValues.put(ps.stemWord("shuttle"), 0.01);
termValues.put(ps.stemWord("brioche"), 0.01);
termValues.put(ps.stemWord("spool"), 0.01);
termValues.put(ps.stemWord("hem"), 0.01);
termValues.put(ps.stemWord("bodice"), 0.01);
termValues.put(ps.stemWord("seam"), 0.01);
termValues.put(ps.stemWord("allowance"), 0.01);
termValues.put(ps.stemWord("crinoline"), 0.01);
termValues.put(ps.stemWord("petticoat"), 0.01);
termValues.put(ps.stemWord("armscye"), 0.01);
termValues.put(ps.stemWord("baste"), 0.01);
termValues.put(ps.stemWord("cord"), 0.01);
termValues.put(ps.stemWord("darning"), 0.01);
termValues.put(ps.stemWord("draping"), 0.01);
termValues.put(ps.stemWord("embroider"), 0.01);
termValues.put(ps.stemWord("eyelet"), 0.01);
termValues.put(ps.stemWord("godet"), 0.01);
termValues.put(ps.stemWord("gore"), 0.01);
termValues.put(ps.stemWord("grain"), 0.01);
termValues.put(ps.stemWord("jersey"), 0.01);
termValues.put(ps.stemWord("lining"), 0.01);
termValues.put(ps.stemWord("muslin"), 0.01);
termValues.put(ps.stemWord("needlework"), 0.01);
termValues.put(ps.stemWord("pleat"), 0.01);
termValues.put(ps.stemWord("quilt"), 0.01);
termValues.put(ps.stemWord("silk"), 0.01);
termValues.put(ps.stemWord("sloper"), 0.01);
termValues.put(ps.stemWord("surplice"), 0.01);
termValues.put(ps.stemWord("thread"), 0.01);
termValues.put(ps.stemWord("twill"), 0.01);
termValues.put(ps.stemWord("ch"), 0.01);
termValues.put(ps.stemWord("sp"), 0.01);
termValues.put(ps.stemWord("sl"), 0.01);
termValues.put(ps.stemWord("sc"), 0.01);
termValues.put(ps.stemWord("ss"), 0.01);
termValues.put(ps.stemWord("hdc"), 0.01);
termValues.put(ps.stemWord("turn"), 0.01);
termValues.put(ps.stemWord("skip"), 0.01);
termValues.put(ps.stemWord("round"), 0.01);
termValues.put(ps.stemWord("ring"), 0.01);
termValues.put(ps.stemWord("sequin"), 0.01);
termValues.put(ps.stemWord("bobble"), 0.01);
termValues.put(ps.stemWord("puff"), 0.01);
termValues.put(ps.stemWord("v-stitch"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -1,135 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class WoodworkingDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public WoodworkingDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
// woodworking and joinery
termValues.put(ps.stemWord("apse"), 0.01);
termValues.put(ps.stemWord("baluster"), 0.01);
termValues.put(ps.stemWord("beam"), 0.01);
termValues.put(ps.stemWord("cornice"), 0.01);
termValues.put(ps.stemWord("drill"), 0.01);
termValues.put(ps.stemWord("nail"), 0.01);
termValues.put(ps.stemWord("saw"), 0.01);
termValues.put(ps.stemWord("hacksaw"), 0.01);
termValues.put(ps.stemWord("bandsaw"), 0.01);
termValues.put(ps.stemWord("whipsaw"), 0.01);
termValues.put(ps.stemWord("gimlet"), 0.01);
termValues.put(ps.stemWord("clamp"), 0.01);
termValues.put(ps.stemWord("glue"), 0.01);
termValues.put(ps.stemWord("cut"), 0.01);
termValues.put(ps.stemWord("plane"), 0.01);
termValues.put(ps.stemWord("sand"), 0.01);
termValues.put(ps.stemWord("bevel"), 0.01);
termValues.put(ps.stemWord("chamfer"), 0.01);
termValues.put(ps.stemWord("dado"), 0.075);
termValues.put(ps.stemWord("dowel"), 0.05);
termValues.put(ps.stemWord("dovetail"), 0.05);
termValues.put(ps.stemWord("joint"), 0.01);
termValues.put(ps.stemWord("level"), 0.01);
termValues.put(ps.stemWord("edge"), 0.01);
termValues.put(ps.stemWord("face"), 0.01);
termValues.put(ps.stemWord("fibreboard"), 0.01);
termValues.put(ps.stemWord("fiberboard"), 0.01);
termValues.put(ps.stemWord("battens"), 0.01);
termValues.put(ps.stemWord("furring"), 0.01);
termValues.put(ps.stemWord("glulam"), 0.025);
termValues.put(ps.stemWord("hardboard"), 0.025);
termValues.put(ps.stemWord("hardwood"), 0.01);
termValues.put(ps.stemWord("jamb"), 0.015);
termValues.put(ps.stemWord("kerf"), 0.025);
termValues.put(ps.stemWord("lvl"), 0.025);
termValues.put(ps.stemWord("laminated"), 0.01);
termValues.put(ps.stemWord("lignin"), 0.01);
termValues.put(ps.stemWord("mitre"), 0.01);
termValues.put(ps.stemWord("mortise"), 0.015);
termValues.put(ps.stemWord("mullion"), 0.01);
termValues.put(ps.stemWord("newel"), 0.01);
termValues.put(ps.stemWord("nogging"), 0.01);
termValues.put(ps.stemWord("ogee"), 0.01);
termValues.put(ps.stemWord("ogive"), 0.01);
termValues.put(ps.stemWord("ovolo"), 0.01);
termValues.put(ps.stemWord("drawknife"), 0.01);
termValues.put(ps.stemWord("plywood"), 0.01);
termValues.put(ps.stemWord("purlin"), 0.01);
termValues.put(ps.stemWord("riser"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("shingle"), 0.01);
termValues.put(ps.stemWord("softwood"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("stave"), 0.01);
termValues.put(ps.stemWord("stopper"), 0.01);
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
termValues.put(ps.stemWord("transom"), 0.01);
termValues.put(ps.stemWord("v-joint"), 0.015);
termValues.put(ps.stemWord("veneer"), 0.01);
termValues.put(ps.stemWord("quartersaw"), 0.015);
termValues.put(ps.stemWord("screw"), 0.01);
termValues.put(ps.stemWord("woodturning"), 0.01);
termValues.put(ps.stemWord("pine"), 0.005);
termValues.put(ps.stemWord("balsa"), 0.01);
termValues.put(ps.stemWord("poplar"), 0.005);
termValues.put(ps.stemWord("nut"), 0.01);
termValues.put(ps.stemWord("bolt"), 0.01);
termValues.put(ps.stemWord("tack"), 0.01);
termValues.put(ps.stemWord("hinge"), 0.01);
termValues.put(ps.stemWord("brass"), 0.01);
termValues.put(ps.stemWord("fitting"), 0.01);
termValues.put(ps.stemWord("diy"), 0.015);
termValues.put(ps.stemWord("dozuki"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -5,9 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.EscortSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
@@ -67,21 +66,19 @@ public class FeatureExtractor {
private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector;
private final TextileCraftDetector textileCraftDetector;
private final WoodworkingDetector woodworkingDetector;
private final EscortSpamDetector escortSpamDetector;
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
@Inject
public FeatureExtractor(AdblockSimulator adblockSimulator,
RecipeDetector recipeDetector,
TextileCraftDetector textileCraftDetector,
WoodworkingDetector woodworkingDetector,
EscortSpamDetector escortSpamDetector,
GoogleAnwersSpamDetector googleAnwersSpamDetector)
{
this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;
this.woodworkingDetector = woodworkingDetector;
this.escortSpamDetector = escortSpamDetector;
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
}
@@ -343,9 +340,11 @@ public class FeatureExtractor {
if (recipeDetector.testP(dld) > 0.5)
features.add(HtmlFeature.CATEGORY_FOOD);
// these should be mutually exclusive
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
features.add(HtmlFeature.CATEGORY_CRAFTS);
if (escortSpamDetector.test(dld, url)) {
features.add(HtmlFeature.CATEGORY_SPAM);
features.add(HtmlFeature.CATEGORY_NSFW);
}
return features;
}

View File

@@ -264,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
if (workLog.isJobFinished(crawlSpec.domain))
continue;
var task = new CrawlTask(
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
retryQueue.drainTo(taskList);
taskList.removeIf(this::trySubmitDeferredTask);
// Then add this new task to the retry queue
taskList.add(task);
}
}

View File

@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
/** Paints the tags onto the request builder. */
public void paint(HttpGet request) {
// Paint the ETag header if present,
// otherwise paint the Last-Modified header
// (but not both at the same time due to some servers not liking it)
if (etag != null) {
request.addHeader("If-None-Match", etag);
}
if (lastMod != null) {
} else if (lastMod != null) {
request.addHeader("If-Modified-Since", lastMod);
}
}

View File

@@ -51,6 +51,7 @@ import javax.net.ssl.SSLException;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.Instant;
@@ -635,14 +636,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
return false;
}
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
return false;
}
return executionCount <= 3;
return switch (exception) {
case SocketTimeoutException ste -> false;
case SSLException ssle -> false;
case UnknownHostException uhe -> false;
default -> executionCount <= 3;
};
}
@Override

View File

@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer();
}
Instant start = Instant.now();
InputStream is = null;
try {
is = entity.getContent();
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
}
finally {
// We're required to consume the stream to avoid leaking connections,
// but we also don't want to get stuck on slow or malicious connections
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
try {
is.skip(Long.MAX_VALUE);
while (is != null) {
// Consume some data
if (is.skip(65536) == 0) {
// Note that skip may return 0 if the stream is empty
// or for other unspecified reasons, so we need to check
// with read() as well to determine if the stream is done
if (is.read() == -1)
is = null;
}
// Check if the time limit has been exceeded
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
request.abort();
is = null;
}
}
}
catch (IOException e) {
// Ignore the exception

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -5,7 +5,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -26,4 +26,10 @@
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
</head>
</head>
<noscript>
<h1>Users of text-based browsers</h1>
<p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
as it uses fewer modern CSS tricks, and should work better than the new UI. It's functionally nearly identical, but just renders it using a different layout.</p>
<hr>
</noscript>

View File

@@ -26,7 +26,7 @@
<!-- Main content -->
<main class="flex-1 p-4 max-w-2xl space-y-4">
<div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="flex space-x-3 place-items-baseline">
<i class="fa fa-circle-exclamation text-red-800"></i>
<div class="grow">${model.errorTitle()}</div>

View File

@@ -80,10 +80,6 @@
<tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>

View File

@@ -7,7 +7,7 @@
<form class="flex-1 max-w-2xl" action="/search">
<div class="flex">
@if (query.isBlank())
@if (query != null && query.isBlank())
<%-- Add autofocus if the query is blank --%>
<input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
public class AssistantModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.assistant.suggest;
import gnu.trove.list.array.TIntArrayList;
import org.jetbrains.annotations.NotNull;
import java.util.*;
@@ -434,7 +435,7 @@ public class PrefixSearchStructure {
/**
* Class representing a suggested completion.
*/
public static class ScoredSuggestion {
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
private final String word;
private final int score;
@@ -455,5 +456,10 @@ public class PrefixSearchStructure {
public String toString() {
return word + " (" + score + ")";
}
@Override
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
return Integer.compare(this.score, o.score);
}
}
}

View File

@@ -2,8 +2,6 @@ package nu.marginalia.assistant.suggest;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.functions.math.dict.SpellChecker;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -13,35 +11,27 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.*;
import java.util.zip.GZIPInputStream;
public class Suggestions {
private PrefixSearchStructure searchStructure = null;
private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false;
private final SpellChecker spellChecker;
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
private volatile boolean ready = false;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
private static final int MIN_SUGGEST_LENGTH = 3;
@Inject
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
SpellChecker spellChecker,
TermFrequencyDict dict
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
@Named("suggestions-file2") Path suggestionsFile2
) {
this.spellChecker = spellChecker;
Thread.ofPlatform().start(() -> {
searchStructure = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
searchStructures.add(loadSuggestions(suggestionsFile1));
searchStructures.add(loadSuggestions(suggestionsFile2));
ready = true;
logger.info("Loaded {} suggestions", searchStructure.size());
logger.info("Loaded suggestions");
});
}
@@ -55,8 +45,8 @@ public class Suggestions {
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] parts = StringUtils.split(line, " ", 2);
String line = scanner.nextLine().trim();
String[] parts = StringUtils.split(line, " ,", 2);
if (parts.length != 2) {
logger.warn("Invalid suggestion line: {}", line);
continue;
@@ -64,7 +54,30 @@ public class Suggestions {
int cnt = Integer.parseInt(parts[0]);
if (cnt > 1) {
String word = parts[1];
ret.insert(word, cnt);
// Remove quotes and trailing periods if this is a CSV
if (word.startsWith("\"") && word.endsWith("\"")) {
word = word.substring(1, word.length() - 1);
}
// Remove trailing periods
while (word.endsWith(".")) {
word = word.substring(0, word.length() - 1);
}
// Remove junk items we may have gotten from link extraction
if (word.startsWith("click here"))
continue;
if (word.contains("new window"))
continue;
if (word.contains("click to"))
continue;
if (word.startsWith("share "))
continue;
if (word.length() > 3) {
ret.insert(word, cnt);
}
}
}
return ret;
@@ -96,10 +109,22 @@ public class Suggestions {
return List.of();
}
var results = searchStructure.getTopCompletions(prefix, count);
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
for (var searchStructure : searchStructures) {
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
}
resultsAll.sort(Comparator.reverseOrder());
List<String> ret = new ArrayList<>(count);
for (var result : results) {
ret.add(result.getWord());
Set<String> seen = new HashSet<>();
for (var result : resultsAll) {
if (seen.add(result.getWord())) {
ret.add(result.getWord());
}
if (ret.size() >= count) {
break;
}
}
return ret;

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -64,6 +64,11 @@ public class ControlMain extends MainClass {
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
}
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
if (!Files.exists(altSuggestionsFile)) {
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
}
Path asnRawData = dataPath.resolve("asn-data-raw-table");
if (!Files.exists(asnRawData)) {
download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table"));

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -1,4 +1,6 @@
## This is a token file for automatic deployment
## This is a token file for triggering automatic deployment when no commit is made.
2025-01-08: Deploy executor.
2025-01-07: Deploy executor.
2025-01-07: Deploy executor.
2025-04-24: Deploy executor.
2025-04-24: Deploy assistant.

View File

@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists