1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
612f076400 (converter) Tweak escort filter parameters 2025-05-01 17:27:07 +02:00
Viktor Lofgren
2e87614768 (converter) Create a filter for escort ads spam 2025-05-01 16:07:53 +02:00
6 changed files with 136 additions and 358 deletions

View File

@@ -26,6 +26,9 @@ public enum HtmlFeature {
ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"),
CATEGORY_NSFW("special:nsfw"),
CATEGORY_SPAM("special:spam"),
GA_SPAM("special:gaspam"),
/** For fingerprinting and ranking */

View File

@@ -1,53 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.language.model.DocumentLanguageData;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class AdHocDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
public AdHocDetector(List<String> terms) {
PorterStemmer ps = new PorterStemmer();
for (String term : terms) {
String[] parts = StringUtils.split(term, ' ');
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
}
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -0,0 +1,123 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** Basic spam detector for escort ads.
* <p></p>
* Tries to differentiate between escorts (callgirls) and escorts (warships)
* and the ford escort.
*/
public class EscortSpamDetector {
private final Map<String, Double> sexyValues = new HashMap<>();
private final Map<String, Double> escortValues = new HashMap<>();
private final Map<String, Double> navyValues = new HashMap<>();
private final Map<String, Double> carValues = new HashMap<>();
private final List<String[]> callgirlPhrases = new ArrayList<>();
private static final Logger logger = LoggerFactory.getLogger(EscortSpamDetector.class);
private static final Marker marker = MarkerFactory.getMarker("FILTER");
private final PorterStemmer ps = new PorterStemmer();
@Inject
public EscortSpamDetector() {
register(sexyValues, "sexy", 0.5);
register(sexyValues, "hot", 0.1);
register(sexyValues, "girl", 0.3);
register(sexyValues, "massage", 0.3);
register(sexyValues, "adult", 0.3);
register(sexyValues, "companion", 0.3);
register(sexyValues, "date", 0.1);
register(sexyValues, "callgirl", 0.5); // Note callgirl will raise escortValues too
register(escortValues, "escort", 0.3);
register(escortValues, "callgirl", 1);
register(navyValues, "navy", 0.1);
register(navyValues, "fleet", 0.2);
register(navyValues, "maritime", 0.3);
register(navyValues, "warship", 0.5);
register(navyValues, "cruiser", 0.5);
register(navyValues, "carrier", 0.3);
register(navyValues, "destroyer", 0.3);
register(carValues, "ford", 0.3);
register(carValues, "vehicle", 0.3);
register(carValues, "sedan", 0.3);
register(carValues, "hatchback", 0.3);
register(carValues, "transmission", 0.3);
register(carValues, "exhaust", 0.3);
register(carValues, "fuel", 0.3);
addCallgirlPhrase("call", "girl");
addCallgirlPhrase("escort", "service");
addCallgirlPhrase("escort", "agency");
}
private void register(Map<String, Double> map, String word, double value) {
String stemmed = ps.stemWord(word);
map.put(stemmed, value);
}
private void addCallgirlPhrase(String word1, String word2) {
String stemmed1 = ps.stemWord(word1);
String stemmed2 = ps.stemWord(word2);
callgirlPhrases.add(new String[] { stemmed1, stemmed2 });
}
public boolean test(DocumentLanguageData dld, EdgeUrl url) {
double sexyP = 0.0;
double escortP = 0.0;
double navyP = 0.0;
double carP = 0.0;
int count = 0;
for (var sentence : dld) {
String prev = "";
for (var stemmed : sentence.stemmedWords) {
count++;
sexyP += sexyValues.getOrDefault(stemmed, 0.0);
escortP += escortValues.getOrDefault(stemmed, 0.0);
navyP += navyValues.getOrDefault(stemmed, 0.0);
carP += carValues.getOrDefault(stemmed, 0.0);
for (var phrase : callgirlPhrases) {
if (prev.equals(phrase[0]) && stemmed.equals(phrase[1])) {
escortP += 0.5;
sexyP += 0.5;
}
}
prev = stemmed;
}
}
if (count == 0 || escortP < 1.5) return false;
boolean is = sexyP > navyP + carP + 1.5;
if (is) {
logger.info(marker, "Escort spam identified in {}", url);
}
return is;
}
}

View File

@@ -1,159 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class TextileCraftDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public TextileCraftDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
termValues.put(ps.stemWord("knit"), 0.05);
termValues.put(ps.stemWord("stitch"), 0.05);
termValues.put(ps.stemWord("yarn"), 0.05);
termValues.put(ps.stemWord("crochet"), 0.05);
termValues.put(ps.stemWord("ravelry"), 0.15);
termValues.put(ps.stemWord("stockinette"), 0.075);
termValues.put(ps.stemWord("purl"), 0.075);
termValues.put(ps.stemWord("ksp"), 0.075);
termValues.put(ps.stemWord("kwise"), 0.075);
termValues.put(ps.stemWord("k2tog"), 0.075);
termValues.put(ps.stemWord("k1b"), 0.075);
termValues.put(ps.stemWord("psso"), 0.075);
termValues.put(ps.stemWord("p2sso"), 0.075);
termValues.put(ps.stemWord("pwise"), 0.075);
termValues.put(ps.stemWord("yrn"), 0.075);
termValues.put(ps.stemWord("yon"), 0.075);
termValues.put(ps.stemWord("entrelac"), 0.075);
termValues.put(ps.stemWord("thrum"), 0.075);
termValues.put(ps.stemWord("bobbin"), 0.025);
termValues.put(ps.stemWord("boucle"), 0.075);
termValues.put(ps.stemWord("lopi"), 0.075);
termValues.put(ps.stemWord("eyelash"), 0.01);
termValues.put(ps.stemWord("variegated"), 0.075);
termValues.put(ps.stemWord("serge"), 0.04);
termValues.put(ps.stemWord("selvage"), 0.075);
termValues.put(ps.stemWord("topstitch"), 0.075);
termValues.put(ps.stemWord("gauge"), 0.01);
termValues.put(ps.stemWord("design"), 0.01);
termValues.put(ps.stemWord("pattern"), 0.01);
termValues.put(ps.stemWord("layer"), 0.01);
termValues.put(ps.stemWord("color"), 0.01);
termValues.put(ps.stemWord("colour"), 0.01);
termValues.put(ps.stemWord("chart"), 0.01);
termValues.put(ps.stemWord("grid"), 0.01);
termValues.put(ps.stemWord("wool"), 0.01);
termValues.put(ps.stemWord("acrylic"), 0.01);
termValues.put(ps.stemWord("loose"), 0.01);
termValues.put(ps.stemWord("loop"), 0.01);
termValues.put(ps.stemWord("needle"), 0.01);
termValues.put(ps.stemWord("row"), 0.01);
termValues.put(ps.stemWord("circular"), 0.01);
termValues.put(ps.stemWord("sew"), 0.01);
termValues.put(ps.stemWord("size"), 0.01);
termValues.put(ps.stemWord("repeat"), 0.01);
termValues.put(ps.stemWord("repetition"), 0.01);
termValues.put(ps.stemWord("basketweave"), 0.01);
termValues.put(ps.stemWord("weave"), 0.01);
termValues.put(ps.stemWord("loom"), 0.01);
termValues.put(ps.stemWord("warp"), 0.01);
termValues.put(ps.stemWord("weft"), 0.01);
termValues.put(ps.stemWord("shuttle"), 0.01);
termValues.put(ps.stemWord("brioche"), 0.01);
termValues.put(ps.stemWord("spool"), 0.01);
termValues.put(ps.stemWord("hem"), 0.01);
termValues.put(ps.stemWord("bodice"), 0.01);
termValues.put(ps.stemWord("seam"), 0.01);
termValues.put(ps.stemWord("allowance"), 0.01);
termValues.put(ps.stemWord("crinoline"), 0.01);
termValues.put(ps.stemWord("petticoat"), 0.01);
termValues.put(ps.stemWord("armscye"), 0.01);
termValues.put(ps.stemWord("baste"), 0.01);
termValues.put(ps.stemWord("cord"), 0.01);
termValues.put(ps.stemWord("darning"), 0.01);
termValues.put(ps.stemWord("draping"), 0.01);
termValues.put(ps.stemWord("embroider"), 0.01);
termValues.put(ps.stemWord("eyelet"), 0.01);
termValues.put(ps.stemWord("godet"), 0.01);
termValues.put(ps.stemWord("gore"), 0.01);
termValues.put(ps.stemWord("grain"), 0.01);
termValues.put(ps.stemWord("jersey"), 0.01);
termValues.put(ps.stemWord("lining"), 0.01);
termValues.put(ps.stemWord("muslin"), 0.01);
termValues.put(ps.stemWord("needlework"), 0.01);
termValues.put(ps.stemWord("pleat"), 0.01);
termValues.put(ps.stemWord("quilt"), 0.01);
termValues.put(ps.stemWord("silk"), 0.01);
termValues.put(ps.stemWord("sloper"), 0.01);
termValues.put(ps.stemWord("surplice"), 0.01);
termValues.put(ps.stemWord("thread"), 0.01);
termValues.put(ps.stemWord("twill"), 0.01);
termValues.put(ps.stemWord("ch"), 0.01);
termValues.put(ps.stemWord("sp"), 0.01);
termValues.put(ps.stemWord("sl"), 0.01);
termValues.put(ps.stemWord("sc"), 0.01);
termValues.put(ps.stemWord("ss"), 0.01);
termValues.put(ps.stemWord("hdc"), 0.01);
termValues.put(ps.stemWord("turn"), 0.01);
termValues.put(ps.stemWord("skip"), 0.01);
termValues.put(ps.stemWord("round"), 0.01);
termValues.put(ps.stemWord("ring"), 0.01);
termValues.put(ps.stemWord("sequin"), 0.01);
termValues.put(ps.stemWord("bobble"), 0.01);
termValues.put(ps.stemWord("puff"), 0.01);
termValues.put(ps.stemWord("v-stitch"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -1,135 +0,0 @@
package nu.marginalia.converting.processor.classifier.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class WoodworkingDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public WoodworkingDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
// woodworking and joinery
termValues.put(ps.stemWord("apse"), 0.01);
termValues.put(ps.stemWord("baluster"), 0.01);
termValues.put(ps.stemWord("beam"), 0.01);
termValues.put(ps.stemWord("cornice"), 0.01);
termValues.put(ps.stemWord("drill"), 0.01);
termValues.put(ps.stemWord("nail"), 0.01);
termValues.put(ps.stemWord("saw"), 0.01);
termValues.put(ps.stemWord("hacksaw"), 0.01);
termValues.put(ps.stemWord("bandsaw"), 0.01);
termValues.put(ps.stemWord("whipsaw"), 0.01);
termValues.put(ps.stemWord("gimlet"), 0.01);
termValues.put(ps.stemWord("clamp"), 0.01);
termValues.put(ps.stemWord("glue"), 0.01);
termValues.put(ps.stemWord("cut"), 0.01);
termValues.put(ps.stemWord("plane"), 0.01);
termValues.put(ps.stemWord("sand"), 0.01);
termValues.put(ps.stemWord("bevel"), 0.01);
termValues.put(ps.stemWord("chamfer"), 0.01);
termValues.put(ps.stemWord("dado"), 0.075);
termValues.put(ps.stemWord("dowel"), 0.05);
termValues.put(ps.stemWord("dovetail"), 0.05);
termValues.put(ps.stemWord("joint"), 0.01);
termValues.put(ps.stemWord("level"), 0.01);
termValues.put(ps.stemWord("edge"), 0.01);
termValues.put(ps.stemWord("face"), 0.01);
termValues.put(ps.stemWord("fibreboard"), 0.01);
termValues.put(ps.stemWord("fiberboard"), 0.01);
termValues.put(ps.stemWord("battens"), 0.01);
termValues.put(ps.stemWord("furring"), 0.01);
termValues.put(ps.stemWord("glulam"), 0.025);
termValues.put(ps.stemWord("hardboard"), 0.025);
termValues.put(ps.stemWord("hardwood"), 0.01);
termValues.put(ps.stemWord("jamb"), 0.015);
termValues.put(ps.stemWord("kerf"), 0.025);
termValues.put(ps.stemWord("lvl"), 0.025);
termValues.put(ps.stemWord("laminated"), 0.01);
termValues.put(ps.stemWord("lignin"), 0.01);
termValues.put(ps.stemWord("mitre"), 0.01);
termValues.put(ps.stemWord("mortise"), 0.015);
termValues.put(ps.stemWord("mullion"), 0.01);
termValues.put(ps.stemWord("newel"), 0.01);
termValues.put(ps.stemWord("nogging"), 0.01);
termValues.put(ps.stemWord("ogee"), 0.01);
termValues.put(ps.stemWord("ogive"), 0.01);
termValues.put(ps.stemWord("ovolo"), 0.01);
termValues.put(ps.stemWord("drawknife"), 0.01);
termValues.put(ps.stemWord("plywood"), 0.01);
termValues.put(ps.stemWord("purlin"), 0.01);
termValues.put(ps.stemWord("riser"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("shingle"), 0.01);
termValues.put(ps.stemWord("softwood"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("stave"), 0.01);
termValues.put(ps.stemWord("stopper"), 0.01);
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
termValues.put(ps.stemWord("transom"), 0.01);
termValues.put(ps.stemWord("v-joint"), 0.015);
termValues.put(ps.stemWord("veneer"), 0.01);
termValues.put(ps.stemWord("quartersaw"), 0.015);
termValues.put(ps.stemWord("screw"), 0.01);
termValues.put(ps.stemWord("woodturning"), 0.01);
termValues.put(ps.stemWord("pine"), 0.005);
termValues.put(ps.stemWord("balsa"), 0.01);
termValues.put(ps.stemWord("poplar"), 0.005);
termValues.put(ps.stemWord("nut"), 0.01);
termValues.put(ps.stemWord("bolt"), 0.01);
termValues.put(ps.stemWord("tack"), 0.01);
termValues.put(ps.stemWord("hinge"), 0.01);
termValues.put(ps.stemWord("brass"), 0.01);
termValues.put(ps.stemWord("fitting"), 0.01);
termValues.put(ps.stemWord("diy"), 0.015);
termValues.put(ps.stemWord("dozuki"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld) {
for (var stemmed : sentence.stemmedWords) {
count++;
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@@ -5,9 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.EscortSpamDetector;
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
@@ -67,21 +66,19 @@ public class FeatureExtractor {
private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector;
private final TextileCraftDetector textileCraftDetector;
private final WoodworkingDetector woodworkingDetector;
private final EscortSpamDetector escortSpamDetector;
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
@Inject
public FeatureExtractor(AdblockSimulator adblockSimulator,
RecipeDetector recipeDetector,
TextileCraftDetector textileCraftDetector,
WoodworkingDetector woodworkingDetector,
EscortSpamDetector escortSpamDetector,
GoogleAnwersSpamDetector googleAnwersSpamDetector)
{
this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;
this.woodworkingDetector = woodworkingDetector;
this.escortSpamDetector = escortSpamDetector;
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
}
@@ -343,9 +340,11 @@ public class FeatureExtractor {
if (recipeDetector.testP(dld) > 0.5)
features.add(HtmlFeature.CATEGORY_FOOD);
// these should be mutually exclusive
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
features.add(HtmlFeature.CATEGORY_CRAFTS);
if (escortSpamDetector.test(dld, url)) {
features.add(HtmlFeature.CATEGORY_SPAM);
features.add(HtmlFeature.CATEGORY_NSFW);
}
return features;
}