mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
2 Commits
deploy-019
...
spam-filte
Author | SHA1 | Date | |
---|---|---|---|
|
612f076400 | ||
|
2e87614768 |
@@ -26,6 +26,9 @@ public enum HtmlFeature {
|
||||
ADVERTISEMENT("special:ads"),
|
||||
CATEGORY_CRAFTS("category:crafts"),
|
||||
|
||||
CATEGORY_NSFW("special:nsfw"),
|
||||
CATEGORY_SPAM("special:spam"),
|
||||
|
||||
GA_SPAM("special:gaspam"),
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
|
@@ -1,53 +0,0 @@
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class AdHocDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public AdHocDetector(List<String> terms) {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
for (String term : terms) {
|
||||
String[] parts = StringUtils.split(term, ' ');
|
||||
termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1]));
|
||||
}
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld) {
|
||||
|
||||
for (var stemmed : sentence.stemmedWords) {
|
||||
count++;
|
||||
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,123 @@
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/** Basic spam detector for escort ads.
|
||||
* <p></p>
|
||||
* Tries to differentiate between escorts (callgirls) and escorts (warships)
|
||||
* and the ford escort.
|
||||
*/
|
||||
public class EscortSpamDetector {
|
||||
|
||||
private final Map<String, Double> sexyValues = new HashMap<>();
|
||||
private final Map<String, Double> escortValues = new HashMap<>();
|
||||
private final Map<String, Double> navyValues = new HashMap<>();
|
||||
private final Map<String, Double> carValues = new HashMap<>();
|
||||
private final List<String[]> callgirlPhrases = new ArrayList<>();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EscortSpamDetector.class);
|
||||
private static final Marker marker = MarkerFactory.getMarker("FILTER");
|
||||
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
@Inject
|
||||
public EscortSpamDetector() {
|
||||
|
||||
register(sexyValues, "sexy", 0.5);
|
||||
register(sexyValues, "hot", 0.1);
|
||||
register(sexyValues, "girl", 0.3);
|
||||
register(sexyValues, "massage", 0.3);
|
||||
register(sexyValues, "adult", 0.3);
|
||||
register(sexyValues, "companion", 0.3);
|
||||
register(sexyValues, "date", 0.1);
|
||||
register(sexyValues, "callgirl", 0.5); // Note callgirl will raise escortValues too
|
||||
|
||||
register(escortValues, "escort", 0.3);
|
||||
register(escortValues, "callgirl", 1);
|
||||
|
||||
register(navyValues, "navy", 0.1);
|
||||
register(navyValues, "fleet", 0.2);
|
||||
register(navyValues, "maritime", 0.3);
|
||||
register(navyValues, "warship", 0.5);
|
||||
register(navyValues, "cruiser", 0.5);
|
||||
register(navyValues, "carrier", 0.3);
|
||||
register(navyValues, "destroyer", 0.3);
|
||||
|
||||
register(carValues, "ford", 0.3);
|
||||
register(carValues, "vehicle", 0.3);
|
||||
register(carValues, "sedan", 0.3);
|
||||
register(carValues, "hatchback", 0.3);
|
||||
register(carValues, "transmission", 0.3);
|
||||
register(carValues, "exhaust", 0.3);
|
||||
register(carValues, "fuel", 0.3);
|
||||
|
||||
addCallgirlPhrase("call", "girl");
|
||||
addCallgirlPhrase("escort", "service");
|
||||
addCallgirlPhrase("escort", "agency");
|
||||
}
|
||||
|
||||
private void register(Map<String, Double> map, String word, double value) {
|
||||
String stemmed = ps.stemWord(word);
|
||||
map.put(stemmed, value);
|
||||
}
|
||||
|
||||
private void addCallgirlPhrase(String word1, String word2) {
|
||||
String stemmed1 = ps.stemWord(word1);
|
||||
String stemmed2 = ps.stemWord(word2);
|
||||
callgirlPhrases.add(new String[] { stemmed1, stemmed2 });
|
||||
}
|
||||
|
||||
public boolean test(DocumentLanguageData dld, EdgeUrl url) {
|
||||
|
||||
double sexyP = 0.0;
|
||||
double escortP = 0.0;
|
||||
double navyP = 0.0;
|
||||
double carP = 0.0;
|
||||
|
||||
int count = 0;
|
||||
|
||||
for (var sentence : dld) {
|
||||
|
||||
String prev = "";
|
||||
for (var stemmed : sentence.stemmedWords) {
|
||||
count++;
|
||||
|
||||
sexyP += sexyValues.getOrDefault(stemmed, 0.0);
|
||||
escortP += escortValues.getOrDefault(stemmed, 0.0);
|
||||
navyP += navyValues.getOrDefault(stemmed, 0.0);
|
||||
carP += carValues.getOrDefault(stemmed, 0.0);
|
||||
|
||||
for (var phrase : callgirlPhrases) {
|
||||
if (prev.equals(phrase[0]) && stemmed.equals(phrase[1])) {
|
||||
escortP += 0.5;
|
||||
sexyP += 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
prev = stemmed;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0 || escortP < 1.5) return false;
|
||||
|
||||
boolean is = sexyP > navyP + carP + 1.5;
|
||||
if (is) {
|
||||
logger.info(marker, "Escort spam identified in {}", url);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
|
||||
}
|
@@ -1,159 +0,0 @@
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class TextileCraftDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public TextileCraftDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
termValues.put(ps.stemWord("knit"), 0.05);
|
||||
termValues.put(ps.stemWord("stitch"), 0.05);
|
||||
termValues.put(ps.stemWord("yarn"), 0.05);
|
||||
termValues.put(ps.stemWord("crochet"), 0.05);
|
||||
termValues.put(ps.stemWord("ravelry"), 0.15);
|
||||
|
||||
termValues.put(ps.stemWord("stockinette"), 0.075);
|
||||
termValues.put(ps.stemWord("purl"), 0.075);
|
||||
termValues.put(ps.stemWord("ksp"), 0.075);
|
||||
termValues.put(ps.stemWord("kwise"), 0.075);
|
||||
termValues.put(ps.stemWord("k2tog"), 0.075);
|
||||
termValues.put(ps.stemWord("k1b"), 0.075);
|
||||
termValues.put(ps.stemWord("psso"), 0.075);
|
||||
termValues.put(ps.stemWord("p2sso"), 0.075);
|
||||
termValues.put(ps.stemWord("pwise"), 0.075);
|
||||
termValues.put(ps.stemWord("yrn"), 0.075);
|
||||
termValues.put(ps.stemWord("yon"), 0.075);
|
||||
termValues.put(ps.stemWord("entrelac"), 0.075);
|
||||
termValues.put(ps.stemWord("thrum"), 0.075);
|
||||
termValues.put(ps.stemWord("bobbin"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("boucle"), 0.075);
|
||||
termValues.put(ps.stemWord("lopi"), 0.075);
|
||||
termValues.put(ps.stemWord("eyelash"), 0.01);
|
||||
termValues.put(ps.stemWord("variegated"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("serge"), 0.04);
|
||||
termValues.put(ps.stemWord("selvage"), 0.075);
|
||||
termValues.put(ps.stemWord("topstitch"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("gauge"), 0.01);
|
||||
termValues.put(ps.stemWord("design"), 0.01);
|
||||
termValues.put(ps.stemWord("pattern"), 0.01);
|
||||
termValues.put(ps.stemWord("layer"), 0.01);
|
||||
termValues.put(ps.stemWord("color"), 0.01);
|
||||
termValues.put(ps.stemWord("colour"), 0.01);
|
||||
termValues.put(ps.stemWord("chart"), 0.01);
|
||||
termValues.put(ps.stemWord("grid"), 0.01);
|
||||
termValues.put(ps.stemWord("wool"), 0.01);
|
||||
termValues.put(ps.stemWord("acrylic"), 0.01);
|
||||
termValues.put(ps.stemWord("loose"), 0.01);
|
||||
termValues.put(ps.stemWord("loop"), 0.01);
|
||||
termValues.put(ps.stemWord("needle"), 0.01);
|
||||
termValues.put(ps.stemWord("row"), 0.01);
|
||||
termValues.put(ps.stemWord("circular"), 0.01);
|
||||
termValues.put(ps.stemWord("sew"), 0.01);
|
||||
termValues.put(ps.stemWord("size"), 0.01);
|
||||
termValues.put(ps.stemWord("repeat"), 0.01);
|
||||
termValues.put(ps.stemWord("repetition"), 0.01);
|
||||
termValues.put(ps.stemWord("basketweave"), 0.01);
|
||||
termValues.put(ps.stemWord("weave"), 0.01);
|
||||
termValues.put(ps.stemWord("loom"), 0.01);
|
||||
termValues.put(ps.stemWord("warp"), 0.01);
|
||||
termValues.put(ps.stemWord("weft"), 0.01);
|
||||
termValues.put(ps.stemWord("shuttle"), 0.01);
|
||||
termValues.put(ps.stemWord("brioche"), 0.01);
|
||||
termValues.put(ps.stemWord("spool"), 0.01);
|
||||
termValues.put(ps.stemWord("hem"), 0.01);
|
||||
termValues.put(ps.stemWord("bodice"), 0.01);
|
||||
termValues.put(ps.stemWord("seam"), 0.01);
|
||||
termValues.put(ps.stemWord("allowance"), 0.01);
|
||||
termValues.put(ps.stemWord("crinoline"), 0.01);
|
||||
termValues.put(ps.stemWord("petticoat"), 0.01);
|
||||
termValues.put(ps.stemWord("armscye"), 0.01);
|
||||
termValues.put(ps.stemWord("baste"), 0.01);
|
||||
termValues.put(ps.stemWord("cord"), 0.01);
|
||||
termValues.put(ps.stemWord("darning"), 0.01);
|
||||
termValues.put(ps.stemWord("draping"), 0.01);
|
||||
termValues.put(ps.stemWord("embroider"), 0.01);
|
||||
termValues.put(ps.stemWord("eyelet"), 0.01);
|
||||
termValues.put(ps.stemWord("godet"), 0.01);
|
||||
termValues.put(ps.stemWord("gore"), 0.01);
|
||||
termValues.put(ps.stemWord("grain"), 0.01);
|
||||
termValues.put(ps.stemWord("jersey"), 0.01);
|
||||
termValues.put(ps.stemWord("lining"), 0.01);
|
||||
termValues.put(ps.stemWord("muslin"), 0.01);
|
||||
termValues.put(ps.stemWord("needlework"), 0.01);
|
||||
termValues.put(ps.stemWord("pleat"), 0.01);
|
||||
termValues.put(ps.stemWord("quilt"), 0.01);
|
||||
termValues.put(ps.stemWord("silk"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sloper"), 0.01);
|
||||
termValues.put(ps.stemWord("surplice"), 0.01);
|
||||
termValues.put(ps.stemWord("thread"), 0.01);
|
||||
termValues.put(ps.stemWord("twill"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("ch"), 0.01);
|
||||
termValues.put(ps.stemWord("sp"), 0.01);
|
||||
termValues.put(ps.stemWord("sl"), 0.01);
|
||||
termValues.put(ps.stemWord("sc"), 0.01);
|
||||
termValues.put(ps.stemWord("ss"), 0.01);
|
||||
termValues.put(ps.stemWord("hdc"), 0.01);
|
||||
termValues.put(ps.stemWord("turn"), 0.01);
|
||||
termValues.put(ps.stemWord("skip"), 0.01);
|
||||
termValues.put(ps.stemWord("round"), 0.01);
|
||||
termValues.put(ps.stemWord("ring"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sequin"), 0.01);
|
||||
termValues.put(ps.stemWord("bobble"), 0.01);
|
||||
termValues.put(ps.stemWord("puff"), 0.01);
|
||||
termValues.put(ps.stemWord("v-stitch"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld) {
|
||||
|
||||
for (var stemmed : sentence.stemmedWords) {
|
||||
count++;
|
||||
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@@ -1,135 +0,0 @@
|
||||
package nu.marginalia.converting.processor.classifier.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class WoodworkingDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public WoodworkingDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
// woodworking and joinery
|
||||
termValues.put(ps.stemWord("apse"), 0.01);
|
||||
termValues.put(ps.stemWord("baluster"), 0.01);
|
||||
termValues.put(ps.stemWord("beam"), 0.01);
|
||||
termValues.put(ps.stemWord("cornice"), 0.01);
|
||||
termValues.put(ps.stemWord("drill"), 0.01);
|
||||
termValues.put(ps.stemWord("nail"), 0.01);
|
||||
termValues.put(ps.stemWord("saw"), 0.01);
|
||||
termValues.put(ps.stemWord("hacksaw"), 0.01);
|
||||
termValues.put(ps.stemWord("bandsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("whipsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("gimlet"), 0.01);
|
||||
termValues.put(ps.stemWord("clamp"), 0.01);
|
||||
termValues.put(ps.stemWord("glue"), 0.01);
|
||||
termValues.put(ps.stemWord("cut"), 0.01);
|
||||
termValues.put(ps.stemWord("plane"), 0.01);
|
||||
termValues.put(ps.stemWord("sand"), 0.01);
|
||||
termValues.put(ps.stemWord("bevel"), 0.01);
|
||||
termValues.put(ps.stemWord("chamfer"), 0.01);
|
||||
termValues.put(ps.stemWord("dado"), 0.075);
|
||||
termValues.put(ps.stemWord("dowel"), 0.05);
|
||||
termValues.put(ps.stemWord("dovetail"), 0.05);
|
||||
termValues.put(ps.stemWord("joint"), 0.01);
|
||||
termValues.put(ps.stemWord("level"), 0.01);
|
||||
termValues.put(ps.stemWord("edge"), 0.01);
|
||||
termValues.put(ps.stemWord("face"), 0.01);
|
||||
termValues.put(ps.stemWord("fibreboard"), 0.01);
|
||||
termValues.put(ps.stemWord("fiberboard"), 0.01);
|
||||
termValues.put(ps.stemWord("battens"), 0.01);
|
||||
termValues.put(ps.stemWord("furring"), 0.01);
|
||||
termValues.put(ps.stemWord("glulam"), 0.025);
|
||||
termValues.put(ps.stemWord("hardboard"), 0.025);
|
||||
termValues.put(ps.stemWord("hardwood"), 0.01);
|
||||
termValues.put(ps.stemWord("jamb"), 0.015);
|
||||
termValues.put(ps.stemWord("kerf"), 0.025);
|
||||
termValues.put(ps.stemWord("lvl"), 0.025);
|
||||
termValues.put(ps.stemWord("laminated"), 0.01);
|
||||
termValues.put(ps.stemWord("lignin"), 0.01);
|
||||
termValues.put(ps.stemWord("mitre"), 0.01);
|
||||
termValues.put(ps.stemWord("mortise"), 0.015);
|
||||
termValues.put(ps.stemWord("mullion"), 0.01);
|
||||
termValues.put(ps.stemWord("newel"), 0.01);
|
||||
termValues.put(ps.stemWord("nogging"), 0.01);
|
||||
termValues.put(ps.stemWord("ogee"), 0.01);
|
||||
termValues.put(ps.stemWord("ogive"), 0.01);
|
||||
termValues.put(ps.stemWord("ovolo"), 0.01);
|
||||
termValues.put(ps.stemWord("drawknife"), 0.01);
|
||||
termValues.put(ps.stemWord("plywood"), 0.01);
|
||||
termValues.put(ps.stemWord("purlin"), 0.01);
|
||||
termValues.put(ps.stemWord("riser"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("shingle"), 0.01);
|
||||
termValues.put(ps.stemWord("softwood"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("stave"), 0.01);
|
||||
termValues.put(ps.stemWord("stopper"), 0.01);
|
||||
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
|
||||
termValues.put(ps.stemWord("transom"), 0.01);
|
||||
termValues.put(ps.stemWord("v-joint"), 0.015);
|
||||
termValues.put(ps.stemWord("veneer"), 0.01);
|
||||
termValues.put(ps.stemWord("quartersaw"), 0.015);
|
||||
termValues.put(ps.stemWord("screw"), 0.01);
|
||||
termValues.put(ps.stemWord("woodturning"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("pine"), 0.005);
|
||||
termValues.put(ps.stemWord("balsa"), 0.01);
|
||||
termValues.put(ps.stemWord("poplar"), 0.005);
|
||||
|
||||
termValues.put(ps.stemWord("nut"), 0.01);
|
||||
termValues.put(ps.stemWord("bolt"), 0.01);
|
||||
termValues.put(ps.stemWord("tack"), 0.01);
|
||||
termValues.put(ps.stemWord("hinge"), 0.01);
|
||||
termValues.put(ps.stemWord("brass"), 0.01);
|
||||
termValues.put(ps.stemWord("fitting"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("diy"), 0.015);
|
||||
termValues.put(ps.stemWord("dozuki"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld) {
|
||||
|
||||
for (var stemmed : sentence.stemmedWords) {
|
||||
count++;
|
||||
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@@ -5,9 +5,8 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.EscortSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@@ -67,21 +66,19 @@ public class FeatureExtractor {
|
||||
|
||||
private final AdblockSimulator adblockSimulator;
|
||||
private final RecipeDetector recipeDetector;
|
||||
private final TextileCraftDetector textileCraftDetector;
|
||||
private final WoodworkingDetector woodworkingDetector;
|
||||
private final EscortSpamDetector escortSpamDetector;
|
||||
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator,
|
||||
RecipeDetector recipeDetector,
|
||||
TextileCraftDetector textileCraftDetector,
|
||||
WoodworkingDetector woodworkingDetector,
|
||||
EscortSpamDetector escortSpamDetector,
|
||||
GoogleAnwersSpamDetector googleAnwersSpamDetector)
|
||||
{
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
this.recipeDetector = recipeDetector;
|
||||
this.textileCraftDetector = textileCraftDetector;
|
||||
this.woodworkingDetector = woodworkingDetector;
|
||||
this.escortSpamDetector = escortSpamDetector;
|
||||
|
||||
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
|
||||
}
|
||||
|
||||
@@ -343,9 +340,11 @@ public class FeatureExtractor {
|
||||
|
||||
if (recipeDetector.testP(dld) > 0.5)
|
||||
features.add(HtmlFeature.CATEGORY_FOOD);
|
||||
// these should be mutually exclusive
|
||||
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
|
||||
features.add(HtmlFeature.CATEGORY_CRAFTS);
|
||||
|
||||
if (escortSpamDetector.test(dld, url)) {
|
||||
features.add(HtmlFeature.CATEGORY_SPAM);
|
||||
features.add(HtmlFeature.CATEGORY_NSFW);
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
Reference in New Issue
Block a user