mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
21 Commits
deploy-003
...
deploy-004
Author | SHA1 | Date | |
---|---|---|---|
|
b7f0a2a98e | ||
|
5fb76b2e79 | ||
|
ad8c97f342 | ||
|
dc1b6373eb | ||
|
983d6d067c | ||
|
a84a06975c | ||
|
d2864c13ec | ||
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 | ||
|
39e420de88 | ||
|
dc83619861 | ||
|
87d1c89701 | ||
|
a42a7769e2 | ||
|
202bda884f |
@@ -8,17 +8,18 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
@@ -101,4 +102,34 @@ public class DbDomainQueries {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) {
|
||||
List<DomainWithNode> ret = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sibling = new EdgeDomain(rs.getString(1));
|
||||
|
||||
if (sibling.equals(domain))
|
||||
continue;
|
||||
|
||||
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||
public boolean isIndexed() {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -83,6 +83,11 @@ public class QueryParams {
|
||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||
return param.startsWith("project=") || param.startsWith("story=");
|
||||
}
|
||||
|
||||
// www.perseus.tufts.edu:
|
||||
if (param.startsWith("collection=")) return true;
|
||||
if (param.startsWith("doc=")) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@@ -121,6 +121,7 @@ public class IndexProtobufCodec {
|
||||
params.getTcfProximityWeight(),
|
||||
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
|
||||
params.getTemporalBiasWeight(),
|
||||
params.getDisablePenalties(),
|
||||
params.getExportDebugData()
|
||||
);
|
||||
}
|
||||
@@ -146,6 +147,7 @@ public class IndexProtobufCodec {
|
||||
.setTcfProximityWeight(rankingParams.tcfProximity)
|
||||
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
|
||||
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
|
||||
.setDisablePenalties(rankingParams.disablePenalties)
|
||||
.setExportDebugData(rankingParams.exportDebugData);
|
||||
|
||||
if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) {
|
||||
|
@@ -42,12 +42,14 @@ public class ResultRankingParameters {
|
||||
public double tcfVerbatim;
|
||||
public double tcfProximity;
|
||||
|
||||
|
||||
public TemporalBias temporalBias;
|
||||
public double temporalBiasWeight;
|
||||
|
||||
public boolean disablePenalties;
|
||||
public boolean exportDebugData;
|
||||
|
||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
|
||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean disablePenalties, boolean exportDebugData) {
|
||||
this.bm25Params = bm25Params;
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
@@ -61,11 +63,11 @@ public class ResultRankingParameters {
|
||||
this.tcfProximity = tcfProximity;
|
||||
this.temporalBias = temporalBias;
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
this.disablePenalties = disablePenalties;
|
||||
this.exportDebugData = exportDebugData;
|
||||
}
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
private static final ResultRankingParameters _sensibleDefaults = builder()
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
@@ -80,7 +82,11 @@ public class ResultRankingParameters {
|
||||
.temporalBias(TemporalBias.NONE)
|
||||
.temporalBiasWeight(5.0)
|
||||
.exportDebugData(false)
|
||||
.disablePenalties(false)
|
||||
.build();
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return _sensibleDefaults;
|
||||
}
|
||||
|
||||
public static ResultRankingParametersBuilder builder() {
|
||||
@@ -139,6 +145,8 @@ public class ResultRankingParameters {
|
||||
return this.temporalBiasWeight;
|
||||
}
|
||||
|
||||
public boolean isDisablePenalties() { return this.disablePenalties; }
|
||||
|
||||
public boolean isExportDebugData() {
|
||||
return this.exportDebugData;
|
||||
}
|
||||
@@ -166,6 +174,7 @@ public class ResultRankingParameters {
|
||||
result = 31 * result + Double.hashCode(tcfProximity);
|
||||
result = 31 * result + Objects.hashCode(temporalBias);
|
||||
result = 31 * result + Double.hashCode(temporalBiasWeight);
|
||||
result = 31 * result + Boolean.hashCode(disablePenalties);
|
||||
result = 31 * result + Boolean.hashCode(exportDebugData);
|
||||
return result;
|
||||
}
|
||||
@@ -192,6 +201,7 @@ public class ResultRankingParameters {
|
||||
private double tcfProximity;
|
||||
private TemporalBias temporalBias;
|
||||
private double temporalBiasWeight;
|
||||
private boolean disablePenalties;
|
||||
private boolean exportDebugData;
|
||||
|
||||
ResultRankingParametersBuilder() {
|
||||
@@ -262,17 +272,20 @@ public class ResultRankingParameters {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public ResultRankingParametersBuilder disablePenalties(boolean disablePenalties) {
|
||||
this.disablePenalties = disablePenalties;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
|
||||
this.exportDebugData = exportDebugData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParameters build() {
|
||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
|
||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.disablePenalties, this.exportDebugData);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -162,6 +162,7 @@ message RpcResultRankingParameters {
|
||||
double temporalBiasWeight = 17;
|
||||
|
||||
bool exportDebugData = 18;
|
||||
bool disablePenalties = 19;
|
||||
|
||||
}
|
||||
|
||||
|
@@ -233,9 +233,19 @@ public class QueryParser {
|
||||
entity.replace(new QueryToken.RankTerm(limit, str));
|
||||
} else if (str.startsWith("qs=")) {
|
||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||
} else if (str.contains(":")) {
|
||||
} else if (str.startsWith("site:")
|
||||
|| str.startsWith("format:")
|
||||
|| str.startsWith("file:")
|
||||
|| str.startsWith("tld:")
|
||||
|| str.startsWith("ip:")
|
||||
|| str.startsWith("as:")
|
||||
|| str.startsWith("asn:")
|
||||
|| str.startsWith("generator:")
|
||||
)
|
||||
{
|
||||
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static SpecificationLimit parseSpecificationLimit(String str) {
|
||||
|
@@ -208,6 +208,12 @@ public class QueryFactoryTest {
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCplusPlus() {
|
||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuotedApostrophe() {
|
||||
var subquery = parseAndGetSpecs("\"bob's cars\"");
|
||||
|
@@ -248,6 +248,10 @@ public class IndexResultScoreCalculator {
|
||||
ResultRankingParameters rankingParams,
|
||||
@Nullable DebugRankingFactors debugRankingFactors) {
|
||||
|
||||
if (rankingParams.disablePenalties) {
|
||||
return 0.;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
|
@@ -152,7 +152,10 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
boolean matchesWordPattern(String s) {
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||
if (s.length() > 48)
|
||||
return false;
|
||||
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8}
|
||||
|
||||
String wordPartSeparator = ".-_/:+*";
|
||||
|
||||
@@ -169,7 +172,7 @@ public class DocumentPositionMapper {
|
||||
if (i == 0)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
|
@@ -30,9 +30,11 @@ class DocumentPositionMapperTest {
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class CppreferenceSpecialization extends WikiSpecialization {
|
||||
|
||||
@Inject
|
||||
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
|
||||
super(summaryExtractor, titleExtractor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document prune(Document original) {
|
||||
var doc = original.clone();
|
||||
|
||||
doc.getElementsByClass("t-nv").remove();
|
||||
doc.getElementsByClass("toc").remove();
|
||||
doc.getElementsByClass("mw-head").remove();
|
||||
doc.getElementsByClass("printfooter").remove();
|
||||
doc.getElementsByClass("cpp-footer-base").remove();
|
||||
|
||||
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSummary(Document doc, Set<String> importantWords) {
|
||||
|
||||
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
|
||||
if (declTable != null) {
|
||||
var nextPar = declTable.nextElementSibling();
|
||||
if (nextPar != null) {
|
||||
return nextPar.text();
|
||||
}
|
||||
}
|
||||
|
||||
return super.getSummary(doc, importantWords);
|
||||
}
|
||||
|
||||
|
||||
public List<String> extractExtraTokens(String title) {
|
||||
|
||||
if (!title.contains("::")) {
|
||||
return List.of();
|
||||
}
|
||||
if (!title.contains("-")) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
title = StringUtils.split(title, '-')[0];
|
||||
|
||||
String name = title;
|
||||
for (;;) {
|
||||
int lbidx = name.indexOf('<');
|
||||
int rbidx = name.indexOf('>');
|
||||
|
||||
if (lbidx > 0 && rbidx > lbidx) {
|
||||
String className = name.substring(0, lbidx);
|
||||
String methodName = name.substring(rbidx + 1);
|
||||
name = className + methodName;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<String> tokens = new ArrayList<>();
|
||||
|
||||
for (var part : name.split("\\s*,\\s*")) {
|
||||
if (part.endsWith(")") && !part.endsWith("()")) {
|
||||
int parenStart = part.indexOf('(');
|
||||
if (parenStart > 0) { // foo(...) -> foo
|
||||
part = part.substring(0, parenStart);
|
||||
}
|
||||
else if (parenStart == 0) { // (foo) -> foo
|
||||
part = part.substring(1, part.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
part = part.trim();
|
||||
if (part.contains("::")) {
|
||||
tokens.add(part);
|
||||
if (part.startsWith("std::")) {
|
||||
tokens.add(part.substring(5));
|
||||
|
||||
int ss = part.indexOf("::", 5);
|
||||
if (ss > 0) {
|
||||
tokens.add(part.substring(0, ss));
|
||||
tokens.add(part.substring(ss+2));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
|
||||
private final WikiSpecialization wikiSpecialization;
|
||||
private final BlogSpecialization blogSpecialization;
|
||||
private final GogStoreSpecialization gogStoreSpecialization;
|
||||
private final CppreferenceSpecialization cppreferenceSpecialization;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
@Inject
|
||||
@@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
|
||||
WikiSpecialization wikiSpecialization,
|
||||
BlogSpecialization blogSpecialization,
|
||||
GogStoreSpecialization gogStoreSpecialization,
|
||||
CppreferenceSpecialization cppreferenceSpecialization,
|
||||
DefaultSpecialization defaultSpecialization) {
|
||||
this.domainTypes = domainTypes;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
@@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
|
||||
this.wikiSpecialization = wikiSpecialization;
|
||||
this.blogSpecialization = blogSpecialization;
|
||||
this.gogStoreSpecialization = gogStoreSpecialization;
|
||||
this.cppreferenceSpecialization = cppreferenceSpecialization;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
@@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
|
||||
return mariadbKbSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.getTopDomain().equals("cppreference.com")) {
|
||||
return cppreferenceSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.toString().equals("store.steampowered.com")) {
|
||||
return steamStoreSpecialization;
|
||||
}
|
||||
@@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
|
||||
if (generator.keywords().contains("javadoc")) {
|
||||
return javadocSpecialization;
|
||||
}
|
||||
|
||||
// Must be toward the end, as some specializations are for
|
||||
// wiki-generator content
|
||||
if (generator.type() == GeneratorType.WIKI) {
|
||||
return wikiSpecialization;
|
||||
}
|
||||
@@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
|
||||
|
||||
boolean shouldIndex(EdgeUrl url);
|
||||
double lengthModifier();
|
||||
void amendWords(Document doc, DocumentKeywordsBuilder words);
|
||||
|
||||
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||
}
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
|
||||
@Override
|
||||
public double lengthModifier() {
|
||||
return 2.5;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class CppreferenceSpecializationTest {
|
||||
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
|
||||
|
||||
@Test
|
||||
public void testTitleMagic() {
|
||||
|
||||
List<String> ret;
|
||||
|
||||
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("std::multimap"));
|
||||
Assertions.assertTrue(ret.contains("crend"));
|
||||
|
||||
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
|
||||
}
|
||||
|
||||
}
|
@@ -8,8 +8,8 @@
|
||||
<ShortName>Marginalia</ShortName>
|
||||
<Description>Search Marginalia</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
|
||||
<Image width="16" height="16" type="image/x-icon">https://old-search.marginalia.nu/favicon.ico</Image>
|
||||
<Url type="text/html" method="get"
|
||||
template="https://search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
|
||||
template="https://old-search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://old-search.marginalia.nu/</moz:SearchForm>
|
||||
</OpenSearchDescription>
|
@@ -3,9 +3,9 @@
|
||||
<nav>
|
||||
<a href="#" class="screenreader-only" onClick="">Skip to content</a>
|
||||
<a href="https://www.marginalia.nu/">Marginalia</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
|
||||
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
|
||||
<a href="https://about.marginalia-search.com/">About</a>
|
||||
<a href="https://about.marginalia-search.com/article/supporting/">Donate</a>
|
||||
<a class="extra" href="https://old-search.marginalia.nu/explore/random">Random</a>
|
||||
</nav>
|
||||
<div id="theme">
|
||||
<label for="theme-select" class="screenreader-only">Color Theme</label>
|
||||
|
@@ -1,362 +0,0 @@
|
||||
package nu.marginalia.search.paperdoll;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.search.SearchModule;
|
||||
import nu.marginalia.search.SearchService;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import spark.Spark;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyInt;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
|
||||
/** This class is a special test class that sets up a search service
|
||||
* and registers some search results, without actually starting the rest
|
||||
* of the environment. This is used to test the search service in isolation
|
||||
* when working on the frontend.
|
||||
* <p></p>
|
||||
* It's not actually a test, but it's in the test directory because it's
|
||||
* using test related classes.
|
||||
* <p></p>
|
||||
* When using gradle, run ./gradlew paperDoll --info to run this test,
|
||||
* the system will wait for you to kill the process to stop the test,
|
||||
* and the UI is available at port 9999.
|
||||
*/
|
||||
@Testcontainers
|
||||
@Tag("paperdoll")
|
||||
public class SearchServicePaperDoll extends AbstractModule {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
private static HikariDataSource dataSource;
|
||||
|
||||
private static List<DecoratedSearchResultItem> results = new ArrayList<>();
|
||||
private static List<SimilarDomain> dummyLinks = new ArrayList<>();
|
||||
private static QueryResponse searchResponse;
|
||||
private static final Gson gson = GsonFactory.get();
|
||||
|
||||
void registerSearchResult(
|
||||
String url,
|
||||
String title,
|
||||
String description,
|
||||
Collection<HtmlFeature> features,
|
||||
double quality,
|
||||
double score,
|
||||
long positions)
|
||||
{
|
||||
try {
|
||||
results.add(new DecoratedSearchResultItem(
|
||||
new SearchResultItem(url.hashCode(), 2, 3, score, 0),
|
||||
new EdgeUrl(url),
|
||||
title,
|
||||
description,
|
||||
quality,
|
||||
"HTML5",
|
||||
HtmlFeature.encode(features),
|
||||
null,
|
||||
url.hashCode(),
|
||||
400,
|
||||
positions,
|
||||
score,
|
||||
4,
|
||||
null)
|
||||
);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws URISyntaxException {
|
||||
if (!Boolean.getBoolean("runPaperDoll")) {
|
||||
return;
|
||||
}
|
||||
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
System.setProperty("service-name", "search");
|
||||
System.setProperty("search.websiteUrl", "http://localhost:9999/");
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var newsStmt = conn.prepareStatement("""
|
||||
INSERT INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""");
|
||||
var domainStmt = conn.prepareStatement("""
|
||||
INSERT INTO EC_DOMAIN(ID, DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""");
|
||||
var randomStmt = conn.prepareStatement("""
|
||||
INSERT INTO EC_RANDOM_DOMAINS(DOMAIN_ID, DOMAIN_SET)
|
||||
VALUES (?, ?)
|
||||
""")
|
||||
) {
|
||||
newsStmt.setString(1, "Lex Luthor elected president");
|
||||
newsStmt.setString(2, "https://www.example.com/foo");
|
||||
newsStmt.setString(3, "Daily Planet");
|
||||
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
|
||||
newsStmt.execute();
|
||||
|
||||
newsStmt.setString(1, "Besieged Alesian onlookers confused as Caesar builds a wall around his wall around the city walls");
|
||||
newsStmt.setString(2, "https://www.example2.com/bar");
|
||||
newsStmt.setString(3, "The Gaulish Observer");
|
||||
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
|
||||
newsStmt.execute();
|
||||
|
||||
newsStmt.setString(1, "Marginalia acquires Google");
|
||||
newsStmt.setString(2, "https://www.example3.com/baz");
|
||||
newsStmt.setString(3, "The Dependent");
|
||||
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
|
||||
newsStmt.execute();
|
||||
|
||||
domainStmt.setInt(1, 1);
|
||||
domainStmt.setString(2, "www.example.com");
|
||||
domainStmt.setString(3, "example.com");
|
||||
domainStmt.setInt(4, 1);
|
||||
domainStmt.execute();
|
||||
|
||||
domainStmt.setInt(1, 2);
|
||||
domainStmt.setString(2, "www.example2.com");
|
||||
domainStmt.setString(3, "example2.com");
|
||||
domainStmt.setInt(4, 2);
|
||||
domainStmt.execute();
|
||||
|
||||
domainStmt.setInt(1, 3);
|
||||
domainStmt.setString(2, "www.example3.com");
|
||||
domainStmt.setString(3, "example3.com");
|
||||
domainStmt.setInt(4, 3);
|
||||
domainStmt.execute();
|
||||
|
||||
randomStmt.setInt(1, 1);
|
||||
randomStmt.setInt(2, 0);
|
||||
randomStmt.execute();
|
||||
|
||||
randomStmt.setInt(1, 2);
|
||||
randomStmt.setInt(2, 0);
|
||||
randomStmt.execute();
|
||||
|
||||
randomStmt.setInt(1, 3);
|
||||
randomStmt.setInt(2, 0);
|
||||
randomStmt.execute();
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
searchResponse = new QueryResponse(
|
||||
new SearchSpecification(new SearchQuery(), List.of(), "", "test",
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
new QueryLimits(10, 20, 3, 4),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.sensibleDefaults()
|
||||
),
|
||||
results,
|
||||
List.of(),
|
||||
List.of(),
|
||||
1,
|
||||
1,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void run() throws Exception {
|
||||
if (!Boolean.getBoolean("runPaperDoll")) {
|
||||
return;
|
||||
}
|
||||
|
||||
var injector = Guice.createInjector(
|
||||
new ServiceConfigurationModule(ServiceId.Search),
|
||||
new SearchModule(),
|
||||
this);
|
||||
|
||||
injector.getInstance(SearchService.class);
|
||||
|
||||
List<String> suggestions = List.of("foo", "bar", "baz");
|
||||
|
||||
Spark.get("/suggest/", (rq, rsp) -> {
|
||||
rsp.type("application/json");
|
||||
return gson.toJson(suggestions);
|
||||
});
|
||||
|
||||
Spark.get("/screenshot/*", (rq, rsp) -> {
|
||||
rsp.type("image/svg+xml");
|
||||
return """
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="640px"
|
||||
height="480px"
|
||||
viewBox="0 0 640 480"
|
||||
version="1.1">
|
||||
<g>
|
||||
<rect
|
||||
style="fill:#808080"
|
||||
id="rect288"
|
||||
width="595.41992"
|
||||
height="430.01825"
|
||||
x="23.034981"
|
||||
y="27.850344" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:100px;fill:#909090;font-family:sans-serif;"
|
||||
x="20"
|
||||
y="120">Placeholder</text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:32px;fill:#000000;font-family:monospace;"
|
||||
x="320" y="240" dominant-baseline="middle" text-anchor="middle">Lorem Ipsum As F</text>
|
||||
</g>
|
||||
</svg>
|
||||
""";
|
||||
});
|
||||
|
||||
registerSearchResult("https://www.example.com/foo", "Foo", "Lorem ipsum dolor sit amet", Set.of(), 0.5, 0.5, ~0L);
|
||||
registerSearchResult("https://www.example2.com/bar", "Bar", "Some text goes here", Set.of(), 0.5, 0.5, 1L);
|
||||
registerSearchResult("https://www.example3.com/baz", "All HTML Features", "This one's got every feature", EnumSet.allOf(HtmlFeature.class), 0.5, 0.5, 1L);
|
||||
|
||||
|
||||
|
||||
|
||||
dummyLinks.add(new SimilarDomain(
|
||||
new EdgeUrl("https://www.example.com/foo"),
|
||||
1,
|
||||
0.5,
|
||||
0.5,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
SimilarDomain.LinkType.FOWARD
|
||||
));
|
||||
dummyLinks.add(new SimilarDomain(
|
||||
new EdgeUrl("https://www.example2.com/foo"),
|
||||
2,
|
||||
0.5,
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
SimilarDomain.LinkType.BACKWARD
|
||||
));
|
||||
dummyLinks.add(new SimilarDomain(
|
||||
new EdgeUrl("https://www.example3.com/foo"),
|
||||
3,
|
||||
0,
|
||||
0.5,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
SimilarDomain.LinkType.BIDIRECTIONAL
|
||||
));
|
||||
|
||||
|
||||
for (;;);
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
try {
|
||||
var serviceRegistry = Mockito.mock(ServiceRegistryIf.class);
|
||||
when(serviceRegistry.registerService(any(), any(), any())).thenReturn(new ServiceEndpoint("localhost", 9999));
|
||||
|
||||
bind(ServiceRegistryIf.class).toInstance(serviceRegistry);
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
|
||||
var qsMock = Mockito.mock(QueryClient.class);
|
||||
when(qsMock.search(any())).thenReturn(searchResponse);
|
||||
bind(QueryClient.class).toInstance(qsMock);
|
||||
|
||||
var asMock = Mockito.mock(DomainInfoClient.class);
|
||||
|
||||
when(asMock.isAccepting()).thenReturn(true);
|
||||
when(asMock.linkedDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks));
|
||||
when(asMock.similarDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks));
|
||||
when(asMock.domainInformation(anyInt())).thenReturn(CompletableFuture.completedFuture(
|
||||
new DomainInformation(new EdgeDomain("www.example.com"),
|
||||
false,
|
||||
123,
|
||||
123,
|
||||
123,
|
||||
123,
|
||||
123,
|
||||
1,
|
||||
0.5,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
"127.0.0.1",
|
||||
1,
|
||||
"ACME",
|
||||
"CA",
|
||||
"CA",
|
||||
"Exemplary")
|
||||
));
|
||||
|
||||
bind(DomainInfoClient.class).toInstance(asMock);
|
||||
|
||||
var sss = Mockito.mock(ScreenshotService.class);
|
||||
when(sss.hasScreenshot(anyInt())).thenReturn(true);
|
||||
bind(ScreenshotService.class).toInstance(sss);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -1,15 +1,14 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.JoobyService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -34,8 +33,6 @@ public class SearchService extends JoobyService {
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
StaticResources staticResources,
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchSiteSubscriptionService siteSubscriptionService,
|
||||
@@ -62,7 +59,25 @@ public class SearchService extends JoobyService {
|
||||
public void startJooby(Jooby jooby) {
|
||||
super.startJooby(jooby);
|
||||
|
||||
final String startTimeAttribute = "start-time";
|
||||
|
||||
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
|
||||
jooby.before((Context ctx) -> {
|
||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||
});
|
||||
|
||||
jooby.after((Context ctx, Object result, Throwable failure) -> {
|
||||
if (failure != null) {
|
||||
wmsa_search_service_error_count.labels(ctx.getRoute().getPattern(), ctx.getMethod()).inc();
|
||||
}
|
||||
else {
|
||||
Long startTime = ctx.getAttribute(startTimeAttribute);
|
||||
if (startTime != null) {
|
||||
wmsa_search_service_request_time.labels(ctx.getRoute().getPattern(), ctx.getMethod())
|
||||
.observe((System.nanoTime() - startTime) / 1e9);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
|
||||
}
|
||||
|
||||
private void addToCrawlQueue(int id) throws SQLException {
|
||||
/** Mark a domain for crawling by setting node affinity to zero,
|
||||
* unless it is already marked for crawling, then node affinity should
|
||||
* be left unchanged.
|
||||
* */
|
||||
void addToCrawlQueue(int domainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
||||
UPDATE EC_DOMAIN
|
||||
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
|
||||
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")) {
|
||||
stmt.setInt(1, id);
|
||||
stmt.setInt(1, domainId);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
private String getDomainName(int id) {
|
||||
String getDomainName(int id) {
|
||||
var domain = domainQueries.getDomain(id);
|
||||
if (domain.isEmpty())
|
||||
throw new IllegalArgumentException();
|
||||
|
@@ -197,7 +197,6 @@ public class SearchSiteInfoService {
|
||||
|
||||
var domain = new EdgeDomain(domainName);
|
||||
final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);
|
||||
boolean viableAliasDomain = domain.aliasDomain().map(alias -> domainQueries.tryGetDomainId(alias).isPresent()).orElse(false);
|
||||
|
||||
final Future<DomainInformation> domainInfoFuture;
|
||||
final Future<List<SimilarDomain>> similarSetFuture;
|
||||
@@ -232,9 +231,10 @@ public class SearchSiteInfoService {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
|
||||
|
||||
var result = new SiteInfoWithContext(domainName,
|
||||
isSubscribed,
|
||||
viableAliasDomain ? domain.aliasDomain().map(EdgeDomain::toString) : Optional.empty(),
|
||||
domainQueries.otherSubdomains(domain, 5),
|
||||
domainId,
|
||||
url,
|
||||
hasScreenshot,
|
||||
@@ -352,7 +352,7 @@ public class SearchSiteInfoService {
|
||||
|
||||
public record SiteInfoWithContext(String domain,
|
||||
boolean isSubscribed,
|
||||
Optional<String> aliasDomain,
|
||||
List<DbDomainQueries.DomainWithNode> siblingDomains,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
|
@@ -2,13 +2,24 @@
|
||||
|
||||
This service handles search traffic and is the service
|
||||
you're most directly interacting with when visiting
|
||||
[search.marginalia.nu](https://search.marginalia.nu).
|
||||
[marginalia-search.com](https://marginalia-search.com).
|
||||
|
||||
It interprets a "human" query and translates it into a
|
||||
request that gets passed into to the index service, which finds
|
||||
related documents, which this service then ranks and returns
|
||||
to the user.
|
||||
|
||||
The UI is built using [JTE templates](https://jte.gg/syntax/) and the [Jooby framework](https://jooby.io), primarily using
|
||||
its MVC facilities.
|
||||
|
||||
When developing, it's possible to set up a mock version of the UI by running
|
||||
the gradle command
|
||||
|
||||
```$ ./gradlew paperDoll -i```
|
||||
|
||||
The UI will be available at http://localhost:9999/, and has hot reloading of JTE classes
|
||||
and static resources.
|
||||
|
||||
|
||||

|
||||
|
||||
|
@@ -1,3 +1,5 @@
|
||||
@import nu.marginalia.db.DbDomainQueries
|
||||
@import nu.marginalia.model.EdgeDomain
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||
@import nu.marginalia.search.model.UrlDetails
|
||||
@@ -13,18 +15,18 @@
|
||||
<span>${siteInfo.domain()}</span>
|
||||
<div class="grow">
|
||||
</div>
|
||||
<a rel="nofollow noopener external" href="${siteInfo.siteUrl()}" class="fa-solid fa-arrow-up-right-from-square" ></a>
|
||||
<a href="https://web.archive.org/web/*/${siteInfo.domain()}"
|
||||
class="p-1.5 text-white px-4"
|
||||
title="Wayback Machine">
|
||||
<i class="fas fa-clock-rotate-left text-sm"></i>
|
||||
</a>
|
||||
<a title="Visit ${siteInfo.domain()}" rel="nofollow noopener external" href="${siteInfo.siteUrl()}" class="fa-solid fa-arrow-up-right-from-square" ></a>
|
||||
</div>
|
||||
|
||||
@if (siteInfo.hasScreenshot())
|
||||
<a class="mx-3 " tabindex="-1" rel="nofollow noopener external" href="${siteInfo.siteUrl()}">
|
||||
<img class="border dark:border-gray-600 shadow-inner" src="/screenshot/${siteInfo.domainId()}" alt="Screenshot of ${siteInfo.domain()}">
|
||||
</a>
|
||||
@elseif (siteInfo.aliasDomain().isPresent())
|
||||
<div class="mx-3 my-3 text-xs text-slate-800 dark:text-white">
|
||||
The search engine is also aware of links to <a class="underline text-liteblue dark:text-blue-200" href="/site/${siteInfo.aliasDomain().get()}">${siteInfo.aliasDomain().get()}</a>,
|
||||
this may be the canonical address.
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if (siteInfo.hasFeed())
|
||||
@@ -80,6 +82,34 @@
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
<span>Related Subdomains</span>
|
||||
</div>
|
||||
|
||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||
<thead>
|
||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||
<tr>
|
||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||
|
||||
@if (!sibling.isIndexed())
|
||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||
@endif
|
||||
</td>
|
||||
</tr>
|
||||
@endfor
|
||||
</tbody>
|
||||
</table>
|
||||
@endif
|
||||
|
||||
@if (siteInfo.domainInformation().isUnknownDomain())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fa-regular fa-circle-question"></i>
|
||||
|
@@ -8,8 +8,8 @@
|
||||
<ShortName>Marginalia</ShortName>
|
||||
<Description>Search Marginalia</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
|
||||
<Image width="16" height="16" type="image/x-icon">https://marginalia-search.com/favicon.ico</Image>
|
||||
<Url type="text/html" method="get"
|
||||
template="https://search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
|
||||
template="https://marginalia-search.com/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://marginalia-search.com/</moz:SearchForm>
|
||||
</OpenSearchDescription>
|
@@ -6,6 +6,7 @@ import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -18,7 +19,6 @@ import nu.marginalia.search.svc.SearchSiteInfoService;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
public class MockedSearchResults {
|
||||
@@ -133,7 +133,10 @@ public class MockedSearchResults {
|
||||
return new SearchSiteInfoService.SiteInfoWithContext(
|
||||
"www.example.com",
|
||||
false,
|
||||
Optional.of("other.example.com"),
|
||||
List.of(
|
||||
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 1),
|
||||
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 0)
|
||||
),
|
||||
14,
|
||||
"https://www.example.com",
|
||||
true,
|
||||
|
@@ -0,0 +1,85 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class SearchAddToCrawlQueueServiceTest {
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
private DbDomainQueries domainQueries;
|
||||
private SearchAddToCrawlQueueService addToCrawlQueueService;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
|
||||
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
|
||||
}
|
||||
|
||||
domainQueries = new DbDomainQueries(dataSource);
|
||||
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
}
|
||||
|
||||
private int getNodeAffinity(String domainName) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
stmt.setString(1, domainName);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Test
|
||||
void addToCrawlQueue() throws SQLException {
|
||||
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
|
||||
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
|
||||
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
|
||||
|
||||
addToCrawlQueueService.addToCrawlQueue(knownId);
|
||||
addToCrawlQueueService.addToCrawlQueue(addedId);
|
||||
addToCrawlQueueService.addToCrawlQueue(indexedId);
|
||||
|
||||
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
|
||||
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
|
||||
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
|
||||
}
|
||||
|
||||
}
|
@@ -146,6 +146,7 @@ public class QueryBasicInterface {
|
||||
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
|
||||
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
|
||||
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
|
||||
.disablePenalties(boolFromRequest(request, "disablePenalties", sensibleDefaults.disablePenalties))
|
||||
.exportDebugData(true)
|
||||
.build();
|
||||
}
|
||||
@@ -154,6 +155,13 @@ public class QueryBasicInterface {
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Double.parseDouble(request.queryParams(param));
|
||||
}
|
||||
|
||||
boolean boolFromRequest(Request request, String param, boolean defaultValue) {
|
||||
if (param == null)
|
||||
return defaultValue;
|
||||
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Boolean.parseBoolean(request.queryParams(param));
|
||||
}
|
||||
|
||||
int intFromRequest(Request request, String param, int defaultValue) {
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : parseInt(request.queryParams(param));
|
||||
}
|
||||
|
@@ -67,6 +67,14 @@
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
|
||||
|
||||
<div class="col-sm-2"><label for="disablePenalties">Disable Penalties</label></div>
|
||||
<div class="col-sm-2">
|
||||
<select class="form-select" id="disablePenalties" name="disablePenalties">
|
||||
<option value="FALSE" {{#unless disablePenalties}}selected{{/unless}}>FALSE</option>
|
||||
<option value="TRUE" {{#if disablePenalties}}selected{{/if}}>TRUE</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{{/with}}
|
||||
|
@@ -1,6 +1,3 @@
|
||||
## This is a token file for automatic deployment
|
||||
|
||||
A master HEAD tagged with deploy-core*, deploy-executor*, or deploy-index* will trigger a commit.
|
||||
|
||||
2024-12-19-00002: Test deployment of executor
|
||||
2024-12-19-00001: Test deployment of executor
|
||||
2025-01-07: Deploy executor.
|
@@ -11,10 +11,13 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
|
||||
**Docker** - It is a bit of a pain to install, but if you follow
|
||||
[this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems.
|
||||
|
||||
**JDK 21** - The code uses Java 21 preview features.
|
||||
**JDK 23** - The code uses Java 23 preview features.
|
||||
|
||||
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
||||
graalce is a good distribution choice but it doesn't matter too much.
|
||||
|
||||
**Tailwindcss** - Install NPM and run `npm install -D tailwindcss`
|
||||
|
||||
## Quick Set up
|
||||
|
||||
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
||||
|
@@ -701,7 +701,7 @@ public abstract class AbstractRssReader<C extends Channel, I extends Item> {
|
||||
}
|
||||
}
|
||||
} catch (XMLStreamException e) {
|
||||
LOGGER.log(Level.WARNING, "Failed to parse XML.", e);
|
||||
LOGGER.log(Level.FINE, "Failed to parse XML.", e);
|
||||
}
|
||||
|
||||
close();
|
||||
|
@@ -258,6 +258,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=2,
|
||||
groups={"all", "frontend", "core"}
|
||||
),
|
||||
'search-legacy': ServiceConfig(
|
||||
gradle_target=':code:services-application:search-service-legacy:docker',
|
||||
docker_name='search-service-legacy',
|
||||
instances=None,
|
||||
deploy_tier=3,
|
||||
groups={"all", "frontend", "core"}
|
||||
),
|
||||
'api': ServiceConfig(
|
||||
gradle_target=':code:services-application:api-service:docker',
|
||||
docker_name='api-service',
|
||||
|
Reference in New Issue
Block a user