1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

11 Commits

Author SHA1 Message Date
Viktor Lofgren
a84a06975c (ranking-params) Add disable penalties flag to ranking params
This will help debugging ranking issues.  Later it may be added to some filters.
2025-01-08 00:16:49 +01:00
Viktor Lofgren
d2864c13ec (query-params) Add additional permitted query params 2025-01-07 20:21:44 +01:00
Viktor Lofgren
03ba53ce51 (legacy-search) Update nav bar with correct links 2025-01-07 17:44:52 +01:00
Viktor Lofgren
d4a6684931 (specialization) Soften length requirements for wiki-specialized documents (incl. cppreference) 2025-01-07 15:53:25 +01:00
Viktor
6f0485287a Merge pull request #145 from MarginaliaSearch/cppreference_fixes
Cppreference fixes
2025-01-07 15:43:19 +01:00
Viktor Lofgren
59e2dd4c26 (specialization) Soften length requirements for wiki-specialized documents (incl. cppreference) 2025-01-07 15:41:30 +01:00
Viktor Lofgren
ca1807caae (specialization) Add new specialization for cppreference.com
Give this reference website some synthetically generated tokens to improve the likelihood of a good match.
2025-01-07 15:41:05 +01:00
Viktor Lofgren
26c20e18ac (keyword-extraction) Soften constraints on keyword patterns, allowing for longer segmented words 2025-01-07 15:20:50 +01:00
Viktor Lofgren
7c90b6b414 (query) Don't blindly make tokens containing a colon into a non-ranking advice term 2025-01-07 15:18:05 +01:00
Viktor Lofgren
b63c54c4ce (search) Update opensearch.xml to point to non-redirecting domains. 2025-01-07 00:23:09 +01:00
Viktor Lofgren
fecd2f4ec3 (deploy) Add legacy search service to deploy script 2025-01-07 00:21:13 +01:00
20 changed files with 240 additions and 25 deletions

View File

@@ -83,6 +83,11 @@ public class QueryParams {
if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story=");
}
// www.perseus.tufts.edu:
if (param.startsWith("collection=")) return true;
if (param.startsWith("doc=")) return true;
return false;
}
}

View File

@@ -121,6 +121,7 @@ public class IndexProtobufCodec {
params.getTcfProximityWeight(),
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
params.getTemporalBiasWeight(),
params.getDisablePenalties(),
params.getExportDebugData()
);
}
@@ -146,6 +147,7 @@ public class IndexProtobufCodec {
.setTcfProximityWeight(rankingParams.tcfProximity)
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
.setDisablePenalties(rankingParams.disablePenalties)
.setExportDebugData(rankingParams.exportDebugData);
if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) {

View File

@@ -42,12 +42,14 @@ public class ResultRankingParameters {
public double tcfVerbatim;
public double tcfProximity;
public TemporalBias temporalBias;
public double temporalBiasWeight;
public boolean disablePenalties;
public boolean exportDebugData;
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean disablePenalties, boolean exportDebugData) {
this.bm25Params = bm25Params;
this.shortDocumentThreshold = shortDocumentThreshold;
this.shortDocumentPenalty = shortDocumentPenalty;
@@ -61,6 +63,7 @@ public class ResultRankingParameters {
this.tcfProximity = tcfProximity;
this.temporalBias = temporalBias;
this.temporalBiasWeight = temporalBiasWeight;
this.disablePenalties = disablePenalties;
this.exportDebugData = exportDebugData;
}
@@ -80,6 +83,7 @@ public class ResultRankingParameters {
.temporalBias(TemporalBias.NONE)
.temporalBiasWeight(5.0)
.exportDebugData(false)
.disablePenalties(false)
.build();
}
@@ -139,6 +143,8 @@ public class ResultRankingParameters {
return this.temporalBiasWeight;
}
public boolean isDisablePenalties() { return this.disablePenalties; }
public boolean isExportDebugData() {
return this.exportDebugData;
}
@@ -166,6 +172,7 @@ public class ResultRankingParameters {
result = 31 * result + Double.hashCode(tcfProximity);
result = 31 * result + Objects.hashCode(temporalBias);
result = 31 * result + Double.hashCode(temporalBiasWeight);
result = 31 * result + Boolean.hashCode(disablePenalties);
result = 31 * result + Boolean.hashCode(exportDebugData);
return result;
}
@@ -192,6 +199,7 @@ public class ResultRankingParameters {
private double tcfProximity;
private TemporalBias temporalBias;
private double temporalBiasWeight;
private boolean disablePenalties;
private boolean exportDebugData;
ResultRankingParametersBuilder() {
@@ -262,17 +270,20 @@ public class ResultRankingParameters {
return this;
}
public ResultRankingParametersBuilder disablePenalties(boolean disablePenalties) {
this.disablePenalties = disablePenalties;
return this;
}
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
this.exportDebugData = exportDebugData;
return this;
}
public ResultRankingParameters build() {
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.disablePenalties, this.exportDebugData);
}
public String toString() {
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
}
}
}

View File

@@ -162,6 +162,7 @@ message RpcResultRankingParameters {
double temporalBiasWeight = 17;
bool exportDebugData = 18;
bool disablePenalties = 19;
}

View File

@@ -233,9 +233,19 @@ public class QueryParser {
entity.replace(new QueryToken.RankTerm(limit, str));
} else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) {
} else if (str.startsWith("site:")
|| str.startsWith("format:")
|| str.startsWith("file:")
|| str.startsWith("tld:")
|| str.startsWith("ip:")
|| str.startsWith("as:")
|| str.startsWith("asn:")
|| str.startsWith("generator:")
)
{
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
}
}
private static SpecificationLimit parseSpecificationLimit(String str) {

View File

@@ -208,6 +208,12 @@ public class QueryFactoryTest {
System.out.println(subquery);
}
@Test
public void testCplusPlus() {
var subquery = parseAndGetSpecs("std::vector::push_back vector");
System.out.println(subquery);
}
@Test
public void testQuotedApostrophe() {
var subquery = parseAndGetSpecs("\"bob's cars\"");

View File

@@ -248,6 +248,10 @@ public class IndexResultScoreCalculator {
ResultRankingParameters rankingParams,
@Nullable DebugRankingFactors debugRankingFactors) {
if (rankingParams.disablePenalties) {
return 0.;
}
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);

View File

@@ -152,7 +152,10 @@ public class DocumentPositionMapper {
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
if (s.length() > 48)
return false;
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8}
String wordPartSeparator = ".-_/:+*";
@@ -169,7 +172,7 @@ public class DocumentPositionMapper {
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
for (int j = 0; j < 8; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {

View File

@@ -30,9 +30,11 @@ class DocumentPositionMapperTest {
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test"));
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back"));
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));

View File

@@ -0,0 +1,113 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Singleton
public class CppreferenceSpecialization extends WikiSpecialization {
@Inject
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override
public Document prune(Document original) {
var doc = original.clone();
doc.getElementsByClass("t-nv").remove();
doc.getElementsByClass("toc").remove();
doc.getElementsByClass("mw-head").remove();
doc.getElementsByClass("printfooter").remove();
doc.getElementsByClass("cpp-footer-base").remove();
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
return doc;
}
@Override
public String getSummary(Document doc, Set<String> importantWords) {
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
if (declTable != null) {
var nextPar = declTable.nextElementSibling();
if (nextPar != null) {
return nextPar.text();
}
}
return super.getSummary(doc, importantWords);
}
public List<String> extractExtraTokens(String title) {
if (!title.contains("::")) {
return List.of();
}
if (!title.contains("-")) {
return List.of();
}
title = StringUtils.split(title, '-')[0];
String name = title;
for (;;) {
int lbidx = name.indexOf('<');
int rbidx = name.indexOf('>');
if (lbidx > 0 && rbidx > lbidx) {
String className = name.substring(0, lbidx);
String methodName = name.substring(rbidx + 1);
name = className + methodName;
} else {
break;
}
}
List<String> tokens = new ArrayList<>();
for (var part : name.split("\\s*,\\s*")) {
if (part.endsWith(")") && !part.endsWith("()")) {
int parenStart = part.indexOf('(');
if (parenStart > 0) { // foo(...) -> foo
part = part.substring(0, parenStart);
}
else if (parenStart == 0) { // (foo) -> foo
part = part.substring(1, part.length() - 1);
}
}
part = part.trim();
if (part.contains("::")) {
tokens.add(part);
if (part.startsWith("std::")) {
tokens.add(part.substring(5));
int ss = part.indexOf("::", 5);
if (ss > 0) {
tokens.add(part.substring(0, ss));
tokens.add(part.substring(ss+2));
}
}
}
}
return tokens;
}
}

View File

@@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
private final WikiSpecialization wikiSpecialization;
private final BlogSpecialization blogSpecialization;
private final GogStoreSpecialization gogStoreSpecialization;
private final CppreferenceSpecialization cppreferenceSpecialization;
private final DefaultSpecialization defaultSpecialization;
@Inject
@@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
WikiSpecialization wikiSpecialization,
BlogSpecialization blogSpecialization,
GogStoreSpecialization gogStoreSpecialization,
CppreferenceSpecialization cppreferenceSpecialization,
DefaultSpecialization defaultSpecialization) {
this.domainTypes = domainTypes;
this.lemmySpecialization = lemmySpecialization;
@@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
this.wikiSpecialization = wikiSpecialization;
this.blogSpecialization = blogSpecialization;
this.gogStoreSpecialization = gogStoreSpecialization;
this.cppreferenceSpecialization = cppreferenceSpecialization;
this.defaultSpecialization = defaultSpecialization;
}
@@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
return mariadbKbSpecialization;
}
if (url.domain.getTopDomain().equals("cppreference.com")) {
return cppreferenceSpecialization;
}
if (url.domain.toString().equals("store.steampowered.com")) {
return steamStoreSpecialization;
}
@@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
if (generator.keywords().contains("javadoc")) {
return javadocSpecialization;
}
// Must be toward the end, as some specializations are for
// wiki-generator content
if (generator.type() == GeneratorType.WIKI) {
return wikiSpecialization;
}
@@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
boolean shouldIndex(EdgeUrl url);
double lengthModifier();
void amendWords(Document doc, DocumentKeywordsBuilder words);
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
}
}

View File

@@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization {
return true;
}
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
@Override
public double lengthModifier() {
return 2.5;
}
}

View File

@@ -0,0 +1,27 @@
package nu.marginalia.converting.processor.plugin.specialization;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.List;
class CppreferenceSpecializationTest {
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
@Test
public void testTitleMagic() {
List<String> ret;
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
Assertions.assertTrue(ret.contains("std::multimap::crend"));
Assertions.assertTrue(ret.contains("multimap::crend"));
Assertions.assertTrue(ret.contains("std::multimap"));
Assertions.assertTrue(ret.contains("crend"));
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
}
}

View File

@@ -8,8 +8,8 @@
<ShortName>Marginalia</ShortName>
<Description>Search Marginalia</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
<Image width="16" height="16" type="image/x-icon">https://old-search.marginalia.nu/favicon.ico</Image>
<Url type="text/html" method="get"
template="https://search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
template="https://old-search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://old-search.marginalia.nu/</moz:SearchForm>
</OpenSearchDescription>

View File

@@ -3,9 +3,9 @@
<nav>
<a href="#" class="screenreader-only" onClick="">Skip to content</a>
<a href="https://www.marginalia.nu/">Marginalia</a>
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
<a href="https://about.marginalia-search.com/">About</a>
<a href="https://about.marginalia-search.com/article/supporting/">Donate</a>
<a class="extra" href="https://old-search.marginalia.nu/explore/random">Random</a>
</nav>
<div id="theme">
<label for="theme-select" class="screenreader-only">Color Theme</label>

View File

@@ -8,8 +8,8 @@
<ShortName>Marginalia</ShortName>
<Description>Search Marginalia</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
<Image width="16" height="16" type="image/x-icon">https://marginalia-search.com/favicon.ico</Image>
<Url type="text/html" method="get"
template="https://search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
template="https://marginalia-search.com/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://marginalia-search.com/</moz:SearchForm>
</OpenSearchDescription>

View File

@@ -146,6 +146,7 @@ public class QueryBasicInterface {
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
.disablePenalties(boolFromRequest(request, "disablePenalties", sensibleDefaults.disablePenalties))
.exportDebugData(true)
.build();
}
@@ -154,6 +155,13 @@ public class QueryBasicInterface {
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Double.parseDouble(request.queryParams(param));
}
boolean boolFromRequest(Request request, String param, boolean defaultValue) {
if (param == null)
return defaultValue;
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Boolean.parseBoolean(request.queryParams(param));
}
int intFromRequest(Request request, String param, int defaultValue) {
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : parseInt(request.queryParams(param));
}

View File

@@ -67,6 +67,14 @@
<div class="row my-2">
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
<div class="col-sm-2"><label for="disablePenalties">Disable Penalties</label></div>
<div class="col-sm-2">
<select class="form-select" id="disablePenalties" name="disablePenalties">
<option value="FALSE" {{#unless disablePenalties}}selected{{/unless}}>FALSE</option>
<option value="TRUE" {{#if disablePenalties}}selected{{/if}}>TRUE</option>
</select>
</div>
</div>
{{/with}}

View File

@@ -1,6 +1,3 @@
## This is a token file for automatic deployment
A master HEAD tagged with deploy-core*, deploy-executor*, or deploy-index* will trigger a commit.
2024-12-19-00002: Test deployment of executor
2024-12-19-00001: Test deployment of executor
2025-01-07: Deploy executor.

View File

@@ -258,6 +258,13 @@ if __name__ == '__main__':
deploy_tier=2,
groups={"all", "frontend", "core"}
),
'search-legacy': ServiceConfig(
gradle_target=':code:services-application:search-service-legacy:docker',
docker_name='search-service-legacy',
instances=None,
deploy_tier=3,
groups={"all", "frontend", "core"}
),
'api': ServiceConfig(
gradle_target=':code:services-application:api-service:docker',
docker_name='api-service',