1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor
6f0485287a Merge pull request #145 from MarginaliaSearch/cppreference_fixes
Cppreference fixes
2025-01-07 15:43:19 +01:00
Viktor Lofgren
59e2dd4c26 (specialization) Soften length requirements for wiki-specialized documents (incl. cppreference) 2025-01-07 15:41:30 +01:00
Viktor Lofgren
ca1807caae (specialization) Add new specialization for cppreference.com
Give this reference website some synthetically generated tokens to improve the likelihood of a good match.
2025-01-07 15:41:05 +01:00
Viktor Lofgren
26c20e18ac (keyword-extraction) Soften constraints on keyword patterns, allowing for longer segmented words 2025-01-07 15:20:50 +01:00
Viktor Lofgren
7c90b6b414 (query) Don't blindly make tokens containing a colon into a non-ranking advice term 2025-01-07 15:18:05 +01:00
Viktor Lofgren
b63c54c4ce (search) Update opensearch.xml to point to non-redirecting domains. 2025-01-07 00:23:09 +01:00
Viktor Lofgren
fecd2f4ec3 (deploy) Add legacy search service to deploy script 2025-01-07 00:21:13 +01:00
11 changed files with 192 additions and 13 deletions

View File

@@ -233,9 +233,19 @@ public class QueryParser {
entity.replace(new QueryToken.RankTerm(limit, str));
} else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) {
} else if (str.startsWith("site:")
|| str.startsWith("format:")
|| str.startsWith("file:")
|| str.startsWith("tld:")
|| str.startsWith("ip:")
|| str.startsWith("as:")
|| str.startsWith("asn:")
|| str.startsWith("generator:")
)
{
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
}
}
private static SpecificationLimit parseSpecificationLimit(String str) {

View File

@@ -208,6 +208,12 @@ public class QueryFactoryTest {
System.out.println(subquery);
}
@Test
public void testCplusPlus() {
var subquery = parseAndGetSpecs("std::vector::push_back vector");
System.out.println(subquery);
}
@Test
public void testQuotedApostrophe() {
var subquery = parseAndGetSpecs("\"bob's cars\"");

View File

@@ -152,7 +152,10 @@ public class DocumentPositionMapper {
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
if (s.length() > 48)
return false;
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8}
String wordPartSeparator = ".-_/:+*";
@@ -169,7 +172,7 @@ public class DocumentPositionMapper {
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
for (int j = 0; j < 8; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {

View File

@@ -30,9 +30,11 @@ class DocumentPositionMapperTest {
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test"));
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back"));
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));

View File

@@ -0,0 +1,113 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
@Singleton
public class CppreferenceSpecialization extends WikiSpecialization {
@Inject
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override
public Document prune(Document original) {
var doc = original.clone();
doc.getElementsByClass("t-nv").remove();
doc.getElementsByClass("toc").remove();
doc.getElementsByClass("mw-head").remove();
doc.getElementsByClass("printfooter").remove();
doc.getElementsByClass("cpp-footer-base").remove();
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
return doc;
}
@Override
public String getSummary(Document doc, Set<String> importantWords) {
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
if (declTable != null) {
var nextPar = declTable.nextElementSibling();
if (nextPar != null) {
return nextPar.text();
}
}
return super.getSummary(doc, importantWords);
}
public List<String> extractExtraTokens(String title) {
if (!title.contains("::")) {
return List.of();
}
if (!title.contains("-")) {
return List.of();
}
title = StringUtils.split(title, '-')[0];
String name = title;
for (;;) {
int lbidx = name.indexOf('<');
int rbidx = name.indexOf('>');
if (lbidx > 0 && rbidx > lbidx) {
String className = name.substring(0, lbidx);
String methodName = name.substring(rbidx + 1);
name = className + methodName;
} else {
break;
}
}
List<String> tokens = new ArrayList<>();
for (var part : name.split("\\s*,\\s*")) {
if (part.endsWith(")") && !part.endsWith("()")) {
int parenStart = part.indexOf('(');
if (parenStart > 0) { // foo(...) -> foo
part = part.substring(0, parenStart);
}
else if (parenStart == 0) { // (foo) -> foo
part = part.substring(1, part.length() - 1);
}
}
part = part.trim();
if (part.contains("::")) {
tokens.add(part);
if (part.startsWith("std::")) {
tokens.add(part.substring(5));
int ss = part.indexOf("::", 5);
if (ss > 0) {
tokens.add(part.substring(0, ss));
tokens.add(part.substring(ss+2));
}
}
}
}
return tokens;
}
}

View File

@@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
private final WikiSpecialization wikiSpecialization;
private final BlogSpecialization blogSpecialization;
private final GogStoreSpecialization gogStoreSpecialization;
private final CppreferenceSpecialization cppreferenceSpecialization;
private final DefaultSpecialization defaultSpecialization;
@Inject
@@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
WikiSpecialization wikiSpecialization,
BlogSpecialization blogSpecialization,
GogStoreSpecialization gogStoreSpecialization,
CppreferenceSpecialization cppreferenceSpecialization,
DefaultSpecialization defaultSpecialization) {
this.domainTypes = domainTypes;
this.lemmySpecialization = lemmySpecialization;
@@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
this.wikiSpecialization = wikiSpecialization;
this.blogSpecialization = blogSpecialization;
this.gogStoreSpecialization = gogStoreSpecialization;
this.cppreferenceSpecialization = cppreferenceSpecialization;
this.defaultSpecialization = defaultSpecialization;
}
@@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
return mariadbKbSpecialization;
}
if (url.domain.getTopDomain().equals("cppreference.com")) {
return cppreferenceSpecialization;
}
if (url.domain.toString().equals("store.steampowered.com")) {
return steamStoreSpecialization;
}
@@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
if (generator.keywords().contains("javadoc")) {
return javadocSpecialization;
}
// Must be toward the end, as some specializations are for
// wiki-generator content
if (generator.type() == GeneratorType.WIKI) {
return wikiSpecialization;
}
@@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
boolean shouldIndex(EdgeUrl url);
double lengthModifier();
void amendWords(Document doc, DocumentKeywordsBuilder words);
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
}
}

View File

@@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization {
return true;
}
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
@Override
public double lengthModifier() {
return 2.5;
}
}

View File

@@ -0,0 +1,27 @@
package nu.marginalia.converting.processor.plugin.specialization;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.List;
class CppreferenceSpecializationTest {
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
@Test
public void testTitleMagic() {
List<String> ret;
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
Assertions.assertTrue(ret.contains("std::multimap::crend"));
Assertions.assertTrue(ret.contains("multimap::crend"));
Assertions.assertTrue(ret.contains("std::multimap"));
Assertions.assertTrue(ret.contains("crend"));
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
}
}

View File

@@ -8,8 +8,8 @@
<ShortName>Marginalia</ShortName>
<Description>Search Marginalia</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
<Image width="16" height="16" type="image/x-icon">https://old-search.marginalia.nu/favicon.ico</Image>
<Url type="text/html" method="get"
template="https://search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
template="https://old-search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://old-search.marginalia.nu/</moz:SearchForm>
</OpenSearchDescription>

View File

@@ -8,8 +8,8 @@
<ShortName>Marginalia</ShortName>
<Description>Search Marginalia</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
<Image width="16" height="16" type="image/x-icon">https://marginalia-search.com/favicon.ico</Image>
<Url type="text/html" method="get"
template="https://search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
template="https://marginalia-search.com/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://marginalia-search.com/</moz:SearchForm>
</OpenSearchDescription>

View File

@@ -258,6 +258,13 @@ if __name__ == '__main__':
deploy_tier=2,
groups={"all", "frontend", "core"}
),
'search-legacy': ServiceConfig(
gradle_target=':code:services-application:search-service-legacy:docker',
docker_name='search-service-legacy',
instances=None,
deploy_tier=3,
groups={"all", "frontend", "core"}
),
'api': ServiceConfig(
gradle_target=':code:services-application:api-service:docker',
docker_name='api-service',