1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
8da74484f0 (search) Remove unused count modifier from the footer help 2025-04-27 12:08:34 +02:00
Viktor Lofgren
923d5a7234 (search) Add a note for TUI users pointing them to the old UI 2025-04-27 11:52:07 +02:00
Viktor Lofgren
58f88749b8 (deploy) assistant 2025-04-25 13:25:50 +02:00
Viktor Lofgren
77f727a5ba (crawler) Alter conditional request logic to avoid sending both If-None-Match and If-Modified-Since
It seems like some servers dislike this combination, and may turn a 304 into a 200.
2025-04-25 13:19:07 +02:00
Viktor Lofgren
667cfb53dc (assistant) Remove more link text junk from suggestions at loadtime. 2025-04-24 13:35:29 +02:00
Viktor Lofgren
fe36d4ed20 (deploy) Executor services 2025-04-24 13:23:51 +02:00
Viktor Lofgren
acf4bef98d (assistant) Improve search suggestions
Improve suggestions by loading a secondary suggestions set with link text data.
2025-04-24 13:10:59 +02:00
9 changed files with 86 additions and 41 deletions

View File

@@ -229,13 +229,15 @@ public class FeedFetcherService {
.timeout(Duration.ofSeconds(15))
;
if (ifModifiedSinceDate != null) {
// Set the If-Modified-Since or If-None-Match headers if we have them
// though since there are certain idiosyncrasies in server implementations,
// we avoid setting both at the same time as that may turn a 304 into a 200.
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
} else if (ifModifiedSinceDate != null) {
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
}
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
}
HttpRequest getRequest = requestBuilder.build();

View File

@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
/** Paints the tags onto the request builder. */
public void paint(HttpGet request) {
// Paint the ETag header if present,
// otherwise paint the Last-Modified header
// (but not both at the same time due to some servers not liking it)
if (etag != null) {
request.addHeader("If-None-Match", etag);
}
if (lastMod != null) {
} else if (lastMod != null) {
request.addHeader("If-Modified-Since", lastMod);
}
}

View File

@@ -26,4 +26,10 @@
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
</head>
</head>
<noscript>
<h1>Users of text-based browsers</h1>
<p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
as it uses fewer modern CSS tricks, and should work better than the new UI. It's functionally nearly identical, but just renders it using a different layout.</p>
<hr>
</noscript>

View File

@@ -80,10 +80,6 @@
<tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>

View File

@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
public class AssistantModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.assistant.suggest;
import gnu.trove.list.array.TIntArrayList;
import org.jetbrains.annotations.NotNull;
import java.util.*;
@@ -434,7 +435,7 @@ public class PrefixSearchStructure {
/**
* Class representing a suggested completion.
*/
public static class ScoredSuggestion {
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
private final String word;
private final int score;
@@ -455,5 +456,10 @@ public class PrefixSearchStructure {
public String toString() {
return word + " (" + score + ")";
}
@Override
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
return Integer.compare(this.score, o.score);
}
}
}

View File

@@ -2,8 +2,6 @@ package nu.marginalia.assistant.suggest;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.functions.math.dict.SpellChecker;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -13,35 +11,27 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.*;
import java.util.zip.GZIPInputStream;
public class Suggestions {
private PrefixSearchStructure searchStructure = null;
private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false;
private final SpellChecker spellChecker;
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
private volatile boolean ready = false;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
private static final int MIN_SUGGEST_LENGTH = 3;
@Inject
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
SpellChecker spellChecker,
TermFrequencyDict dict
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
@Named("suggestions-file2") Path suggestionsFile2
) {
this.spellChecker = spellChecker;
Thread.ofPlatform().start(() -> {
searchStructure = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
searchStructures.add(loadSuggestions(suggestionsFile1));
searchStructures.add(loadSuggestions(suggestionsFile2));
ready = true;
logger.info("Loaded {} suggestions", searchStructure.size());
logger.info("Loaded suggestions");
});
}
@@ -55,8 +45,8 @@ public class Suggestions {
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] parts = StringUtils.split(line, " ", 2);
String line = scanner.nextLine().trim();
String[] parts = StringUtils.split(line, " ,", 2);
if (parts.length != 2) {
logger.warn("Invalid suggestion line: {}", line);
continue;
@@ -64,7 +54,30 @@ public class Suggestions {
int cnt = Integer.parseInt(parts[0]);
if (cnt > 1) {
String word = parts[1];
ret.insert(word, cnt);
// Remove quotes and trailing periods if this is a CSV
if (word.startsWith("\"") && word.endsWith("\"")) {
word = word.substring(1, word.length() - 1);
}
// Remove trailing periods
while (word.endsWith(".")) {
word = word.substring(0, word.length() - 1);
}
// Remove junk items we may have gotten from link extraction
if (word.startsWith("click here"))
continue;
if (word.contains("new window"))
continue;
if (word.contains("click to"))
continue;
if (word.startsWith("share "))
continue;
if (word.length() > 3) {
ret.insert(word, cnt);
}
}
}
return ret;
@@ -96,10 +109,22 @@ public class Suggestions {
return List.of();
}
var results = searchStructure.getTopCompletions(prefix, count);
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
for (var searchStructure : searchStructures) {
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
}
resultsAll.sort(Comparator.reverseOrder());
List<String> ret = new ArrayList<>(count);
for (var result : results) {
ret.add(result.getWord());
Set<String> seen = new HashSet<>();
for (var result : resultsAll) {
if (seen.add(result.getWord())) {
ret.add(result.getWord());
}
if (ret.size() >= count) {
break;
}
}
return ret;

View File

@@ -64,6 +64,11 @@ public class ControlMain extends MainClass {
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
}
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
if (!Files.exists(altSuggestionsFile)) {
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
}
Path asnRawData = dataPath.resolve("asn-data-raw-table");
if (!Files.exists(asnRawData)) {
download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table"));

View File

@@ -1,4 +1,6 @@
## This is a token file for automatic deployment
## This is a token file for triggering automatic deployment when no commit is made.
2025-01-08: Deploy executor.
2025-01-07: Deploy executor.
2025-01-07: Deploy executor.
2025-04-24: Deploy executor.
2025-04-24: Deploy assistant.