mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
4 Commits
deploy-014
...
deploy-014
Author | SHA1 | Date | |
---|---|---|---|
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d |
@@ -229,13 +229,15 @@ public class FeedFetcherService {
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
|
||||
if (ifModifiedSinceDate != null) {
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
}
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
|
||||
|
@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
// Paint the ETag header if present,
|
||||
// otherwise paint the Last-Modified header
|
||||
// (but not both at the same time due to some servers not liking it)
|
||||
|
||||
if (etag != null) {
|
||||
request.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
} else if (lastMod != null) {
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
|
@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
|
||||
|
||||
public class AssistantModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.assistant.suggest;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@@ -434,7 +435,7 @@ public class PrefixSearchStructure {
|
||||
/**
|
||||
* Class representing a suggested completion.
|
||||
*/
|
||||
public static class ScoredSuggestion {
|
||||
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
|
||||
private final String word;
|
||||
private final int score;
|
||||
|
||||
@@ -455,5 +456,10 @@ public class PrefixSearchStructure {
|
||||
public String toString() {
|
||||
return word + " (" + score + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
|
||||
return Integer.compare(this.score, o.score);
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,8 +2,6 @@ package nu.marginalia.assistant.suggest;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.functions.math.dict.SpellChecker;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -13,35 +11,27 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class Suggestions {
|
||||
private PrefixSearchStructure searchStructure = null;
|
||||
private TermFrequencyDict termFrequencyDict = null;
|
||||
private volatile boolean ready = false;
|
||||
private final SpellChecker spellChecker;
|
||||
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
|
||||
|
||||
private volatile boolean ready = false;
|
||||
|
||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
|
||||
|
||||
private static final int MIN_SUGGEST_LENGTH = 3;
|
||||
@Inject
|
||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||
SpellChecker spellChecker,
|
||||
TermFrequencyDict dict
|
||||
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
|
||||
@Named("suggestions-file2") Path suggestionsFile2
|
||||
) {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
searchStructure = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
searchStructures.add(loadSuggestions(suggestionsFile1));
|
||||
searchStructures.add(loadSuggestions(suggestionsFile2));
|
||||
ready = true;
|
||||
logger.info("Loaded {} suggestions", searchStructure.size());
|
||||
logger.info("Loaded suggestions");
|
||||
});
|
||||
}
|
||||
|
||||
@@ -55,8 +45,8 @@ public class Suggestions {
|
||||
|
||||
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
|
||||
while (scanner.hasNextLine()) {
|
||||
String line = scanner.nextLine();
|
||||
String[] parts = StringUtils.split(line, " ", 2);
|
||||
String line = scanner.nextLine().trim();
|
||||
String[] parts = StringUtils.split(line, " ,", 2);
|
||||
if (parts.length != 2) {
|
||||
logger.warn("Invalid suggestion line: {}", line);
|
||||
continue;
|
||||
@@ -64,7 +54,30 @@ public class Suggestions {
|
||||
int cnt = Integer.parseInt(parts[0]);
|
||||
if (cnt > 1) {
|
||||
String word = parts[1];
|
||||
ret.insert(word, cnt);
|
||||
|
||||
// Remove quotes and trailing periods if this is a CSV
|
||||
if (word.startsWith("\"") && word.endsWith("\"")) {
|
||||
word = word.substring(1, word.length() - 1);
|
||||
}
|
||||
|
||||
// Remove trailing periods
|
||||
while (word.endsWith(".")) {
|
||||
word = word.substring(0, word.length() - 1);
|
||||
}
|
||||
|
||||
// Remove junk items we may have gotten from link extraction
|
||||
if (word.startsWith("click here"))
|
||||
continue;
|
||||
if (word.contains("new window"))
|
||||
continue;
|
||||
if (word.contains("click to"))
|
||||
continue;
|
||||
if (word.startsWith("share "))
|
||||
continue;
|
||||
|
||||
if (word.length() > 3) {
|
||||
ret.insert(word, cnt);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@@ -96,10 +109,22 @@ public class Suggestions {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
var results = searchStructure.getTopCompletions(prefix, count);
|
||||
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
|
||||
|
||||
for (var searchStructure : searchStructures) {
|
||||
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
|
||||
}
|
||||
resultsAll.sort(Comparator.reverseOrder());
|
||||
List<String> ret = new ArrayList<>(count);
|
||||
for (var result : results) {
|
||||
ret.add(result.getWord());
|
||||
|
||||
Set<String> seen = new HashSet<>();
|
||||
for (var result : resultsAll) {
|
||||
if (seen.add(result.getWord())) {
|
||||
ret.add(result.getWord());
|
||||
}
|
||||
if (ret.size() >= count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@@ -64,6 +64,11 @@ public class ControlMain extends MainClass {
|
||||
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
|
||||
}
|
||||
|
||||
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
|
||||
if (!Files.exists(altSuggestionsFile)) {
|
||||
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
|
||||
}
|
||||
|
||||
Path asnRawData = dataPath.resolve("asn-data-raw-table");
|
||||
if (!Files.exists(asnRawData)) {
|
||||
download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table"));
|
||||
|
@@ -1,4 +1,5 @@
|
||||
## This is a token file for automatic deployment
|
||||
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-04-24: Deploy executor.
|
Reference in New Issue
Block a user