1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

13 Commits

Author SHA1 Message Date
Viktor Lofgren
1d693f0efa (build) Upgrade JIB to 3.4.5 2025-04-30 15:26:52 +02:00
Viktor Lofgren
5874a163dc (build) Upgrade gradle to 8.14 2025-04-30 15:26:37 +02:00
Viktor Lofgren
5ec7a1deab (crawler) Fix 80%-ish progress crawler stall
Since the crawl tasks are started in two phases, first when generating them in one loop, and then in a second loop that drains the task list; if the first loop contains a long-running crawl task that is triggered late, the rest of the crawl may halt until that task is finish.

Fixed the problem by draining and re-trying also in the first loop.
2025-04-29 12:23:51 +02:00
Viktor Lofgren
7fea2808ed (search) Fix error view
Fix rendering error when query was null

Fix border on error message.
2025-04-27 12:12:56 +02:00
Viktor Lofgren
8da74484f0 (search) Remove unused count modifier from the footer help 2025-04-27 12:08:34 +02:00
Viktor Lofgren
923d5a7234 (search) Add a note for TUI users pointing them to the old UI 2025-04-27 11:52:07 +02:00
Viktor Lofgren
58f88749b8 (deploy) assistant 2025-04-25 13:25:50 +02:00
Viktor Lofgren
77f727a5ba (crawler) Alter conditional request logic to avoid sending both If-None-Match and If-Modified-Since
It seems like some servers dislike this combination, and may turn a 304 into a 200.
2025-04-25 13:19:07 +02:00
Viktor Lofgren
667cfb53dc (assistant) Remove more link text junk from suggestions at loadtime. 2025-04-24 13:35:29 +02:00
Viktor Lofgren
fe36d4ed20 (deploy) Executor services 2025-04-24 13:23:51 +02:00
Viktor Lofgren
acf4bef98d (assistant) Improve search suggestions
Improve suggestions by loading a secondary suggestions set with link text data.
2025-04-24 13:10:59 +02:00
Viktor Lofgren
2a737c34bb (search) Improve suggestions UX
Fix the highlight colors when arrowing through search suggestions.  Also fix the suggestions box for dark mode.
2025-04-24 12:34:05 +02:00
Viktor Lofgren
90a577af82 (search) Improve suggestions UX 2025-04-24 00:32:25 +02:00
28 changed files with 126 additions and 75 deletions

View File

@@ -5,7 +5,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
}
group 'marginalia'
@@ -47,7 +47,7 @@ ext {
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.4'
jibVersion = '3.4.5'
}
idea {

View File

@@ -229,13 +229,15 @@ public class FeedFetcherService {
.timeout(Duration.ofSeconds(15))
;
if (ifModifiedSinceDate != null) {
// Set the If-Modified-Since or If-None-Match headers if we have them
// though since there are certain idiosyncrasies in server implementations,
// we avoid setting both at the same time as that may turn a 304 into a 200.
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
} else if (ifModifiedSinceDate != null) {
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
}
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
}
HttpRequest getRequest = requestBuilder.build();

View File

@@ -264,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
if (workLog.isJobFinished(crawlSpec.domain))
continue;
var task = new CrawlTask(
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
retryQueue.drainTo(taskList);
taskList.removeIf(this::trySubmitDeferredTask);
// Then add this new task to the retry queue
taskList.add(task);
}
}

View File

@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
/** Paints the tags onto the request builder. */
public void paint(HttpGet request) {
// Paint the ETag header if present,
// otherwise paint the Last-Modified header
// (but not both at the same time due to some servers not liking it)
if (etag != null) {
request.addHeader("If-None-Match", etag);
}
if (lastMod != null) {
} else if (lastMod != null) {
request.addHeader("If-Modified-Since", lastMod);
}
}

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -5,7 +5,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -26,4 +26,10 @@
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
</head>
</head>
<noscript>
<h1>Users of text-based browsers</h1>
<p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
as it uses fewer modern CSS tricks, and should work better than the new UI. It's functionally nearly identical, but just renders it using a different layout.</p>
<hr>
</noscript>

View File

@@ -1,9 +1,16 @@
This is a bit of a hack!
This class exists to let tailwind we're using these classes even though they aren't visible in the code,
as we sometimes generate classes from Java code!
as we sometimes generate classes from Java code or javascript!
<i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
<i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
<i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
<i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
<span class="hover:bg-gray-300 "></span>
<label class="suggestion group block relative">
<input type="radio" name="suggestion" class="peer hidden" checked>
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
</div>
</label>

View File

@@ -26,7 +26,7 @@
<!-- Main content -->
<main class="flex-1 p-4 max-w-2xl space-y-4">
<div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="flex space-x-3 place-items-baseline">
<i class="fa fa-circle-exclamation text-red-800"></i>
<div class="grow">${model.errorTitle()}</div>

View File

@@ -80,10 +80,6 @@
<tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>

View File

@@ -7,7 +7,7 @@
<form class="flex-1 max-w-2xl" action="/search">
<div class="flex">
@if (query.isBlank())
@if (query != null && query.isBlank())
<%-- Add autofocus if the query is blank --%>
<input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
@@ -27,7 +27,7 @@
id="searchInput" />
@endif
<div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
<div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>
<button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
<i class="fas fa-search text-sm sm:mr-3"></i>

View File

@@ -43,13 +43,13 @@ function displaySuggestions(suggestions) {
}
suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
<div
class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
data-index="${index}"
>
${suggestion}
</div>
`).join('');
<label class="suggestion group block relative">
<input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
${suggestion}
</div>
</label>
`).join('');
suggestionsContainer.classList.remove('hidden');

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
public class AssistantModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.assistant.suggest;
import gnu.trove.list.array.TIntArrayList;
import org.jetbrains.annotations.NotNull;
import java.util.*;
@@ -434,7 +435,7 @@ public class PrefixSearchStructure {
/**
* Class representing a suggested completion.
*/
public static class ScoredSuggestion {
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
private final String word;
private final int score;
@@ -455,5 +456,10 @@ public class PrefixSearchStructure {
public String toString() {
return word + " (" + score + ")";
}
@Override
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
return Integer.compare(this.score, o.score);
}
}
}

View File

@@ -2,8 +2,6 @@ package nu.marginalia.assistant.suggest;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.functions.math.dict.SpellChecker;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -13,35 +11,27 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.*;
import java.util.zip.GZIPInputStream;
public class Suggestions {
private PrefixSearchStructure searchStructure = null;
private TermFrequencyDict termFrequencyDict = null;
private volatile boolean ready = false;
private final SpellChecker spellChecker;
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
private volatile boolean ready = false;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
private static final int MIN_SUGGEST_LENGTH = 3;
@Inject
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
SpellChecker spellChecker,
TermFrequencyDict dict
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
@Named("suggestions-file2") Path suggestionsFile2
) {
this.spellChecker = spellChecker;
Thread.ofPlatform().start(() -> {
searchStructure = loadSuggestions(suggestionsFile);
termFrequencyDict = dict;
searchStructures.add(loadSuggestions(suggestionsFile1));
searchStructures.add(loadSuggestions(suggestionsFile2));
ready = true;
logger.info("Loaded {} suggestions", searchStructure.size());
logger.info("Loaded suggestions");
});
}
@@ -55,8 +45,8 @@ public class Suggestions {
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] parts = StringUtils.split(line, " ", 2);
String line = scanner.nextLine().trim();
String[] parts = StringUtils.split(line, " ,", 2);
if (parts.length != 2) {
logger.warn("Invalid suggestion line: {}", line);
continue;
@@ -64,7 +54,30 @@ public class Suggestions {
int cnt = Integer.parseInt(parts[0]);
if (cnt > 1) {
String word = parts[1];
ret.insert(word, cnt);
// Remove quotes and trailing periods if this is a CSV
if (word.startsWith("\"") && word.endsWith("\"")) {
word = word.substring(1, word.length() - 1);
}
// Remove trailing periods
while (word.endsWith(".")) {
word = word.substring(0, word.length() - 1);
}
// Remove junk items we may have gotten from link extraction
if (word.startsWith("click here"))
continue;
if (word.contains("new window"))
continue;
if (word.contains("click to"))
continue;
if (word.startsWith("share "))
continue;
if (word.length() > 3) {
ret.insert(word, cnt);
}
}
}
return ret;
@@ -96,10 +109,22 @@ public class Suggestions {
return List.of();
}
var results = searchStructure.getTopCompletions(prefix, count);
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
for (var searchStructure : searchStructures) {
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
}
resultsAll.sort(Comparator.reverseOrder());
List<String> ret = new ArrayList<>(count);
for (var result : results) {
ret.add(result.getWord());
Set<String> seen = new HashSet<>();
for (var result : resultsAll) {
if (seen.add(result.getWord())) {
ret.add(result.getWord());
}
if (ret.size() >= count) {
break;
}
}
return ret;

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -64,6 +64,11 @@ public class ControlMain extends MainClass {
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
}
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
if (!Files.exists(altSuggestionsFile)) {
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
}
Path asnRawData = dataPath.resolve("asn-data-raw-table");
if (!Files.exists(asnRawData)) {
download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table"));

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -1,4 +1,6 @@
## This is a token file for automatic deployment
## This is a token file for triggering automatic deployment when no commit is made.
2025-01-08: Deploy executor.
2025-01-07: Deploy executor.
2025-01-07: Deploy executor.
2025-04-24: Deploy executor.
2025-04-24: Deploy assistant.

View File

@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists