mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
45 Commits
deploy-003
...
deploy-006
Author | SHA1 | Date | |
---|---|---|---|
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 | ||
|
bc2c2061f2 | ||
|
1c7f5a31a5 | ||
|
59a8ea60f7 | ||
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f | ||
|
79da622e3b | ||
|
3da8337ba6 | ||
|
a32d230f0a | ||
|
3772bfd387 | ||
|
02a7900d1a | ||
|
a1fb92468f | ||
|
b7f0a2a98e | ||
|
5fb76b2e79 | ||
|
ad8c97f342 | ||
|
dc1b6373eb | ||
|
983d6d067c | ||
|
a84a06975c | ||
|
d2864c13ec | ||
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 |
39
ROADMAP.md
39
ROADMAP.md
@@ -1,4 +1,4 @@
|
||||
# Roadmap 2024-2025
|
||||
# Roadmap 2025
|
||||
|
||||
This is a roadmap with major features planned for Marginalia Search.
|
||||
|
||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||
|
||||
## Web Design Overhaul
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
||||
|
||||
## Additional Language Support
|
||||
|
||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||
@@ -62,8 +56,31 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
|
||||
# Completed
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
||||
## Proper Position Index (COMPLETED 2024-09)
|
||||
|
||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
||||
|
||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
@@ -47,7 +47,7 @@ ext {
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
jibVersion = '3.4.4'
|
||||
|
||||
}
|
||||
|
||||
|
@@ -20,7 +20,10 @@ public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
public DbDomainQueries(HikariDataSource dataSource)
|
||||
@@ -30,16 +33,21 @@ public class DbDomainQueries {
|
||||
|
||||
|
||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try {
|
||||
return domainIdCache.get(domain, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
@@ -49,9 +57,6 @@ public class DbDomainQueries {
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
@@ -84,46 +89,60 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
EdgeDomain existing = domainNameCache.getIfPresent(id);
|
||||
if (existing != null) {
|
||||
return Optional.of(existing);
|
||||
}
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||
var val = new EdgeDomain(rsp.getString(1));
|
||||
domainNameCache.put(id, val);
|
||||
return Optional.of(val);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<EdgeDomain> otherSubdomains(EdgeDomain domain, int cnt) {
|
||||
List<EdgeDomain> ret = new ArrayList<>();
|
||||
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
|
||||
String topDomain = domain.topDomain;
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
return siblingsCache.get(topDomain, () -> {
|
||||
List<DomainWithNode> ret = new ArrayList<>();
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sibling = new EdgeDomain(rs.getString(1));
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
|
||||
if (sibling.equals(domain))
|
||||
continue;
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sibling = new EdgeDomain(rs.getString(1));
|
||||
|
||||
ret.add(sibling);
|
||||
if (sibling.equals(domain))
|
||||
continue;
|
||||
|
||||
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
return ret;
|
||||
});
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||
public boolean isIndexed() {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,118 +0,0 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
/** Class used in exporting data. This is intended to be used for a brief time
|
||||
* and then discarded, not kept around as a service.
|
||||
*/
|
||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
||||
private final Connection connection;
|
||||
private final int nodeId;
|
||||
private final PreparedStatement knownUrlsQuery;
|
||||
private final PreparedStatement visitedUrlsQuery;
|
||||
private final PreparedStatement goodUrlsQuery;
|
||||
private final PreparedStatement domainNameToId;
|
||||
|
||||
private final PreparedStatement allDomainsQuery;
|
||||
private final PreparedStatement crawlQueueDomains;
|
||||
private final PreparedStatement indexedDomainsQuery;
|
||||
|
||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
||||
this.connection = dataSource.getConnection();
|
||||
this.nodeId = nodeId;
|
||||
|
||||
knownUrlsQuery = connection.prepareStatement("""
|
||||
SELECT KNOWN_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
visitedUrlsQuery = connection.prepareStatement("""
|
||||
SELECT VISITED_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
goodUrlsQuery = connection.prepareStatement("""
|
||||
SELECT GOOD_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
domainNameToId = connection.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
allDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
""");
|
||||
crawlQueueDomains = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM CRAWL_QUEUE
|
||||
""");
|
||||
indexedDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE INDEXED > 0
|
||||
""");
|
||||
}
|
||||
|
||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
||||
}
|
||||
|
||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, domainNameToId);
|
||||
}
|
||||
|
||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
||||
return executeListQuery(crawlQueueDomains, 100);
|
||||
}
|
||||
public List<String> getAllIndexedDomains() throws SQLException {
|
||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
||||
}
|
||||
|
||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
||||
throws SQLException {
|
||||
statement.setString(1, domainName);
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return OptionalInt.of(rs.getInt(1));
|
||||
}
|
||||
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
|
||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
||||
List<String> ret = new ArrayList<>(sizeHint);
|
||||
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
knownUrlsQuery.close();
|
||||
goodUrlsQuery.close();
|
||||
visitedUrlsQuery.close();
|
||||
allDomainsQuery.close();
|
||||
crawlQueueDomains.close();
|
||||
domainNameToId.close();
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -83,6 +83,11 @@ public class QueryParams {
|
||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||
return param.startsWith("project=") || param.startsWith("story=");
|
||||
}
|
||||
|
||||
// www.perseus.tufts.edu:
|
||||
if (param.startsWith("collection=")) return true;
|
||||
if (param.startsWith("doc=")) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||
|
||||
config.setMaximumPoolSize(5);
|
||||
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||
config.setMinimumIdle(2);
|
||||
|
||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||
|
@@ -29,6 +29,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
|
@@ -15,7 +15,9 @@ import java.util.Map;
|
||||
|
||||
/** Client for local browserless.io API */
|
||||
public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||
|
||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content"))
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/screenshot"))
|
||||
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.rss.model;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
||||
public static final int MAX_DESC_LENGTH = 255;
|
||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||
|
||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
||||
String title = item.getTitle().orElse("");
|
||||
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||
String title = item.title();
|
||||
String date = getItemDate(item);
|
||||
String description = getItemDescription(item);
|
||||
String url;
|
||||
|
||||
if (keepFragment || item.getLink().isEmpty()) {
|
||||
url = item.getLink().orElse("");
|
||||
if (keepFragment) {
|
||||
url = item.url();
|
||||
}
|
||||
else {
|
||||
try {
|
||||
String link = item.getLink().get();
|
||||
String link = item.url();
|
||||
var linkUri = new URI(link);
|
||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||
url = cleanUri.toString();
|
||||
}
|
||||
catch (Exception e) {
|
||||
// fallback to original link if we can't clean it, this is not a very important step
|
||||
url = item.getLink().get();
|
||||
url = item.url();
|
||||
}
|
||||
}
|
||||
|
||||
return new FeedItem(title, date, description, url);
|
||||
}
|
||||
|
||||
private static String getItemDescription(Item item) {
|
||||
Optional<String> description = item.getDescription();
|
||||
if (description.isEmpty())
|
||||
return "";
|
||||
|
||||
String rawDescription = description.get();
|
||||
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||
String rawDescription = item.description();
|
||||
if (rawDescription.indexOf('<') >= 0) {
|
||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||
}
|
||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
||||
|
||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||
private static String getItemDate(Item item) {
|
||||
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||
try {
|
||||
zonedDateTime = item.getPubDateZonedDateTime();
|
||||
}
|
||||
catch (Exception e) {
|
||||
zonedDateTime = item.getPubDate()
|
||||
.map(extraFormatter::parse)
|
||||
.map(ZonedDateTime::from);
|
||||
try {
|
||||
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||
}
|
||||
catch (Exception e2) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||
|
@@ -1,7 +1,5 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import com.google.inject.Inject;
|
||||
import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
@@ -48,8 +44,6 @@ public class FeedFetcherService {
|
||||
private static final int MAX_FEED_ITEMS = 10;
|
||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||
|
||||
private final RssReader rssReader = new RssReader();
|
||||
|
||||
private final FeedDb feedDb;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
@@ -72,17 +66,6 @@ public class FeedFetcherService {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
|
||||
|
||||
// Add support for some alternate date tags for atom
|
||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
||||
rssReader.addItemExtension("created", this::setDateFallback);
|
||||
}
|
||||
|
||||
private void setDateFallback(Item item, String value) {
|
||||
if (item.getPubDate().isEmpty()) {
|
||||
item.setPubDate(value);
|
||||
}
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -96,6 +79,7 @@ public class FeedFetcherService {
|
||||
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
||||
}
|
||||
|
||||
|
||||
try (FeedDbWriter writer = feedDb.createWriter();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.connectTimeout(Duration.ofSeconds(15))
|
||||
@@ -103,6 +87,7 @@ public class FeedFetcherService {
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
FeedJournal feedJournal = FeedJournal.create();
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||
) {
|
||||
updating = true;
|
||||
@@ -155,6 +140,8 @@ public class FeedFetcherService {
|
||||
case FetchResult.Success(String value, String etag) -> {
|
||||
writer.saveEtag(feed.domain(), etag);
|
||||
writer.saveFeed(parseFeed(value, feed));
|
||||
|
||||
feedJournal.record(feed.feedUrl(), value);
|
||||
}
|
||||
case FetchResult.NotModified() -> {
|
||||
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||
@@ -367,12 +354,7 @@ public class FeedFetcherService {
|
||||
|
||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||
try {
|
||||
feedData = sanitizeEntities(feedData);
|
||||
|
||||
List<Item> rawItems = rssReader.read(
|
||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||
).toList();
|
||||
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||
|
||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||
|
||||
@@ -395,33 +377,6 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||
"»", "»",
|
||||
"«", "«",
|
||||
"—", "--",
|
||||
"–", "-",
|
||||
"’", "'",
|
||||
"‘", "'",
|
||||
""", "\"",
|
||||
" ", ""
|
||||
);
|
||||
|
||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||
* which is unfortunately relatively common. Replace them as far as is possible
|
||||
* with their corresponding characters
|
||||
*/
|
||||
static String sanitizeEntities(String feedData) {
|
||||
String result = feedData;
|
||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||
result = result.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Handle lone ampersands not part of a recognized XML entity
|
||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Decide whether to keep URI fragments in the feed items.
|
||||
* <p></p>
|
||||
* We keep fragments if there are multiple different fragments in the items.
|
||||
@@ -429,16 +384,16 @@ public class FeedFetcherService {
|
||||
* @param items The items to check
|
||||
* @return True if we should keep the fragments, false otherwise
|
||||
*/
|
||||
private boolean areFragmentsDisparate(List<Item> items) {
|
||||
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||
Set<String> seenFragments = new HashSet<>();
|
||||
|
||||
try {
|
||||
for (var item : items) {
|
||||
if (item.getLink().isEmpty()) {
|
||||
if (item.url().isBlank()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var link = item.getLink().get();
|
||||
var link = item.url();
|
||||
if (!link.contains("#")) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -0,0 +1,76 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
import nu.marginalia.slop.column.string.StringColumn;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||
*/
|
||||
public interface FeedJournal extends AutoCloseable {
|
||||
StringColumn urlColumn = new StringColumn("url");
|
||||
StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
|
||||
void record(String url, String contents) throws IOException;
|
||||
void close() throws IOException;
|
||||
|
||||
|
||||
static FeedJournal create() throws IOException {
|
||||
if (Boolean.getBoolean("feedFetcher.persistJournal")) {
|
||||
Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
|
||||
if (Files.isDirectory(journalPath)) {
|
||||
FileUtils.deleteDirectory(journalPath.toFile());
|
||||
}
|
||||
Files.createDirectories(journalPath);
|
||||
return new RecordingFeedJournal(journalPath);
|
||||
}
|
||||
else {
|
||||
return new NoOpFeedJournal();
|
||||
}
|
||||
}
|
||||
|
||||
class NoOpFeedJournal implements FeedJournal {
|
||||
@Override
|
||||
public void record(String url, String contents) {}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
}
|
||||
|
||||
class RecordingFeedJournal extends SlopTable implements FeedJournal {
|
||||
|
||||
private final StringColumn.Writer urlWriter;
|
||||
private final StringColumn.Writer contentsWriter;
|
||||
|
||||
public RecordingFeedJournal(Path path) throws IOException {
|
||||
super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
|
||||
|
||||
urlWriter = urlColumn.create(this);
|
||||
contentsWriter = contentsColumn.create(this);
|
||||
}
|
||||
|
||||
public synchronized void record(String url, String contents) throws IOException {
|
||||
urlWriter.put(url);
|
||||
contentsWriter.put(contents);
|
||||
}
|
||||
}
|
||||
|
||||
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||
try (SlopTable table = new SlopTable(journalPath)) {
|
||||
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||
|
||||
while (urlReader.hasRemaining()) {
|
||||
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,94 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||
import com.apptasticsoftware.rssreader.util.Default;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.parser.Parser;
|
||||
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SimpleFeedParser {
|
||||
|
||||
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||
|
||||
public record ItemData (
|
||||
String title,
|
||||
String description,
|
||||
String url,
|
||||
String pubDate
|
||||
) {
|
||||
public boolean isWellFormed() {
|
||||
return title != null && !title.isBlank() &&
|
||||
description != null && !description.isBlank() &&
|
||||
url != null && !url.isBlank() &&
|
||||
pubDate != null && !pubDate.isBlank();
|
||||
}
|
||||
|
||||
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||
try {
|
||||
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static List<ItemData> parse(String content) {
|
||||
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||
List<ItemData> ret = new ArrayList<>();
|
||||
|
||||
doc.select("item, entry").forEach(element -> {
|
||||
String link = "";
|
||||
String title = "";
|
||||
String description = "";
|
||||
String pubDate = "";
|
||||
|
||||
for (String attr : List.of("title", "dc:title")) {
|
||||
if (!title.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
title = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||
if (!description.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
description = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||
if (!pubDate.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
pubDate = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("link", "url")) {
|
||||
if (!link.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
link = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
ret.add(new ItemData(title, description, link, pubDate));
|
||||
});
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.Map;
|
||||
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withExposedPorts(3000);
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
|
@@ -1,50 +0,0 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestXmlSanitization {
|
||||
|
||||
@Test
|
||||
public void testPreservedEntities() {
|
||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNlnetTitleTag() {
|
||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
||||
|
||||
// Verify we're able to consume and strip out the HTML tags
|
||||
RssReader r = new RssReader();
|
||||
|
||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
||||
|
||||
Assertions.assertEquals(1, items.size());
|
||||
for (var item : items) {
|
||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrayAmpersand() {
|
||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntity() {
|
||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntityQuot() {
|
||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
||||
}
|
||||
}
|
@@ -2,9 +2,6 @@ package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
|
||||
@@ -27,37 +24,19 @@ public class IndexProtobufCodec {
|
||||
.build();
|
||||
}
|
||||
|
||||
public static QueryLimits convertQueryLimits(RpcQueryLimits queryLimits) {
|
||||
return new QueryLimits(
|
||||
queryLimits.getResultsByDomain(),
|
||||
queryLimits.getResultsTotal(),
|
||||
queryLimits.getTimeoutMs(),
|
||||
queryLimits.getFetchSize()
|
||||
);
|
||||
}
|
||||
|
||||
public static RpcQueryLimits convertQueryLimits(QueryLimits queryLimits) {
|
||||
return RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(queryLimits.resultsByDomain())
|
||||
.setResultsTotal(queryLimits.resultsTotal())
|
||||
.setTimeoutMs(queryLimits.timeoutMs())
|
||||
.setFetchSize(queryLimits.fetchSize())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
|
||||
List<SearchPhraseConstraint> phraseConstraints = new ArrayList<>();
|
||||
|
||||
for (int j = 0; j < query.getPhrasesCount(); j++) {
|
||||
var coh = query.getPhrases(j);
|
||||
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
||||
phraseConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
||||
phraseConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
||||
phraseConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
|
||||
@@ -70,7 +49,7 @@ public class IndexProtobufCodec {
|
||||
query.getExcludeList(),
|
||||
query.getAdviceList(),
|
||||
query.getPriorityList(),
|
||||
phraeConstraints
|
||||
phraseConstraints
|
||||
);
|
||||
}
|
||||
|
||||
@@ -103,60 +82,4 @@ public class IndexProtobufCodec {
|
||||
return subqueryBuilder.build();
|
||||
}
|
||||
|
||||
public static ResultRankingParameters convertRankingParameterss(RpcResultRankingParameters params) {
|
||||
if (params == null)
|
||||
return ResultRankingParameters.sensibleDefaults();
|
||||
|
||||
return new ResultRankingParameters(
|
||||
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
|
||||
params.getShortDocumentThreshold(),
|
||||
params.getShortDocumentPenalty(),
|
||||
params.getDomainRankBonus(),
|
||||
params.getQualityPenalty(),
|
||||
params.getShortSentenceThreshold(),
|
||||
params.getShortSentencePenalty(),
|
||||
params.getBm25Weight(),
|
||||
params.getTcfFirstPositionWeight(),
|
||||
params.getTcfVerbatimWeight(),
|
||||
params.getTcfProximityWeight(),
|
||||
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
|
||||
params.getTemporalBiasWeight(),
|
||||
params.getExportDebugData()
|
||||
);
|
||||
}
|
||||
|
||||
public static RpcResultRankingParameters convertRankingParameterss(ResultRankingParameters rankingParams,
|
||||
RpcTemporalBias temporalBias)
|
||||
{
|
||||
if (rankingParams == null) {
|
||||
rankingParams = ResultRankingParameters.sensibleDefaults();
|
||||
}
|
||||
|
||||
var builder = RpcResultRankingParameters.newBuilder()
|
||||
.setBm25B(rankingParams.bm25Params.b())
|
||||
.setBm25K(rankingParams.bm25Params.k())
|
||||
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
|
||||
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
|
||||
.setDomainRankBonus(rankingParams.domainRankBonus)
|
||||
.setQualityPenalty(rankingParams.qualityPenalty)
|
||||
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
|
||||
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
|
||||
.setBm25Weight(rankingParams.bm25Weight)
|
||||
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
|
||||
.setTcfProximityWeight(rankingParams.tcfProximity)
|
||||
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
|
||||
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
|
||||
.setExportDebugData(rankingParams.exportDebugData);
|
||||
|
||||
if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) {
|
||||
builder.setTemporalBias(temporalBias);
|
||||
}
|
||||
else {
|
||||
builder.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||
.setBias(RpcTemporalBias.Bias.valueOf(rankingParams.temporalBias.name())));
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||
@@ -37,7 +37,7 @@ public class QueryProtobufCodec {
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
||||
|
||||
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
||||
builder.setQueryLimits(query.specs.queryLimits);
|
||||
|
||||
// Query strategy may be overridden by the query, but if not, use the one from the request
|
||||
if (query.specs.queryStrategy != null && query.specs.queryStrategy != QueryStrategy.AUTO)
|
||||
@@ -45,9 +45,27 @@ public class QueryProtobufCodec {
|
||||
else
|
||||
builder.setQueryStrategy(request.getQueryStrategy());
|
||||
|
||||
if (query.specs.rankingParams != null) {
|
||||
builder.setParameters(IndexProtobufCodec.convertRankingParameterss(query.specs.rankingParams, request.getTemporalBias()));
|
||||
if (request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) {
|
||||
if (query.specs.rankingParams != null) {
|
||||
builder.setParameters(
|
||||
RpcResultRankingParameters.newBuilder(query.specs.rankingParams)
|
||||
.setTemporalBias(request.getTemporalBias())
|
||||
.build()
|
||||
);
|
||||
} else {
|
||||
builder.setParameters(
|
||||
RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults())
|
||||
.setTemporalBias(request.getTemporalBias())
|
||||
.build()
|
||||
);
|
||||
}
|
||||
} else if (query.specs.rankingParams != null) {
|
||||
builder.setParameters(query.specs.rankingParams);
|
||||
}
|
||||
// else {
|
||||
// if we have no ranking params, we don't need to set them, the client check and use the default values
|
||||
// so we don't need to send this huge object over the wire
|
||||
// }
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
@@ -65,18 +83,13 @@ public class QueryProtobufCodec {
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
||||
|
||||
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
||||
builder.setQueryLimits(query.specs.queryLimits);
|
||||
|
||||
// Query strategy may be overridden by the query, but if not, use the one from the request
|
||||
builder.setQueryStrategy(query.specs.queryStrategy.name());
|
||||
|
||||
if (query.specs.rankingParams != null) {
|
||||
builder.setParameters(IndexProtobufCodec.convertRankingParameterss(
|
||||
query.specs.rankingParams,
|
||||
RpcTemporalBias.newBuilder().setBias(
|
||||
RpcTemporalBias.Bias.NONE)
|
||||
.build())
|
||||
);
|
||||
builder.setParameters(query.specs.rankingParams);
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
@@ -95,10 +108,10 @@ public class QueryProtobufCodec {
|
||||
IndexProtobufCodec.convertSpecLimit(request.getSize()),
|
||||
IndexProtobufCodec.convertSpecLimit(request.getRank()),
|
||||
request.getDomainIdsList(),
|
||||
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
|
||||
request.getQueryLimits(),
|
||||
request.getSearchSetIdentifier(),
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
ResultRankingParameters.TemporalBias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -294,9 +307,9 @@ public class QueryProtobufCodec {
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
||||
IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()),
|
||||
specs.getQueryLimits(),
|
||||
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
||||
IndexProtobufCodec.convertRankingParameterss(specs.getParameters())
|
||||
specs.hasParameters() ? specs.getParameters() : null
|
||||
);
|
||||
}
|
||||
|
||||
@@ -307,7 +320,7 @@ public class QueryProtobufCodec {
|
||||
.addAllTacitExcludes(params.tacitExcludes())
|
||||
.addAllTacitPriority(params.tacitPriority())
|
||||
.setHumanQuery(params.humanQuery())
|
||||
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))
|
||||
.setQueryLimits(params.limits())
|
||||
.setQuality(IndexProtobufCodec.convertSpecLimit(params.quality()))
|
||||
.setYear(IndexProtobufCodec.convertSpecLimit(params.year()))
|
||||
.setSize(IndexProtobufCodec.convertSpecLimit(params.size()))
|
||||
@@ -319,7 +332,7 @@ public class QueryProtobufCodec {
|
||||
.build())
|
||||
.setPagination(RpcQsQueryPagination.newBuilder()
|
||||
.setPage(params.page())
|
||||
.setPageSize(Math.min(100, params.limits().resultsTotal()))
|
||||
.setPageSize(Math.min(100, params.limits().getResultsTotal()))
|
||||
.build());
|
||||
|
||||
if (params.nearDomain() != null)
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
@@ -21,14 +21,14 @@ public record QueryParams(
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
List<Integer> domainIds,
|
||||
QueryLimits limits,
|
||||
RpcQueryLimits limits,
|
||||
String identifier,
|
||||
QueryStrategy queryStrategy,
|
||||
ResultRankingParameters.TemporalBias temporalBias,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, QueryLimits limits, String identifier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -42,7 +42,7 @@ public record QueryParams(
|
||||
limits,
|
||||
identifier,
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -1,10 +1,11 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchSpecification {
|
||||
@@ -24,11 +25,12 @@ public class SearchSpecification {
|
||||
public SpecificationLimit size;
|
||||
public SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
public final RpcQueryLimits queryLimits;
|
||||
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
public final ResultRankingParameters rankingParams;
|
||||
@Nullable
|
||||
public final RpcResultRankingParameters rankingParams;
|
||||
|
||||
public SearchSpecification(SearchQuery query,
|
||||
List<Integer> domains,
|
||||
@@ -38,9 +40,9 @@ public class SearchSpecification {
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
QueryLimits queryLimits,
|
||||
RpcQueryLimits queryLimits,
|
||||
QueryStrategy queryStrategy,
|
||||
ResultRankingParameters rankingParams)
|
||||
@Nullable RpcResultRankingParameters rankingParams)
|
||||
{
|
||||
this.query = query;
|
||||
this.domains = domains;
|
||||
@@ -91,7 +93,7 @@ public class SearchSpecification {
|
||||
return this.rank;
|
||||
}
|
||||
|
||||
public QueryLimits getQueryLimits() {
|
||||
public RpcQueryLimits getQueryLimits() {
|
||||
return this.queryLimits;
|
||||
}
|
||||
|
||||
@@ -99,7 +101,7 @@ public class SearchSpecification {
|
||||
return this.queryStrategy;
|
||||
}
|
||||
|
||||
public ResultRankingParameters getRankingParams() {
|
||||
public RpcResultRankingParameters getRankingParams() {
|
||||
return this.rankingParams;
|
||||
}
|
||||
|
||||
@@ -120,9 +122,9 @@ public class SearchSpecification {
|
||||
private boolean size$set;
|
||||
private SpecificationLimit rank$value;
|
||||
private boolean rank$set;
|
||||
private QueryLimits queryLimits;
|
||||
private RpcQueryLimits queryLimits;
|
||||
private QueryStrategy queryStrategy;
|
||||
private ResultRankingParameters rankingParams;
|
||||
private RpcResultRankingParameters rankingParams;
|
||||
|
||||
SearchSpecificationBuilder() {
|
||||
}
|
||||
@@ -171,7 +173,7 @@ public class SearchSpecification {
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder queryLimits(QueryLimits queryLimits) {
|
||||
public SearchSpecificationBuilder queryLimits(RpcQueryLimits queryLimits) {
|
||||
this.queryLimits = queryLimits;
|
||||
return this;
|
||||
}
|
||||
@@ -181,7 +183,7 @@ public class SearchSpecification {
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder rankingParams(ResultRankingParameters rankingParams) {
|
||||
public SearchSpecificationBuilder rankingParams(RpcResultRankingParameters rankingParams) {
|
||||
this.rankingParams = rankingParams;
|
||||
return this;
|
||||
}
|
||||
|
@@ -0,0 +1,33 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
|
||||
public class PrototypeRankingParameters {
|
||||
|
||||
/** These are the default ranking parameters that are used when no parameters are specified. */
|
||||
|
||||
private static final RpcResultRankingParameters _sensibleDefaults = RpcResultRankingParameters.newBuilder()
|
||||
.setBm25B(0.5)
|
||||
.setBm25K(1.2)
|
||||
.setShortDocumentThreshold(2000)
|
||||
.setShortDocumentPenalty(2.)
|
||||
.setDomainRankBonus(1 / 100.)
|
||||
.setQualityPenalty(1 / 15.)
|
||||
.setShortSentenceThreshold(2)
|
||||
.setShortSentencePenalty(5)
|
||||
.setBm25Weight(1.)
|
||||
.setTcfVerbatimWeight(1.)
|
||||
.setTcfProximityWeight(1.)
|
||||
.setTcfFirstPositionWeight(5)
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder().setBias(RpcTemporalBias.Bias.NONE))
|
||||
.setTemporalBiasWeight(5.0)
|
||||
.setExportDebugData(false)
|
||||
.setDisablePenalties(false)
|
||||
.build();
|
||||
|
||||
public static RpcResultRankingParameters sensibleDefaults() {
|
||||
return _sensibleDefaults;
|
||||
}
|
||||
|
||||
}
|
@@ -1,12 +1,13 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final ResultRankingParameters params;
|
||||
public final RpcResultRankingParameters params;
|
||||
|
||||
|
||||
public final BitSet regularMask;
|
||||
@@ -21,7 +22,7 @@ public class ResultRankingContext {
|
||||
public final CqDataInt priorityCounts;
|
||||
|
||||
public ResultRankingContext(int docCount,
|
||||
ResultRankingParameters params,
|
||||
RpcResultRankingParameters params,
|
||||
BitSet ngramsMask,
|
||||
BitSet regularMask,
|
||||
CqDataInt fullCounts,
|
||||
|
@@ -1,278 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class ResultRankingParameters {
|
||||
|
||||
/**
|
||||
* Tuning for BM25 when applied to full document matches
|
||||
*/
|
||||
public final Bm25Parameters bm25Params;
|
||||
|
||||
/**
|
||||
* Documents below this length are penalized
|
||||
*/
|
||||
public int shortDocumentThreshold;
|
||||
|
||||
public double shortDocumentPenalty;
|
||||
|
||||
|
||||
/**
|
||||
* Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good)
|
||||
*/
|
||||
public double domainRankBonus;
|
||||
|
||||
/**
|
||||
* Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad)
|
||||
*/
|
||||
public double qualityPenalty;
|
||||
|
||||
/**
|
||||
* Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want
|
||||
*/
|
||||
public int shortSentenceThreshold;
|
||||
|
||||
/**
|
||||
* Magnitude of penalty for documents with low average sentence length
|
||||
*/
|
||||
public double shortSentencePenalty;
|
||||
|
||||
public double bm25Weight;
|
||||
public double tcfFirstPosition;
|
||||
public double tcfVerbatim;
|
||||
public double tcfProximity;
|
||||
|
||||
public TemporalBias temporalBias;
|
||||
public double temporalBiasWeight;
|
||||
|
||||
public boolean exportDebugData;
|
||||
|
||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
|
||||
this.bm25Params = bm25Params;
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
this.bm25Weight = bm25Weight;
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
this.tcfProximity = tcfProximity;
|
||||
this.temporalBias = temporalBias;
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
this.exportDebugData = exportDebugData;
|
||||
}
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
.domainRankBonus(1 / 100.)
|
||||
.qualityPenalty(1 / 15.)
|
||||
.shortSentenceThreshold(2)
|
||||
.shortSentencePenalty(5)
|
||||
.bm25Weight(1.)
|
||||
.tcfVerbatim(1.)
|
||||
.tcfProximity(1.)
|
||||
.tcfFirstPosition(5)
|
||||
.temporalBias(TemporalBias.NONE)
|
||||
.temporalBiasWeight(5.0)
|
||||
.exportDebugData(false)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static ResultRankingParametersBuilder builder() {
|
||||
return new ResultRankingParametersBuilder();
|
||||
}
|
||||
|
||||
public Bm25Parameters getBm25Params() {
|
||||
return this.bm25Params;
|
||||
}
|
||||
|
||||
public int getShortDocumentThreshold() {
|
||||
return this.shortDocumentThreshold;
|
||||
}
|
||||
|
||||
public double getShortDocumentPenalty() {
|
||||
return this.shortDocumentPenalty;
|
||||
}
|
||||
|
||||
public double getDomainRankBonus() {
|
||||
return this.domainRankBonus;
|
||||
}
|
||||
|
||||
public double getQualityPenalty() {
|
||||
return this.qualityPenalty;
|
||||
}
|
||||
|
||||
public int getShortSentenceThreshold() {
|
||||
return this.shortSentenceThreshold;
|
||||
}
|
||||
|
||||
public double getShortSentencePenalty() {
|
||||
return this.shortSentencePenalty;
|
||||
}
|
||||
|
||||
public double getBm25Weight() {
|
||||
return this.bm25Weight;
|
||||
}
|
||||
|
||||
public double getTcfFirstPosition() {
|
||||
return this.tcfFirstPosition;
|
||||
}
|
||||
|
||||
public double getTcfVerbatim() {
|
||||
return this.tcfVerbatim;
|
||||
}
|
||||
|
||||
public double getTcfProximity() {
|
||||
return this.tcfProximity;
|
||||
}
|
||||
|
||||
public TemporalBias getTemporalBias() {
|
||||
return this.temporalBias;
|
||||
}
|
||||
|
||||
public double getTemporalBiasWeight() {
|
||||
return this.temporalBiasWeight;
|
||||
}
|
||||
|
||||
public boolean isExportDebugData() {
|
||||
return this.exportDebugData;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof ResultRankingParameters that)) return false;
|
||||
|
||||
return shortDocumentThreshold == that.shortDocumentThreshold && Double.compare(shortDocumentPenalty, that.shortDocumentPenalty) == 0 && Double.compare(domainRankBonus, that.domainRankBonus) == 0 && Double.compare(qualityPenalty, that.qualityPenalty) == 0 && shortSentenceThreshold == that.shortSentenceThreshold && Double.compare(shortSentencePenalty, that.shortSentencePenalty) == 0 && Double.compare(bm25Weight, that.bm25Weight) == 0 && Double.compare(tcfFirstPosition, that.tcfFirstPosition) == 0 && Double.compare(tcfVerbatim, that.tcfVerbatim) == 0 && Double.compare(tcfProximity, that.tcfProximity) == 0 && Double.compare(temporalBiasWeight, that.temporalBiasWeight) == 0 && exportDebugData == that.exportDebugData && Objects.equals(bm25Params, that.bm25Params) && temporalBias == that.temporalBias;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = Objects.hashCode(bm25Params);
|
||||
result = 31 * result + shortDocumentThreshold;
|
||||
result = 31 * result + Double.hashCode(shortDocumentPenalty);
|
||||
result = 31 * result + Double.hashCode(domainRankBonus);
|
||||
result = 31 * result + Double.hashCode(qualityPenalty);
|
||||
result = 31 * result + shortSentenceThreshold;
|
||||
result = 31 * result + Double.hashCode(shortSentencePenalty);
|
||||
result = 31 * result + Double.hashCode(bm25Weight);
|
||||
result = 31 * result + Double.hashCode(tcfFirstPosition);
|
||||
result = 31 * result + Double.hashCode(tcfVerbatim);
|
||||
result = 31 * result + Double.hashCode(tcfProximity);
|
||||
result = 31 * result + Objects.hashCode(temporalBias);
|
||||
result = 31 * result + Double.hashCode(temporalBiasWeight);
|
||||
result = 31 * result + Boolean.hashCode(exportDebugData);
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters(bm25Params=" + this.getBm25Params() + ", shortDocumentThreshold=" + this.getShortDocumentThreshold() + ", shortDocumentPenalty=" + this.getShortDocumentPenalty() + ", domainRankBonus=" + this.getDomainRankBonus() + ", qualityPenalty=" + this.getQualityPenalty() + ", shortSentenceThreshold=" + this.getShortSentenceThreshold() + ", shortSentencePenalty=" + this.getShortSentencePenalty() + ", bm25Weight=" + this.getBm25Weight() + ", tcfFirstPosition=" + this.getTcfFirstPosition() + ", tcfVerbatim=" + this.getTcfVerbatim() + ", tcfProximity=" + this.getTcfProximity() + ", temporalBias=" + this.getTemporalBias() + ", temporalBiasWeight=" + this.getTemporalBiasWeight() + ", exportDebugData=" + this.isExportDebugData() + ")";
|
||||
}
|
||||
|
||||
public enum TemporalBias {
|
||||
RECENT, OLD, NONE
|
||||
}
|
||||
|
||||
public static class ResultRankingParametersBuilder {
|
||||
private Bm25Parameters bm25Params;
|
||||
private int shortDocumentThreshold;
|
||||
private double shortDocumentPenalty;
|
||||
private double domainRankBonus;
|
||||
private double qualityPenalty;
|
||||
private int shortSentenceThreshold;
|
||||
private double shortSentencePenalty;
|
||||
private double bm25Weight;
|
||||
private double tcfFirstPosition;
|
||||
private double tcfVerbatim;
|
||||
private double tcfProximity;
|
||||
private TemporalBias temporalBias;
|
||||
private double temporalBiasWeight;
|
||||
private boolean exportDebugData;
|
||||
|
||||
ResultRankingParametersBuilder() {
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Params(Bm25Parameters bm25Params) {
|
||||
this.bm25Params = bm25Params;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentThreshold(int shortDocumentThreshold) {
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentPenalty(double shortDocumentPenalty) {
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder domainRankBonus(double domainRankBonus) {
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder qualityPenalty(double qualityPenalty) {
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentenceThreshold(int shortSentenceThreshold) {
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentencePenalty(double shortSentencePenalty) {
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Weight(double bm25Weight) {
|
||||
this.bm25Weight = bm25Weight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfFirstPosition(double tcfFirstPosition) {
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfVerbatim(double tcfVerbatim) {
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfProximity(double tcfProximity) {
|
||||
this.tcfProximity = tcfProximity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBias(TemporalBias temporalBias) {
|
||||
this.temporalBias = temporalBias;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBiasWeight(double temporalBiasWeight) {
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
|
||||
this.exportDebugData = exportDebugData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParameters build() {
|
||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
|
||||
}
|
||||
}
|
||||
}
|
@@ -162,6 +162,7 @@ message RpcResultRankingParameters {
|
||||
double temporalBiasWeight = 17;
|
||||
|
||||
bool exportDebugData = 18;
|
||||
bool disablePenalties = 19;
|
||||
|
||||
}
|
||||
|
||||
|
@@ -3,8 +3,6 @@ package nu.marginalia.index.client;
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@@ -22,18 +20,6 @@ class IndexProtobufCodecTest {
|
||||
verifyIsIdentityTransformation(SpecificationLimit.lessThan(1), l -> IndexProtobufCodec.convertSpecLimit(IndexProtobufCodec.convertSpecLimit(l)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRankingParameters() {
|
||||
verifyIsIdentityTransformation(ResultRankingParameters.sensibleDefaults(),
|
||||
p -> IndexProtobufCodec.convertRankingParameterss(IndexProtobufCodec.convertRankingParameterss(p, null)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQueryLimits() {
|
||||
verifyIsIdentityTransformation(new QueryLimits(1,2,3,4),
|
||||
l -> IndexProtobufCodec.convertQueryLimits(IndexProtobufCodec.convertQueryLimits(l))
|
||||
);
|
||||
}
|
||||
@Test
|
||||
public void testSubqery() {
|
||||
verifyIsIdentityTransformation(new SearchQuery(
|
||||
|
@@ -2,8 +2,9 @@ package nu.marginalia.functions.searchquery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
@@ -36,7 +37,7 @@ public class QueryFactory {
|
||||
|
||||
|
||||
public ProcessedQuery createQuery(QueryParams params,
|
||||
@Nullable ResultRankingParameters rankingParams) {
|
||||
@Nullable RpcResultRankingParameters rankingParams) {
|
||||
final var query = params.humanQuery();
|
||||
|
||||
if (query.length() > 1000) {
|
||||
@@ -132,7 +133,9 @@ public class QueryFactory {
|
||||
var limits = params.limits();
|
||||
// Disable limits on number of results per domain if we're searching with a site:-type term
|
||||
if (domain != null) {
|
||||
limits = limits.forSingleDomain();
|
||||
limits = RpcQueryLimits.newBuilder(limits)
|
||||
.setResultsByDomain(limits.getResultsTotal())
|
||||
.build();
|
||||
}
|
||||
|
||||
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
@@ -55,7 +55,7 @@ public class QueryGRPCService
|
||||
.time(() -> {
|
||||
|
||||
var params = QueryProtobufCodec.convertRequest(request);
|
||||
var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults());
|
||||
var query = queryFactory.createQuery(params, PrototypeRankingParameters.sensibleDefaults());
|
||||
|
||||
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
|
||||
|
||||
@@ -102,7 +102,7 @@ public class QueryGRPCService
|
||||
String originalQuery,
|
||||
QueryParams params,
|
||||
IndexClient.Pagination pagination,
|
||||
ResultRankingParameters rankingParameters) {
|
||||
RpcResultRankingParameters rankingParameters) {
|
||||
|
||||
var query = queryFactory.createQuery(params, rankingParameters);
|
||||
IndexClient.AggregateQueryResponse response = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query), pagination);
|
||||
|
@@ -233,9 +233,19 @@ public class QueryParser {
|
||||
entity.replace(new QueryToken.RankTerm(limit, str));
|
||||
} else if (str.startsWith("qs=")) {
|
||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||
} else if (str.contains(":")) {
|
||||
} else if (str.startsWith("site:")
|
||||
|| str.startsWith("format:")
|
||||
|| str.startsWith("file:")
|
||||
|| str.startsWith("tld:")
|
||||
|| str.startsWith("ip:")
|
||||
|| str.startsWith("as:")
|
||||
|| str.startsWith("asn:")
|
||||
|| str.startsWith("generator:")
|
||||
)
|
||||
{
|
||||
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static SpecificationLimit parseSpecificationLimit(String str) {
|
||||
|
@@ -1,12 +1,12 @@
|
||||
package nu.marginalia.query.svc;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
@@ -49,10 +49,15 @@ public class QueryFactoryTest {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
null,
|
||||
new QueryLimits(100, 100, 100, 100),
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(100)
|
||||
.setTimeoutMs(100)
|
||||
.setFetchSize(100)
|
||||
.build(),
|
||||
"NONE",
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
@@ -208,6 +213,12 @@ public class QueryFactoryTest {
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCplusPlus() {
|
||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuotedApostrophe() {
|
||||
var subquery = parseAndGetSpecs("\"bob's cars\"");
|
||||
|
@@ -16,20 +16,19 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import static java.lang.Math.clamp;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
@@ -51,40 +50,37 @@ public class IndexClient {
|
||||
|
||||
/** Execute a query on the index partitions and return the combined results. */
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
.async(executor)
|
||||
.runEach(indexRequest);
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
|
||||
|
||||
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
for (var future : futures) {
|
||||
try {
|
||||
future.get().forEachRemaining(results::add);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Downstream exception", e);
|
||||
}
|
||||
}
|
||||
List<RpcDecoratedResultItem> results =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
.async(executor)
|
||||
.runEach(indexRequest)
|
||||
.stream()
|
||||
.map(future -> future.thenApply(iterator -> {
|
||||
List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
|
||||
iterator.forEachRemaining(ret::add);
|
||||
totalNumResults.addAndGet(ret.size());
|
||||
return ret;
|
||||
}))
|
||||
.mapMulti((CompletableFuture<List<RpcDecoratedResultItem>> fut, Consumer<List<RpcDecoratedResultItem>> c) ->{
|
||||
try {
|
||||
c.accept(fut.join());
|
||||
} catch (Exception e) {
|
||||
logger.error("Error while fetching results", e);
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
.toList();
|
||||
|
||||
// Sort the results by ranking score and remove blacklisted domains
|
||||
results.sort(comparator);
|
||||
results.removeIf(this::isBlacklisted);
|
||||
|
||||
int numReceivedResults = results.size();
|
||||
|
||||
// pagination is typically 1-indexed, so we need to adjust the start and end indices
|
||||
int indexStart = (pagination.page - 1) * pagination.pageSize;
|
||||
int indexEnd = (pagination.page) * pagination.pageSize;
|
||||
|
||||
results = results.subList(
|
||||
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
|
||||
clamp(indexEnd, 0, results.size()));
|
||||
|
||||
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
|
@@ -10,12 +10,12 @@ import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
@@ -211,7 +211,7 @@ public class IndexGrpcService
|
||||
/** This class is responsible for ranking the results and adding the best results to the
|
||||
* resultHeap, which depending on the state of the indexLookup threads may or may not block
|
||||
*/
|
||||
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
|
||||
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
CompiledQueryLong compiledQueryIds)
|
||||
{
|
||||
|
@@ -2,12 +2,13 @@ package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
@@ -23,7 +24,7 @@ public class SearchParameters {
|
||||
public final IndexSearchBudget budget;
|
||||
public final SearchQuery query;
|
||||
public final QueryParams queryParams;
|
||||
public final ResultRankingParameters rankingParams;
|
||||
public final RpcResultRankingParameters rankingParams;
|
||||
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
@@ -41,11 +42,11 @@ public class SearchParameters {
|
||||
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
|
||||
this.query = specsSet.query;
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
|
||||
queryParams = new QueryParams(
|
||||
specsSet.quality,
|
||||
@@ -62,17 +63,17 @@ public class SearchParameters {
|
||||
}
|
||||
|
||||
public SearchParameters(RpcIndexQuery request, SearchSet searchSet) {
|
||||
var limits = IndexProtobufCodec.convertQueryLimits(request.getQueryLimits());
|
||||
var limits = request.getQueryLimits();
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
|
||||
// The time budget is halved because this is the point when we start to
|
||||
// wrap up the search and return the results.
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
|
||||
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
|
||||
queryParams = new QueryParams(
|
||||
convertSpecLimit(request.getQuality()),
|
||||
@@ -85,7 +86,7 @@ public class SearchParameters {
|
||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||
|
||||
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
||||
rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
|
||||
}
|
||||
|
||||
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
|
||||
import java.util.BitSet;
|
||||
@@ -24,14 +23,14 @@ public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
|
||||
|
||||
private final BitSet mask;
|
||||
|
||||
public Bm25GraphVisitor(Bm25Parameters bm25Parameters,
|
||||
public Bm25GraphVisitor(double k1, double b,
|
||||
float[] counts,
|
||||
int length,
|
||||
ResultRankingContext ctx) {
|
||||
this.length = length;
|
||||
|
||||
this.k1 = bm25Parameters.k();
|
||||
this.b = bm25Parameters.b();
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
|
||||
this.docCount = ctx.termFreqDocCount();
|
||||
this.counts = counts;
|
||||
|
@@ -156,7 +156,7 @@ public class IndexResultRankingService {
|
||||
// for the selected results, as this would be comically expensive to do for all the results we
|
||||
// discard along the way
|
||||
|
||||
if (params.rankingParams.exportDebugData) {
|
||||
if (params.rankingParams.getExportDebugData()) {
|
||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||
for (var item : resultsList) {
|
||||
combinedIdsList.add(item.combinedId);
|
||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.index.results;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
@@ -116,14 +117,14 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
float proximitiyFac = getProximitiyFac(decodedPositions, searchTerms.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||
|
||||
double score_firstPosition = params.tcfFirstPosition * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||
double score_verbatim = params.tcfVerbatim * verbatimMatches.getScore();
|
||||
double score_proximity = params.tcfProximity * proximitiyFac;
|
||||
double score_bM25 = params.bm25Weight
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.bm25Params, unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
||||
double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||
double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
|
||||
double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
|
||||
double score_bM25 = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
double score_bFlags = params.bm25Weight
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.bm25Params, wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||
double score_bFlags = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
|
||||
double score = normalize(
|
||||
@@ -245,9 +246,13 @@ public class IndexResultScoreCalculator {
|
||||
private double calculateDocumentBonus(long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
ResultRankingParameters rankingParams,
|
||||
RpcResultRankingParameters rankingParams,
|
||||
@Nullable DebugRankingFactors debugRankingFactors) {
|
||||
|
||||
if (rankingParams.getDisablePenalties()) {
|
||||
return 0.;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
@@ -256,18 +261,18 @@ public class IndexResultScoreCalculator {
|
||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||
|
||||
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
|
||||
double averageSentenceLengthPenalty = (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
|
||||
|
||||
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
|
||||
final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
|
||||
final double topologyBonus = Math.log(1 + topology);
|
||||
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
|
||||
final double documentLengthPenalty = length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
|
||||
final double temporalBias;
|
||||
|
||||
if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
|
||||
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
|
||||
} else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
|
||||
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
|
||||
if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
|
||||
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
|
||||
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else {
|
||||
temporalBias = 0;
|
||||
}
|
||||
@@ -506,14 +511,14 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
|
||||
|
||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||
private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
|
||||
if (size < 400) {
|
||||
if (quality < 5)
|
||||
return 0;
|
||||
return -quality * rankingParams.qualityPenalty;
|
||||
return -quality * rankingParams.getQualityPenalty();
|
||||
}
|
||||
else {
|
||||
return -quality * rankingParams.qualityPenalty * 20;
|
||||
return -quality * rankingParams.getQualityPenalty() * 20;
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.index.results;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
@@ -15,15 +14,14 @@ public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
private final CqDataLong wordMetaData;
|
||||
private final CqDataInt frequencies;
|
||||
private final float[] counts;
|
||||
private final Bm25Parameters bm25Parameters;
|
||||
|
||||
private final double k1;
|
||||
private final int docCount;
|
||||
|
||||
public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters,
|
||||
public TermFlagsGraphVisitor(double k1,
|
||||
CqDataLong wordMetaData,
|
||||
float[] counts,
|
||||
ResultRankingContext ctx) {
|
||||
this.bm25Parameters = bm25Parameters;
|
||||
this.k1 = k1;
|
||||
this.counts = counts;
|
||||
this.docCount = ctx.termFreqDocCount();
|
||||
this.wordMetaData = wordMetaData;
|
||||
@@ -55,7 +53,7 @@ public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
int freq = frequencies.get(idx);
|
||||
|
||||
// note we override b to zero for priority terms as they are independent of document length
|
||||
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||
return invFreq(docCount, freq) * f(k1, 0, count, 0);
|
||||
}
|
||||
|
||||
private double evaluatePriorityScore(int idx) {
|
||||
|
@@ -1,7 +0,0 @@
|
||||
package nu.marginalia.index.query.limit;
|
||||
|
||||
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
||||
public QueryLimits forSingleDomain() {
|
||||
return new QueryLimits(resultsTotal, resultsTotal, timeoutMs, fetchSize);
|
||||
}
|
||||
}
|
@@ -4,10 +4,11 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
@@ -17,7 +18,6 @@ import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
@@ -115,9 +115,16 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryLimits(
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(10)
|
||||
.setResultsTotal(10)
|
||||
.setTimeoutMs(Integer.MAX_VALUE)
|
||||
.setFetchSize(4000)
|
||||
.build()
|
||||
)
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE")
|
||||
.query(
|
||||
@@ -171,9 +178,16 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryLimits(
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(10)
|
||||
.setResultsTotal(10)
|
||||
.setTimeoutMs(Integer.MAX_VALUE)
|
||||
.setFetchSize(4000)
|
||||
.build()
|
||||
)
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE")
|
||||
.query(
|
||||
@@ -225,8 +239,15 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.queryLimits(
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(10)
|
||||
.setResultsTotal(10)
|
||||
.setTimeoutMs(Integer.MAX_VALUE)
|
||||
.setFetchSize(4000)
|
||||
.build()
|
||||
)
|
||||
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.domains(List.of(2))
|
||||
.query(
|
||||
@@ -282,11 +303,18 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryLimits(
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(10)
|
||||
.setResultsTotal(10)
|
||||
.setTimeoutMs(Integer.MAX_VALUE)
|
||||
.setFetchSize(4000)
|
||||
.build()
|
||||
)
|
||||
.year(SpecificationLimit.equals(1998))
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.searchSetIdentifier("NONE")
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||
.query(
|
||||
SearchQuery.builder()
|
||||
.compiledQuery("4")
|
||||
|
@@ -4,10 +4,11 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
@@ -18,7 +19,6 @@ import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
@@ -389,13 +389,20 @@ public class IndexQueryServiceIntegrationTest {
|
||||
SearchSpecification basicQuery(Function<SearchSpecification.SearchSpecificationBuilder, SearchSpecification.SearchSpecificationBuilder> mutator)
|
||||
{
|
||||
var builder = SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryLimits(
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(10)
|
||||
.setResultsTotal(10)
|
||||
.setTimeoutMs(Integer.MAX_VALUE)
|
||||
.setFetchSize(4000)
|
||||
.build()
|
||||
)
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
.rank(SpecificationLimit.none())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE");
|
||||
|
||||
|
@@ -152,7 +152,10 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
boolean matchesWordPattern(String s) {
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||
if (s.length() > 48)
|
||||
return false;
|
||||
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8}
|
||||
|
||||
String wordPartSeparator = ".-_/:+*";
|
||||
|
||||
@@ -169,7 +172,7 @@ public class DocumentPositionMapper {
|
||||
if (i == 0)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
|
@@ -30,9 +30,11 @@ class DocumentPositionMapperTest {
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class CppreferenceSpecialization extends WikiSpecialization {
|
||||
|
||||
@Inject
|
||||
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
|
||||
super(summaryExtractor, titleExtractor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document prune(Document original) {
|
||||
var doc = original.clone();
|
||||
|
||||
doc.getElementsByClass("t-nv").remove();
|
||||
doc.getElementsByClass("toc").remove();
|
||||
doc.getElementsByClass("mw-head").remove();
|
||||
doc.getElementsByClass("printfooter").remove();
|
||||
doc.getElementsByClass("cpp-footer-base").remove();
|
||||
|
||||
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSummary(Document doc, Set<String> importantWords) {
|
||||
|
||||
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
|
||||
if (declTable != null) {
|
||||
var nextPar = declTable.nextElementSibling();
|
||||
if (nextPar != null) {
|
||||
return nextPar.text();
|
||||
}
|
||||
}
|
||||
|
||||
return super.getSummary(doc, importantWords);
|
||||
}
|
||||
|
||||
|
||||
public List<String> extractExtraTokens(String title) {
|
||||
|
||||
if (!title.contains("::")) {
|
||||
return List.of();
|
||||
}
|
||||
if (!title.contains("-")) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
title = StringUtils.split(title, '-')[0];
|
||||
|
||||
String name = title;
|
||||
for (;;) {
|
||||
int lbidx = name.indexOf('<');
|
||||
int rbidx = name.indexOf('>');
|
||||
|
||||
if (lbidx > 0 && rbidx > lbidx) {
|
||||
String className = name.substring(0, lbidx);
|
||||
String methodName = name.substring(rbidx + 1);
|
||||
name = className + methodName;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<String> tokens = new ArrayList<>();
|
||||
|
||||
for (var part : name.split("\\s*,\\s*")) {
|
||||
if (part.endsWith(")") && !part.endsWith("()")) {
|
||||
int parenStart = part.indexOf('(');
|
||||
if (parenStart > 0) { // foo(...) -> foo
|
||||
part = part.substring(0, parenStart);
|
||||
}
|
||||
else if (parenStart == 0) { // (foo) -> foo
|
||||
part = part.substring(1, part.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
part = part.trim();
|
||||
if (part.contains("::")) {
|
||||
tokens.add(part);
|
||||
if (part.startsWith("std::")) {
|
||||
tokens.add(part.substring(5));
|
||||
|
||||
int ss = part.indexOf("::", 5);
|
||||
if (ss > 0) {
|
||||
tokens.add(part.substring(0, ss));
|
||||
tokens.add(part.substring(ss+2));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
|
||||
private final WikiSpecialization wikiSpecialization;
|
||||
private final BlogSpecialization blogSpecialization;
|
||||
private final GogStoreSpecialization gogStoreSpecialization;
|
||||
private final CppreferenceSpecialization cppreferenceSpecialization;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
@Inject
|
||||
@@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
|
||||
WikiSpecialization wikiSpecialization,
|
||||
BlogSpecialization blogSpecialization,
|
||||
GogStoreSpecialization gogStoreSpecialization,
|
||||
CppreferenceSpecialization cppreferenceSpecialization,
|
||||
DefaultSpecialization defaultSpecialization) {
|
||||
this.domainTypes = domainTypes;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
@@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
|
||||
this.wikiSpecialization = wikiSpecialization;
|
||||
this.blogSpecialization = blogSpecialization;
|
||||
this.gogStoreSpecialization = gogStoreSpecialization;
|
||||
this.cppreferenceSpecialization = cppreferenceSpecialization;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
@@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
|
||||
return mariadbKbSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.getTopDomain().equals("cppreference.com")) {
|
||||
return cppreferenceSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.toString().equals("store.steampowered.com")) {
|
||||
return steamStoreSpecialization;
|
||||
}
|
||||
@@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
|
||||
if (generator.keywords().contains("javadoc")) {
|
||||
return javadocSpecialization;
|
||||
}
|
||||
|
||||
// Must be toward the end, as some specializations are for
|
||||
// wiki-generator content
|
||||
if (generator.type() == GeneratorType.WIKI) {
|
||||
return wikiSpecialization;
|
||||
}
|
||||
@@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
|
||||
|
||||
boolean shouldIndex(EdgeUrl url);
|
||||
double lengthModifier();
|
||||
void amendWords(Document doc, DocumentKeywordsBuilder words);
|
||||
|
||||
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||
}
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
|
||||
@Override
|
||||
public double lengthModifier() {
|
||||
return 2.5;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class CppreferenceSpecializationTest {
|
||||
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
|
||||
|
||||
@Test
|
||||
public void testTitleMagic() {
|
||||
|
||||
List<String> ret;
|
||||
|
||||
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("std::multimap"));
|
||||
Assertions.assertTrue(ret.contains("crend"));
|
||||
|
||||
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
|
||||
}
|
||||
|
||||
}
|
@@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
@@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048)
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.code(),
|
||||
inputBuffer.headers(),
|
||||
|
@@ -44,6 +44,7 @@ dependencies {
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.opencsv
|
||||
implementation libs.trove
|
||||
implementation libs.protobuf
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
@@ -6,10 +6,10 @@ import nu.marginalia.api.model.ApiSearchResult;
|
||||
import nu.marginalia.api.model.ApiSearchResultQueryDetails;
|
||||
import nu.marginalia.api.model.ApiSearchResults;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -47,11 +47,12 @@ public class ApiSearchOperator {
|
||||
|
||||
return new QueryParams(
|
||||
query,
|
||||
new QueryLimits(
|
||||
2,
|
||||
Math.min(100, count),
|
||||
150,
|
||||
8192),
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(2)
|
||||
.setResultsTotal(Math.min(100, count))
|
||||
.setTimeoutMs(150)
|
||||
.setFetchSize(8192)
|
||||
.build(),
|
||||
searchSet.name());
|
||||
}
|
||||
|
||||
|
@@ -5,11 +5,11 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -155,15 +155,15 @@ public class SearchOperator {
|
||||
|
||||
|
||||
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
|
||||
final QueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
final RpcQueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.getResultsByDomain());
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
|
||||
return queryResponse.results().stream()
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.limit(limits.resultsTotal())
|
||||
.limit(limits.getResultsTotal())
|
||||
.map(SearchOperator::createDetails)
|
||||
.toList();
|
||||
}
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
@@ -12,6 +12,21 @@ import nu.marginalia.search.command.SearchParameters;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchQueryParamFactory {
|
||||
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(5)
|
||||
.setTimeoutMs(200)
|
||||
.setFetchSize(8192)
|
||||
.build();
|
||||
|
||||
|
||||
static final RpcQueryLimits shallowLimit = RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(100)
|
||||
.setTimeoutMs(100)
|
||||
.setFetchSize(512)
|
||||
.build();
|
||||
|
||||
|
||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||
SearchQuery prototype = new SearchQuery();
|
||||
@@ -33,7 +48,7 @@ public class SearchQueryParamFactory {
|
||||
profile.getSizeLimit(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(5, 100, 200, 8192),
|
||||
defaultLimits,
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
@@ -54,10 +69,15 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(domainId),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(count)
|
||||
.setResultsByDomain(count)
|
||||
.setTimeoutMs(100)
|
||||
.setFetchSize(512)
|
||||
.build(),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
@@ -74,10 +94,10 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
shallowLimit,
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
@@ -94,10 +114,10 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
shallowLimit,
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
@@ -78,15 +78,15 @@ public record SearchParameters(String query,
|
||||
return baseUrl.withPath(path);
|
||||
}
|
||||
|
||||
public ResultRankingParameters.TemporalBias temporalBias() {
|
||||
public RpcTemporalBias.Bias temporalBias() {
|
||||
if (recent == RECENT) {
|
||||
return ResultRankingParameters.TemporalBias.RECENT;
|
||||
return RpcTemporalBias.Bias.RECENT;
|
||||
}
|
||||
else if (profile == SearchProfile.VINTAGE) {
|
||||
return ResultRankingParameters.TemporalBias.OLD;
|
||||
return RpcTemporalBias.Bias.OLD;
|
||||
}
|
||||
|
||||
return ResultRankingParameters.TemporalBias.NONE;
|
||||
return RpcTemporalBias.Bias.NONE;
|
||||
}
|
||||
|
||||
public QueryStrategy strategy() {
|
||||
|
@@ -8,8 +8,8 @@
|
||||
<ShortName>Marginalia</ShortName>
|
||||
<Description>Search Marginalia</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
|
||||
<Image width="16" height="16" type="image/x-icon">https://old-search.marginalia.nu/favicon.ico</Image>
|
||||
<Url type="text/html" method="get"
|
||||
template="https://search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
|
||||
template="https://old-search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://old-search.marginalia.nu/</moz:SearchForm>
|
||||
</OpenSearchDescription>
|
@@ -3,9 +3,9 @@
|
||||
<nav>
|
||||
<a href="#" class="screenreader-only" onClick="">Skip to content</a>
|
||||
<a href="https://www.marginalia.nu/">Marginalia</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
|
||||
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
|
||||
<a href="https://about.marginalia-search.com/">About</a>
|
||||
<a href="https://about.marginalia-search.com/article/supporting/">Donate</a>
|
||||
<a class="extra" href="https://old-search.marginalia.nu/explore/random">Random</a>
|
||||
</nav>
|
||||
<div id="theme">
|
||||
<label for="theme-select" class="screenreader-only">Color Theme</label>
|
||||
|
@@ -2,14 +2,13 @@ package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -47,7 +46,6 @@ public class SearchOperator {
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final QueryClient queryClient;
|
||||
private final SearchQueryParamFactory paramFactory;
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchUnitConversionService searchUnitConversionService;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
|
||||
@@ -57,7 +55,6 @@ public class SearchOperator {
|
||||
DbDomainQueries domainQueries,
|
||||
QueryClient queryClient,
|
||||
SearchQueryParamFactory paramFactory,
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchUnitConversionService searchUnitConversionService,
|
||||
SearchQueryCountService searchVisitorCount
|
||||
)
|
||||
@@ -67,7 +64,6 @@ public class SearchOperator {
|
||||
this.domainQueries = domainQueries;
|
||||
this.queryClient = queryClient;
|
||||
this.paramFactory = paramFactory;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.searchUnitConversionService = searchUnitConversionService;
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
}
|
||||
@@ -154,8 +150,8 @@ public class SearchOperator {
|
||||
|
||||
|
||||
public SimpleSearchResults getResultsFromQuery(QueryResponse queryResponse) {
|
||||
final QueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
final RpcQueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.getResultsByDomain());
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
@@ -164,7 +160,7 @@ public class SearchOperator {
|
||||
.sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness"
|
||||
.limit(limits.resultsTotal())
|
||||
.limit(limits.getResultsTotal())
|
||||
.map(SearchOperator::createDetails)
|
||||
.toList();
|
||||
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
@@ -13,6 +13,22 @@ import java.util.List;
|
||||
|
||||
public class SearchQueryParamFactory {
|
||||
|
||||
|
||||
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(5)
|
||||
.setTimeoutMs(200)
|
||||
.setFetchSize(8192)
|
||||
.build();
|
||||
|
||||
|
||||
static final RpcQueryLimits shallowLimit = RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(100)
|
||||
.setTimeoutMs(100)
|
||||
.setFetchSize(512)
|
||||
.build();
|
||||
|
||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||
SearchQuery prototype = new SearchQuery();
|
||||
var profile = userParams.profile();
|
||||
@@ -29,11 +45,11 @@ public class SearchQueryParamFactory {
|
||||
prototype.searchTermsPriority,
|
||||
prototype.searchTermsAdvice,
|
||||
profile.getQualityLimit(),
|
||||
profile.getYearLimit(),
|
||||
userParams.yearLimit(),
|
||||
profile.getSizeLimit(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(5, 100, 200, 8192),
|
||||
defaultLimits,
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
@@ -54,10 +70,15 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(domainId),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(count)
|
||||
.setResultsByDomain(count)
|
||||
.setTimeoutMs(100)
|
||||
.setFetchSize(512)
|
||||
.build(),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
page
|
||||
);
|
||||
}
|
||||
@@ -74,10 +95,10 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
shallowLimit,
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
page
|
||||
);
|
||||
}
|
||||
@@ -94,10 +115,10 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
shallowLimit,
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
@@ -1,15 +1,14 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.JoobyService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -34,8 +33,6 @@ public class SearchService extends JoobyService {
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
StaticResources staticResources,
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchSiteSubscriptionService siteSubscriptionService,
|
||||
@@ -62,7 +59,25 @@ public class SearchService extends JoobyService {
|
||||
public void startJooby(Jooby jooby) {
|
||||
super.startJooby(jooby);
|
||||
|
||||
final String startTimeAttribute = "start-time";
|
||||
|
||||
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
|
||||
jooby.before((Context ctx) -> {
|
||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||
});
|
||||
|
||||
jooby.after((Context ctx, Object result, Throwable failure) -> {
|
||||
if (failure != null) {
|
||||
wmsa_search_service_error_count.labels(ctx.getRoute().getPattern(), ctx.getMethod()).inc();
|
||||
}
|
||||
else {
|
||||
Long startTime = ctx.getAttribute(startTimeAttribute);
|
||||
if (startTime != null) {
|
||||
wmsa_search_service_request_time.labels(ctx.getRoute().getPattern(), ctx.getMethod())
|
||||
.observe((System.nanoTime() - startTime) / 1e9);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -84,29 +84,44 @@ public record SearchParameters(WebsiteUrl url,
|
||||
}
|
||||
|
||||
public String renderUrl() {
|
||||
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
|
||||
URLEncoder.encode(query, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
|
||||
Boolean.valueOf(newFilter).toString(),
|
||||
page
|
||||
);
|
||||
|
||||
return path;
|
||||
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
|
||||
if (profile != SearchProfile.NO_FILTER) {
|
||||
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (js != SearchJsParameter.DEFAULT) {
|
||||
pathBuilder.append("&js=").append(URLEncoder.encode(js.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (adtech != SearchAdtechParameter.DEFAULT) {
|
||||
pathBuilder.append("&adtech=").append(URLEncoder.encode(adtech.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (recent != SearchRecentParameter.DEFAULT) {
|
||||
pathBuilder.append("&recent=").append(URLEncoder.encode(recent.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (searchTitle != SearchTitleParameter.DEFAULT) {
|
||||
pathBuilder.append("&searchTitle=").append(URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (page != 1) {
|
||||
pathBuilder.append("&page=").append(page);
|
||||
}
|
||||
if (newFilter) {
|
||||
pathBuilder.append("&newfilter=").append(Boolean.valueOf(newFilter).toString());
|
||||
}
|
||||
|
||||
return pathBuilder.toString();
|
||||
}
|
||||
|
||||
public ResultRankingParameters.TemporalBias temporalBias() {
|
||||
public RpcTemporalBias.Bias temporalBias() {
|
||||
if (recent == RECENT) {
|
||||
return ResultRankingParameters.TemporalBias.RECENT;
|
||||
return RpcTemporalBias.Bias.RECENT;
|
||||
}
|
||||
else if (profile == SearchProfile.VINTAGE) {
|
||||
return ResultRankingParameters.TemporalBias.OLD;
|
||||
return RpcTemporalBias.Bias.OLD;
|
||||
}
|
||||
|
||||
return ResultRankingParameters.TemporalBias.NONE;
|
||||
return RpcTemporalBias.Bias.NONE;
|
||||
}
|
||||
|
||||
public QueryStrategy strategy() {
|
||||
|
@@ -3,27 +3,22 @@ package nu.marginalia.search.command.commands;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.search.JteRenderer;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.model.NavbarModel;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SearchCommand implements SearchCommandInterface {
|
||||
private final SearchOperator searchOperator;
|
||||
private final JteRenderer jteRenderer;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchCommand(SearchOperator searchOperator,
|
||||
JteRenderer jteRenderer) throws IOException {
|
||||
public SearchCommand(SearchOperator searchOperator){
|
||||
this.searchOperator = searchOperator;
|
||||
this.jteRenderer = jteRenderer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
|
||||
}
|
||||
|
||||
private void addToCrawlQueue(int id) throws SQLException {
|
||||
/** Mark a domain for crawling by setting node affinity to zero,
|
||||
* unless it is already marked for crawling, then node affinity should
|
||||
* be left unchanged.
|
||||
* */
|
||||
void addToCrawlQueue(int domainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
||||
UPDATE EC_DOMAIN
|
||||
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
|
||||
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")) {
|
||||
stmt.setInt(1, id);
|
||||
stmt.setInt(1, domainId);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
private String getDomainName(int id) {
|
||||
String getDomainName(int id) {
|
||||
var domain = domainQueries.getDomain(id);
|
||||
if (domain.isEmpty())
|
||||
throw new IllegalArgumentException();
|
||||
|
@@ -37,7 +37,7 @@ public class SearchQueryService {
|
||||
@QueryParam String profile,
|
||||
@QueryParam String js,
|
||||
@QueryParam String recent,
|
||||
@QueryParam String title,
|
||||
@QueryParam String searchTitle,
|
||||
@QueryParam String adtech,
|
||||
@QueryParam Integer page
|
||||
) {
|
||||
@@ -47,7 +47,7 @@ public class SearchQueryService {
|
||||
SearchProfile.getSearchProfile(profile),
|
||||
SearchJsParameter.parse(js),
|
||||
SearchRecentParameter.parse(recent),
|
||||
SearchTitleParameter.parse(title),
|
||||
SearchTitleParameter.parse(searchTitle),
|
||||
SearchAdtechParameter.parse(adtech),
|
||||
false,
|
||||
Objects.requireNonNullElse(page,1));
|
||||
|
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
@@ -67,8 +68,12 @@ public class SearchSiteInfoService {
|
||||
this.screenshotService = screenshotService;
|
||||
this.dataSource = dataSource;
|
||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||
|
||||
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
|
||||
}
|
||||
|
||||
private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
|
||||
|
||||
@GET
|
||||
@Path("/site")
|
||||
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
||||
@@ -77,23 +82,48 @@ public class SearchSiteInfoService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
||||
}
|
||||
|
||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
return new MapModelAndView("siteinfo/start.jte",
|
||||
Map.of("navbar", NavbarModel.SITEINFO,
|
||||
"model", new SiteOverviewModel(domains)));
|
||||
"model", cachedOverviewModel));
|
||||
}
|
||||
|
||||
private void modelUpdater() {
|
||||
while (!Thread.interrupted()) {
|
||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||
|
||||
// This query can be quite expensive, so we can't run it on demand
|
||||
// for every request. Instead, we run it every 15 minutes and cache
|
||||
// the result.
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME, DISCOVER_DATE
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY = 0
|
||||
ORDER BY ID DESC
|
||||
LIMIT 10
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(new SiteOverviewModel.DiscoveredDomain(
|
||||
rs.getString("DOMAIN_NAME"),
|
||||
rs.getString("DISCOVER_DATE"))
|
||||
);
|
||||
}
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("Failed to get recently added domains: {}", ex.getMessage());
|
||||
}
|
||||
|
||||
cachedOverviewModel = new SiteOverviewModel(domains);
|
||||
|
||||
try {
|
||||
TimeUnit.MINUTES.sleep(15);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||
@@ -107,7 +137,7 @@ public class SearchSiteInfoService {
|
||||
@PathParam String domainName,
|
||||
@QueryParam String view,
|
||||
@QueryParam Integer page
|
||||
) throws SQLException {
|
||||
) throws SQLException, ExecutionException {
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
@@ -193,7 +223,7 @@ public class SearchSiteInfoService {
|
||||
);
|
||||
}
|
||||
|
||||
private SiteInfoWithContext listInfo(Context context, String domainName) {
|
||||
private SiteInfoWithContext listInfo(Context context, String domainName) throws ExecutionException {
|
||||
|
||||
var domain = new EdgeDomain(domainName);
|
||||
final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);
|
||||
@@ -352,7 +382,7 @@ public class SearchSiteInfoService {
|
||||
|
||||
public record SiteInfoWithContext(String domain,
|
||||
boolean isSubscribed,
|
||||
List<EdgeDomain> siblingDomains,
|
||||
List<DbDomainQueries.DomainWithNode> siblingDomains,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
|
@@ -2,13 +2,24 @@
|
||||
|
||||
This service handles search traffic and is the service
|
||||
you're most directly interacting with when visiting
|
||||
[search.marginalia.nu](https://search.marginalia.nu).
|
||||
[marginalia-search.com](https://marginalia-search.com).
|
||||
|
||||
It interprets a "human" query and translates it into a
|
||||
request that gets passed into to the index service, which finds
|
||||
related documents, which this service then ranks and returns
|
||||
to the user.
|
||||
|
||||
The UI is built using [JTE templates](https://jte.gg/syntax/) and the [Jooby framework](https://jooby.io), primarily using
|
||||
its MVC facilities.
|
||||
|
||||
When developing, it's possible to set up a mock version of the UI by running
|
||||
the gradle command
|
||||
|
||||
```$ ./gradlew paperDoll -i```
|
||||
|
||||
The UI will be available at http://localhost:9999/, and has hot reloading of JTE classes
|
||||
and static resources.
|
||||
|
||||
|
||||

|
||||
|
||||
|
@@ -36,10 +36,11 @@
|
||||
|
||||
</div>
|
||||
|
||||
@if (filters.showRecentOption.isSet()) <input type="hidden" name="js" value="${filters.removeJsOption.value()}"> @endif
|
||||
@if (filters.reduceAdtechOption.isSet()) <input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}"> @endif
|
||||
@if (filters.searchTitleOption.isSet()) <input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}"> @endif
|
||||
@if (filters.showRecentOption.isSet()) <input type="hidden" name="recent" value="${filters.showRecentOption.value()}"> @endif
|
||||
|
||||
<input type="hidden" name="js" value="${filters.removeJsOption.value()}">
|
||||
<input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}">
|
||||
<input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}">
|
||||
<input type="hidden" name="profile" value="${profile}">
|
||||
<input type="hidden" name="recent" value="${filters.showRecentOption.value()}">
|
||||
|
||||
</form>
|
||||
|
@@ -34,12 +34,12 @@
|
||||
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
||||
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||
<div class="fas fa-wrench mr-1 text-margeblue dark:text-slate-200"></div>
|
||||
This is the new design and home of Marginalia Search. Migration to the new domain <pre class="inline text-red-800 dark:text-red-100">marginalia-search.com</pre> is currently <em>in progress</em>,
|
||||
so mind that some things may be a bit broken for a day or two. <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">Read more</a>.
|
||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||
This is the new design and home of Marginalia Search.
|
||||
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
<p class="my-4"></p>
|
||||
If you have any issues or feedback regarding this change, please email
|
||||
<a href="mailto:contact@marginalia-search.com" class="underline text-liteblue dark:text-blue-200">contact@marginalia-search.com</a>.
|
||||
The old version of Marginalia Search remains available at
|
||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||
</div>
|
||||
</div>
|
||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||
|
@@ -1,4 +1,4 @@
|
||||
@import nu.marginalia.model.EdgeDomain
|
||||
@import nu.marginalia.db.DbDomainQueries
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||
@import nu.marginalia.search.model.UrlDetails
|
||||
@@ -80,31 +80,6 @@
|
||||
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
<span>Related Subdomains</span>
|
||||
</div>
|
||||
|
||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||
<thead>
|
||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||
@for (EdgeDomain sibling : siteInfo.siblingDomains())
|
||||
<tr>
|
||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.toString()}">${sibling.toString()}</a>
|
||||
</td>
|
||||
</tr>
|
||||
@endfor
|
||||
</tbody>
|
||||
</table>
|
||||
@endif
|
||||
|
||||
@if (siteInfo.domainInformation().isUnknownDomain())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fa-regular fa-circle-question"></i>
|
||||
@@ -173,6 +148,36 @@
|
||||
</form>
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
<span>Related Subdomains</span>
|
||||
</div>
|
||||
|
||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||
<thead>
|
||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||
<tr>
|
||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||
|
||||
@if (!sibling.isIndexed())
|
||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||
@endif
|
||||
</td>
|
||||
</tr>
|
||||
@endfor
|
||||
</tbody>
|
||||
</table>
|
||||
@endif
|
||||
|
||||
|
||||
@if (siteInfo.isKnown())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-chart-simple"></i>
|
||||
|
@@ -8,8 +8,8 @@
|
||||
<ShortName>Marginalia</ShortName>
|
||||
<Description>Search Marginalia</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
|
||||
<Image width="16" height="16" type="image/x-icon">https://marginalia-search.com/favicon.ico</Image>
|
||||
<Url type="text/html" method="get"
|
||||
template="https://search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
|
||||
template="https://marginalia-search.com/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://marginalia-search.com/</moz:SearchForm>
|
||||
</OpenSearchDescription>
|
@@ -6,6 +6,7 @@ import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -132,8 +133,9 @@ public class MockedSearchResults {
|
||||
return new SearchSiteInfoService.SiteInfoWithContext(
|
||||
"www.example.com",
|
||||
false,
|
||||
List.of(new EdgeDomain("example.com"),
|
||||
new EdgeDomain("about.example.com")
|
||||
List.of(
|
||||
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 1),
|
||||
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 0)
|
||||
),
|
||||
14,
|
||||
"https://www.example.com",
|
||||
|
@@ -0,0 +1,85 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class SearchAddToCrawlQueueServiceTest {
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
private DbDomainQueries domainQueries;
|
||||
private SearchAddToCrawlQueueService addToCrawlQueueService;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
|
||||
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
|
||||
}
|
||||
|
||||
domainQueries = new DbDomainQueries(dataSource);
|
||||
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
}
|
||||
|
||||
private int getNodeAffinity(String domainName) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
stmt.setString(1, domainName);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Test
|
||||
void addToCrawlQueue() throws SQLException {
|
||||
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
|
||||
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
|
||||
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
|
||||
|
||||
addToCrawlQueueService.addToCrawlQueue(knownId);
|
||||
addToCrawlQueueService.addToCrawlQueue(addedId);
|
||||
addToCrawlQueueService.addToCrawlQueue(indexedId);
|
||||
|
||||
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
|
||||
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
|
||||
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
|
||||
}
|
||||
|
||||
}
|
@@ -55,6 +55,7 @@ dependencies {
|
||||
|
||||
implementation libs.duckdb
|
||||
implementation libs.jsoup
|
||||
implementation libs.protobuf
|
||||
|
||||
implementation libs.trove
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
|
@@ -2,11 +2,10 @@ package nu.marginalia.control.app.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.control.ControlRendererFactory;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
@@ -22,18 +21,16 @@ public class SearchToBanService {
|
||||
private final ControlRendererFactory rendererFactory;
|
||||
private final QueryClient queryClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
|
||||
@Inject
|
||||
public SearchToBanService(ControlBlacklistService blacklistService,
|
||||
ControlRendererFactory rendererFactory,
|
||||
QueryClient queryClient, NodeConfigurationService nodeConfigurationService)
|
||||
QueryClient queryClient)
|
||||
{
|
||||
|
||||
this.blacklistService = blacklistService;
|
||||
this.rendererFactory = rendererFactory;
|
||||
this.queryClient = queryClient;
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
}
|
||||
|
||||
public void register() throws IOException {
|
||||
@@ -76,7 +73,14 @@ public class SearchToBanService {
|
||||
|
||||
private Object executeQuery(String query) {
|
||||
return queryClient.search(new QueryParams(
|
||||
query, new QueryLimits(2, 200, 250, 8192),
|
||||
query,
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsTotal(100)
|
||||
.setResultsByDomain(2)
|
||||
.setTimeoutMs(200)
|
||||
.setFetchSize(8192)
|
||||
.build()
|
||||
,
|
||||
"NONE"
|
||||
));
|
||||
}
|
||||
|
@@ -3,12 +3,13 @@ package nu.marginalia.query;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
@@ -53,9 +54,14 @@ public class QueryBasicInterface {
|
||||
int domainCount = parseInt(requireNonNullElse(request.queryParams("domainCount"), "5"));
|
||||
String set = requireNonNullElse(request.queryParams("set"), "");
|
||||
|
||||
var params = new QueryParams(queryString, new QueryLimits(
|
||||
domainCount, min(100, count * 10), 250, 8192
|
||||
), set);
|
||||
var params = new QueryParams(queryString,
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(domainCount)
|
||||
.setResultsTotal(min(100, count * 10))
|
||||
.setTimeoutMs(250)
|
||||
.setFetchSize(8192)
|
||||
.build()
|
||||
, set);
|
||||
|
||||
var pagination = new IndexClient.Pagination(page, count);
|
||||
|
||||
@@ -63,7 +69,7 @@ public class QueryBasicInterface {
|
||||
queryString,
|
||||
params,
|
||||
pagination,
|
||||
ResultRankingParameters.sensibleDefaults()
|
||||
PrototypeRankingParameters.sensibleDefaults()
|
||||
);
|
||||
|
||||
var results = detailedDirectResult.result();
|
||||
@@ -92,7 +98,7 @@ public class QueryBasicInterface {
|
||||
String queryString = request.queryParams("q");
|
||||
if (queryString == null) {
|
||||
// Show the default query form if no query is given
|
||||
return qdebugRenderer.render(Map.of("rankingParams", ResultRankingParameters.sensibleDefaults())
|
||||
return qdebugRenderer.render(Map.of("rankingParams", PrototypeRankingParameters.sensibleDefaults())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -101,9 +107,14 @@ public class QueryBasicInterface {
|
||||
int domainCount = parseInt(requireNonNullElse(request.queryParams("domainCount"), "5"));
|
||||
String set = requireNonNullElse(request.queryParams("set"), "");
|
||||
|
||||
var queryParams = new QueryParams(queryString, new QueryLimits(
|
||||
domainCount, min(100, count * 10), 250, 8192
|
||||
), set);
|
||||
var queryParams = new QueryParams(queryString,
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(domainCount)
|
||||
.setResultsTotal(min(100, count * 10))
|
||||
.setTimeoutMs(250)
|
||||
.setFetchSize(8192)
|
||||
.build(),
|
||||
set);
|
||||
|
||||
var pagination = new IndexClient.Pagination(page, count);
|
||||
|
||||
@@ -126,27 +137,28 @@ public class QueryBasicInterface {
|
||||
);
|
||||
}
|
||||
|
||||
private ResultRankingParameters debugRankingParamsFromRequest(Request request) {
|
||||
var sensibleDefaults = ResultRankingParameters.sensibleDefaults();
|
||||
private RpcResultRankingParameters debugRankingParamsFromRequest(Request request) {
|
||||
var sensibleDefaults = PrototypeRankingParameters.sensibleDefaults();
|
||||
|
||||
return ResultRankingParameters.builder()
|
||||
.domainRankBonus(doubleFromRequest(request, "domainRankBonus", sensibleDefaults.domainRankBonus))
|
||||
.qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty))
|
||||
.shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold))
|
||||
.shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty))
|
||||
.tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition))
|
||||
.tcfVerbatim(doubleFromRequest(request, "tcfVerbatim", sensibleDefaults.tcfVerbatim))
|
||||
.tcfProximity(doubleFromRequest(request, "tcfProximity", sensibleDefaults.tcfProximity))
|
||||
.bm25Params(new Bm25Parameters(
|
||||
doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()),
|
||||
doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b())
|
||||
))
|
||||
.temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString())))
|
||||
.temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight))
|
||||
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
|
||||
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
|
||||
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
|
||||
.exportDebugData(true)
|
||||
var bias = RpcTemporalBias.Bias.valueOf(stringFromRequest(request, "temporalBias", "NONE"));
|
||||
|
||||
return RpcResultRankingParameters.newBuilder()
|
||||
.setDomainRankBonus(doubleFromRequest(request, "domainRankBonus", sensibleDefaults.getDomainRankBonus()))
|
||||
.setQualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.getQualityPenalty()))
|
||||
.setShortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.getShortDocumentThreshold()))
|
||||
.setShortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.getShortDocumentPenalty()))
|
||||
.setTcfFirstPositionWeight(doubleFromRequest(request, "tcfFirstPositionWeight", sensibleDefaults.getTcfFirstPositionWeight()))
|
||||
.setTcfVerbatimWeight(doubleFromRequest(request, "tcfVerbatimWeight", sensibleDefaults.getTcfVerbatimWeight()))
|
||||
.setTcfProximityWeight(doubleFromRequest(request, "tcfProximityWeight", sensibleDefaults.getTcfProximityWeight()))
|
||||
.setBm25B(doubleFromRequest(request, "bm25b", sensibleDefaults.getBm25B()))
|
||||
.setBm25K(doubleFromRequest(request, "bm25k", sensibleDefaults.getBm25K()))
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder().setBias(bias).build())
|
||||
.setTemporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.getTemporalBiasWeight()))
|
||||
.setShortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.getShortSentenceThreshold()))
|
||||
.setShortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.getShortSentencePenalty()))
|
||||
.setBm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.getBm25Weight()))
|
||||
.setDisablePenalties(boolFromRequest(request, "disablePenalties", sensibleDefaults.getDisablePenalties()))
|
||||
.setExportDebugData(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
@@ -154,6 +166,13 @@ public class QueryBasicInterface {
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Double.parseDouble(request.queryParams(param));
|
||||
}
|
||||
|
||||
boolean boolFromRequest(Request request, String param, boolean defaultValue) {
|
||||
if (param == null)
|
||||
return defaultValue;
|
||||
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Boolean.parseBoolean(request.queryParams(param));
|
||||
}
|
||||
|
||||
int intFromRequest(Request request, String param, int defaultValue) {
|
||||
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : parseInt(request.queryParams(param));
|
||||
}
|
||||
|
@@ -31,20 +31,20 @@
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="shortDocumentPenalty" name="shortDocumentPenalty" value="{{shortDocumentPenalty}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="tcfFirstPosition">TCF First Position Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfFirstPosition" name="tcfFirstPosition" value="{{tcfFirstPosition}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfFirstPositionWeight">TCF First Position Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfFirstPositionWeight" name="tcfFirstPositionWeight" value="{{tcfFirstPositionWeight}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="tcfVerbatim">TCF Verbatim</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfVerbatim" name="tcfVerbatim" value="{{tcfVerbatim}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfProximity">TCF Proximity</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfProximity" name="tcfProximity" value="{{tcfProximity}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfVerbatimWeight">TCF Verbatim</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfVerbatimWeight" name="tcfVerbatimWeight" value="{{tcfVerbatimWeight}}"></div>
|
||||
<div class="col-sm-2"><label for="tcfProximityWeight">TCF Proximity</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="tcfProximityWeight" name="tcfProximityWeight" value="{{tcfProximityWeight}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="bm25.k1">BM25 K1</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.k1" name="bm25.k1" value="{{bm25Params.k}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25.b">BM25 B</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25.b" name="bm25.b" value="{{bm25Params.b}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25k">BM25 K1</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25k" name="bm25k" value="{{bm25K}}"></div>
|
||||
<div class="col-sm-2"><label for="bm25b">BM25 B</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25b" name="bm25b" value="{{bm25B}}"></div>
|
||||
</div>
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="temporalBias">Temporal Bias</label></div>
|
||||
@@ -67,6 +67,14 @@
|
||||
<div class="row my-2">
|
||||
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
|
||||
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
|
||||
|
||||
<div class="col-sm-2"><label for="disablePenalties">Disable Penalties</label></div>
|
||||
<div class="col-sm-2">
|
||||
<select class="form-select" id="disablePenalties" name="disablePenalties">
|
||||
<option value="FALSE" {{#unless disablePenalties}}selected{{/unless}}>FALSE</option>
|
||||
<option value="TRUE" {{#if disablePenalties}}selected{{/if}}>TRUE</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{{/with}}
|
||||
|
@@ -5,7 +5,8 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.QueryProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.RpcQsQuery;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
@@ -211,8 +212,7 @@ public class IntegrationTest {
|
||||
|
||||
var params = QueryProtobufCodec.convertRequest(request);
|
||||
|
||||
var p = ResultRankingParameters.sensibleDefaults();
|
||||
p.exportDebugData = true;
|
||||
var p = RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults()).setExportDebugData(true).build();
|
||||
var query = queryFactory.createQuery(params, p);
|
||||
|
||||
|
||||
|
@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
|
||||
public class ScreenshotCaptureToolMain {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
||||
|
||||
private static final String BROWSERLESS_TOKEN = System.getenv("live-capture.browserless-token");
|
||||
public static void main(String[] args) {
|
||||
DatabaseModule databaseModule = new DatabaseModule(false);
|
||||
var ds = databaseModule.provideConnection();
|
||||
@@ -107,7 +107,7 @@ public class ScreenshotCaptureToolMain {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(new URI("http://browserless:3000/screenshot"))
|
||||
.uri(new URI("http://browserless:3000/screenshot?token=" + BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
|
@@ -1,6 +1,4 @@
|
||||
## This is a token file for automatic deployment
|
||||
|
||||
A master HEAD tagged with deploy-core*, deploy-executor*, or deploy-index* will trigger a commit.
|
||||
|
||||
2024-12-19-00002: Test deployment of executor
|
||||
2024-12-19-00001: Test deployment of executor
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
@@ -72,11 +72,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -103,11 +103,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -129,11 +129,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -3,11 +3,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -258,6 +258,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=2,
|
||||
groups={"all", "frontend", "core"}
|
||||
),
|
||||
'search-legacy': ServiceConfig(
|
||||
gradle_target=':code:services-application:search-service-legacy:docker',
|
||||
docker_name='search-service-legacy',
|
||||
instances=None,
|
||||
deploy_tier=3,
|
||||
groups={"all", "frontend", "core"}
|
||||
),
|
||||
'api': ServiceConfig(
|
||||
gradle_target=':code:services-application:api-service:docker',
|
||||
docker_name='api-service',
|
||||
|
Reference in New Issue
Block a user