1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
599534806b (search) Soften domain limit constraints in URL deduplication 2025-05-17 00:00:42 +02:00
Viktor Lofgren
7e8253dac7 (search) Clean up debug logging 2025-05-17 00:00:28 +02:00
4 changed files with 9 additions and 46 deletions

View File

@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
return topDomain;
}
public String getDomainKey() {
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
return topDomain;
}
return topDomain.substring(0, cutPoint).toLowerCase();
}
/** If possible, try to provide an alias domain,
* i.e. a domain name that is very likely to link to this one
* */

View File

@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgeDomainTest {
@Test
public void testSkepdic() throws URISyntaxException {
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
assertEquals("skepdic", domain.getDomain().getDomainKey());
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
assertEquals("skepdic", domain2.getDomain().getDomainKey());
}
@Test
public void testHkDomain() throws URISyntaxException {
var domain = new EdgeUrl("http://l7072i3.l7c.net");

View File

@@ -112,13 +112,6 @@ public class SearchOperator {
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
if (queryParams.humanQuery().equals("slackware linux")) {
logger.info("Query response: {}", queryResponse.results().subList(0, 5));
logger.info("Query results: {}", queryResults.subList(0, 5));
logger.info("Clustered results: {}", clusteredResults.subList(0, 5));
}
// Log the query and results
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));

View File

@@ -25,28 +25,14 @@ public class UrlDeduplicator {
}
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (details.url.domain.topDomain.equals("slackware.com")) {
if (!deduplicateOnSuperficialHash(details)) {
logger.info("Rejecting on superficial hash " + details.url);
return true;
}
if (!deduplicateOnLSH(details)) {
logger.info("Rejecting on LSH for " + details.url);
return true;
}
if (!limitResultsPerDomain(details)) {
logger.info("Rejecting on limitResultsPerDomain for " + details.url);
return true;
}
}
else {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
return true;
if (!limitResultsPerDomain(details))
return true;
}
return false;
}
@@ -76,7 +62,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}