mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(refac) Merge IndexResultScoreCalculator into IndexResultRankingService
This commit is contained in:
@@ -170,7 +170,7 @@ public class PerfTestMain {
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
long start = System.nanoTime();
|
||||
sum2 += rankingService.rankResults(rankingContext, rankingData, false).size();
|
||||
sum2 += rankingService.rankResults(rankingContext, rankingData).size();
|
||||
long end = System.nanoTime();
|
||||
times.add((end - start)/1_000_000.);
|
||||
|
||||
|
@@ -228,7 +228,7 @@ public class IndexQueryExecution {
|
||||
|
||||
try (rankingData) {
|
||||
long st = System.nanoTime();
|
||||
resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData, false));
|
||||
resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData));
|
||||
long et = System.nanoTime();
|
||||
|
||||
metric_index_rank_time_s
|
||||
|
@@ -4,33 +4,45 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.StatefulIndex;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.model.TermMetadataList;
|
||||
import nu.marginalia.index.model.*;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
|
||||
|
||||
@Singleton
|
||||
public class IndexResultRankingService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class);
|
||||
@@ -132,12 +144,10 @@ public class IndexResultRankingService {
|
||||
|
||||
public List<SearchResultItem> rankResults(
|
||||
SearchContext rankingContext,
|
||||
RankingData rankingData,
|
||||
boolean exportDebugData)
|
||||
RankingData rankingData)
|
||||
{
|
||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(rankingData.size());
|
||||
CombinedIndexReader index = statefulIndex.get();
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
@@ -149,37 +159,25 @@ public class IndexResultRankingService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(null, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore( rankingFactors, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
SearchResultItem score = calculateScore(null, index, rankingData.resultId(), rankingContext, rankingData);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
|
||||
int limitTotal,
|
||||
SearchContext searchContext,
|
||||
List<SearchResultItem> results) throws SQLException {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
|
||||
|
||||
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||
TLongList idsList = new TLongArrayList(limitTotal);
|
||||
|
||||
IndexResultDomainDeduplicator domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
|
||||
for (var item : results) {
|
||||
if (domainCountFilter.test(item)) {
|
||||
|
||||
@@ -209,12 +207,23 @@ public class IndexResultRankingService {
|
||||
|
||||
resultsList.clear();
|
||||
|
||||
try (var data = prepareRankingData(searchContext, new CombinedDocIdList(combinedIdsList))) {
|
||||
resultsList.addAll(this.rankResults(
|
||||
searchContext,
|
||||
data,
|
||||
true)
|
||||
);
|
||||
// Re-rank the results while gathering debugging data
|
||||
try (RankingData rankingData = prepareRankingData(searchContext, new CombinedDocIdList(combinedIdsList))) {
|
||||
CombinedIndexReader index = statefulIndex.get();
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
|
||||
while (rankingData.next()) {
|
||||
if (!searchContext.phraseConstraints.testMandatory(rankingData.positions())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
SearchResultItem score = calculateScore(new DebugRankingFactors(), index, rankingData.resultId(), searchContext, rankingData);
|
||||
if (score != null) {
|
||||
resultsList.add(score);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (TimeoutException ex) {
|
||||
// this won't happen since we passed null for budget
|
||||
@@ -327,4 +336,556 @@ public class IndexResultRankingService {
|
||||
|
||||
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
|
||||
CombinedIndexReader index,
|
||||
long combinedId,
|
||||
SearchContext rankingContext,
|
||||
IndexResultRankingService.RankingData rankingData)
|
||||
{
|
||||
long[] wordFlags = rankingData.flags();
|
||||
CodedSequence[] positions = rankingData.positions();
|
||||
DocumentSpans spans = rankingData.documentSpans();
|
||||
|
||||
QueryParams queryParams = rankingContext.queryParams;
|
||||
CompiledQuery<String> compiledQuery = rankingContext.compiledQuery;
|
||||
|
||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||
|
||||
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||
// avoid unnecessary calculations
|
||||
|
||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
||||
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
|
||||
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
|
||||
|
||||
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
long docMetadata = index.getDocumentMetadata(docId);
|
||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
if (docSize <= 0) docSize = 5000;
|
||||
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
|
||||
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
|
||||
}
|
||||
|
||||
// Decode the coded positions lists into plain IntLists as at this point we will be
|
||||
// going over them multiple times
|
||||
IntList[] decodedPositions = new IntList[positions.length];
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
if (positions[i] != null) {
|
||||
decodedPositions[i] = positions[i].values();
|
||||
}
|
||||
else {
|
||||
decodedPositions[i] = IntList.of();
|
||||
}
|
||||
}
|
||||
|
||||
var params = rankingContext.params;
|
||||
|
||||
double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors);
|
||||
|
||||
VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, rankingContext.phraseConstraints, spans);
|
||||
UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, rankingContext.regularMask, spans);
|
||||
|
||||
float proximitiyFac = getProximitiyFac(decodedPositions, rankingContext.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||
|
||||
double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||
double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
|
||||
double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
|
||||
double score_bM25 = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
double score_bFlags = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
|
||||
double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
|
||||
|
||||
double score = normalize(
|
||||
rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
|
||||
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
||||
);
|
||||
|
||||
if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
throw new IllegalStateException("NaN in result value calculation");
|
||||
}
|
||||
score = Double.MAX_VALUE;
|
||||
}
|
||||
|
||||
// Capture ranking factors for debugging
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("score.bm25-main", Double.toString(score_bM25));
|
||||
debugRankingFactors.addDocumentFactor("score.bm25-flags", Double.toString(score_bFlags));
|
||||
debugRankingFactors.addDocumentFactor("score.verbatim", Double.toString(score_verbatim));
|
||||
debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity));
|
||||
debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition));
|
||||
|
||||
for (int i = 0; i < rankingContext.termIdsAll.size(); i++) {
|
||||
long termId = rankingContext.termIdsAll.at(i);
|
||||
|
||||
var flags = wordFlagsQuery.at(i);
|
||||
|
||||
debugRankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
|
||||
|
||||
for (var flag : WordFlags.values()) {
|
||||
if (flag.isPresent((byte) flags)) {
|
||||
debugRankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
for (HtmlTag tag : HtmlTag.includedTags) {
|
||||
if (verbatimMatches.get(tag)) {
|
||||
debugRankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
if (positions[i] != null) {
|
||||
debugRankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.body", SequenceOperations.findIntersections(spans.body.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.positionValues(), decodedPositions[i]).iterator());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SearchResultItem ret = new SearchResultItem(combinedId,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
score,
|
||||
calculatePositionsMask(decodedPositions, rankingContext.phraseConstraints)
|
||||
);
|
||||
|
||||
if (null != debugRankingFactors) {
|
||||
ret.debugRankingFactors = debugRankingFactors;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||
QueryParams queryParams)
|
||||
{
|
||||
QueryStrategy queryStrategy = queryParams.queryStrategy();
|
||||
if (queryStrategy == QueryStrategy.AUTO ||
|
||||
queryStrategy == QueryStrategy.SENTENCE ||
|
||||
queryStrategy == QueryStrategy.TOPIC) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return booleanAggregate(queryGraphScores,
|
||||
flags -> meetsQueryStrategyRequirements((byte) flags, queryStrategy));
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
|
||||
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
|
||||
return WordFlags.Site.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
|
||||
return WordFlags.Subjects.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||
return WordFlags.Title.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
|
||||
return WordFlags.UrlPath.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
|
||||
return WordFlags.UrlDomain.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
|
||||
return WordFlags.ExternalLink.isPresent(flags);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Calculate a bitmask illustrating the intersected positions of the search terms in the document.
|
||||
* This is used in the GUI.
|
||||
* */
|
||||
private long calculatePositionsMask(IntList[] positions, PhraseConstraintGroupList phraseConstraints) {
|
||||
|
||||
long result = 0;
|
||||
int bit = 0;
|
||||
|
||||
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
|
||||
|
||||
while (intersection.hasNext() && bit < 64) {
|
||||
bit = (int) (Math.sqrt(intersection.nextInt()));
|
||||
result |= 1L << bit;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private double calculateDocumentBonus(long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
RpcResultRankingParameters rankingParams,
|
||||
@Nullable DebugRankingFactors debugRankingFactors) {
|
||||
|
||||
if (rankingParams.getDisablePenalties()) {
|
||||
return 0.;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
int size = DocumentMetadata.decodeSize(documentMetadata);
|
||||
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
|
||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||
|
||||
double averageSentenceLengthPenalty
|
||||
= (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
|
||||
|
||||
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||
final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
|
||||
final double topologyBonus = Math.log(1 + topology);
|
||||
final double documentLengthPenalty
|
||||
= length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
|
||||
final double temporalBias;
|
||||
|
||||
if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
|
||||
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
|
||||
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else {
|
||||
temporalBias = 0;
|
||||
}
|
||||
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.documentLengthPenalty", Double.toString(documentLengthPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.qualityPenalty", Double.toString(qualityPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.rankingBonus", Double.toString(rankingBonus));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.topologyBonus", Double.toString(topologyBonus));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.temporalBias", Double.toString(temporalBias));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.flagsPenalty", Double.toString(flagsPenalty));
|
||||
}
|
||||
|
||||
return averageSentenceLengthPenalty
|
||||
+ documentLengthPenalty
|
||||
+ qualityPenalty
|
||||
+ rankingBonus
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
}
|
||||
|
||||
/** Calculate the proximity factor for the document.
|
||||
* <p></p>
|
||||
* The proximity factor is a bonus based on how close the search terms are to each other in the document
|
||||
* that turns into a penalty if the distance is too large.
|
||||
* */
|
||||
private static float getProximitiyFac(IntList[] positions,
|
||||
PhraseConstraintGroupList constraintGroups,
|
||||
VerbatimMatches verbatimMatches,
|
||||
UnorderedMatches unorderedMatches,
|
||||
DocumentSpans spans
|
||||
) {
|
||||
float proximitiyFac = 0;
|
||||
|
||||
if (positions.length > 2) {
|
||||
int minDist = constraintGroups.getFullGroup().minDistance(positions);
|
||||
if (minDist > 0 && minDist < Integer.MAX_VALUE) {
|
||||
if (minDist < 32) {
|
||||
// If min-dist is sufficiently small, we give a tapering reward to the document
|
||||
proximitiyFac = 2.0f / (0.1f + (float) Math.sqrt(minDist));
|
||||
} else {
|
||||
// if it is too large, we add a mounting penalty
|
||||
proximitiyFac = -1.0f * (float) Math.sqrt(minDist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Give bonus proximity score if all keywords are in the title
|
||||
if (!verbatimMatches.get(HtmlTag.TITLE)
|
||||
&& unorderedMatches.searchableKeywordCount > 2
|
||||
&& unorderedMatches.getObservationCount(HtmlTag.TITLE) == unorderedMatches.searchableKeywordCount) {
|
||||
proximitiyFac += unorderedMatches.getObservationCount(HtmlTag.TITLE)
|
||||
* (2.5f + 2.f / Math.max(1, spans.title.length()));
|
||||
}
|
||||
|
||||
// Give bonus proximity score if all keywords are in a heading
|
||||
if (spans.heading.length() < 64 &&
|
||||
! verbatimMatches.get(HtmlTag.HEADING)
|
||||
&& unorderedMatches.getObservationCount(HtmlTag.HEADING) == unorderedMatches.searchableKeywordCount)
|
||||
{
|
||||
proximitiyFac += 1.0f * unorderedMatches.getObservationCount(HtmlTag.HEADING);
|
||||
}
|
||||
|
||||
return proximitiyFac;
|
||||
}
|
||||
|
||||
/** A helper class for capturing the verbatim phrase matches in the document */
|
||||
private static class VerbatimMatches {
|
||||
private final BitSet matches;
|
||||
private float score = 0.f;
|
||||
|
||||
private static final float[] weights_full;
|
||||
private static final float[] weights_partial;
|
||||
|
||||
static {
|
||||
weights_full = new float[HtmlTag.includedTags.length];
|
||||
weights_partial = new float[HtmlTag.includedTags.length];
|
||||
|
||||
for (int i = 0; i < weights_full.length; i++) {
|
||||
weights_full[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 4.0f;
|
||||
case HEADING -> 1.5f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case EXTERNAL_LINKTEXT -> 3.0f;
|
||||
case BODY -> 1.0f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
|
||||
for (int i = 0; i < weights_partial.length; i++) {
|
||||
weights_partial[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 1.5f;
|
||||
case HEADING -> 1.f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case EXTERNAL_LINKTEXT -> 2.0f;
|
||||
case BODY -> 0.25f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public VerbatimMatches(IntList[] positions, PhraseConstraintGroupList constraints, DocumentSpans spans) {
|
||||
matches = new BitSet(HtmlTag.includedTags.length);
|
||||
|
||||
var fullGroup = constraints.getFullGroup();
|
||||
IntList fullGroupIntersections = fullGroup.findIntersections(positions);
|
||||
|
||||
int largestOptional = constraints.getFullGroup().size;
|
||||
if (largestOptional < 2) {
|
||||
var titleSpan = spans.getSpan(HtmlTag.TITLE);
|
||||
if (titleSpan.length() == fullGroup.size
|
||||
&& titleSpan.containsRange(fullGroupIntersections, fullGroup.size))
|
||||
{
|
||||
score += 4; // If the title is a single word and the same as the query, we give it a verbatim bonus
|
||||
}
|
||||
|
||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
if (extLinkSpan.length() >= fullGroup.size) {
|
||||
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||
if (cnt > 0) {
|
||||
score += 2 * cnt;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Capture full query matches
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnts = spans.getSpan(tag).countRangeMatches(fullGroupIntersections, fullGroup.size);
|
||||
if (cnts > 0) {
|
||||
matches.set(tag.ordinal());
|
||||
score += (float) (weights_full[tag.ordinal()] * fullGroup.size + (1 + Math.log(2 + cnts)));
|
||||
}
|
||||
}
|
||||
|
||||
// Bonus if there's a perfect match with an atag span
|
||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
if (extLinkSpan.length() >= fullGroup.size) {
|
||||
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||
score += 2*cnt;
|
||||
}
|
||||
|
||||
// For optional groups, we scale the score by the size of the group relative to the full group
|
||||
for (var optionalGroup : constraints.getOptionalGroups()) {
|
||||
int groupSize = optionalGroup.size;
|
||||
float sizeScalingFactor = groupSize / (float) largestOptional;
|
||||
|
||||
IntList intersections = optionalGroup.findIntersections(positions);
|
||||
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
|
||||
if (cnts > 0) {
|
||||
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean get(HtmlTag tag) {
|
||||
assert !tag.exclude;
|
||||
return matches.get(tag.ordinal());
|
||||
}
|
||||
|
||||
public float getScore() {
|
||||
return score;
|
||||
}
|
||||
}
|
||||
|
||||
/** A helper class for capturing the counts of unordered matches in the document */
|
||||
private static class UnorderedMatches {
|
||||
private final int[] observationsByTag;
|
||||
private final float[] valuesByWordIdx;
|
||||
private static final float[] weights;
|
||||
|
||||
private int firstPosition = 1;
|
||||
private int searchableKeywordCount = 0;
|
||||
static {
|
||||
weights = new float[HtmlTag.includedTags.length];
|
||||
|
||||
for (int i = 0; i < weights.length; i++) {
|
||||
weights[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 2.5f;
|
||||
case HEADING -> 1.25f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case BODY -> 1.0f;
|
||||
case EXTERNAL_LINKTEXT -> 1.5f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public UnorderedMatches(IntList[] positions, CompiledQuery<String> compiledQuery,
|
||||
BitSet regularMask,
|
||||
DocumentSpans spans) {
|
||||
observationsByTag = new int[HtmlTag.includedTags.length];
|
||||
valuesByWordIdx = new float[compiledQuery.size()];
|
||||
|
||||
for (int i = 0; i < compiledQuery.size(); i++) {
|
||||
|
||||
if (!regularMask.get(i))
|
||||
continue;
|
||||
|
||||
if (positions[i] == null || positions[i].isEmpty()) {
|
||||
firstPosition = Integer.MAX_VALUE;
|
||||
continue;
|
||||
}
|
||||
|
||||
firstPosition = Math.max(firstPosition, positions[i].getInt(0));
|
||||
searchableKeywordCount ++;
|
||||
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnt = spans.getSpan(tag).countIntersections(positions[i]);
|
||||
observationsByTag[tag.ordinal()] += cnt;
|
||||
valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int getObservationCount(HtmlTag tag) {
|
||||
return observationsByTag[tag.ordinal()];
|
||||
}
|
||||
|
||||
public float[] getWeightedCounts() {
|
||||
return valuesByWordIdx;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return valuesByWordIdx.length;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
|
||||
if (size < 400) {
|
||||
if (quality < 5)
|
||||
return 0;
|
||||
return -quality * rankingParams.getQualityPenalty();
|
||||
}
|
||||
else {
|
||||
return -quality * rankingParams.getQualityPenalty() * 20;
|
||||
}
|
||||
}
|
||||
|
||||
private int flagsPenalty(int featureFlags, long docFlags, int size) {
|
||||
|
||||
// Short-circuit for index-service, which does not have the feature flags
|
||||
if (featureFlags == 0)
|
||||
return 0;
|
||||
|
||||
double penalty = 0;
|
||||
|
||||
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
|
||||
boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
|
||||
boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
|
||||
|
||||
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
|
||||
double largeSiteFactor = 1.;
|
||||
|
||||
if (!isForum && !isWiki && !isDocs && size > 400) {
|
||||
// Long urls-that-look-like-this tend to be poor search results
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
|
||||
penalty += 30.0;
|
||||
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
|
||||
penalty += 30.;
|
||||
else penalty += 5.;
|
||||
|
||||
largeSiteFactor = 2;
|
||||
}
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
|
||||
penalty += 7.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (isForum || isWiki) {
|
||||
penalty = Math.min(0, penalty - 2);
|
||||
}
|
||||
|
||||
return (int) -penalty;
|
||||
}
|
||||
|
||||
/** Normalize a value to the range 0...15, where 0 is the best possible score
|
||||
*
|
||||
* @param value The value to normalize, must be positive or zero
|
||||
* @param penalty Any negative component of the value
|
||||
* */
|
||||
public static double normalize(double value, double penalty) {
|
||||
if (value < 0)
|
||||
value = 0;
|
||||
|
||||
return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,592 +0,0 @@
|
||||
package nu.marginalia.index.results;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.StatefulIndex;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.BitSet;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
|
||||
|
||||
/** This class is responsible for calculating the score of a search result.
|
||||
* It holds the data required to perform the scoring, as there is strong
|
||||
* reasons to cache this data, and performs the calculations */
|
||||
public class IndexResultScoreCalculator {
|
||||
private final CombinedIndexReader index;
|
||||
private final QueryParams queryParams;
|
||||
|
||||
private final DomainRankingOverrides domainRankingOverrides;
|
||||
private final SearchContext rankingContext;
|
||||
private final CompiledQuery<String> compiledQuery;
|
||||
|
||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||
DomainRankingOverrides domainRankingOverrides,
|
||||
SearchContext rankingContext)
|
||||
{
|
||||
this.index = statefulIndex.get();
|
||||
this.domainRankingOverrides = domainRankingOverrides;
|
||||
this.rankingContext = rankingContext;
|
||||
|
||||
this.queryParams = rankingContext.queryParams;
|
||||
this.compiledQuery = rankingContext.compiledQuery;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
|
||||
long combinedId,
|
||||
SearchContext rankingContext,
|
||||
long[] wordFlags,
|
||||
CodedSequence[] positions,
|
||||
DocumentSpans spans)
|
||||
{
|
||||
|
||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||
|
||||
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||
// avoid unnecessary calculations
|
||||
|
||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
||||
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
|
||||
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
|
||||
|
||||
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
long docMetadata = index.getDocumentMetadata(docId);
|
||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
if (docSize <= 0) docSize = 5000;
|
||||
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
|
||||
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
|
||||
}
|
||||
|
||||
// Decode the coded positions lists into plain IntLists as at this point we will be
|
||||
// going over them multiple times
|
||||
IntList[] decodedPositions = new IntList[positions.length];
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
if (positions[i] != null) {
|
||||
decodedPositions[i] = positions[i].values();
|
||||
}
|
||||
else {
|
||||
decodedPositions[i] = IntList.of();
|
||||
}
|
||||
}
|
||||
|
||||
var params = this.rankingContext.params;
|
||||
|
||||
double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors);
|
||||
|
||||
VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, rankingContext.phraseConstraints, spans);
|
||||
UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, this.rankingContext.regularMask, spans);
|
||||
|
||||
float proximitiyFac = getProximitiyFac(decodedPositions, rankingContext.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||
|
||||
double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||
double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
|
||||
double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
|
||||
double score_bM25 = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, this.rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
double score_bFlags = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), this.rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
|
||||
double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
|
||||
|
||||
double score = normalize(
|
||||
rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
|
||||
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
||||
);
|
||||
|
||||
if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
throw new IllegalStateException("NaN in result value calculation");
|
||||
}
|
||||
score = Double.MAX_VALUE;
|
||||
}
|
||||
|
||||
// Capture ranking factors for debugging
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("score.bm25-main", Double.toString(score_bM25));
|
||||
debugRankingFactors.addDocumentFactor("score.bm25-flags", Double.toString(score_bFlags));
|
||||
debugRankingFactors.addDocumentFactor("score.verbatim", Double.toString(score_verbatim));
|
||||
debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity));
|
||||
debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition));
|
||||
|
||||
for (int i = 0; i < rankingContext.termIdsAll.size(); i++) {
|
||||
long termId = rankingContext.termIdsAll.at(i);
|
||||
|
||||
var flags = wordFlagsQuery.at(i);
|
||||
|
||||
debugRankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
|
||||
|
||||
for (var flag : WordFlags.values()) {
|
||||
if (flag.isPresent((byte) flags)) {
|
||||
debugRankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
for (HtmlTag tag : HtmlTag.includedTags) {
|
||||
if (verbatimMatches.get(tag)) {
|
||||
debugRankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
if (positions[i] != null) {
|
||||
debugRankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.body", SequenceOperations.findIntersections(spans.body.positionValues(), decodedPositions[i]).iterator());
|
||||
debugRankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.positionValues(), decodedPositions[i]).iterator());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new SearchResultItem(combinedId,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
score,
|
||||
calculatePositionsMask(decodedPositions, rankingContext.phraseConstraints)
|
||||
);
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||
QueryStrategy queryStrategy)
|
||||
{
|
||||
if (queryStrategy == QueryStrategy.AUTO ||
|
||||
queryStrategy == QueryStrategy.SENTENCE ||
|
||||
queryStrategy == QueryStrategy.TOPIC) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return booleanAggregate(queryGraphScores,
|
||||
flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy()));
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
|
||||
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
|
||||
return WordFlags.Site.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
|
||||
return WordFlags.Subjects.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||
return WordFlags.Title.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
|
||||
return WordFlags.UrlPath.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
|
||||
return WordFlags.UrlDomain.isPresent(flags);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
|
||||
return WordFlags.ExternalLink.isPresent(flags);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Calculate a bitmask illustrating the intersected positions of the search terms in the document.
|
||||
* This is used in the GUI.
|
||||
* */
|
||||
private long calculatePositionsMask(IntList[] positions, PhraseConstraintGroupList phraseConstraints) {
|
||||
|
||||
long result = 0;
|
||||
int bit = 0;
|
||||
|
||||
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
|
||||
|
||||
while (intersection.hasNext() && bit < 64) {
|
||||
bit = (int) (Math.sqrt(intersection.nextInt()));
|
||||
result |= 1L << bit;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private double calculateDocumentBonus(long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
RpcResultRankingParameters rankingParams,
|
||||
@Nullable DebugRankingFactors debugRankingFactors) {
|
||||
|
||||
if (rankingParams.getDisablePenalties()) {
|
||||
return 0.;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
int size = DocumentMetadata.decodeSize(documentMetadata);
|
||||
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
|
||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||
|
||||
double averageSentenceLengthPenalty = (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
|
||||
|
||||
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||
final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
|
||||
final double topologyBonus = Math.log(1 + topology);
|
||||
final double documentLengthPenalty = length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
|
||||
final double temporalBias;
|
||||
|
||||
if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
|
||||
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
|
||||
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||
} else {
|
||||
temporalBias = 0;
|
||||
}
|
||||
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.documentLengthPenalty", Double.toString(documentLengthPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.qualityPenalty", Double.toString(qualityPenalty));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.rankingBonus", Double.toString(rankingBonus));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.topologyBonus", Double.toString(topologyBonus));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.temporalBias", Double.toString(temporalBias));
|
||||
debugRankingFactors.addDocumentFactor("documentBonus.flagsPenalty", Double.toString(flagsPenalty));
|
||||
}
|
||||
|
||||
return averageSentenceLengthPenalty
|
||||
+ documentLengthPenalty
|
||||
+ qualityPenalty
|
||||
+ rankingBonus
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
}
|
||||
|
||||
/** Calculate the proximity factor for the document.
|
||||
* <p></p>
|
||||
* The proximity factor is a bonus based on how close the search terms are to each other in the document
|
||||
* that turns into a penalty if the distance is too large.
|
||||
* */
|
||||
private static float getProximitiyFac(IntList[] positions,
|
||||
PhraseConstraintGroupList constraintGroups,
|
||||
VerbatimMatches verbatimMatches,
|
||||
UnorderedMatches unorderedMatches,
|
||||
DocumentSpans spans
|
||||
) {
|
||||
float proximitiyFac = 0;
|
||||
|
||||
if (positions.length > 2) {
|
||||
int minDist = constraintGroups.getFullGroup().minDistance(positions);
|
||||
if (minDist > 0 && minDist < Integer.MAX_VALUE) {
|
||||
if (minDist < 32) {
|
||||
// If min-dist is sufficiently small, we give a tapering reward to the document
|
||||
proximitiyFac = 2.0f / (0.1f + (float) Math.sqrt(minDist));
|
||||
} else {
|
||||
// if it is too large, we add a mounting penalty
|
||||
proximitiyFac = -1.0f * (float) Math.sqrt(minDist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Give bonus proximity score if all keywords are in the title
|
||||
if (!verbatimMatches.get(HtmlTag.TITLE) && unorderedMatches.searchableKeywordCount > 2 && unorderedMatches.getObservationCount(HtmlTag.TITLE) == unorderedMatches.searchableKeywordCount) {
|
||||
proximitiyFac += unorderedMatches.getObservationCount(HtmlTag.TITLE) * (2.5f + 2.f / Math.max(1, spans.title.length()));
|
||||
}
|
||||
// Give bonus proximity score if all keywords are in a heading
|
||||
if (spans.heading.length() < 64 &&
|
||||
! verbatimMatches.get(HtmlTag.HEADING)
|
||||
&& unorderedMatches.getObservationCount(HtmlTag.HEADING) == unorderedMatches.searchableKeywordCount)
|
||||
{
|
||||
proximitiyFac += 1.0f * unorderedMatches.getObservationCount(HtmlTag.HEADING);
|
||||
}
|
||||
|
||||
return proximitiyFac;
|
||||
}
|
||||
|
||||
/** A helper class for capturing the verbatim phrase matches in the document */
|
||||
private static class VerbatimMatches {
|
||||
private final BitSet matches;
|
||||
private float score = 0.f;
|
||||
|
||||
private static final float[] weights_full;
|
||||
private static final float[] weights_partial;
|
||||
|
||||
static {
|
||||
weights_full = new float[HtmlTag.includedTags.length];
|
||||
weights_partial = new float[HtmlTag.includedTags.length];
|
||||
|
||||
for (int i = 0; i < weights_full.length; i++) {
|
||||
weights_full[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 4.0f;
|
||||
case HEADING -> 1.5f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case EXTERNAL_LINKTEXT -> 3.0f;
|
||||
case BODY -> 1.0f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
|
||||
for (int i = 0; i < weights_partial.length; i++) {
|
||||
weights_partial[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 1.5f;
|
||||
case HEADING -> 1.f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case EXTERNAL_LINKTEXT -> 2.0f;
|
||||
case BODY -> 0.25f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public VerbatimMatches(IntList[] positions, PhraseConstraintGroupList constraints, DocumentSpans spans) {
|
||||
matches = new BitSet(HtmlTag.includedTags.length);
|
||||
|
||||
var fullGroup = constraints.getFullGroup();
|
||||
IntList fullGroupIntersections = fullGroup.findIntersections(positions);
|
||||
|
||||
int largestOptional = constraints.getFullGroup().size;
|
||||
if (largestOptional < 2) {
|
||||
var titleSpan = spans.getSpan(HtmlTag.TITLE);
|
||||
if (titleSpan.length() == fullGroup.size
|
||||
&& titleSpan.containsRange(fullGroupIntersections, fullGroup.size))
|
||||
{
|
||||
score += 4; // If the title is a single word and the same as the query, we give it a verbatim bonus
|
||||
}
|
||||
|
||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
if (extLinkSpan.length() >= fullGroup.size) {
|
||||
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||
if (cnt > 0) {
|
||||
score += 2 * cnt;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Capture full query matches
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnts = spans.getSpan(tag).countRangeMatches(fullGroupIntersections, fullGroup.size);
|
||||
if (cnts > 0) {
|
||||
matches.set(tag.ordinal());
|
||||
score += (float) (weights_full[tag.ordinal()] * fullGroup.size + (1 + Math.log(2 + cnts)));
|
||||
}
|
||||
}
|
||||
|
||||
// Bonus if there's a perfect match with an atag span
|
||||
var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
if (extLinkSpan.length() >= fullGroup.size) {
|
||||
int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
|
||||
score += 2*cnt;
|
||||
}
|
||||
|
||||
// For optional groups, we scale the score by the size of the group relative to the full group
|
||||
for (var optionalGroup : constraints.getOptionalGroups()) {
|
||||
int groupSize = optionalGroup.size;
|
||||
float sizeScalingFactor = groupSize / (float) largestOptional;
|
||||
|
||||
IntList intersections = optionalGroup.findIntersections(positions);
|
||||
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnts = spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
|
||||
if (cnts > 0) {
|
||||
score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean get(HtmlTag tag) {
|
||||
assert !tag.exclude;
|
||||
return matches.get(tag.ordinal());
|
||||
}
|
||||
|
||||
public float getScore() {
|
||||
return score;
|
||||
}
|
||||
}
|
||||
|
||||
/** A helper class for capturing the counts of unordered matches in the document */
|
||||
private static class UnorderedMatches {
|
||||
private final int[] observationsByTag;
|
||||
private final float[] valuesByWordIdx;
|
||||
private static final float[] weights;
|
||||
|
||||
private int firstPosition = 1;
|
||||
private int searchableKeywordCount = 0;
|
||||
static {
|
||||
weights = new float[HtmlTag.includedTags.length];
|
||||
|
||||
for (int i = 0; i < weights.length; i++) {
|
||||
weights[i] = switch(HtmlTag.includedTags[i]) {
|
||||
case TITLE -> 2.5f;
|
||||
case HEADING -> 1.25f;
|
||||
case ANCHOR -> 0.2f;
|
||||
case NAV -> 0.1f;
|
||||
case CODE -> 0.25f;
|
||||
case BODY -> 1.0f;
|
||||
case EXTERNAL_LINKTEXT -> 1.5f;
|
||||
default -> 0.0f;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public UnorderedMatches(IntList[] positions, CompiledQuery<String> compiledQuery,
|
||||
BitSet regularMask,
|
||||
DocumentSpans spans) {
|
||||
observationsByTag = new int[HtmlTag.includedTags.length];
|
||||
valuesByWordIdx = new float[compiledQuery.size()];
|
||||
|
||||
for (int i = 0; i < compiledQuery.size(); i++) {
|
||||
|
||||
if (!regularMask.get(i))
|
||||
continue;
|
||||
|
||||
if (positions[i] == null || positions[i].isEmpty()) {
|
||||
firstPosition = Integer.MAX_VALUE;
|
||||
continue;
|
||||
}
|
||||
|
||||
firstPosition = Math.max(firstPosition, positions[i].getInt(0));
|
||||
searchableKeywordCount ++;
|
||||
|
||||
for (var tag : HtmlTag.includedTags) {
|
||||
int cnt = spans.getSpan(tag).countIntersections(positions[i]);
|
||||
observationsByTag[tag.ordinal()] += cnt;
|
||||
valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int getObservationCount(HtmlTag tag) {
|
||||
return observationsByTag[tag.ordinal()];
|
||||
}
|
||||
|
||||
public float[] getWeightedCounts() {
|
||||
return valuesByWordIdx;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return valuesByWordIdx.length;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
|
||||
if (size < 400) {
|
||||
if (quality < 5)
|
||||
return 0;
|
||||
return -quality * rankingParams.getQualityPenalty();
|
||||
}
|
||||
else {
|
||||
return -quality * rankingParams.getQualityPenalty() * 20;
|
||||
}
|
||||
}
|
||||
|
||||
private int flagsPenalty(int featureFlags, long docFlags, int size) {
|
||||
|
||||
// Short-circuit for index-service, which does not have the feature flags
|
||||
if (featureFlags == 0)
|
||||
return 0;
|
||||
|
||||
double penalty = 0;
|
||||
|
||||
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
|
||||
boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
|
||||
boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
|
||||
|
||||
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
|
||||
double largeSiteFactor = 1.;
|
||||
|
||||
if (!isForum && !isWiki && !isDocs && size > 400) {
|
||||
// Long urls-that-look-like-this tend to be poor search results
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
|
||||
penalty += 30.0;
|
||||
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
|
||||
penalty += 30.;
|
||||
else penalty += 5.;
|
||||
|
||||
largeSiteFactor = 2;
|
||||
}
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
|
||||
penalty += 7.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (isForum || isWiki) {
|
||||
penalty = Math.min(0, penalty - 2);
|
||||
}
|
||||
|
||||
return (int) -penalty;
|
||||
}
|
||||
|
||||
/** Normalize a value to the range 0...15, where 0 is the best possible score
|
||||
*
|
||||
* @param value The value to normalize, must be positive or zero
|
||||
* @param penalty Any negative component of the value
|
||||
* */
|
||||
public static double normalize(double value, double penalty) {
|
||||
if (value < 0)
|
||||
value = 0;
|
||||
|
||||
return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user