(refac) Merge IndexResultScoreCalculator into IndexResultRankingService

2025-10-05 21:22:39 +02:00 · 2025-09-24 11:51:16 +02:00
parent 0929d77247
commit fbfea8539b
4 changed files with 596 additions and 627 deletions
--- a/code/index/index-perftest/java/nu/marginalia/index/perftest/PerfTestMain.java
+++ b/code/index/index-perftest/java/nu/marginalia/index/perftest/PerfTestMain.java
@@ -170,7 +170,7 @@ public class PerfTestMain {
        int iter;
        for (iter = 0;; iter++) {
            long start = System.nanoTime();
-            sum2 += rankingService.rankResults(rankingContext, rankingData, false).size();
+            sum2 += rankingService.rankResults(rankingContext, rankingData).size();
            long end = System.nanoTime();
            times.add((end - start)/1_000_000.);

--- a/code/index/java/nu/marginalia/index/IndexQueryExecution.java
+++ b/code/index/java/nu/marginalia/index/IndexQueryExecution.java
@@ -228,7 +228,7 @@ public class IndexQueryExecution {

                try (rankingData) {
                    long st =  System.nanoTime();
-                    resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData, false));
+                    resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData));
                    long et = System.nanoTime();

                    metric_index_rank_time_s
--- a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java
@@ -4,33 +4,45 @@ import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import gnu.trove.list.TLongList;
 import gnu.trove.list.array.TLongArrayList;
+import it.unimi.dsi.fastutil.ints.IntIterator;
+import it.unimi.dsi.fastutil.ints.IntList;
 import it.unimi.dsi.fastutil.longs.LongArrayList;
 import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 import nu.marginalia.api.searchquery.*;
+import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
+import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
 import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
+import nu.marginalia.api.searchquery.model.query.QueryStrategy;
 import nu.marginalia.api.searchquery.model.results.SearchResultItem;
 import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
 import nu.marginalia.index.CombinedIndexReader;
 import nu.marginalia.index.StatefulIndex;
 import nu.marginalia.index.forward.spans.DocumentSpans;
-import nu.marginalia.index.model.CombinedDocIdList;
-import nu.marginalia.index.model.SearchContext;
-import nu.marginalia.index.model.TermMetadataList;
+import nu.marginalia.index.model.*;
+import nu.marginalia.language.sentence.tag.HtmlTag;
 import nu.marginalia.linkdb.docs.DocumentDbReader;
 import nu.marginalia.linkdb.model.DocdbUrlDetail;
+import nu.marginalia.model.crawl.HtmlFeature;
+import nu.marginalia.model.crawl.PubDate;
+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.model.idx.DocumentFlags;
+import nu.marginalia.model.idx.DocumentMetadata;
+import nu.marginalia.model.idx.WordFlags;
 import nu.marginalia.sequence.CodedSequence;
+import nu.marginalia.sequence.SequenceOperations;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import javax.annotation.Nullable;
 import java.lang.foreign.Arena;
 import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;

+import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
+import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
+
@Singleton
 public class IndexResultRankingService {
    private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class);
@@ -132,12 +144,10 @@ public class IndexResultRankingService {

    public List<SearchResultItem> rankResults(
            SearchContext rankingContext,
-            RankingData rankingData,
-            boolean exportDebugData)
+            RankingData rankingData)
    {
-        IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
-
        List<SearchResultItem> results = new ArrayList<>(rankingData.size());
+        CombinedIndexReader index = statefulIndex.get();

        // Iterate over documents by their index in the combinedDocIds, as we need the index for the
        // term data arrays as well
@@ -149,37 +159,25 @@ public class IndexResultRankingService {
                continue;
            }

-            if (!exportDebugData) {
-                var score = resultRanker.calculateScore(null, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
-                if (score != null) {
-                    results.add(score);
-                }
-            }
-            else {
-                var rankingFactors = new DebugRankingFactors();
-                var score = resultRanker.calculateScore( rankingFactors, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
-
-                if (score != null) {
-                    score.debugRankingFactors = rankingFactors;
-                    results.add(score);
-                }
+            SearchResultItem score = calculateScore(null, index, rankingData.resultId(), rankingContext, rankingData);
+            if (score != null) {
+                results.add(score);
            }
        }

        return results;
    }

-
    public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
                                                          int limitTotal,
                                                          SearchContext searchContext,
                                                          List<SearchResultItem> results) throws SQLException {

-        var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);

        List<SearchResultItem> resultsList = new ArrayList<>(results.size());
        TLongList idsList = new TLongArrayList(limitTotal);

+        IndexResultDomainDeduplicator domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
        for (var item : results) {
            if (domainCountFilter.test(item)) {

@@ -209,12 +207,23 @@ public class IndexResultRankingService {

            resultsList.clear();

-            try (var data = prepareRankingData(searchContext,  new CombinedDocIdList(combinedIdsList))) {
-                resultsList.addAll(this.rankResults(
-                        searchContext,
-                        data,
-                        true)
-                );
+            // Re-rank the results while gathering debugging data
+            try (RankingData rankingData = prepareRankingData(searchContext,  new CombinedDocIdList(combinedIdsList))) {
+                CombinedIndexReader index = statefulIndex.get();
+
+                // Iterate over documents by their index in the combinedDocIds, as we need the index for the
+                // term data arrays as well
+
+                while (rankingData.next()) {
+                    if (!searchContext.phraseConstraints.testMandatory(rankingData.positions())) {
+                        continue;
+                    }
+
+                    SearchResultItem score = calculateScore(new DebugRankingFactors(), index, rankingData.resultId(), searchContext, rankingData);
+                    if (score != null) {
+                        resultsList.add(score);
+                    }
+                }
            }
            catch (TimeoutException ex) {
                // this won't happen since we passed null for budget
@@ -327,4 +336,556 @@ public class IndexResultRankingService {



+    @Nullable
+    public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
+                                           CombinedIndexReader index,
+                                           long combinedId,
+                                           SearchContext rankingContext,
+                                           IndexResultRankingService.RankingData rankingData)
+    {
+        long[] wordFlags = rankingData.flags();
+        CodedSequence[] positions = rankingData.positions();
+        DocumentSpans spans = rankingData.documentSpans();
+
+        QueryParams queryParams = rankingContext.queryParams;
+        CompiledQuery<String> compiledQuery = rankingContext.compiledQuery;
+
+        CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
+
+        // If the document is not relevant to the query, abort early to reduce allocations and
+        // avoid unnecessary calculations
+
+        CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
+        if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams)) {
+            return null;
+        }
+
+        boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
+        int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
+        int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
+
+        if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
+            return null;
+        }
+
+        long docId = UrlIdCodec.removeRank(combinedId);
+        long docMetadata = index.getDocumentMetadata(docId);
+        int htmlFeatures = index.getHtmlFeatures(docId);
+
+        int docSize = index.getDocumentSize(docId);
+        if (docSize <= 0) docSize = 5000;
+
+        if (debugRankingFactors != null) {
+            debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
+            debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
+        }
+
+        // Decode the coded positions lists into plain IntLists as at this point we will be
+        // going over them multiple times
+        IntList[] decodedPositions = new IntList[positions.length];
+        for (int i = 0; i < positions.length; i++) {
+            if (positions[i] != null) {
+                decodedPositions[i] = positions[i].values();
+            }
+            else {
+                decodedPositions[i] = IntList.of();
+            }
+        }
+
+        var params = rankingContext.params;
+
+        double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors);
+
+        VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, rankingContext.phraseConstraints, spans);
+        UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, rankingContext.regularMask, spans);
+
+        float proximitiyFac = getProximitiyFac(decodedPositions, rankingContext.phraseConstraints, verbatimMatches, unorderedMatches, spans);
+
+        double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
+        double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
+        double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
+        double score_bM25 = params.getBm25Weight()
+                * wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, rankingContext))
+                / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
+        double score_bFlags = params.getBm25Weight()
+                * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
+                / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
+
+        double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
+
+        double score = normalize(
+                rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
+                -Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
+        );
+
+        if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
+            if (getClass().desiredAssertionStatus()) {
+                throw new IllegalStateException("NaN in result value calculation");
+            }
+            score = Double.MAX_VALUE;
+        }
+
+        // Capture ranking factors for debugging
+        if (debugRankingFactors != null) {
+            debugRankingFactors.addDocumentFactor("score.bm25-main", Double.toString(score_bM25));
+            debugRankingFactors.addDocumentFactor("score.bm25-flags", Double.toString(score_bFlags));
+            debugRankingFactors.addDocumentFactor("score.verbatim", Double.toString(score_verbatim));
+            debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity));
+            debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition));
+
+            for (int i = 0; i < rankingContext.termIdsAll.size(); i++) {
+                long termId = rankingContext.termIdsAll.at(i);
+
+                var flags = wordFlagsQuery.at(i);
+
+                debugRankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
+
+                for (var flag : WordFlags.values()) {
+                    if (flag.isPresent((byte) flags)) {
+                        debugRankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
+                    }
+                }
+
+                for (HtmlTag tag : HtmlTag.includedTags) {
+                    if (verbatimMatches.get(tag)) {
+                        debugRankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
+                    }
+                }
+
+                if (positions[i] != null) {
+                    debugRankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.body", SequenceOperations.findIntersections(spans.body.positionValues(), decodedPositions[i]).iterator());
+                    debugRankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.positionValues(), decodedPositions[i]).iterator());
+                }
+            }
+        }
+
+        SearchResultItem ret = new SearchResultItem(combinedId,
+                docMetadata,
+                htmlFeatures,
+                score,
+                calculatePositionsMask(decodedPositions, rankingContext.phraseConstraints)
+        );
+
+        if (null != debugRankingFactors) {
+            ret.debugRankingFactors = debugRankingFactors;
+        }
+
+        return ret;
+    }
+
+    private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
+                                                   QueryParams queryParams)
+    {
+        QueryStrategy queryStrategy = queryParams.queryStrategy();
+        if (queryStrategy == QueryStrategy.AUTO ||
+                queryStrategy == QueryStrategy.SENTENCE ||
+                queryStrategy == QueryStrategy.TOPIC) {
+            return true;
+        }
+
+        return booleanAggregate(queryGraphScores,
+                flags -> meetsQueryStrategyRequirements((byte) flags, queryStrategy));
+    }
+
+    private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
+        if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
+            return WordFlags.Site.isPresent(flags);
+        }
+        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
+            return WordFlags.Subjects.isPresent(flags);
+        }
+        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
+            return WordFlags.Title.isPresent(flags);
+        }
+        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
+            return WordFlags.UrlPath.isPresent(flags);
+        }
+        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
+            return WordFlags.UrlDomain.isPresent(flags);
+        }
+        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
+            return WordFlags.ExternalLink.isPresent(flags);
+        }
+        return true;
+    }
+
+    /** Calculate a bitmask illustrating the intersected positions of the search terms in the document.
+     *  This is used in the GUI.
+     * */
+    private long calculatePositionsMask(IntList[] positions, PhraseConstraintGroupList phraseConstraints) {
+
+        long result = 0;
+        int bit = 0;
+
+        IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
+
+        while (intersection.hasNext() && bit < 64) {
+            bit = (int) (Math.sqrt(intersection.nextInt()));
+            result |= 1L << bit;
+        }
+
+        return result;
+    }
+
+
+    private double calculateDocumentBonus(long documentMetadata,
+                                          int features,
+                                          int length,
+                                          RpcResultRankingParameters rankingParams,
+                                          @Nullable DebugRankingFactors debugRankingFactors) {
+
+        if (rankingParams.getDisablePenalties()) {
+            return 0.;
+        }
+
+        int rank = DocumentMetadata.decodeRank(documentMetadata);
+        int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
+        int quality = DocumentMetadata.decodeQuality(documentMetadata);
+        int size = DocumentMetadata.decodeSize(documentMetadata);
+        int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
+        int topology = DocumentMetadata.decodeTopology(documentMetadata);
+        int year = DocumentMetadata.decodeYear(documentMetadata);
+
+        double averageSentenceLengthPenalty
+                = (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
+
+        final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
+        final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
+        final double topologyBonus = Math.log(1 + topology);
+        final double documentLengthPenalty
+                = length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
+        final double temporalBias;
+
+        if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
+            temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
+        } else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
+            temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
+        } else {
+            temporalBias = 0;
+        }
+
+        if (debugRankingFactors != null) {
+            debugRankingFactors.addDocumentFactor("documentBonus.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
+            debugRankingFactors.addDocumentFactor("documentBonus.documentLengthPenalty", Double.toString(documentLengthPenalty));
+            debugRankingFactors.addDocumentFactor("documentBonus.qualityPenalty", Double.toString(qualityPenalty));
+            debugRankingFactors.addDocumentFactor("documentBonus.rankingBonus", Double.toString(rankingBonus));
+            debugRankingFactors.addDocumentFactor("documentBonus.topologyBonus", Double.toString(topologyBonus));
+            debugRankingFactors.addDocumentFactor("documentBonus.temporalBias", Double.toString(temporalBias));
+            debugRankingFactors.addDocumentFactor("documentBonus.flagsPenalty", Double.toString(flagsPenalty));
+        }
+
+        return averageSentenceLengthPenalty
+                + documentLengthPenalty
+                + qualityPenalty
+                + rankingBonus
+                + topologyBonus
+                + temporalBias
+                + flagsPenalty;
+    }
+
+    /** Calculate the proximity factor for the document.
+     * <p></p>
+     * The proximity factor is a bonus based on how close the search terms are to each other in the document
+     * that turns into a penalty if the distance is too large.
+     * */
+    private static float getProximitiyFac(IntList[] positions,
+                                          PhraseConstraintGroupList constraintGroups,
+                                          VerbatimMatches verbatimMatches,
+                                          UnorderedMatches unorderedMatches,
+                                          DocumentSpans spans
+    ) {
+        float proximitiyFac = 0;
+
+        if (positions.length > 2) {
+            int minDist = constraintGroups.getFullGroup().minDistance(positions);
+            if (minDist > 0 && minDist < Integer.MAX_VALUE) {
+                if (minDist < 32) {
+                    // If min-dist is sufficiently small, we give a tapering reward to the document
+                    proximitiyFac = 2.0f / (0.1f + (float) Math.sqrt(minDist));
+                } else {
+                    // if it is too large, we add a mounting penalty
+                    proximitiyFac = -1.0f * (float) Math.sqrt(minDist);
+                }
+            }
+        }
+
+
+        // Give bonus proximity score if all keywords are in the title
+        if (!verbatimMatches.get(HtmlTag.TITLE)
+                && unorderedMatches.searchableKeywordCount > 2
+                && unorderedMatches.getObservationCount(HtmlTag.TITLE) == unorderedMatches.searchableKeywordCount) {
+            proximitiyFac += unorderedMatches.getObservationCount(HtmlTag.TITLE)
+                    * (2.5f + 2.f / Math.max(1, spans.title.length()));
+        }
+
+        // Give bonus proximity score if all keywords are in a heading
+        if (spans.heading.length() < 64 &&
+                ! verbatimMatches.get(HtmlTag.HEADING)
+                && unorderedMatches.getObservationCount(HtmlTag.HEADING) == unorderedMatches.searchableKeywordCount)
+        {
+            proximitiyFac += 1.0f * unorderedMatches.getObservationCount(HtmlTag.HEADING);
+        }
+
+        return proximitiyFac;
+    }
+
+    /** A helper class for capturing the verbatim phrase matches in the document */
+    private static class VerbatimMatches {
+        private final BitSet matches;
+        private float score = 0.f;
+
+        private static final float[] weights_full;
+        private static final float[] weights_partial;
+
+        static {
+            weights_full = new float[HtmlTag.includedTags.length];
+            weights_partial = new float[HtmlTag.includedTags.length];
+
+            for (int i = 0; i < weights_full.length; i++) {
+                weights_full[i] = switch(HtmlTag.includedTags[i]) {
+                    case TITLE -> 4.0f;
+                    case HEADING -> 1.5f;
+                    case ANCHOR -> 0.2f;
+                    case NAV -> 0.1f;
+                    case CODE -> 0.25f;
+                    case EXTERNAL_LINKTEXT -> 3.0f;
+                    case BODY -> 1.0f;
+                    default -> 0.0f;
+                };
+            }
+
+            for (int i = 0; i < weights_partial.length; i++) {
+                weights_partial[i] = switch(HtmlTag.includedTags[i]) {
+                    case TITLE -> 1.5f;
+                    case HEADING -> 1.f;
+                    case ANCHOR -> 0.2f;
+                    case NAV -> 0.1f;
+                    case CODE -> 0.25f;
+                    case EXTERNAL_LINKTEXT -> 2.0f;
+                    case BODY -> 0.25f;
+                    default -> 0.0f;
+                };
+            }
+        }
+
+        public VerbatimMatches(IntList[] positions, PhraseConstraintGroupList constraints, DocumentSpans spans) {
+            matches = new BitSet(HtmlTag.includedTags.length);
+
+            var fullGroup = constraints.getFullGroup();
+            IntList fullGroupIntersections = fullGroup.findIntersections(positions);
+
+            int largestOptional = constraints.getFullGroup().size;
+            if (largestOptional < 2) {
+                var titleSpan = spans.getSpan(HtmlTag.TITLE);
+                if (titleSpan.length() == fullGroup.size
+                        && titleSpan.containsRange(fullGroupIntersections, fullGroup.size))
+                {
+                    score += 4; // If the title is a single word and the same as the query, we give it a verbatim bonus
+                }
+
+                var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
+                if (extLinkSpan.length() >= fullGroup.size) {
+                    int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+                    if (cnt > 0) {
+                        score += 2 * cnt;
+                    }
+                }
+
+                return;
+            }
+
+            // Capture full query matches
+            for (var tag : HtmlTag.includedTags) {
+                int cnts =  spans.getSpan(tag).countRangeMatches(fullGroupIntersections, fullGroup.size);
+                if (cnts > 0) {
+                    matches.set(tag.ordinal());
+                    score += (float) (weights_full[tag.ordinal()] * fullGroup.size + (1 + Math.log(2 + cnts)));
+                }
+            }
+
+            // Bonus if there's a perfect match with an atag span
+            var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
+            if (extLinkSpan.length() >= fullGroup.size) {
+                int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
+                score += 2*cnt;
+            }
+
+            // For optional groups, we scale the score by the size of the group relative to the full group
+            for (var optionalGroup : constraints.getOptionalGroups()) {
+                int groupSize = optionalGroup.size;
+                float sizeScalingFactor = groupSize / (float) largestOptional;
+
+                IntList intersections = optionalGroup.findIntersections(positions);
+
+                for (var tag : HtmlTag.includedTags) {
+                    int cnts =  spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
+                    if (cnts > 0) {
+                        score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
+                    }
+                }
+            }
+        }
+
+        public boolean get(HtmlTag tag) {
+            assert !tag.exclude;
+            return matches.get(tag.ordinal());
+        }
+
+        public float getScore() {
+            return score;
+        }
+    }
+
+    /** A helper class for capturing the counts of unordered matches in the document */
+    private static class UnorderedMatches {
+        private final int[] observationsByTag;
+        private final float[] valuesByWordIdx;
+        private static final float[] weights;
+
+        private int firstPosition = 1;
+        private int searchableKeywordCount = 0;
+        static {
+            weights = new float[HtmlTag.includedTags.length];
+
+            for (int i = 0; i < weights.length; i++) {
+                weights[i] = switch(HtmlTag.includedTags[i]) {
+                    case TITLE -> 2.5f;
+                    case HEADING -> 1.25f;
+                    case ANCHOR -> 0.2f;
+                    case NAV -> 0.1f;
+                    case CODE -> 0.25f;
+                    case BODY -> 1.0f;
+                    case EXTERNAL_LINKTEXT -> 1.5f;
+                    default -> 0.0f;
+                };
+            }
+        }
+
+        public UnorderedMatches(IntList[] positions, CompiledQuery<String> compiledQuery,
+                                BitSet regularMask,
+                                DocumentSpans spans) {
+            observationsByTag = new int[HtmlTag.includedTags.length];
+            valuesByWordIdx = new float[compiledQuery.size()];
+
+            for (int i = 0; i < compiledQuery.size(); i++) {
+
+                if (!regularMask.get(i))
+                    continue;
+
+                if (positions[i] == null || positions[i].isEmpty()) {
+                    firstPosition = Integer.MAX_VALUE;
+                    continue;
+                }
+
+                firstPosition = Math.max(firstPosition, positions[i].getInt(0));
+                searchableKeywordCount ++;
+
+                for (var tag : HtmlTag.includedTags) {
+                    int cnt = spans.getSpan(tag).countIntersections(positions[i]);
+                    observationsByTag[tag.ordinal()] += cnt;
+                    valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
+                }
+            }
+        }
+
+        public int getObservationCount(HtmlTag tag) {
+            return observationsByTag[tag.ordinal()];
+        }
+
+        public float[] getWeightedCounts() {
+            return valuesByWordIdx;
+        }
+
+        public int size() {
+            return valuesByWordIdx.length;
+        }
+    }
+
+
+    private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
+        if (size < 400) {
+            if (quality < 5)
+                return 0;
+            return -quality * rankingParams.getQualityPenalty();
+        }
+        else {
+            return -quality * rankingParams.getQualityPenalty() * 20;
+        }
+    }
+
+    private int flagsPenalty(int featureFlags, long docFlags, int size) {
+
+        // Short-circuit for index-service, which does not have the feature flags
+        if (featureFlags == 0)
+            return 0;
+
+        double penalty = 0;
+
+        boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
+        boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
+        boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
+
+        // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
+        double largeSiteFactor = 1.;
+
+        if (!isForum && !isWiki && !isDocs && size > 400) {
+            // Long urls-that-look-like-this tend to be poor search results
+            if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
+                penalty += 30.0;
+            else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
+                penalty += 30.;
+            else penalty += 5.;
+
+            largeSiteFactor = 2;
+        }
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
+            penalty += 7.5 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
+            penalty += 2.5 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
+            penalty += 2.5 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
+            penalty += 5.0 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
+            penalty += 5.0 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
+            penalty += 2.5 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
+            penalty += 2.5 * largeSiteFactor;
+
+        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
+            penalty += 2.5  * largeSiteFactor;
+
+        if (isForum || isWiki) {
+            penalty = Math.min(0, penalty - 2);
+        }
+
+        return (int) -penalty;
+    }
+
+    /** Normalize a value to the range 0...15, where 0 is the best possible score
+     *
+     * @param value The value to normalize, must be positive or zero
+     * @param penalty Any negative component of the value
+     * */
+    public static double normalize(double value, double penalty) {
+        if (value < 0)
+            value = 0;
+
+        return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
+    }
+
 }
--- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
+++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
@@ -1,592 +0,0 @@
-package nu.marginalia.index.results;
-
-import it.unimi.dsi.fastutil.ints.IntIterator;
-import it.unimi.dsi.fastutil.ints.IntList;
-import nu.marginalia.api.searchquery.RpcResultRankingParameters;
-import nu.marginalia.api.searchquery.RpcTemporalBias;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
-import nu.marginalia.api.searchquery.model.query.QueryStrategy;
-import nu.marginalia.api.searchquery.model.results.SearchResultItem;
-import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
-import nu.marginalia.index.CombinedIndexReader;
-import nu.marginalia.index.StatefulIndex;
-import nu.marginalia.index.forward.spans.DocumentSpans;
-import nu.marginalia.index.model.PhraseConstraintGroupList;
-import nu.marginalia.index.model.QueryParams;
-import nu.marginalia.index.model.SearchContext;
-import nu.marginalia.language.sentence.tag.HtmlTag;
-import nu.marginalia.model.crawl.HtmlFeature;
-import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.id.UrlIdCodec;
-import nu.marginalia.model.idx.DocumentFlags;
-import nu.marginalia.model.idx.DocumentMetadata;
-import nu.marginalia.model.idx.WordFlags;
-import nu.marginalia.sequence.CodedSequence;
-import nu.marginalia.sequence.SequenceOperations;
-
-import javax.annotation.Nullable;
-import java.util.BitSet;
-
-import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
-import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
-
-/** This class is responsible for calculating the score of a search result.
- * It holds the data required to perform the scoring, as there is strong
- * reasons to cache this data, and performs the calculations */
-public class IndexResultScoreCalculator {
-    private final CombinedIndexReader index;
-    private final QueryParams queryParams;
-
-    private final DomainRankingOverrides domainRankingOverrides;
-    private final SearchContext rankingContext;
-    private final CompiledQuery<String> compiledQuery;
-
-    public IndexResultScoreCalculator(StatefulIndex statefulIndex,
-                                      DomainRankingOverrides domainRankingOverrides,
-                                      SearchContext rankingContext)
-    {
-        this.index = statefulIndex.get();
-        this.domainRankingOverrides = domainRankingOverrides;
-        this.rankingContext = rankingContext;
-
-        this.queryParams = rankingContext.queryParams;
-        this.compiledQuery = rankingContext.compiledQuery;
-    }
-
-    @Nullable
-    public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
-                                           long combinedId,
-                                           SearchContext rankingContext,
-                                           long[] wordFlags,
-                                           CodedSequence[] positions,
-                                           DocumentSpans spans)
-    {
-
-        CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
-
-        // If the document is not relevant to the query, abort early to reduce allocations and
-        // avoid unnecessary calculations
-
-        CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
-        if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
-            return null;
-        }
-
-        boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
-        int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
-        int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
-
-        if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
-            return null;
-        }
-
-        long docId = UrlIdCodec.removeRank(combinedId);
-        long docMetadata = index.getDocumentMetadata(docId);
-        int htmlFeatures = index.getHtmlFeatures(docId);
-
-        int docSize = index.getDocumentSize(docId);
-        if (docSize <= 0) docSize = 5000;
-
-        if (debugRankingFactors != null) {
-            debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
-            debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
-        }
-
-        // Decode the coded positions lists into plain IntLists as at this point we will be
-        // going over them multiple times
-        IntList[] decodedPositions = new IntList[positions.length];
-        for (int i = 0; i < positions.length; i++) {
-            if (positions[i] != null) {
-                decodedPositions[i] = positions[i].values();
-            }
-            else {
-                decodedPositions[i] = IntList.of();
-            }
-        }
-
-        var params = this.rankingContext.params;
-
-        double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors);
-
-        VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, rankingContext.phraseConstraints, spans);
-        UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, this.rankingContext.regularMask, spans);
-
-        float proximitiyFac = getProximitiyFac(decodedPositions, rankingContext.phraseConstraints, verbatimMatches, unorderedMatches, spans);
-
-        double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
-        double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
-        double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
-        double score_bM25 = params.getBm25Weight()
-                * wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, this.rankingContext))
-                / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
-        double score_bFlags = params.getBm25Weight()
-                * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), this.rankingContext))
-                / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
-
-        double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
-
-        double score = normalize(
-                rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
-                -Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
-        );
-
-        if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it
-            if (getClass().desiredAssertionStatus()) {
-                throw new IllegalStateException("NaN in result value calculation");
-            }
-            score = Double.MAX_VALUE;
-        }
-
-        // Capture ranking factors for debugging
-        if (debugRankingFactors != null) {
-            debugRankingFactors.addDocumentFactor("score.bm25-main", Double.toString(score_bM25));
-            debugRankingFactors.addDocumentFactor("score.bm25-flags", Double.toString(score_bFlags));
-            debugRankingFactors.addDocumentFactor("score.verbatim", Double.toString(score_verbatim));
-            debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity));
-            debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition));
-
-            for (int i = 0; i < rankingContext.termIdsAll.size(); i++) {
-                long termId = rankingContext.termIdsAll.at(i);
-
-                var flags = wordFlagsQuery.at(i);
-
-                debugRankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
-
-                for (var flag : WordFlags.values()) {
-                    if (flag.isPresent((byte) flags)) {
-                        debugRankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
-                    }
-                }
-
-                for (HtmlTag tag : HtmlTag.includedTags) {
-                    if (verbatimMatches.get(tag)) {
-                        debugRankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true");
-                    }
-                }
-
-                if (positions[i] != null) {
-                    debugRankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.body", SequenceOperations.findIntersections(spans.body.positionValues(), decodedPositions[i]).iterator());
-                    debugRankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.positionValues(), decodedPositions[i]).iterator());
-                }
-            }
-        }
-
-        return new SearchResultItem(combinedId,
-                docMetadata,
-                htmlFeatures,
-                score,
-                calculatePositionsMask(decodedPositions, rankingContext.phraseConstraints)
-        );
-    }
-
-    private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
-                                                   QueryStrategy queryStrategy)
-    {
-        if (queryStrategy == QueryStrategy.AUTO ||
-                queryStrategy == QueryStrategy.SENTENCE ||
-                queryStrategy == QueryStrategy.TOPIC) {
-            return true;
-        }
-
-        return booleanAggregate(queryGraphScores,
-                flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy()));
-    }
-
-    private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) {
-        if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
-            return WordFlags.Site.isPresent(flags);
-        }
-        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
-            return WordFlags.Subjects.isPresent(flags);
-        }
-        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
-            return WordFlags.Title.isPresent(flags);
-        }
-        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
-            return WordFlags.UrlPath.isPresent(flags);
-        }
-        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
-            return WordFlags.UrlDomain.isPresent(flags);
-        }
-        else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
-            return WordFlags.ExternalLink.isPresent(flags);
-        }
-        return true;
-    }
-
-    /** Calculate a bitmask illustrating the intersected positions of the search terms in the document.
-     *  This is used in the GUI.
-     * */
-    private long calculatePositionsMask(IntList[] positions, PhraseConstraintGroupList phraseConstraints) {
-
-        long result = 0;
-        int bit = 0;
-
-        IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
-
-        while (intersection.hasNext() && bit < 64) {
-            bit = (int) (Math.sqrt(intersection.nextInt()));
-            result |= 1L << bit;
-        }
-
-        return result;
-    }
-
-
-    private double calculateDocumentBonus(long documentMetadata,
-                                          int features,
-                                          int length,
-                                          RpcResultRankingParameters rankingParams,
-                                          @Nullable DebugRankingFactors debugRankingFactors) {
-
-        if (rankingParams.getDisablePenalties()) {
-            return 0.;
-        }
-
-        int rank = DocumentMetadata.decodeRank(documentMetadata);
-        int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
-        int quality = DocumentMetadata.decodeQuality(documentMetadata);
-        int size = DocumentMetadata.decodeSize(documentMetadata);
-        int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
-        int topology = DocumentMetadata.decodeTopology(documentMetadata);
-        int year = DocumentMetadata.decodeYear(documentMetadata);
-
-        double averageSentenceLengthPenalty = (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
-
-        final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
-        final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
-        final double topologyBonus = Math.log(1 + topology);
-        final double documentLengthPenalty = length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
-        final double temporalBias;
-
-        if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
-            temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
-        } else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
-            temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
-        } else {
-            temporalBias = 0;
-        }
-
-        if (debugRankingFactors != null) {
-            debugRankingFactors.addDocumentFactor("documentBonus.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
-            debugRankingFactors.addDocumentFactor("documentBonus.documentLengthPenalty", Double.toString(documentLengthPenalty));
-            debugRankingFactors.addDocumentFactor("documentBonus.qualityPenalty", Double.toString(qualityPenalty));
-            debugRankingFactors.addDocumentFactor("documentBonus.rankingBonus", Double.toString(rankingBonus));
-            debugRankingFactors.addDocumentFactor("documentBonus.topologyBonus", Double.toString(topologyBonus));
-            debugRankingFactors.addDocumentFactor("documentBonus.temporalBias", Double.toString(temporalBias));
-            debugRankingFactors.addDocumentFactor("documentBonus.flagsPenalty", Double.toString(flagsPenalty));
-        }
-
-        return averageSentenceLengthPenalty
-                + documentLengthPenalty
-                + qualityPenalty
-                + rankingBonus
-                + topologyBonus
-                + temporalBias
-                + flagsPenalty;
-    }
-
-    /** Calculate the proximity factor for the document.
-     * <p></p>
-     * The proximity factor is a bonus based on how close the search terms are to each other in the document
-     * that turns into a penalty if the distance is too large.
-     * */
-    private static float getProximitiyFac(IntList[] positions,
-                                          PhraseConstraintGroupList constraintGroups,
-                                          VerbatimMatches verbatimMatches,
-                                          UnorderedMatches unorderedMatches,
-                                          DocumentSpans spans
-                                          ) {
-        float proximitiyFac = 0;
-
-        if (positions.length > 2) {
-            int minDist = constraintGroups.getFullGroup().minDistance(positions);
-            if (minDist > 0 && minDist < Integer.MAX_VALUE) {
-                if (minDist < 32) {
-                    // If min-dist is sufficiently small, we give a tapering reward to the document
-                    proximitiyFac = 2.0f / (0.1f + (float) Math.sqrt(minDist));
-                } else {
-                    // if it is too large, we add a mounting penalty
-                    proximitiyFac = -1.0f * (float) Math.sqrt(minDist);
-                }
-            }
-        }
-
-
-        // Give bonus proximity score if all keywords are in the title
-        if (!verbatimMatches.get(HtmlTag.TITLE) && unorderedMatches.searchableKeywordCount > 2 && unorderedMatches.getObservationCount(HtmlTag.TITLE) == unorderedMatches.searchableKeywordCount) {
-            proximitiyFac += unorderedMatches.getObservationCount(HtmlTag.TITLE) * (2.5f + 2.f / Math.max(1, spans.title.length()));
-        }
-        // Give bonus proximity score if all keywords are in a heading
-        if (spans.heading.length() < 64 &&
-                ! verbatimMatches.get(HtmlTag.HEADING)
-                && unorderedMatches.getObservationCount(HtmlTag.HEADING) == unorderedMatches.searchableKeywordCount)
-        {
-            proximitiyFac += 1.0f * unorderedMatches.getObservationCount(HtmlTag.HEADING);
-        }
-
-        return proximitiyFac;
-    }
-
-    /** A helper class for capturing the verbatim phrase matches in the document */
-    private static class VerbatimMatches {
-        private final BitSet matches;
-        private float score = 0.f;
-
-        private static final float[] weights_full;
-        private static final float[] weights_partial;
-
-        static {
-            weights_full = new float[HtmlTag.includedTags.length];
-            weights_partial = new float[HtmlTag.includedTags.length];
-
-            for (int i = 0; i < weights_full.length; i++) {
-                weights_full[i] = switch(HtmlTag.includedTags[i]) {
-                    case TITLE -> 4.0f;
-                    case HEADING -> 1.5f;
-                    case ANCHOR -> 0.2f;
-                    case NAV -> 0.1f;
-                    case CODE -> 0.25f;
-                    case EXTERNAL_LINKTEXT -> 3.0f;
-                    case BODY -> 1.0f;
-                    default -> 0.0f;
-                };
-            }
-
-            for (int i = 0; i < weights_partial.length; i++) {
-                weights_partial[i] = switch(HtmlTag.includedTags[i]) {
-                    case TITLE -> 1.5f;
-                    case HEADING -> 1.f;
-                    case ANCHOR -> 0.2f;
-                    case NAV -> 0.1f;
-                    case CODE -> 0.25f;
-                    case EXTERNAL_LINKTEXT -> 2.0f;
-                    case BODY -> 0.25f;
-                    default -> 0.0f;
-                };
-            }
-        }
-
-        public VerbatimMatches(IntList[] positions, PhraseConstraintGroupList constraints, DocumentSpans spans) {
-            matches = new BitSet(HtmlTag.includedTags.length);
-
-            var fullGroup = constraints.getFullGroup();
-            IntList fullGroupIntersections = fullGroup.findIntersections(positions);
-
-            int largestOptional = constraints.getFullGroup().size;
-            if (largestOptional < 2) {
-                var titleSpan = spans.getSpan(HtmlTag.TITLE);
-                if (titleSpan.length() == fullGroup.size
-                    && titleSpan.containsRange(fullGroupIntersections, fullGroup.size))
-                {
-                    score += 4; // If the title is a single word and the same as the query, we give it a verbatim bonus
-                }
-
-                var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
-                if (extLinkSpan.length() >= fullGroup.size) {
-                    int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
-                    if (cnt > 0) {
-                        score += 2 * cnt;
-                    }
-                }
-
-                return;
-            }
-
-            // Capture full query matches
-            for (var tag : HtmlTag.includedTags) {
-                int cnts =  spans.getSpan(tag).countRangeMatches(fullGroupIntersections, fullGroup.size);
-                if (cnts > 0) {
-                    matches.set(tag.ordinal());
-                    score += (float) (weights_full[tag.ordinal()] * fullGroup.size + (1 + Math.log(2 + cnts)));
-                }
-            }
-
-            // Bonus if there's a perfect match with an atag span
-            var extLinkSpan = spans.getSpan(HtmlTag.EXTERNAL_LINKTEXT);
-            if (extLinkSpan.length() >= fullGroup.size) {
-                int cnt = extLinkSpan.containsRangeExact(fullGroupIntersections, fullGroup.size);
-                score += 2*cnt;
-            }
-
-            // For optional groups, we scale the score by the size of the group relative to the full group
-            for (var optionalGroup : constraints.getOptionalGroups()) {
-                int groupSize = optionalGroup.size;
-                float sizeScalingFactor = groupSize / (float) largestOptional;
-
-                IntList intersections = optionalGroup.findIntersections(positions);
-
-                for (var tag : HtmlTag.includedTags) {
-                    int cnts =  spans.getSpan(tag).countRangeMatches(intersections, fullGroup.size);
-                    if (cnts > 0) {
-                        score += (float) (weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor * (1 + Math.log(2 + cnts)));
-                    }
-                }
-            }
-        }
-
-        public boolean get(HtmlTag tag) {
-            assert !tag.exclude;
-            return matches.get(tag.ordinal());
-        }
-
-        public float getScore() {
-            return score;
-        }
-    }
-
-    /** A helper class for capturing the counts of unordered matches in the document */
-    private static class UnorderedMatches {
-        private final int[] observationsByTag;
-        private final float[] valuesByWordIdx;
-        private static final float[] weights;
-
-        private int firstPosition = 1;
-        private int searchableKeywordCount = 0;
-        static {
-            weights = new float[HtmlTag.includedTags.length];
-
-            for (int i = 0; i < weights.length; i++) {
-                weights[i] = switch(HtmlTag.includedTags[i]) {
-                    case TITLE -> 2.5f;
-                    case HEADING -> 1.25f;
-                    case ANCHOR -> 0.2f;
-                    case NAV -> 0.1f;
-                    case CODE -> 0.25f;
-                    case BODY -> 1.0f;
-                    case EXTERNAL_LINKTEXT -> 1.5f;
-                    default -> 0.0f;
-                };
-            }
-        }
-
-        public UnorderedMatches(IntList[] positions, CompiledQuery<String> compiledQuery,
-                                BitSet regularMask,
-                                DocumentSpans spans) {
-            observationsByTag = new int[HtmlTag.includedTags.length];
-            valuesByWordIdx = new float[compiledQuery.size()];
-
-            for (int i = 0; i < compiledQuery.size(); i++) {
-
-                if (!regularMask.get(i))
-                    continue;
-
-                if (positions[i] == null || positions[i].isEmpty()) {
-                    firstPosition = Integer.MAX_VALUE;
-                    continue;
-                }
-
-                firstPosition = Math.max(firstPosition, positions[i].getInt(0));
-                searchableKeywordCount ++;
-
-                for (var tag : HtmlTag.includedTags) {
-                    int cnt = spans.getSpan(tag).countIntersections(positions[i]);
-                    observationsByTag[tag.ordinal()] += cnt;
-                    valuesByWordIdx[i] += cnt * weights[tag.ordinal()];
-                }
-            }
-        }
-
-        public int getObservationCount(HtmlTag tag) {
-            return observationsByTag[tag.ordinal()];
-        }
-
-        public float[] getWeightedCounts() {
-            return valuesByWordIdx;
-        }
-
-        public int size() {
-            return valuesByWordIdx.length;
-        }
-    }
-
-
-    private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
-        if (size < 400) {
-            if (quality < 5)
-                return 0;
-            return -quality * rankingParams.getQualityPenalty();
-        }
-        else {
-            return -quality * rankingParams.getQualityPenalty() * 20;
-        }
-    }
-
-    private int flagsPenalty(int featureFlags, long docFlags, int size) {
-
-        // Short-circuit for index-service, which does not have the feature flags
-        if (featureFlags == 0)
-            return 0;
-
-        double penalty = 0;
-
-        boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
-        boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
-        boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
-
-        // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
-        double largeSiteFactor = 1.;
-
-        if (!isForum && !isWiki && !isDocs && size > 400) {
-            // Long urls-that-look-like-this tend to be poor search results
-            if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
-                penalty += 30.0;
-            else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
-                penalty += 30.;
-            else penalty += 5.;
-
-            largeSiteFactor = 2;
-        }
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
-            penalty += 7.5 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
-            penalty += 2.5 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
-            penalty += 2.5 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
-            penalty += 5.0 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
-            penalty += 5.0 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
-            penalty += 2.5 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
-            penalty += 2.5 * largeSiteFactor;
-
-        if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
-            penalty += 2.5  * largeSiteFactor;
-
-        if (isForum || isWiki) {
-            penalty = Math.min(0, penalty - 2);
-        }
-
-        return (int) -penalty;
-    }
-
-    /** Normalize a value to the range 0...15, where 0 is the best possible score
-     *
-     * @param value The value to normalize, must be positive or zero
-     * @param penalty Any negative component of the value
-     * */
-    public static double normalize(double value, double penalty) {
-        if (value < 0)
-            value = 0;
-
-        return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
-    }
-
-}
-