1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(converter) Clean up spans-handling

This code was unnecessarily difficult to follow with repeated packing and re-packing of the same data.
This commit is contained in:
Viktor Lofgren
2025-08-17 09:41:53 +02:00
parent fa685bf1f4
commit 338d300e1a
13 changed files with 99 additions and 104 deletions

View File

@@ -193,48 +193,4 @@ public class DocumentPositionMapper {
return false;
}
/** Helper class to record spans of words */
private static class SpanRecorder {
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
}
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
{
// special case for body tag, we match against no tag on the sentence
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public void endCurrentSpan(int pos) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
}
}

View File

@@ -0,0 +1,52 @@
package nu.marginalia.keyword;
import nu.marginalia.keyword.model.DocumentWordSpan;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.sentence.tag.HtmlTag;
import java.util.ArrayList;
import java.util.List;
/**
* Helper class to record spans of words
*/
class SpanRecorder {
private final List<DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
// special case for body tag, we match against no tag on the sentence
if (start <= 0) start = pos;
} else {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public void endCurrentSpan(int pos) {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
}

View File

@@ -3,7 +3,6 @@ package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;

View File

@@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;

View File

@@ -1,7 +1,6 @@
package nu.marginalia.keyword.extractors;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.keyword;
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.model.WordRep;

View File

@@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;

View File

@@ -1,29 +1,14 @@
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.sequence.VarintCodedSequence;
import java.util.List;
public final class DocumentKeywords {
public final List<String> keywords;
public final byte[] metadata;
public final List<VarintCodedSequence> positions;
public final List<CodedWordSpan> spans;
public DocumentKeywords(List<String> keywords,
byte[] metadata,
List<VarintCodedSequence> positions,
List<CodedWordSpan> spans)
{
this.keywords = keywords;
this.metadata = metadata;
this.positions = positions;
this.spans = spans;
assert keywords.size() == metadata.length;
}
public record DocumentKeywords(List<String> keywords,
byte[] metadata,
List<VarintCodedSequence> positions,
byte[] spanCodes,
List<VarintCodedSequence> spanSequences) {
public boolean isEmpty() {
return keywords.isEmpty();

View File

@@ -5,7 +5,6 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
@@ -28,6 +27,7 @@ public class DocumentKeywordsBuilder {
// be plenty. The lexicon writer has another limit that's higher.
private final int MAX_WORD_LENGTH = 64;
private final int MAX_POSITIONS_PER_WORD = 512;
private final int MAX_SPANS_PER_TYPE = 8192;
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
@@ -35,13 +35,22 @@ public class DocumentKeywordsBuilder {
this(1600);
}
public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
}
public DocumentKeywords build() {
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
final byte[] spanCodes = new byte[wordSpans.size()];
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
// Encode positions
while (iter.hasNext()) {
var entry = iter.next();
@@ -58,27 +67,26 @@ public class DocumentKeywordsBuilder {
}
// Encode spans
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
wordSpans.forEach((tag, spansForTag) -> {
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
for (var span : spansForTag) {
positionsForTag.add(span.start());
positionsForTag.add(span.end());
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
break;
}
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
spanCodes[spanSequences.size()] = tag.code;
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
}
public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
}
public void addMeta(String word, byte meta) {
if (word.length() > MAX_WORD_LENGTH)
@@ -173,6 +181,4 @@ public class DocumentKeywordsBuilder {
return this.importantWords;
}
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}
}

View File

@@ -0,0 +1,6 @@
package nu.marginalia.keyword.model;
import nu.marginalia.language.sentence.tag.HtmlTag;
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}

View File

@@ -61,13 +61,13 @@ class DocumentKeywordExtractorTest {
Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords.get(i);
byte metadata = keywordsBuilt.metadata[i]
String keyword = keywordsBuilt.keywords().get(i);
byte metadata = keywordsBuilt.metadata()[i]
;
if (Set.of("dirty", "blues").contains(keyword)) {
flags.put(keyword, metadata);
positions.put(keyword, keywordsBuilt.positions.get(i));
positions.put(keyword, keywordsBuilt.positions().get(i));
}
}

View File

@@ -5,6 +5,7 @@ import gnu.trove.list.array.TIntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.keyword.model.DocumentWordSpan;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.sentence.SentenceExtractor;
@@ -105,7 +106,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
@@ -134,7 +135,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
@@ -170,7 +171,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());

View File

@@ -4,6 +4,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
@@ -11,14 +12,16 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.model.processed.SlopDomainLinkRecord;
import nu.marginalia.model.processed.SlopDomainRecord;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
/** Writer for a single batch of converter parquet files */
public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf {
@@ -100,17 +103,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
continue;
}
var wb = document.words.build();
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
byte[] spanCodes = new byte[wb.spans.size()];
for (int i = 0; i < wb.spans.size(); i++) {
var span = wb.spans.get(i);
spanCodes[i] = span.code();
spanSequences.add(span.spans());
}
DocumentKeywords wb = document.words.build();
documentWriter.write(new SlopDocumentRecord(
domainName,
@@ -127,11 +120,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
(float) document.details.quality,
document.details.metadata.encode(),
document.details.pubYear,
wb.keywords,
wb.metadata,
wb.positions,
spanCodes,
spanSequences
wb.keywords(),
wb.metadata(),
wb.positions(),
wb.spanCodes(),
wb.spanSequences()
));
}