mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(converter) Clean up spans-handling
This code was unnecessarily difficult to follow with repeated packing and re-packing of the same data.
This commit is contained in:
@@ -193,48 +193,4 @@ public class DocumentPositionMapper {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Helper class to record spans of words */
|
||||
private static class SpanRecorder {
|
||||
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||
{
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,52 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class to record spans of words
|
||||
*/
|
||||
class SpanRecorder {
|
||||
private final List<DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
} else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
@@ -3,7 +3,6 @@ package nu.marginalia.keyword.extractors;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
|
@@ -1,29 +1,14 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public final class DocumentKeywords {
|
||||
|
||||
public final List<String> keywords;
|
||||
public final byte[] metadata;
|
||||
public final List<VarintCodedSequence> positions;
|
||||
public final List<CodedWordSpan> spans;
|
||||
|
||||
public DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<VarintCodedSequence> positions,
|
||||
List<CodedWordSpan> spans)
|
||||
{
|
||||
this.keywords = keywords;
|
||||
this.metadata = metadata;
|
||||
this.positions = positions;
|
||||
this.spans = spans;
|
||||
|
||||
assert keywords.size() == metadata.length;
|
||||
}
|
||||
public record DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<VarintCodedSequence> spanSequences) {
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.isEmpty();
|
||||
|
@@ -5,7 +5,6 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
@@ -28,6 +27,7 @@ public class DocumentKeywordsBuilder {
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
private final int MAX_POSITIONS_PER_WORD = 512;
|
||||
private final int MAX_SPANS_PER_TYPE = 8192;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
|
||||
|
||||
@@ -35,13 +35,22 @@ public class DocumentKeywordsBuilder {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
|
||||
final byte[] spanCodes = new byte[wordSpans.size()];
|
||||
|
||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||
|
||||
// Encode positions
|
||||
while (iter.hasNext()) {
|
||||
var entry = iter.next();
|
||||
|
||||
@@ -58,27 +67,26 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
// Encode spans
|
||||
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
|
||||
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
|
||||
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
|
||||
break;
|
||||
}
|
||||
|
||||
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||
spanCodes[spanSequences.size()] = tag.code;
|
||||
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
|
||||
});
|
||||
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void addMeta(String word, byte meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
@@ -173,6 +181,4 @@ public class DocumentKeywordsBuilder {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
@@ -61,13 +61,13 @@ class DocumentKeywordExtractorTest {
|
||||
Map<String, CodedSequence> positions = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
||||
String keyword = keywordsBuilt.keywords.get(i);
|
||||
byte metadata = keywordsBuilt.metadata[i]
|
||||
String keyword = keywordsBuilt.keywords().get(i);
|
||||
byte metadata = keywordsBuilt.metadata()[i]
|
||||
;
|
||||
|
||||
if (Set.of("dirty", "blues").contains(keyword)) {
|
||||
flags.put(keyword, metadata);
|
||||
positions.put(keyword, keywordsBuilt.positions.get(i));
|
||||
positions.put(keyword, keywordsBuilt.positions().get(i));
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@ import gnu.trove.list.array.TIntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@@ -105,7 +106,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -134,7 +135,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -170,7 +171,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
|
@@ -4,6 +4,7 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -11,14 +12,16 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
||||
import nu.marginalia.model.processed.SlopDomainRecord;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
/** Writer for a single batch of converter parquet files */
|
||||
public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf {
|
||||
@@ -100,17 +103,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
continue;
|
||||
}
|
||||
|
||||
var wb = document.words.build();
|
||||
|
||||
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
||||
byte[] spanCodes = new byte[wb.spans.size()];
|
||||
|
||||
for (int i = 0; i < wb.spans.size(); i++) {
|
||||
var span = wb.spans.get(i);
|
||||
|
||||
spanCodes[i] = span.code();
|
||||
spanSequences.add(span.spans());
|
||||
}
|
||||
DocumentKeywords wb = document.words.build();
|
||||
|
||||
documentWriter.write(new SlopDocumentRecord(
|
||||
domainName,
|
||||
@@ -127,11 +120,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
(float) document.details.quality,
|
||||
document.details.metadata.encode(),
|
||||
document.details.pubYear,
|
||||
wb.keywords,
|
||||
wb.metadata,
|
||||
wb.positions,
|
||||
spanCodes,
|
||||
spanSequences
|
||||
wb.keywords(),
|
||||
wb.metadata(),
|
||||
wb.positions(),
|
||||
wb.spanCodes(),
|
||||
wb.spanSequences()
|
||||
));
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user