1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

68 lines
1.8 KiB
Java

package nu.marginalia.language.model;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.lsh.EasyLSH;
import org.jetbrains.annotations.NotNull;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;
/** Holds the sentences and text of a document, decorated with
* HTML tags, POS tags, and other information.
*
* @see SentenceExtractor
*/
public record DocumentLanguageData(LanguageDefinition language,
List<DocumentSentence> sentences,
String text) implements Iterable<DocumentSentence> {
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
{
this.language = language;
this.sentences = Collections.unmodifiableList(sentences);
this.text = text;
}
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
return stream().filter(s -> s.htmlTags.contains(tag)).toList();
}
public int numSentences() {
return sentences.size();
}
public int totalNumWords() {
int ret = 0;
for (DocumentSentence sent : sentences) {
ret += sent.length();
}
return ret;
}
public long localitySensitiveHashCode() {
var hash = new EasyLSH();
for (var sent : sentences) {
for (var word : sent.wordsLowerCase) {
hash.addUnordered(word);
}
}
return hash.get();
}
@NotNull
@Override
public Iterator<DocumentSentence> iterator() {
return sentences.iterator();
}
public Stream<DocumentSentence> stream() {
return sentences.stream();
}
}