mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
68 lines
1.8 KiB
Java
68 lines
1.8 KiB
Java
package nu.marginalia.language.model;
|
|
|
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
|
import nu.marginalia.lsh.EasyLSH;
|
|
import org.jetbrains.annotations.NotNull;
|
|
|
|
import java.util.Collections;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.stream.Stream;
|
|
|
|
/** Holds the sentences and text of a document, decorated with
|
|
* HTML tags, POS tags, and other information.
|
|
*
|
|
* @see SentenceExtractor
|
|
*/
|
|
public record DocumentLanguageData(LanguageDefinition language,
|
|
List<DocumentSentence> sentences,
|
|
String text) implements Iterable<DocumentSentence> {
|
|
|
|
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
|
|
{
|
|
this.language = language;
|
|
this.sentences = Collections.unmodifiableList(sentences);
|
|
this.text = text;
|
|
}
|
|
|
|
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
|
|
return stream().filter(s -> s.htmlTags.contains(tag)).toList();
|
|
}
|
|
|
|
public int numSentences() {
|
|
return sentences.size();
|
|
}
|
|
|
|
public int totalNumWords() {
|
|
int ret = 0;
|
|
|
|
for (DocumentSentence sent : sentences) {
|
|
ret += sent.length();
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
public long localitySensitiveHashCode() {
|
|
var hash = new EasyLSH();
|
|
|
|
for (var sent : sentences) {
|
|
for (var word : sent.wordsLowerCase) {
|
|
hash.addUnordered(word);
|
|
}
|
|
}
|
|
return hash.get();
|
|
}
|
|
|
|
@NotNull
|
|
@Override
|
|
public Iterator<DocumentSentence> iterator() {
|
|
return sentences.iterator();
|
|
}
|
|
|
|
public Stream<DocumentSentence> stream() {
|
|
return sentences.stream();
|
|
}
|
|
}
|