1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00
Files
MarginaliaSearch/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java

66 lines
1.7 KiB
Java
Raw Normal View History

2023-03-04 13:19:01 +01:00
package nu.marginalia.language.model;
2022-05-19 17:45:26 +02:00
2023-03-04 13:19:01 +01:00
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
2023-03-04 13:19:01 +01:00
import nu.marginalia.lsh.EasyLSH;
import org.jetbrains.annotations.NotNull;
2022-05-19 17:45:26 +02:00
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;
2022-05-19 17:45:26 +02:00
/** Holds the sentences and text of a document, decorated with
* HTML tags, POS tags, and other information.
*
* @see SentenceExtractor
2022-08-08 15:18:04 +02:00
*/
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
2022-05-19 17:45:26 +02:00
public DocumentLanguageData(List<DocumentSentence> sentences,
String text)
{
this.sentences = Collections.unmodifiableList(sentences);
this.text = text;
}
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
return stream().filter(s -> s.htmlTags.contains(tag)).toList();
}
public int numSentences() {
return sentences.size();
}
2022-05-19 17:45:26 +02:00
public int totalNumWords() {
int ret = 0;
for (DocumentSentence sent : sentences) {
ret += sent.length();
2022-05-19 17:45:26 +02:00
}
return ret;
2022-05-19 17:45:26 +02:00
}
2023-03-04 13:19:01 +01:00
public long localitySensitiveHashCode() {
var hash = new EasyLSH();
for (var sent : sentences) {
for (var word : sent.wordsLowerCase) {
hash.addUnordered(word);
2023-03-04 13:19:01 +01:00
}
}
return hash.get();
}
@NotNull
@Override
public Iterator<DocumentSentence> iterator() {
return sentences.iterator();
}
public Stream<DocumentSentence> stream() {
return sentences.stream();
}
2022-05-19 17:45:26 +02:00
}