2023-03-04 13:19:01 +01:00
|
|
|
package nu.marginalia.language.model;
|
2022-05-19 17:45:26 +02:00
|
|
|
|
2023-03-04 13:19:01 +01:00
|
|
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
2024-07-18 15:57:48 +02:00
|
|
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
2023-03-04 13:19:01 +01:00
|
|
|
import nu.marginalia.lsh.EasyLSH;
|
2024-07-19 12:24:55 +02:00
|
|
|
import org.jetbrains.annotations.NotNull;
|
2022-05-19 17:45:26 +02:00
|
|
|
|
2024-07-19 12:24:55 +02:00
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.Iterator;
|
2024-07-18 15:57:48 +02:00
|
|
|
import java.util.List;
|
2024-07-19 12:24:55 +02:00
|
|
|
import java.util.stream.Stream;
|
2022-05-19 17:45:26 +02:00
|
|
|
|
2024-07-18 15:57:48 +02:00
|
|
|
/** Holds the sentences and text of a document, decorated with
|
|
|
|
* HTML tags, POS tags, and other information.
|
|
|
|
*
|
2023-01-30 09:36:11 +01:00
|
|
|
* @see SentenceExtractor
|
2022-08-08 15:18:04 +02:00
|
|
|
*/
|
2024-07-19 12:24:55 +02:00
|
|
|
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
|
2022-05-19 17:45:26 +02:00
|
|
|
|
2024-07-18 15:57:48 +02:00
|
|
|
public DocumentLanguageData(List<DocumentSentence> sentences,
|
2024-07-19 12:24:55 +02:00
|
|
|
String text)
|
|
|
|
{
|
|
|
|
this.sentences = Collections.unmodifiableList(sentences);
|
2024-07-18 15:57:48 +02:00
|
|
|
this.text = text;
|
|
|
|
}
|
|
|
|
|
|
|
|
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
|
2024-07-19 12:24:55 +02:00
|
|
|
return stream().filter(s -> s.htmlTags.contains(tag)).toList();
|
|
|
|
}
|
|
|
|
|
|
|
|
public int numSentences() {
|
|
|
|
return sentences.size();
|
2023-07-10 17:36:12 +02:00
|
|
|
}
|
|
|
|
|
2022-05-19 17:45:26 +02:00
|
|
|
public int totalNumWords() {
|
|
|
|
int ret = 0;
|
2024-07-18 15:57:48 +02:00
|
|
|
|
2024-07-19 12:24:55 +02:00
|
|
|
for (DocumentSentence sent : sentences) {
|
|
|
|
ret += sent.length();
|
2022-05-19 17:45:26 +02:00
|
|
|
}
|
|
|
|
|
2024-07-18 15:57:48 +02:00
|
|
|
return ret;
|
2022-05-19 17:45:26 +02:00
|
|
|
}
|
2023-03-04 13:19:01 +01:00
|
|
|
|
|
|
|
public long localitySensitiveHashCode() {
|
|
|
|
var hash = new EasyLSH();
|
|
|
|
|
|
|
|
for (var sent : sentences) {
|
2024-07-19 12:24:55 +02:00
|
|
|
for (var word : sent.wordsLowerCase) {
|
|
|
|
hash.addUnordered(word);
|
2023-03-04 13:19:01 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return hash.get();
|
|
|
|
}
|
2024-07-19 12:24:55 +02:00
|
|
|
|
|
|
|
@NotNull
|
|
|
|
@Override
|
|
|
|
public Iterator<DocumentSentence> iterator() {
|
|
|
|
return sentences.iterator();
|
|
|
|
}
|
|
|
|
|
|
|
|
public Stream<DocumentSentence> stream() {
|
|
|
|
return sentences.stream();
|
|
|
|
}
|
2022-05-19 17:45:26 +02:00
|
|
|
}
|