mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
158 lines
4.7 KiB
Java
158 lines
4.7 KiB
Java
package nu.marginalia.segmentation;
|
|
|
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.openzim.ZIMTypes.ZIMFile;
|
|
import org.openzim.ZIMTypes.ZIMReader;
|
|
|
|
import java.io.IOException;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.concurrent.ForkJoinPool;
|
|
|
|
public class NgramExtractorMain {
|
|
public static void main(String... args) throws IOException, InterruptedException {
|
|
}
|
|
|
|
private static List<String> getNgramTitleTerms(String title) {
|
|
List<String> terms = new ArrayList<>();
|
|
|
|
// Add the title
|
|
if (title.contains(" ")) { // Only add multi-word titles since we're chasing ngrams
|
|
terms.add(title.toLowerCase());
|
|
}
|
|
|
|
return cleanTerms(terms);
|
|
}
|
|
|
|
private static List<String> getNgramBodyTerms(Document document) {
|
|
List<String> terms = new ArrayList<>();
|
|
|
|
// Grab all internal links
|
|
document.select("a[href]").forEach(e -> {
|
|
var href = e.attr("href");
|
|
if (href.contains(":"))
|
|
return;
|
|
if (href.contains("/"))
|
|
return;
|
|
|
|
var text = e.text().toLowerCase();
|
|
if (!text.contains(" "))
|
|
return;
|
|
|
|
terms.add(text);
|
|
});
|
|
|
|
// Grab all italicized text
|
|
document.getElementsByTag("i").forEach(e -> {
|
|
var text = e.text().toLowerCase();
|
|
if (!text.contains(" "))
|
|
return;
|
|
|
|
terms.add(text);
|
|
});
|
|
|
|
return cleanTerms(terms);
|
|
}
|
|
|
|
private static List<String> cleanTerms(List<String> terms) {
|
|
// Trim the discovered terms
|
|
terms.replaceAll(s -> {
|
|
// Remove trailing parentheses and their contents
|
|
if (s.endsWith(")")) {
|
|
int idx = s.lastIndexOf('(');
|
|
if (idx > 0) {
|
|
return s.substring(0, idx).trim();
|
|
}
|
|
}
|
|
|
|
return s;
|
|
});
|
|
|
|
terms.replaceAll(s -> {
|
|
// Remove leading "list of "
|
|
if (s.startsWith("list of ")) {
|
|
return s.substring("list of ".length());
|
|
}
|
|
|
|
return s;
|
|
});
|
|
|
|
terms.replaceAll(s -> {
|
|
// Remove trailing punctuation
|
|
if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
|
|
return s.substring(0, s.length() - 1);
|
|
}
|
|
|
|
return s;
|
|
});
|
|
|
|
// Remove terms that are too short or too long
|
|
terms.removeIf(s -> {
|
|
if (!s.contains(" "))
|
|
return true;
|
|
if (s.length() > 64)
|
|
return true;
|
|
return false;
|
|
});
|
|
|
|
return terms;
|
|
}
|
|
|
|
public static void dumpCounts(Path zimInputFile,
|
|
Path countsOutputFile
|
|
) throws IOException, InterruptedException
|
|
{
|
|
ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString()));
|
|
|
|
NgramLexicon lexicon = new NgramLexicon();
|
|
|
|
var orderedHasher = HasherGroup.ordered();
|
|
|
|
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
|
|
|
|
reader.forEachTitles((title) -> {
|
|
pool.submit(() -> {
|
|
LongArrayList orderedHashesTitle = new LongArrayList();
|
|
|
|
String normalizedTitle = title.replace('_', ' ');
|
|
|
|
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
|
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
|
}
|
|
synchronized (lexicon) {
|
|
for (var hash : orderedHashesTitle) {
|
|
lexicon.incOrderedTitle(hash);
|
|
}
|
|
}
|
|
});
|
|
|
|
});
|
|
|
|
reader.forEachArticles((title, body) -> {
|
|
pool.submit(() -> {
|
|
LongArrayList orderedHashesBody = new LongArrayList();
|
|
|
|
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
|
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
|
}
|
|
|
|
synchronized (lexicon) {
|
|
for (var hash : orderedHashesBody) {
|
|
lexicon.incOrderedBody(hash);
|
|
}
|
|
}
|
|
});
|
|
|
|
}, p -> true);
|
|
}
|
|
|
|
lexicon.saveCounts(countsOutputFile);
|
|
}
|
|
|
|
}
|