1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

37 lines
903 B
Java

package nu.marginalia.language.sentence;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
public class SentencePreCleaner {
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
public String[] clean(String[] sentences) {
int sentenceCount = 0;
List<String> sentenceList = new ArrayList<>();
for (var s : sentences) {
if (s.isBlank()) continue;
sentenceCount++;
if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) {
break;
}
if (s.contains("-") || s.contains("|")) {
sentenceList.addAll(Arrays.asList(splitPattern.split(s)));
}
else {
sentenceList.add(s);
}
}
return sentenceList.toArray(String[]::new);
}
}