mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
37 lines
903 B
Java
37 lines
903 B
Java
package nu.marginalia.language.sentence;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Arrays;
|
|
import java.util.List;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class SentencePreCleaner {
|
|
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
|
|
|
public String[] clean(String[] sentences) {
|
|
|
|
int sentenceCount = 0;
|
|
|
|
List<String> sentenceList = new ArrayList<>();
|
|
for (var s : sentences) {
|
|
|
|
if (s.isBlank()) continue;
|
|
|
|
sentenceCount++;
|
|
|
|
if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) {
|
|
break;
|
|
}
|
|
|
|
if (s.contains("-") || s.contains("|")) {
|
|
sentenceList.addAll(Arrays.asList(splitPattern.split(s)));
|
|
}
|
|
else {
|
|
sentenceList.add(s);
|
|
}
|
|
}
|
|
|
|
return sentenceList.toArray(String[]::new);
|
|
}
|
|
}
|