1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/language/WordPatterns.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

63 lines
1.6 KiB
Java

package nu.marginalia.language;
/** Logic for deciding which words are eligible to be keywords.
*/
public class WordPatterns {
public static final int MIN_WORD_LENGTH = 1;
public static final int MAX_WORD_LENGTH = 64;
public static final String WORD_TOKEN_JOINER = "_";
/** Run checks on the word and exclude terms with too many special characters
*/
public static boolean isNotJunkWord(String word) {
if (word.isBlank()) {
return false;
}
if (hasMoreThanN(word, '-', 4)) {
return false;
}
if (hasMoreThanN(word, '+', 2)) {
return false;
}
if (word.startsWith("-")
|| word.endsWith("-")
) {
return false;
}
int numDigits = 0;
for (int i = 0; i < word.length(); i++) {
if (Character.isDigit(word.charAt(i))) {
numDigits++;
}
if (numDigits > 16)
return false;
}
return true;
}
private static boolean hasMoreThanN(String s, char c, int max) {
int idx = 0;
for (int i = 0; i <= max; i++) {
idx = s.indexOf(c, idx+1);
if (idx < 0 || idx >= s.length() - 1)
return false;
}
return true;
}
// Stopword exclusion has been moved to the index. We just filter out
// junk words here now.
public static boolean isStopWord(String s) {
if (!isNotJunkWord(s)) {
return true;
}
return false;
}
}