1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Fix unicode issues in keyword extraction

This commit is contained in:
Viktor Lofgren
2025-09-21 13:54:01 +02:00
parent a40c2a8146
commit 9d008390ae
2 changed files with 24 additions and 16 deletions

View File

@@ -163,11 +163,14 @@ public class DocumentPositionMapper {
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
for (int run = 0; run < 15 && i < s.length(); run++) {
int cp = s.charAt(i);
i += Character.charCount(cp);
if (Character.isAlphabetic(cp)) continue;
if (Character.isDigit(cp)) continue;
break;
}
@@ -183,12 +186,13 @@ public class DocumentPositionMapper {
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
for (int run = 0; run < 10 && i < s.length(); run++) {
int cp = s.charAt(i);
i += Character.charCount(cp);
if (Character.isAlphabetic(cp)) continue;
if (Character.isDigit(cp)) continue;
}
}

View File

@@ -25,7 +25,9 @@ public interface UnicodeNormalization {
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
sb.appendCodePoint(c);
else {
sb.appendCodePoint(c);
}
}
return sb.toString();
@@ -55,8 +57,9 @@ public interface UnicodeNormalization {
else if ("é".indexOf(c) >= 0) {
sb.append('e');
}
sb.appendCodePoint(c);
else {
sb.appendCodePoint(c);
}
}
return sb.toString();
@@ -83,8 +86,9 @@ public interface UnicodeNormalization {
} else if ('ß' == c) {
sb.append("ss");
}
sb.appendCodePoint(c);
else {
sb.appendCodePoint(c);
}
}
return sb.toString();