From 9d008390aefb33c290b90dd4cc082c6a9b3194a0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 21 Sep 2025 13:54:01 +0200 Subject: [PATCH] (language) Fix unicode issues in keyword extraction --- .../keyword/DocumentPositionMapper.java | 26 +++++++++++-------- .../encoding/UnicodeNormalization.java | 14 ++++++---- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java index 526601d13..7d2421911 100644 --- a/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java +++ b/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java @@ -163,11 +163,14 @@ public class DocumentPositionMapper { int i = 0; - for (int run = 0; run < 15 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; + for (int run = 0; run < 15 && i < s.length(); run++) { + int cp = s.charAt(i); + + i += Character.charCount(cp); + + if (Character.isAlphabetic(cp)) continue; + if (Character.isDigit(cp)) continue; + break; } @@ -183,12 +186,13 @@ public class DocumentPositionMapper { i++; - for (int run = 0; run < 10 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; + for (int run = 0; run < 10 && i < s.length(); run++) { + int cp = s.charAt(i); + + i += Character.charCount(cp); + + if (Character.isAlphabetic(cp)) continue; + if (Character.isDigit(cp)) continue; } } diff --git a/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java b/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java index 60628d9d3..a59f8b9c2 100644 --- a/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java +++ b/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java @@ -25,7 +25,9 @@ public interface UnicodeNormalization { if ("\u201C\u201D".indexOf(c) >= 0) { sb.append('"'); } - sb.appendCodePoint(c); + else { + sb.appendCodePoint(c); + } } return sb.toString(); @@ -55,8 +57,9 @@ public interface UnicodeNormalization { else if ("é".indexOf(c) >= 0) { sb.append('e'); } - - sb.appendCodePoint(c); + else { + sb.appendCodePoint(c); + } } return sb.toString(); @@ -83,8 +86,9 @@ public interface UnicodeNormalization { } else if ('ß' == c) { sb.append("ss"); } - - sb.appendCodePoint(c); + else { + sb.appendCodePoint(c); + } } return sb.toString();