Implement "ngram" suggestions

This is the last part of the `suggester`. Hunspell has a bespoke
string similarity measurement called "ngram similarity." Conceptually
it's like Jaro or Levenshtein similarity - a measurement for how close
two strings are.

The suggester resorts to ngram suggestions when it believes that the
simple string edits in `suggest_low` are not high quality. Ngram
suggestions are a pipeline:

* Iterate on all stems in the wordlist. Take the 100 most promising
  according to a basic ngram similarity score.
* Expand all affixes for each stem and give each expanded form a score
  based on another ngram-similarity-based metric. Take up to the top 200
  most promising candidates.
* Determine a threshold to eliminate lower quality candidates.
* Return the last remaining most promising candidates.

It's notable that because we iterate on the entire wordlist that ngram
suggestions are far far slower than the basic edit based suggestions.
This commit is contained in:
Michael Davis
2024-11-10 15:52:54 -05:00
parent 8455774155
commit 3f0aa5cab0
8 changed files with 823 additions and 53 deletions

View File

@@ -7,8 +7,8 @@ This example doesn't check whether the input word is in the dictionary first.
```
$ cargo run --example suggest ansi
Compiled the dictionary in 138ms
Suggestions for "ansi": "ANSI", "ans" (checked in 23µs)
Compiled the dictionary in 127ms
Suggestions for "ansi": "ANSI", "ans", "anti", "ans i" (checked in 1367µs)
```
*/
use std::time::Instant;

View File

@@ -124,7 +124,7 @@ pub(crate) struct Affix<K> {
/// a prefix and a suffix, both the prefix and suffix should have `crossproduct: true`.
pub crossproduct: bool,
/// What is stripped from the stem when the affix is applied.
strip: Option<String>,
pub strip: Option<String>,
/// What should be added when the affix is applied.
pub add: String,
/// Condition that the stem should be checked against to query if the affix is relevant.
@@ -522,6 +522,10 @@ impl<C: AffixKind> AffixIndex<C> {
pub fn len(&self) -> usize {
self.table.len()
}
pub fn iter(&self) -> core::slice::Iter<Affix<C>> {
self.table.iter()
}
}
/// An iterator over the prefixes/suffixes of a given word.
@@ -1120,6 +1124,21 @@ impl CaseHandling {
}
}
pub fn lowercase_into_utf32(&self, word: &str, out: &mut Vec<char>) {
out.extend(
word.chars()
.map(match self {
Self::Turkic => |ch| match ch {
'I' => 'ı',
'İ' => 'i',
_ => ch,
},
Self::Standard => |ch| ch,
})
.flat_map(|ch| ch.to_lowercase()),
)
}
pub fn uppercase(&self, word: &str) -> String {
match self {
Self::Turkic => word.replace('i', "İ").replace('ı', "I").to_uppercase(),
@@ -1176,6 +1195,22 @@ impl CaseHandling {
}
output
}
/// Checks whether `left` is equal to `right` when `right` is lowercase.
pub fn is_char_eq_lowercase(&self, left: char, right: char) -> bool {
match (self, left, right) {
(Self::Turkic, 'ı', 'I') => return true,
(Self::Turkic, 'i', 'İ') => return true,
_ => (),
}
let mut lower_iter = right.to_lowercase();
if lower_iter.len() != 1 {
return false;
}
let lower = lower_iter.next().unwrap();
left == lower
}
}
#[derive(Debug, Clone)]

View File

@@ -511,7 +511,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> {
}
// Reversed form of Nuspell's `outer_affix_NOT_valid`
fn is_outer_affix_valid<K: AffixKind, const MODE: AffixingMode>(
pub(crate) fn is_outer_affix_valid<K: AffixKind, const MODE: AffixingMode>(
&self,
affix: &Affix<K>,
) -> bool {
@@ -526,7 +526,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> {
true
}
fn is_circumfix<K: AffixKind>(&self, affix: &Affix<K>) -> bool {
pub(crate) fn is_circumfix<K: AffixKind>(&self, affix: &Affix<K>) -> bool {
has_flag!(affix.flags, self.aff.options.circumfix_flag)
}

View File

@@ -33,7 +33,7 @@ mod umbra_slice;
pub use aff::parser::{
ParseDictionaryError, ParseDictionaryErrorKind, ParseDictionaryErrorSource, ParseFlagError,
};
use suggester::Suggester;
pub use suggester::Suggester;
use crate::alloc::{borrow::Cow, slice, string::String, vec::Vec};
use aff::AffData;
@@ -192,8 +192,18 @@ impl<S: BuildHasher> Dictionary<S> {
}
/// Fills the given vec with possible corrections from the dictionary for the given word.
///
/// This is the same as [Suggester::suggest] but uses the default Suggester behavior.
pub fn suggest(&self, word: &str, out: &mut Vec<String>) {
Suggester::new(Checker::new(self)).suggest(word, out)
self.suggester().suggest(word, out)
}
/// Creates a Suggester that borrows this dictionary.
///
/// The [Suggester] type can be used to customize the suggestion behavior (for example to
/// disable ngram suggestions). See the [Suggester] docs.
pub fn suggester(&self) -> Suggester<S> {
Suggester::new(Checker::new(self))
}
/// Adds a word to the dictionary.
@@ -450,6 +460,7 @@ const MAX_WORD_LEN: usize = 360;
/// The casing of a word.
// Hunspell: <https://github.com/hunspell/hunspell/blob/8f9bb2957bfd74ca153fad96083a54488b518ca5/src/hunspell/csutil.hxx#L91-L96>
// Nuspell: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/src/nuspell/utils.hxx#L91-L104>
#[derive(Debug, Clone, Copy)]
enum Casing {
/// All letters are lowercase. For example "foobar".
///

View File

@@ -1,3 +1,6 @@
mod ngram;
use core::fmt;
use core::hash::BuildHasher;
use crate::{
@@ -15,15 +18,44 @@ macro_rules! has_flag {
}};
}
pub(crate) struct Suggester<'a, S: BuildHasher> {
/// A wrapper struct for a dictionary that allows customizing suggestion behavior.
///
/// Currently only [ngram suggestions](Suggester::with_ngram_suggestions) may be configured.
pub struct Suggester<'a, S: BuildHasher> {
checker: Checker<'a, S>,
ngram_suggest: bool,
}
impl<'a, S: BuildHasher> fmt::Debug for Suggester<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Suggester")
.field("ngram_suggest", &self.ngram_suggest)
.finish_non_exhaustive()
}
}
impl<'a, S: BuildHasher> Suggester<'a, S> {
pub fn new(checker: Checker<'a, S>) -> Self {
Self { checker }
pub(crate) fn new(checker: Checker<'a, S>) -> Self {
Self {
checker,
ngram_suggest: true,
}
}
/// Enables or disables the suggester from finding suggestions based on "ngram similarity."
///
/// Ngram similarity is a bespoke string similarity metric used by the suggester to find
/// suggestions when string edits don't produce any likely candidates. Finding a suggestion
/// with ngram similarity involves iterating through the dictionary's wordlist and therefore
/// ngram suggestion can be very slow depending on the dictionary size.
///
/// Ngram suggestion is enabled by default.
pub fn with_ngram_suggestions(mut self, ngram_suggest: bool) -> Self {
self.ngram_suggest = ngram_suggest;
self
}
/// Fills the given vec with possible corrections from the dictionary for the given word.
pub fn suggest(&self, word: &str, out: &mut Vec<String>) {
out.clear();
if word.len() >= MAX_WORD_LEN {
@@ -76,9 +108,10 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
if matches!(casing_after_dot, Casing::Init) {
let mut buffer = String::from(word.as_ref());
unsafe {
// SAFETY: '.' and ' ' are both one byte so we can swap the characters
// SAFETY: b' ' is ASCII and therefore a valid UTF-8 character, so
// we can safely assume it after the '.' character's right boundary
// without invalidating the UTF-8.
buffer.as_bytes_mut()[dot_idx] = b' ';
buffer.as_mut_vec().insert(dot_idx + 1, b' ');
}
// Nuspell inserts suggestions at the beginning of the list in this block.
// `insert_sug_first(word, out)`
@@ -102,11 +135,13 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
out.insert(0, lowered);
}
}
let lowercase = self.checker.aff.options.case_handling.lowercase(&word);
hq_suggestions |= self.suggest_low(&lowercase, out);
if self.checker.check(&lowercase) {
out.insert(0, lowercase);
}
if matches!(casing, Casing::Pascal) {
let titlecase = self.checker.aff.options.case_handling.titlecase(&word);
hq_suggestions |= self.suggest_low(&titlecase, out);
@@ -124,7 +159,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
if len > word.len() {
continue;
}
if suggestion[after_space_idx..] == word[..word.len() - len] {
if suggestion[after_space_idx..] == word[word.len() - len..] {
continue;
}
let titled = self
@@ -134,9 +169,9 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
.case_handling
.upper_char_at(suggestion, after_space_idx);
out[i] = titled;
// Rotate this suggestion to the front. (I think? TODO)
// Rotate this suggestion to the front.
if i > 0 {
out[..i].rotate_right(1);
out[..=i].rotate_right(1);
}
}
}
@@ -155,7 +190,10 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
}
}
if !hq_suggestions && self.checker.aff.options.max_ngram_suggestions != 0 {
if self.ngram_suggest
&& !hq_suggestions
&& self.checker.aff.options.max_ngram_suggestions != 0
{
let buffer = if matches!(casing, Casing::None) {
Cow::Borrowed(word.as_ref())
} else {
@@ -166,7 +204,7 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
self.ngram_suggest(&buffer, out);
if matches!(casing, Casing::All) {
for suggestion in &mut out[..old_len] {
for suggestion in &mut out[old_len..] {
let upper = self.checker.aff.options.case_handling.uppercase(suggestion);
*suggestion = upper;
}
@@ -188,17 +226,16 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
None => &word[i..],
};
if !self.checker.check(part) {
suggestions_tmp.clear();
self.suggest_impl(part, &mut suggestions_tmp);
let mut buffer = String::with_capacity(word.len());
for s in &suggestions_tmp {
for s in suggestions_tmp.drain(..) {
buffer.clear();
buffer.push_str(&word[..i]);
buffer.push_str(s);
buffer.push_str(&s);
if let Some(j) = j {
buffer.push_str(&word[j..]);
}
if self
if !self
.checker
.check_word(&buffer, Forceucase::default(), HiddenHomonym::default())
.is_some_and(|flags| {
@@ -219,8 +256,13 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
if matches!(casing, Casing::Init | Casing::Pascal) {
for suggestion in out.iter_mut() {
let titlecased = self.checker.aff.options.case_handling.titlecase(suggestion);
*suggestion = titlecased;
let uppered = self
.checker
.aff
.options
.case_handling
.upper_char_at(suggestion, 0);
*suggestion = uppered;
}
}
@@ -911,11 +953,6 @@ impl<'a, S: BuildHasher> Suggester<'a, S> {
.try_into()
.expect("clamping and divisions should ensure this can fit into usize")
}
#[allow(clippy::ptr_arg)]
fn ngram_suggest(&self, _word: &str, _out: &mut Vec<String>) {
// TODO this is a lot.
}
}
/// Removes all duplicate items in a vector while preserving order.
@@ -1158,7 +1195,8 @@ mod test {
fn suggest(dict: &Dictionary, word: &str) -> Vec<String> {
let mut suggestions = Vec::new();
dict.suggest(word, &mut suggestions);
let suggester = dict.suggester().with_ngram_suggestions(false);
suggester.suggest(word, &mut suggestions);
suggestions
}

677
src/suggester/ngram.rs Normal file
View File

@@ -0,0 +1,677 @@
// The parent module covers suggestions made by editing the input word (for example swapping two
// characters.) This module instead covers "ngram suggestions" - a fancier and more expensive
// procedure.
//
// The basic idea of ngram suggestion is to find words in the dictionary similar to the input
// word. To do that we try to filter down the wordlist words in a multi step process.
//
// 1. Find 100 stems in the word list with the highest "ngram similarity" score to the input word.
// 2. Expand the prefixes and suffixes for those 100 stems and find the 200 expanded words with
// the highest ngram similarity score to the input word.
// 3. Re-score the (up to) 200 best candidates based on weighted ngram similarity and other
// bespoke metrics.
// 4. Push the most promising candidates to the `out` vec.
//
// Note that this is **very** expensive compared to regular edit based suggestions since we need
// to iterate on the word list and expand affixes.
//
// Ngram suggestions are also criticized as not very smart:
// <https://battlepenguin.com/tech/aspell-and-hunspell-a-tale-of-two-spell-checkers/>. Ngram
// suggestions are implemented for compatibility with Nuspell/Hunspell but we could consider
// adding other strategies as well, for example looking at the Aspell code.
use core::hash::BuildHasher;
use crate::alloc::{collections::BinaryHeap, string::String, vec::Vec};
use crate::aff::{CaseHandling, HIDDEN_HOMONYM_FLAG, MAX_SUGGESTIONS};
use crate::{FlagSet, Stem, FULL_WORD};
use super::Suggester;
// For ngram suggestions we'll be switching to UTF-32. UTF-32 uses 32-bit integers to represent
// every char.
//
// Compare to UTF-8 - the representation of `String` and `str` - in which a character could be
// represented by one through four bytes.
//
// UTF-32 is handy because indices are intuitive. `utf32_str[3]` is the third character. The same
// isn't true of UTF-8 - you index into the bytes so `utf8_str[3]` could be right in the middle of
// the bytes used to represent a character.
//
// `String` is a newtype wrapper around a `Vec<u8>` which is asserted to contain valid UTF-8. In
// contrast we won't use a newtype wrapper for our Utf32String below since any sequence of
// `char`s is valid UTF-32.
//
// This is getting off-topic but UTF-8 is generally preferred because it has good compression -
// for ASCII (which very common) you only need a byte to represent a character. UTF-32 strings
// take a constant 4 bytes per character which is relatively expensive.
type Utf32String = Vec<char>;
type Utf32Str = [char];
macro_rules! has_flag {
( $flags:expr, $flag:expr ) => {{
match $flag {
Some(flag) => $flags.contains(&flag),
None => false,
}
}};
}
/// This struct is used as a wrapper for other data (for example stem+flagset) to organize a
/// min-heap with BinaryHeap. (`BinaryHeap` from the standard library is a max-heap and you need
/// to reverse the Ord of the type to use it as a min-heap.)
#[derive(Debug, PartialEq, Eq)]
struct MinScored<T: PartialEq + Eq> {
score: isize,
inner: T,
}
impl<T: PartialEq + Eq> Ord for MinScored<T> {
fn cmp(&self, other: &Self) -> core::cmp::Ordering {
self.score.cmp(&other.score).reverse()
}
}
impl<T: PartialEq + Eq> PartialOrd<Self> for MinScored<T> {
fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<'a, S: BuildHasher> Suggester<'a, S> {
pub(super) fn ngram_suggest(&self, word: &str, out: &mut Vec<String>) {
// First step: find 100 stems in the word list with the best ngram score.
let wrong_word: Utf32String = word.chars().collect();
let mut wide_buf = Vec::new();
let mut roots = BinaryHeap::with_capacity(100);
let mut stem_utf32 = Vec::new();
for (stem_utf8, flagset) in self.checker.words.iter() {
if has_flag!(flagset, self.checker.aff.options.forbidden_word_flag)
|| has_flag!(flagset, self.checker.aff.options.no_suggest_flag)
|| has_flag!(flagset, self.checker.aff.options.only_in_compound_flag)
|| flagset.contains(&HIDDEN_HOMONYM_FLAG)
{
continue;
}
// Convert the dictionary stem to UTF-32.
stem_utf32.clear();
stem_utf32.extend(stem_utf8.as_ref().chars());
let mut score = left_common_substring_length(
&self.checker.aff.options.case_handling,
&wrong_word,
&stem_utf32,
) as isize;
wide_buf.clear();
self.checker
.aff
.options
.case_handling
.lowercase_into_utf32(stem_utf8.as_ref(), &mut wide_buf);
score += ngram_similarity_longer_worse(3, &wrong_word, &wide_buf);
let root = MinScored {
score,
inner: (stem_utf8, flagset),
};
if roots.len() != 100 {
roots.push(root);
} else if roots.peek().is_some_and(|entry| score > entry.score) {
// The heap has hit capacity. Drop the lowest scoring root and push this new
// higher scored root.
roots.pop();
roots.push(root);
}
}
// Calculate a somewhat low threshold score so that we can ignore bad suggestions in the
// next steps.
let mut threshold = 0isize;
for k in 1..=3 {
let mangled_word = &mut wide_buf;
mangled_word.clear();
mangled_word.extend_from_slice(&wrong_word);
let mut i = k;
while i < mangled_word.len() {
mangled_word[i] = '*';
i += 4;
}
threshold += ngram_similarity_any_mismatch(wrong_word.len(), &wrong_word, mangled_word);
}
threshold /= 3;
// Step two: expand the affixes for these wordlist entries, gathering the 200 highest
// scoring candidates.
let mut expanded_list = Vec::new();
let mut expanded_cross_affix = Vec::new();
let mut expanded_word_utf32 = Vec::new();
let mut guess_words = BinaryHeap::new();
for MinScored {
inner: (stem, flags),
..
} in roots
{
expanded_cross_affix.clear();
self.expand_stem_for_ngram(
stem,
flags,
word,
&mut expanded_list,
&mut expanded_cross_affix,
);
for expanded_word_utf8 in expanded_list.drain(..) {
expanded_word_utf32.clear();
expanded_word_utf32.extend(expanded_word_utf8.chars());
let mut score = left_common_substring_length(
&self.checker.aff.options.case_handling,
&wrong_word,
&expanded_word_utf32,
) as isize;
let lower_expanded_word = &mut wide_buf;
lower_expanded_word.clear();
self.checker
.aff
.options
.case_handling
.lowercase_into_utf32(&expanded_word_utf8, lower_expanded_word);
score += ngram_similarity_any_mismatch(
wrong_word.len(),
&wrong_word,
lower_expanded_word,
);
if score < threshold {
continue;
}
// Nuspell stores the UTF-32 word in `guess_words`, but then later converts from
// UTF-32 into UTF-8 when pushing into `out`. For our sake it's easier to store
// the UTF-8 instead and avoid the conversion later.
let guess_word = MinScored {
score,
inner: expanded_word_utf8,
};
if guess_words.len() != 200 {
guess_words.push(guess_word);
} else if guess_words.peek().is_some_and(|entry| score > entry.score) {
guess_words.pop();
guess_words.push(guess_word);
}
}
}
// Step three: rescore these up to 200 potential matches based on a weighted ngram
// calculation and other bespoke measurements.
// Scratchpad vector used for calculating longest common subsequences. See
// `longest_common_subsequence_length`.
let mut lcs_state = Vec::new();
// Nuspell questions whether or not the heap needs to be sorted before iterating.
// For now, they sort the heap. I think Nuspell is correct to do so because the `break`
// below could cause a different end behavior based on whether we're iterating on a sorted
// or unsorted vec.
let mut guess_words = guess_words.into_sorted_vec();
for MinScored {
score,
inner: guess_word,
} in guess_words.iter_mut()
{
let lower_guess_word = &mut wide_buf;
lower_guess_word.clear();
self.checker
.aff
.options
.case_handling
.lowercase_into_utf32(guess_word, lower_guess_word);
let lcs =
longest_common_subsequence_length(&wrong_word, lower_guess_word, &mut lcs_state);
if wrong_word.len() == lower_guess_word.len() && wrong_word.len() == lcs {
*score += 2000;
break;
}
let mut ngram2 =
ngram_similarity_any_mismatch_weighted(2, &wrong_word, lower_guess_word);
ngram2 += ngram_similarity_any_mismatch_weighted(2, lower_guess_word, &wrong_word);
let ngram4 = ngram_similarity_any_mismatch(4, &wrong_word, lower_guess_word);
let left_common = left_common_substring_length(
&self.checker.aff.options.case_handling,
&wrong_word,
lower_guess_word,
);
let (num_eq_chars_same_pos, eq_char_is_swapped) =
count_eq_at_same_pos(&wrong_word, lower_guess_word);
*score = 2 * lcs as isize;
*score -= (wrong_word.len() as isize - lower_guess_word.len() as isize).abs();
*score += left_common as isize + ngram2 + ngram4;
if num_eq_chars_same_pos != 0 {
*score += 1;
}
if eq_char_is_swapped {
*score += 10;
}
if 5 * ngram2
< ((wrong_word.len() + lower_guess_word.len())
* (10 - self.checker.aff.options.max_diff_factor as usize))
as isize
{
*score -= 1000;
}
}
// We've updated the scores (`iter_mut` above) so we need to re-sort the Vec.
// Note that because of `MinScored<T>`'s `Ord` implementation the Vec is ordered by
// score descending. (Normally a sort would be ascending.)
guess_words.sort_unstable();
// Step four: push the most promising of the candidates to `out`.
let be_more_selective = guess_words.first().is_some_and(|guess| guess.score > 1000);
let old_num_suggestions = out.len();
let max_suggestions = MAX_SUGGESTIONS
.min(old_num_suggestions + self.checker.aff.options.max_ngram_suggestions as usize);
for MinScored {
score,
inner: guess_word,
} in guess_words.into_iter()
{
if out.len() == max_suggestions {
break;
}
// Note that we are iterating in descending score order, so this sets a minimum.
if be_more_selective && score <= 1000 {
break;
}
if score < -100
&& (old_num_suggestions != out.len() || self.checker.aff.options.only_max_diff)
{
break;
}
// Nuspell is converting back to UTF-8 but we store the `guess_word` in UTF-8 form.
// I think Nuspell only carries UTF-32 because of templates allowing easy conversion
// to lowercase. In Spellbook's case we need an explicit function for this and we
// already have one for UTF-8, so it's easier to carry UTF-8. It's nearly always less
// memory plus we save a conversion to UTF-8 right here:
if out
.iter()
.any(|sug| contains_subslice(guess_word.as_bytes(), sug.as_bytes()))
{
if score < -100 {
break;
} else {
continue;
}
}
out.push(guess_word);
}
}
fn expand_stem_for_ngram(
&self,
stem: &Stem,
flags: &FlagSet,
word: &str,
expanded_list: &mut Vec<String>,
cross_affix: &mut Vec<bool>,
) {
expanded_list.clear();
cross_affix.clear();
if !has_flag!(flags, self.checker.aff.options.need_affix_flag) {
expanded_list.push(String::from(stem.as_ref()));
cross_affix.push(false);
}
if flags.is_empty() {
return;
}
// TODO: investigate collecting `all_flags` (like we do for compounds IIRC) on the
// prefixes and suffixes tables to see if we can disqualify flagsets faster?
for suffix in self.checker.aff.suffixes.iter() {
// Nuspell:
// if (!cross_valid_inner_outer(flags, suffix))
// continue;
if !flags.contains(&suffix.flag) {
continue;
}
if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(suffix) {
continue;
}
if self.checker.is_circumfix(suffix) {
continue;
}
// Nuspell has a todo here:
// > Suffixes marked with needaffix or circumfix should not just be skipped as we can
// > later add prefix. This is not handled in hunspell, too.
if suffix
.strip
.as_ref()
.is_some_and(|suf| !stem.as_str().ends_with(suf))
{
continue;
}
if !suffix.condition_matches(stem.as_str()) {
continue;
}
if !suffix.add.is_empty() && !word.ends_with(&suffix.add) {
continue;
}
let expanded = suffix.to_derived(stem.as_str());
expanded_list.push(expanded);
cross_affix.push(suffix.crossproduct);
}
// Expand crossproduct words - prefixes for suffix-expanded words marked with
// `crossproduct: true`.
for i in 0..expanded_list.len() {
if !cross_affix[i] {
continue;
}
for prefix in self.checker.aff.prefixes.iter() {
let suffixed_stem = &expanded_list[i];
// if (!cross_valid_inner_outer(flags, prefix))
// continue;
if !flags.contains(&prefix.flag) {
continue;
}
if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(prefix) {
continue;
}
if self.checker.is_circumfix(prefix) {
continue;
}
if prefix
.strip
.as_ref()
.is_some_and(|pre| !suffixed_stem.starts_with(pre))
{
continue;
}
if !prefix.condition_matches(suffixed_stem) {
continue;
}
if !prefix.add.is_empty() && !word.starts_with(&prefix.add) {
continue;
}
let expanded = prefix.to_derived(stem.as_str());
expanded_list.push(expanded);
}
}
for prefix in self.checker.aff.prefixes.iter() {
// Nuspell:
// if (!cross_valid_inner_outer(flags, prefix))
// continue;
if !flags.contains(&prefix.flag) {
continue;
}
if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(prefix) {
continue;
}
if self.checker.is_circumfix(prefix) {
continue;
}
if prefix
.strip
.as_ref()
.is_some_and(|pre| !stem.as_str().starts_with(pre))
{
continue;
}
if !prefix.condition_matches(stem.as_str()) {
continue;
}
if !prefix.add.is_empty() && !word.starts_with(&prefix.add) {
continue;
}
let expanded = prefix.to_derived(stem.as_str());
expanded_list.push(expanded);
}
}
}
fn left_common_substring_length(
case_handling: &CaseHandling,
left: &Utf32Str,
right: &Utf32Str,
) -> usize {
if left.is_empty() || right.is_empty() {
return 0;
}
if left[0] != right[0] && !case_handling.is_char_eq_lowercase(left[0], right[0]) {
return 0;
}
index_of_mismatch(&left[1..], &right[1..])
.map(|idx| idx + 1)
.unwrap_or(left.len())
}
fn index_of_mismatch<T: Eq>(left: &[T], right: &[T]) -> Option<usize> {
left.iter()
.enumerate()
.find_map(|(idx, l)| match right.get(idx) {
Some(r) if r == l => None,
_ => Some(idx),
})
}
fn ngram_similarity_longer_worse(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize {
if right.is_empty() {
return 0;
}
let mut score = ngram_similarity(n, left, right);
let d = (left.len() as isize - right.len() as isize) - 2;
if d > 0 {
score -= d;
}
score
}
// Nuspell calls this `ngram_similarity_low_level`.
fn ngram_similarity(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize {
let n = n.min(left.len());
let mut score = 0;
for k in 1..=n {
let mut k_score = 0;
for i in 0..=left.len() - k {
let kgram = &left[i..i + k];
if contains_subslice(right, kgram) {
k_score += 1;
}
}
score += k_score;
if k_score < 2 {
break;
}
}
score
}
fn contains_subslice<T: Eq>(slice: &[T], subslice: &[T]) -> bool {
if subslice.len() > slice.len() {
return false;
}
let window = slice.len() - subslice.len();
for i in 0..=window {
if slice[i..].starts_with(subslice) {
return true;
}
}
false
}
fn ngram_similarity_any_mismatch(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize {
if right.is_empty() {
return 0;
}
let mut score = ngram_similarity(n, left, right);
let d = (right.len() as isize - left.len() as isize).abs() - 2;
if d > 0 {
score -= d;
}
score
}
// Nuspell returns an isize.
fn longest_common_subsequence_length<T: Eq>(
left: &[T],
right: &[T],
state_buffer: &mut Vec<usize>,
) -> usize {
state_buffer.clear();
state_buffer.resize(right.len(), 0);
let mut row1_prev = 0;
for l in left.iter() {
row1_prev = 0;
let mut row2_prev = 0;
for j in 0..right.len() {
let row1_current = state_buffer[j];
let row2_current = &mut state_buffer[j];
*row2_current = if *l == right[j] {
row1_prev + 1
} else {
row1_current.max(row2_prev)
};
row1_prev = row1_current;
row2_prev = *row2_current;
}
row1_prev = row2_prev;
}
row1_prev
}
fn ngram_similarity_any_mismatch_weighted(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize {
if right.is_empty() {
return 0;
}
let mut score = ngram_similarity_weighted(n, left, right);
let d = (right.len() as isize - left.len() as isize).abs() - 2;
if d > 0 {
score -= d;
}
score
}
fn ngram_similarity_weighted(n: usize, left: &Utf32Str, right: &Utf32Str) -> isize {
let n = n.min(left.len());
let mut score = 0;
for k in 1..=n {
let mut k_score = 0;
for i in 0..=left.len() - k {
let kgram = &left[i..i + k];
if contains_subslice(right, kgram) {
k_score += 1;
} else {
k_score -= 1;
if i == 0 || i == left.len() - k {
k_score -= 1;
}
}
}
score += k_score;
}
score
}
fn count_eq_at_same_pos<T: Eq + Copy>(left: &[T], right: &[T]) -> (usize, bool) {
let n = left.len().min(right.len());
let count = left
.iter()
.zip(right.iter())
.filter(|(l, r)| l == r)
.count();
let mut is_swap = false;
// Only two characters are not equal. Check if they were swapped.
if left.len() == right.len() && n - count == 2 {
let mut first_mismatch = None;
for (l, r) in left.iter().zip(right.iter()) {
if l != r {
if let Some((l1, r1)) = first_mismatch {
is_swap = l1 == r && r1 == l;
break;
}
first_mismatch = Some((l, r));
}
}
}
(count, is_swap)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn index_of_mismatch_test() {
assert_eq!(index_of_mismatch(b"abcd", b"abcd"), None);
assert_eq!(index_of_mismatch(b"abcd", b"abxy"), Some(2));
assert_eq!(index_of_mismatch(b"abcd", b"abc"), Some(3));
assert_eq!(index_of_mismatch(b"abc", b"abcd"), None);
}
#[test]
fn contains_subslice_test() {
assert!(contains_subslice(b"abcd", b"abcd"));
assert!(contains_subslice(b"abcd", b"abc"));
assert!(contains_subslice(b"abcd", b"bcd"));
assert!(contains_subslice(b"abcd", b"b"));
}
#[test]
fn nagrm_similarity_test() {
// Rebuilding the Spellchecker:
// > ngram(3, 'actually', 'akchualy')
// > 11 = a, c, u, a, l, l, y, ua, al, ly, ual
let left: Utf32String = "actually".chars().collect();
let right: Utf32String = "akchualy".chars().collect();
assert_eq!(ngram_similarity(3, &left, &right), 11);
}
#[test]
fn longest_common_subsequence_length_test() {
let mut state_buffer = Vec::new();
assert_eq!(
longest_common_subsequence_length(b"aaa", b"aaa", &mut state_buffer),
3
);
assert_eq!(
longest_common_subsequence_length(b"aaaaa", b"bbbaa", &mut state_buffer),
2
);
}
#[test]
fn count_eq_at_same_pos_test() {
assert_eq!(count_eq_at_same_pos(b"abcd", b"abcd"), (4, false));
assert_eq!(count_eq_at_same_pos(b"abcd", b"acbd"), (2, true));
}
}

View File

@@ -386,11 +386,15 @@ impl UmbraString {
pub fn as_bytes(&self) -> &[u8] {
self.0.as_slice()
}
pub fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(self.as_bytes()) }
}
}
impl AsRef<str> for UmbraString {
fn as_ref(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(self.as_bytes()) }
self.as_str()
}
}

View File

@@ -209,6 +209,7 @@ fn do_suggest_case(case: &str) {
for word in fs::read_to_string(path.with_extension("wrong"))
.iter()
.flat_map(|text| text.lines())
.filter(|line| !line.is_empty())
{
assert!(
!dict.check(word),
@@ -226,6 +227,7 @@ fn do_suggest_case(case: &str) {
for line in fs::read_to_string(path.with_extension("sug"))
.iter()
.flat_map(|text| text.lines())
.filter(|line| !line.is_empty())
{
let sugs: Vec<_> = line.split(", ").map(ToOwned::to_owned).collect();
if !sugs.is_empty() {
@@ -233,42 +235,45 @@ fn do_suggest_case(case: &str) {
}
}
assert_eq!(list_sugs, expected_list_sugs);
assert_eq!(
expected_list_sugs, list_sugs,
"(left: expected, right: actual)"
);
}
// TODO: fix and uncomment the broken tests.
// suggest!(suggest_1463589);
// suggest!(suggest_1463589_utf);
suggest!(suggest_1463589);
suggest!(suggest_1463589_utf);
suggest!(suggest_1695964);
// suggest!(suggest_IJ);
// suggest!(suggest_allcaps);
// suggest!(suggest_allcaps_utf);
suggest!(suggest_IJ);
suggest!(suggest_allcaps);
suggest!(suggest_allcaps_utf);
suggest!(suggest_allcaps2);
// suggest!(suggest_base);
// suggest!(suggest_base_utf);
// suggest!(suggest_breakdefault);
// suggest!(suggest_checksharps);
// suggest!(suggest_checksharpsutf);
suggest!(suggest_base);
suggest!(suggest_base_utf);
suggest!(suggest_breakdefault);
suggest!(suggest_forceucase);
// suggest!(suggest_i35725);
// suggest!(suggest_i54633);
// suggest!(suggest_i58202);
// suggest!(suggest_keepcase);
suggest!(suggest_i35725);
suggest!(suggest_i54633);
suggest!(suggest_i58202);
suggest!(suggest_keepcase);
suggest!(suggest_map);
suggest!(suggest_maputf);
suggest!(suggest_ngram_utf_fix);
// suggest!(suggest_nosuggest);
// suggest!(suggest_oconv);
suggest!(suggest_oconv);
suggest!(suggest_onlyincompound);
suggest!(suggest_opentaal_forbiddenword1);
suggest!(suggest_opentaal_forbiddenword2);
// suggest!(suggest_opentaal_keepcase);
// suggest!(suggest_phone);
suggest!(suggest_opentaal_keepcase);
suggest!(suggest_rep);
suggest!(suggest_reputf);
// suggest!(suggest_sug);
// suggest!(suggest_sugutf);
suggest!(suggest_sug);
suggest!(suggest_sugutf);
// These are marked as failing in Nuspell:
// suggest!(suggest_checksharps);
// suggest!(suggest_checksharpsutf);
// suggest!(suggest_nosuggest);
// suggest!(suggest_phone);
// suggest!(suggest_utf8_nonbmp);
/// Reads the contents of a file into a String, handling detecting and decoding of non-UTF-8