1using System; 2using System.Collections.Generic; 3using System.IO; 4using System.Text; 5 6namespace WindowCapture.Helpers 7{ 8 /// <summary> 9 /// CompactSpell: memory-efficient spelling correction for millions of words. 10 /// Uses sorted arrays + binary search instead of Dictionary — ~5x less RAM. 11 /// 12 /// Architecture: 13 /// - words[]: all dictionary words, sorted by frequency 14 /// - deleteHashes[]: sorted int32 hashes of all delete-1 combinations 15 /// - deleteWordIdx[]: corresponding word indices (parallel array) 16 /// - Binary search on deleteHashes to find candidates 17 /// 18 /// RAM: ~20MB for 1.5M words (vs ~100MB with Dictionary) 19 /// Speed: O(log n) per lookup instead of O(1), but n=10M so log(n)=23 comparisons 20 /// </summary> 21 public class CompactSpell 22 { 23 private string[] words; // frequency-ordered (for FreqIdx) 24 private HashSet<string> wordSet; // ALL words for lookup candidates 25 private HashSet<string> trustedSet; // top N frequent words — "definitely correct" 26 private int wordCount; 27 28 // Compact delete index: sorted parallel arrays 29 private int[] deleteHashes; // sorted hash values 30 private int[] deleteWordIdxs; // word index for each hash 31 // For duplicate hashes, they're adjacent in the sorted array 32 33 // Phonetic skeleton index (vowel-class collapsed) over trusted words — recall for 34 // vowel-confusion typos (малако→молоко) the delete index can't bridge (2 substitutions). 35 private Dictionary<string, List<int>> phoneticIdx; 36 37 private volatile bool ready; 38 public bool IsReady { get { return ready; } } 39 public int WordCount { get { return wordCount; } } 40 41 // Bloom filter for "is word correct?" — shared 42 public BloomFilter Bloom; 43 44 public bool IsCorrect(string lower) 45 { 46 if (Bloom != null) return Bloom.MayContain(lower); 47 // Fallback when the Bloom filter hasn't been attached yet (init-order race): 48 // 'words' is frequency-ordered, NOT lexicographically sorted, so BinarySearch on it 49 // returns garbage. Use the exact-match HashSet instead. 50 return wordSet != null && wordSet.Contains(lower); 51 } 52 53 /// <summary>Check if word is in full dictionary (1.5M).</summary> 54 public bool ContainsExact(string lower) 55 { 56 return wordSet != null && wordSet.Contains(lower); 57 } 58 59 // Rare short words that should NEVER win over common alternatives 60 private static readonly HashSet<string> rareBlacklist = new HashSet<string> { 61 "риза","ска","омон","дот","шкод","катер", 62 "нина","сводня","кое","слом","тагор", 63 "пресет","превед","компутер","локаут", 64 }; 65 66 // Real 1-2 char words in Russian (everything else is corpus garbage) 67 private static readonly HashSet<string> realShortWords = new HashSet<string> { 68 "а","в","и","к","о","с","у","я","аж","ай","ан","ау", 69 "бы","вы","да","до","ей","ем","ей","её","за","из","их","ко", 70 "ли","мы","на","не","ни","но","ну","об","он","от","по","со", 71 "та","те","то","ту","ты","уж","уз","ум","уч","чё", 72 "же","мне","все","вас","нас","вам","нам","ещё","его","нет", 73 "кто","что","как","так","тут","там","вот","уже","это","она","они", 74 }; 75 76 /// <summary>Check if word is in trusted dictionary. Short words use whitelist.</summary> 77 public bool ContainsTrusted(string lower) 78 { 79 if (lower.Length <= 2) return realShortWords.Contains(lower); 80 return trustedSet != null && trustedSet.Contains(lower); 81 } 82 83 /// <summary>Build compact index from frequency-ordered dictionary.</summary> 84 public void Build(string[] dictionary) 85 { 86 var sw = System.Diagnostics.Stopwatch.StartNew(); 87 words = dictionary; 88 wordCount = words.Length; 89 90 wordSet = new HashSet<string>(StringComparer.Ordinal); 91 trustedSet = new HashSet<string>(StringComparer.Ordinal); 92 phoneticIdx = new Dictionary<string, List<int>>(StringComparer.Ordinal); 93 for (int i = 0; i < wordCount; i++) 94 { 95 wordSet.Add(words[i]); 96 // First 87k words = frequency-ordered from Leeds corpus = trusted 97 if (i < 87000) 98 { 99 trustedSet.Add(words[i]); 100 if (words[i].Length >= 3) 101 { 102 string sk = Skeleton(words[i]); 103 List<int> pl; 104 if (!phoneticIdx.TryGetValue(sk, out pl)) { pl = new List<int>(2); phoneticIdx[sk] = pl; } 105 pl.Add(i); 106 } 107 } 108 } 109 Logger.Log("textproc", "CompactSpell: wordSet=" + wordSet.Count + " trusted=" + trustedSet.Count); 110 111 // Phase 1: count total deletes to pre-allocate arrays 112 long totalDeletes = 0; 113 for (int i = 0; i < wordCount; i++) 114 { 115 int len = words[i].Length; 116 if (len >= 3 && len <= 25) totalDeletes += len; 117 } 118 119 Logger.Log("textproc", "CompactSpell: " + wordCount + " words, ~" + totalDeletes + " deletes, allocating..."); 120 121 // Phase 2: generate all delete-1 hashes 122 // Use List then convert to sorted array 123 var hashes = new int[totalDeletes]; 124 var idxs = new int[totalDeletes]; 125 int pos = 0; 126 127 for (int wi = 0; wi < wordCount; wi++) 128 { 129 string w = words[wi]; 130 if (w.Length < 3 || w.Length > 25) continue; 131 132 for (int i = 0; i < w.Length; i++) 133 { 134 // Hash the delete string without actually creating it (saves GC pressure) 135 int hash = HashDelete(w, i); 136 hashes[pos] = hash; 137 idxs[pos] = wi; 138 pos++; 139 } 140 } 141 142 // Trim if needed 143 if (pos < totalDeletes) 144 { 145 Array.Resize(ref hashes, pos); 146 Array.Resize(ref idxs, pos); 147 } 148 149 // Phase 3: sort by hash (parallel sort of both arrays) 150 SortParallel(hashes, idxs, 0, pos - 1); 151 152 deleteHashes = hashes; 153 deleteWordIdxs = idxs; 154 155 sw.Stop(); 156 Logger.Log("textproc", "CompactSpell built: " + pos + " delete entries, " + 157 (pos * 8 / 1024 / 1024) + "MB arrays, " + sw.ElapsedMilliseconds + "ms"); 158 ready = true; 159 } 160 161 /// <summary>Find best correction candidates.</summary> 162 public List<SymSpell.Candidate> Lookup(string input, int topN) { return Lookup(input, topN, false); } 163 164 /// <summary>force=true generates candidates even for an in-dictionary word — used for the 165 /// frequency-override (a rare corpus word may still have a far more frequent correct neighbour).</summary> 166 public List<SymSpell.Candidate> Lookup(string input, int topN, bool force) 167 { 168 var result = new List<SymSpell.Candidate>(); 169 if (!ready || input == null || input.Length < 2) return result; 170 string lower = input.ToLower(); 171 if (!force && IsCorrect(lower)) return result; // already correct 172 173 var candidateIdxs = new HashSet<int>(); 174 175 // Phonetic recall: trusted words sharing the vowel-collapsed skeleton (малако→молоко). 176 if (phoneticIdx != null && lower.Length >= 3) 177 { 178 List<int> pl; 179 if (phoneticIdx.TryGetValue(Skeleton(lower), out pl) && pl.Count <= 300) 180 foreach (int idx in pl) candidateIdxs.Add(idx); 181 } 182 183 // Check input as-is (it might be a delete of some dict word → insertion error) 184 int inputHash = HashString(lower); 185 FindByHash(inputHash, candidateIdxs); 186 187 // All distance-1 deletes from input 188 for (int i = 0; i < lower.Length; i++) 189 { 190 int h = HashDelete(lower, i); 191 FindByHash(h, candidateIdxs); 192 193 // Distance-2 deletes from input 194 // Build the delete-1 string, then hash each delete-1 of that 195 string d1 = lower.Remove(i, 1); 196 for (int j = 0; j < d1.Length; j++) 197 { 198 int h2 = HashDelete(d1, j); 199 FindByHash(h2, candidateIdxs); 200 } 201 } 202 203 // Score candidates 204 foreach (int idx in candidateIdxs) 205 { 206 if (idx < 0 || idx >= wordCount) continue; 207 string candidate = words[idx]; 208 if (Math.Abs(candidate.Length - lower.Length) > 2) continue; 209 210 int dist = DamerauLevenshtein(lower, candidate); 211 if (dist <= 0 || dist > 2) continue; 212 if (dist == 2 && candidate[0] != lower[0]) continue; // first char must match for dist=2 213 214 result.Add(new SymSpell.Candidate { Word = candidate, Distance = dist, FreqIdx = idx }); 215 } 216 217 // Noisy-channel score: P(candidate) [frequency + trusted] weighted by how natural the 218 // typed->candidate edit is [ё/phonetic/keyboard/double-letter via SpellScore], with the 219 // edit DISTANCE folded into the score (not a hard primary sort key) so a frequent, 220 // plausible real word can outrank a rare/garbage word that is merely one edit closer. 221 foreach (var c in result) 222 { 223 int score = 0; 224 int minL = Math.Min(lower.Length, c.Word.Length); 225 226 if (c.Word.Length > 0 && lower.Length > 0 && c.Word[0] == lower[0]) score += SpellScore.FirstCharBonus; 227 if (c.Word.Length > 0 && lower.Length > 0 && c.Word[c.Word.Length - 1] == lower[lower.Length - 1]) score += SpellScore.LastCharBonus; 228 229 int overlap = 0; 230 for (int p = 0; p < minL; p++) 231 if (lower[p] == c.Word[p]) overlap++; 232 score += overlap * 200; 233 234 if (c.Word.Length == lower.Length) score += SpellScore.SameLenBonus; 235 236 bool isTrusted = trustedSet != null && trustedSet.Contains(c.Word); 237 if (isTrusted) score += SpellScore.TrustedBonus; 238 239 score += SpellScore.FreqBonus(c.FreqIdx); 240 241 // How plausible is this exact edit (ё↔е, о↔а, voicing, keyboard, doubled letter…)? 242 score += SpellScore.EditPlausibility(lower, c.Word); 243 244 // Rare words that historically caused hallucinations (kept as a safety net). 245 if (rareBlacklist.Contains(c.Word)) score -= 8000; 246 247 // Fold distance in — the core fix for the distance-primary "компутер vs компьютер" bug. 248 score -= c.Distance * SpellScore.DistancePenalty; 249 250 c.ContextScore = score; 251 } 252 253 result.Sort(delegate(SymSpell.Candidate a, SymSpell.Candidate b) 254 { 255 return b.ContextScore.CompareTo(a.ContextScore); // combined noisy-channel score 256 }); 257 258 if (result.Count > topN) result.RemoveRange(topN, result.Count - topN); 259 return result; 260 } 261 262 // Find all word indices matching a given hash via binary search 263 private void FindByHash(int hash, HashSet<int> results) 264 { 265 int idx = Array.BinarySearch(deleteHashes, hash); 266 if (idx < 0) return; 267 268 // Found — scan left and right for all entries with same hash 269 for (int i = idx; i >= 0 && deleteHashes[i] == hash; i--) 270 results.Add(deleteWordIdxs[i]); 271 for (int i = idx + 1; i < deleteHashes.Length && deleteHashes[i] == hash; i++) 272 results.Add(deleteWordIdxs[i]); 273 } 274 275 // Normalize: ё→е for hashing (treats them as same char) 276 private static char NormChar(char c) { return c == 'ё' ? 'е' : c; } 277 278 private static int HashDelete(string s, int skipIdx) 279 { 280 unchecked 281 { 282 int hash = 17; 283 for (int i = 0; i < s.Length; i++) 284 { 285 if (i == skipIdx) continue; 286 hash = hash * 31 + NormChar(s[i]); 287 } 288 return hash; 289 } 290 } 291 292 private static int HashString(string s) 293 { 294 unchecked 295 { 296 int hash = 17; 297 for (int i = 0; i < s.Length; i++) 298 hash = hash * 31 + NormChar(s[i]); 299 return hash; 300 } 301 } 302 303 // Quicksort two parallel arrays by the first 304 private static void SortParallel(int[] keys, int[] vals, int lo, int hi) 305 { 306 if (lo >= hi) return; 307 // Use iterative approach with stack to avoid stack overflow on large arrays 308 var stack = new Stack<int>(); 309 stack.Push(lo); stack.Push(hi); 310 311 while (stack.Count > 0) 312 { 313 int h = stack.Pop(); 314 int l = stack.Pop(); 315 if (l >= h) continue; 316 317 int pivot = keys[(l + h) / 2]; 318 int i = l, j = h; 319 while (i <= j) 320 { 321 while (keys[i] < pivot) i++; 322 while (keys[j] > pivot) j--; 323 if (i <= j) 324 { 325 int tk = keys[i]; keys[i] = keys[j]; keys[j] = tk; 326 int tv = vals[i]; vals[i] = vals[j]; vals[j] = tv; 327 i++; j--; 328 } 329 } 330 if (l < j) { stack.Push(l); stack.Push(j); } 331 if (i < h) { stack.Push(i); stack.Push(h); } 332 } 333 } 334 335 private static int DamerauLevenshtein(string a, string b) 336 { 337 int n = a.Length, m = b.Length; 338 if (n == 0) return m; if (m == 0) return n; 339 int[] prev2 = new int[m + 1]; 340 int[] prev = new int[m + 1]; 341 int[] curr = new int[m + 1]; 342 for (int j = 0; j <= m; j++) prev[j] = j; 343 for (int i = 1; i <= n; i++) 344 { 345 curr[0] = i; 346 for (int j = 1; j <= m; j++) 347 { 348 int cost = (a[i - 1] == b[j - 1]) ? 0 : 1; 349 curr[j] = Math.Min(Math.Min(curr[j - 1] + 1, prev[j] + 1), prev[j - 1] + cost); 350 if (i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]) 351 curr[j] = Math.Min(curr[j], prev2[j - 2] + cost); 352 } 353 var tmp = prev2; prev2 = prev; prev = curr; curr = tmp; 354 } 355 return prev[m]; 356 } 357 358 // Vowel-class collapsed skeleton for phonetic matching (ё→е, о/а→а, е/и/я/э→и, у/ю→у, ы→и). 359 private static string Skeleton(string w) 360 { 361 var sb = new StringBuilder(w.Length); 362 foreach (char c0 in w) 363 { 364 char c = c0 == 'ё' ? 'е' : c0; 365 switch (c) 366 { 367 case 'о': case 'а': sb.Append('а'); break; 368 case 'е': case 'и': case 'я': case 'э': sb.Append('и'); break; 369 case 'у': case 'ю': sb.Append('у'); break; 370 case 'ы': sb.Append('и'); break; 371 default: sb.Append(c); break; 372 } 373 } 374 return sb.ToString(); 375 } 376 } 377}