1using System; 2using System.Collections.Generic; 3 4namespace WindowCapture.Helpers 5{ 6 /// <summary> 7 /// SymSpell: Symmetric Delete spelling correction. 8 /// Precomputes delete-combinations for O(1) fuzzy lookup. 9 /// Uses word frequency (index order) to prefer common words. 10 /// </summary> 11 public class SymSpell 12 { 13 private readonly int maxDist; 14 private string[] words; 15 private HashSet<string> wordSet; 16 private Dictionary<int, List<int>> deletesMap; 17 private int freqCutoff; // words below this index are "common" (preferred) 18 private volatile bool ready; 19 20 public bool IsReady { get { return ready; } } 21 public int WordCount { get { return words != null ? words.Length : 0; } } 22 23 public SymSpell(int maxEditDistance) 24 { 25 maxDist = maxEditDistance; 26 } 27 28 public bool Contains(string word) 29 { 30 return wordSet != null && wordSet.Contains(word); 31 } 32 33 /// <summary>Set an external Bloom filter for "is word correct?" checks (1.5M words in 3MB).</summary> 34 public BloomFilter ExternalBloom; 35 36 private bool IsCorrectWord(string lower) 37 { 38 if (ExternalBloom != null && ExternalBloom.MayContain(lower)) return true; 39 return wordSet != null && wordSet.Contains(lower); 40 } 41 42 /// <summary>Build index. Dictionary should be ordered by frequency (most common first).</summary> 43 public void Build(string[] dictionary) 44 { 45 var sw = System.Diagnostics.Stopwatch.StartNew(); 46 words = dictionary; 47 wordSet = new HashSet<string>(StringComparer.Ordinal); 48 freqCutoff = Math.Min(words.Length, 80000); // top 80k = "common" 49 50 var delMap = new Dictionary<int, List<int>>(); 51 52 for (int idx = 0; idx < words.Length; idx++) 53 { 54 string w = words[idx]; 55 wordSet.Add(w); 56 57 if (w.Length < 3 || w.Length > 25) continue; 58 59 // Generate distance-1 deletes for ALL words 60 for (int i = 0; i < w.Length; i++) 61 { 62 string del = w.Remove(i, 1); 63 AddToMap(delMap, del, idx); 64 } 65 66 // Distance-2 deletes for top 40k words (good coverage, ~15MB extra RAM) 67 if (idx < 40000 && w.Length >= 4 && w.Length <= 15) 68 { 69 for (int i = 0; i < w.Length; i++) 70 { 71 string d1 = w.Remove(i, 1); 72 for (int j = 0; j < d1.Length; j++) 73 AddToMap(delMap, d1.Remove(j, 1), idx); 74 } 75 } 76 } 77 78 deletesMap = delMap; 79 sw.Stop(); 80 Logger.Log("textproc", "SymSpell built: " + words.Length + " words, " + 81 deletesMap.Count + " deletes, " + sw.ElapsedMilliseconds + "ms, freq<" + freqCutoff); 82 ready = true; 83 } 84 85 public class Candidate 86 { 87 public string Word; 88 public int Distance; 89 public int FreqIdx; 90 public int ContextScore; // set by TextProcessor 91 } 92 93 /// <summary>Return top N correction candidates sorted by distance then frequency.</summary> 94 public List<Candidate> LookupTop(string input, int topN, out bool isCorrect) 95 { 96 isCorrect = false; 97 var result = new List<Candidate>(); 98 if (!ready || input == null || input.Length < 2) return result; 99 string lower = input.ToLower(); 100 if (IsCorrectWord(lower)) { isCorrect = true; return result; } 101 102 var candidateIdxs = new HashSet<int>(); 103 AddCandidates(lower, candidateIdxs); 104 for (int i = 0; i < lower.Length; i++) 105 { 106 string d1 = lower.Remove(i, 1); 107 AddCandidates(d1, candidateIdxs); 108 for (int j = 0; j < d1.Length; j++) 109 AddCandidates(d1.Remove(j, 1), candidateIdxs); 110 } 111 112 foreach (int idx in candidateIdxs) 113 { 114 if (idx < 0 || idx >= words.Length) continue; 115 string c = words[idx]; 116 if (Math.Abs(c.Length - lower.Length) > maxDist) continue; 117 int dist = DamerauLevenshtein(lower, c); 118 if (dist <= 0 || dist > maxDist) continue; 119 if (dist == 2 && c.Length > 0 && lower.Length > 0 && c[0] != lower[0]) continue; 120 // All words in SymSpell are top-80k, no need for freq filter 121 result.Add(new Candidate { Word = c, Distance = dist, FreqIdx = idx }); 122 } 123 124 // Sort: distance ASC, then freq ASC 125 result.Sort(delegate(Candidate a, Candidate b) 126 { 127 int cmp = a.Distance.CompareTo(b.Distance); 128 if (cmp != 0) return cmp; 129 return a.FreqIdx.CompareTo(b.FreqIdx); 130 }); 131 132 if (result.Count > topN) result.RemoveRange(topN, result.Count - topN); 133 return result; 134 } 135 136 /// <summary>Find best correction. Returns null if word is correct or no match found.</summary> 137 public string Lookup(string input, out int editDist) 138 { 139 editDist = 0; 140 if (!ready || input == null || input.Length < 2) return null; 141 string lower = input.ToLower(); 142 if (IsCorrectWord(lower)) return null; // correct word 143 144 var candidates = new HashSet<int>(); 145 146 // 1. Input as-is might be a delete of some dict word (handles insertions) 147 AddCandidates(lower, candidates); 148 149 // 2. All distance-1 deletes from input 150 for (int i = 0; i < lower.Length; i++) 151 { 152 string d1 = lower.Remove(i, 1); 153 AddCandidates(d1, candidates); 154 155 // 3. Distance-2 deletes from input (for effective dist up to 2) 156 for (int j = 0; j < d1.Length; j++) 157 { 158 string d2 = d1.Remove(j, 1); 159 AddCandidates(d2, candidates); 160 } 161 } 162 163 if (candidates.Count == 0) return null; 164 165 Logger.Log("textproc", " SymSpell candidates: " + candidates.Count); 166 167 // Score candidates: lower distance wins, then frequency (lower index = more common) 168 string best = null; 169 int bestDist = maxDist + 1; 170 int bestIdx = int.MaxValue; 171 bool bestIsCommon = false; 172 173 foreach (int idx in candidates) 174 { 175 if (idx < 0 || idx >= words.Length) continue; 176 string candidate = words[idx]; 177 if (Math.Abs(candidate.Length - lower.Length) > maxDist) continue; 178 179 int dist = DamerauLevenshtein(lower, candidate); 180 if (dist <= 0 || dist > maxDist) continue; 181 182 bool isCommon = idx < freqCutoff; 183 184 // Dist=2: first char must match 185 if (dist == 2 && candidate.Length > 0 && lower.Length > 0 && candidate[0] != lower[0]) 186 continue; 187 188 // Reject very rare words as corrections (idx > 200k = too obscure) 189 if (idx > 200000) continue; 190 191 // Better distance always wins 192 if (dist < bestDist) 193 { 194 bestDist = dist; best = candidate; bestIdx = idx; bestIsCommon = isCommon; 195 } 196 // Same distance: prefer common word, then closer index (more frequent) 197 else if (dist == bestDist) 198 { 199 if (isCommon && !bestIsCommon) 200 { best = candidate; bestIdx = idx; bestIsCommon = true; } 201 else if (isCommon == bestIsCommon && idx < bestIdx) 202 { best = candidate; bestIdx = idx; } 203 } 204 } 205 206 editDist = bestDist; 207 return best; 208 } 209 210 private static void AddToMap(Dictionary<int, List<int>> map, string del, int idx) 211 { 212 int hash = del.GetHashCode(); 213 List<int> bucket; 214 if (!map.TryGetValue(hash, out bucket)) 215 { 216 bucket = new List<int>(2); 217 map[hash] = bucket; 218 } 219 if (bucket.Count < 100) 220 bucket.Add(idx); 221 } 222 223 private void AddCandidates(string key, HashSet<int> candidates) 224 { 225 int hash = key.GetHashCode(); 226 List<int> bucket; 227 if (deletesMap.TryGetValue(hash, out bucket)) 228 { 229 foreach (int idx in bucket) 230 candidates.Add(idx); 231 } 232 } 233 234 // Damerau-Levenshtein: handles transpositions too 235 private static int DamerauLevenshtein(string a, string b) 236 { 237 int n = a.Length, m = b.Length; 238 if (n == 0) return m; 239 if (m == 0) return n; 240 241 int[] prev2 = new int[m + 1]; 242 int[] prev = new int[m + 1]; 243 int[] curr = new int[m + 1]; 244 for (int j = 0; j <= m; j++) prev[j] = j; 245 246 for (int i = 1; i <= n; i++) 247 { 248 curr[0] = i; 249 for (int j = 1; j <= m; j++) 250 { 251 int cost = (a[i - 1] == b[j - 1]) ? 0 : 1; 252 curr[j] = Math.Min(Math.Min(curr[j - 1] + 1, prev[j] + 1), prev[j - 1] + cost); 253 if (i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]) 254 curr[j] = Math.Min(curr[j], prev2[j - 2] + cost); 255 } 256 var tmp = prev2; prev2 = prev; prev = curr; curr = tmp; 257 } 258 return prev[m]; 259 } 260 } 261}