windowcapture
исходный код / Helpers/SymSpell.cs

SymSpell.cs

261 строк · 9,886 байт · модуль Helpers
  1using System;
  2using System.Collections.Generic;
  3
  4namespace WindowCapture.Helpers
  5{
  6    /// <summary>
  7    /// SymSpell: Symmetric Delete spelling correction.
  8    /// Precomputes delete-combinations for O(1) fuzzy lookup.
  9    /// Uses word frequency (index order) to prefer common words.
 10    /// </summary>
 11    public class SymSpell
 12    {
 13        private readonly int maxDist;
 14        private string[] words;
 15        private HashSet<string> wordSet;
 16        private Dictionary<int, List<int>> deletesMap;
 17        private int freqCutoff; // words below this index are "common" (preferred)
 18        private volatile bool ready;
 19
 20        public bool IsReady { get { return ready; } }
 21        public int WordCount { get { return words != null ? words.Length : 0; } }
 22
 23        public SymSpell(int maxEditDistance)
 24        {
 25            maxDist = maxEditDistance;
 26        }
 27
 28        public bool Contains(string word)
 29        {
 30            return wordSet != null && wordSet.Contains(word);
 31        }
 32
 33        /// <summary>Set an external Bloom filter for "is word correct?" checks (1.5M words in 3MB).</summary>
 34        public BloomFilter ExternalBloom;
 35
 36        private bool IsCorrectWord(string lower)
 37        {
 38            if (ExternalBloom != null && ExternalBloom.MayContain(lower)) return true;
 39            return wordSet != null && wordSet.Contains(lower);
 40        }
 41
 42        /// <summary>Build index. Dictionary should be ordered by frequency (most common first).</summary>
 43        public void Build(string[] dictionary)
 44        {
 45            var sw = System.Diagnostics.Stopwatch.StartNew();
 46            words = dictionary;
 47            wordSet = new HashSet<string>(StringComparer.Ordinal);
 48            freqCutoff = Math.Min(words.Length, 80000); // top 80k = "common"
 49
 50            var delMap = new Dictionary<int, List<int>>();
 51
 52            for (int idx = 0; idx < words.Length; idx++)
 53            {
 54                string w = words[idx];
 55                wordSet.Add(w);
 56
 57                if (w.Length < 3 || w.Length > 25) continue;
 58
 59                // Generate distance-1 deletes for ALL words
 60                for (int i = 0; i < w.Length; i++)
 61                {
 62                    string del = w.Remove(i, 1);
 63                    AddToMap(delMap, del, idx);
 64                }
 65
 66                // Distance-2 deletes for top 40k words (good coverage, ~15MB extra RAM)
 67                if (idx < 40000 && w.Length >= 4 && w.Length <= 15)
 68                {
 69                    for (int i = 0; i < w.Length; i++)
 70                    {
 71                        string d1 = w.Remove(i, 1);
 72                        for (int j = 0; j < d1.Length; j++)
 73                            AddToMap(delMap, d1.Remove(j, 1), idx);
 74                    }
 75                }
 76            }
 77
 78            deletesMap = delMap;
 79            sw.Stop();
 80            Logger.Log("textproc", "SymSpell built: " + words.Length + " words, " +
 81                deletesMap.Count + " deletes, " + sw.ElapsedMilliseconds + "ms, freq<" + freqCutoff);
 82            ready = true;
 83        }
 84
 85        public class Candidate
 86        {
 87            public string Word;
 88            public int Distance;
 89            public int FreqIdx;
 90            public int ContextScore; // set by TextProcessor
 91        }
 92
 93        /// <summary>Return top N correction candidates sorted by distance then frequency.</summary>
 94        public List<Candidate> LookupTop(string input, int topN, out bool isCorrect)
 95        {
 96            isCorrect = false;
 97            var result = new List<Candidate>();
 98            if (!ready || input == null || input.Length < 2) return result;
 99            string lower = input.ToLower();
100            if (IsCorrectWord(lower)) { isCorrect = true; return result; }
101
102            var candidateIdxs = new HashSet<int>();
103            AddCandidates(lower, candidateIdxs);
104            for (int i = 0; i < lower.Length; i++)
105            {
106                string d1 = lower.Remove(i, 1);
107                AddCandidates(d1, candidateIdxs);
108                for (int j = 0; j < d1.Length; j++)
109                    AddCandidates(d1.Remove(j, 1), candidateIdxs);
110            }
111
112            foreach (int idx in candidateIdxs)
113            {
114                if (idx < 0 || idx >= words.Length) continue;
115                string c = words[idx];
116                if (Math.Abs(c.Length - lower.Length) > maxDist) continue;
117                int dist = DamerauLevenshtein(lower, c);
118                if (dist <= 0 || dist > maxDist) continue;
119                if (dist == 2 && c.Length > 0 && lower.Length > 0 && c[0] != lower[0]) continue;
120                // All words in SymSpell are top-80k, no need for freq filter
121                result.Add(new Candidate { Word = c, Distance = dist, FreqIdx = idx });
122            }
123
124            // Sort: distance ASC, then freq ASC
125            result.Sort(delegate(Candidate a, Candidate b)
126            {
127                int cmp = a.Distance.CompareTo(b.Distance);
128                if (cmp != 0) return cmp;
129                return a.FreqIdx.CompareTo(b.FreqIdx);
130            });
131
132            if (result.Count > topN) result.RemoveRange(topN, result.Count - topN);
133            return result;
134        }
135
136        /// <summary>Find best correction. Returns null if word is correct or no match found.</summary>
137        public string Lookup(string input, out int editDist)
138        {
139            editDist = 0;
140            if (!ready || input == null || input.Length < 2) return null;
141            string lower = input.ToLower();
142            if (IsCorrectWord(lower)) return null; // correct word
143
144            var candidates = new HashSet<int>();
145
146            // 1. Input as-is might be a delete of some dict word (handles insertions)
147            AddCandidates(lower, candidates);
148
149            // 2. All distance-1 deletes from input
150            for (int i = 0; i < lower.Length; i++)
151            {
152                string d1 = lower.Remove(i, 1);
153                AddCandidates(d1, candidates);
154
155                // 3. Distance-2 deletes from input (for effective dist up to 2)
156                for (int j = 0; j < d1.Length; j++)
157                {
158                    string d2 = d1.Remove(j, 1);
159                    AddCandidates(d2, candidates);
160                }
161            }
162
163            if (candidates.Count == 0) return null;
164
165            Logger.Log("textproc", "  SymSpell candidates: " + candidates.Count);
166
167            // Score candidates: lower distance wins, then frequency (lower index = more common)
168            string best = null;
169            int bestDist = maxDist + 1;
170            int bestIdx = int.MaxValue;
171            bool bestIsCommon = false;
172
173            foreach (int idx in candidates)
174            {
175                if (idx < 0 || idx >= words.Length) continue;
176                string candidate = words[idx];
177                if (Math.Abs(candidate.Length - lower.Length) > maxDist) continue;
178
179                int dist = DamerauLevenshtein(lower, candidate);
180                if (dist <= 0 || dist > maxDist) continue;
181
182                bool isCommon = idx < freqCutoff;
183
184                // Dist=2: first char must match
185                if (dist == 2 && candidate.Length > 0 && lower.Length > 0 && candidate[0] != lower[0])
186                    continue;
187
188                // Reject very rare words as corrections (idx > 200k = too obscure)
189                if (idx > 200000) continue;
190
191                // Better distance always wins
192                if (dist < bestDist)
193                {
194                    bestDist = dist; best = candidate; bestIdx = idx; bestIsCommon = isCommon;
195                }
196                // Same distance: prefer common word, then closer index (more frequent)
197                else if (dist == bestDist)
198                {
199                    if (isCommon && !bestIsCommon)
200                    { best = candidate; bestIdx = idx; bestIsCommon = true; }
201                    else if (isCommon == bestIsCommon && idx < bestIdx)
202                    { best = candidate; bestIdx = idx; }
203                }
204            }
205
206            editDist = bestDist;
207            return best;
208        }
209
210        private static void AddToMap(Dictionary<int, List<int>> map, string del, int idx)
211        {
212            int hash = del.GetHashCode();
213            List<int> bucket;
214            if (!map.TryGetValue(hash, out bucket))
215            {
216                bucket = new List<int>(2);
217                map[hash] = bucket;
218            }
219            if (bucket.Count < 100)
220                bucket.Add(idx);
221        }
222
223        private void AddCandidates(string key, HashSet<int> candidates)
224        {
225            int hash = key.GetHashCode();
226            List<int> bucket;
227            if (deletesMap.TryGetValue(hash, out bucket))
228            {
229                foreach (int idx in bucket)
230                    candidates.Add(idx);
231            }
232        }
233
234        // Damerau-Levenshtein: handles transpositions too
235        private static int DamerauLevenshtein(string a, string b)
236        {
237            int n = a.Length, m = b.Length;
238            if (n == 0) return m;
239            if (m == 0) return n;
240
241            int[] prev2 = new int[m + 1];
242            int[] prev = new int[m + 1];
243            int[] curr = new int[m + 1];
244            for (int j = 0; j <= m; j++) prev[j] = j;
245
246            for (int i = 1; i <= n; i++)
247            {
248                curr[0] = i;
249                for (int j = 1; j <= m; j++)
250                {
251                    int cost = (a[i - 1] == b[j - 1]) ? 0 : 1;
252                    curr[j] = Math.Min(Math.Min(curr[j - 1] + 1, prev[j] + 1), prev[j - 1] + cost);
253                    if (i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1])
254                        curr[j] = Math.Min(curr[j], prev2[j - 2] + cost);
255                }
256                var tmp = prev2; prev2 = prev; prev = curr; curr = tmp;
257            }
258            return prev[m];
259        }
260    }
261}