windowcapture
исходный код / Helpers/CompactSpell.cs

CompactSpell.cs

377 строк · 16,098 байт · модуль Helpers
  1using System;
  2using System.Collections.Generic;
  3using System.IO;
  4using System.Text;
  5
  6namespace WindowCapture.Helpers
  7{
  8    /// <summary>
  9    /// CompactSpell: memory-efficient spelling correction for millions of words.
 10    /// Uses sorted arrays + binary search instead of Dictionary — ~5x less RAM.
 11    ///
 12    /// Architecture:
 13    /// - words[]: all dictionary words, sorted by frequency
 14    /// - deleteHashes[]: sorted int32 hashes of all delete-1 combinations
 15    /// - deleteWordIdx[]: corresponding word indices (parallel array)
 16    /// - Binary search on deleteHashes to find candidates
 17    ///
 18    /// RAM: ~20MB for 1.5M words (vs ~100MB with Dictionary)
 19    /// Speed: O(log n) per lookup instead of O(1), but n=10M so log(n)=23 comparisons
 20    /// </summary>
 21    public class CompactSpell
 22    {
 23        private string[] words;         // frequency-ordered (for FreqIdx)
 24        private HashSet<string> wordSet;  // ALL words for lookup candidates
 25        private HashSet<string> trustedSet; // top N frequent words — "definitely correct"
 26        private int wordCount;
 27
 28        // Compact delete index: sorted parallel arrays
 29        private int[] deleteHashes;   // sorted hash values
 30        private int[] deleteWordIdxs; // word index for each hash
 31        // For duplicate hashes, they're adjacent in the sorted array
 32
 33        // Phonetic skeleton index (vowel-class collapsed) over trusted words — recall for
 34        // vowel-confusion typos (малако→молоко) the delete index can't bridge (2 substitutions).
 35        private Dictionary<string, List<int>> phoneticIdx;
 36
 37        private volatile bool ready;
 38        public bool IsReady { get { return ready; } }
 39        public int WordCount { get { return wordCount; } }
 40
 41        // Bloom filter for "is word correct?" — shared
 42        public BloomFilter Bloom;
 43
 44        public bool IsCorrect(string lower)
 45        {
 46            if (Bloom != null) return Bloom.MayContain(lower);
 47            // Fallback when the Bloom filter hasn't been attached yet (init-order race):
 48            // 'words' is frequency-ordered, NOT lexicographically sorted, so BinarySearch on it
 49            // returns garbage. Use the exact-match HashSet instead.
 50            return wordSet != null && wordSet.Contains(lower);
 51        }
 52
 53        /// <summary>Check if word is in full dictionary (1.5M).</summary>
 54        public bool ContainsExact(string lower)
 55        {
 56            return wordSet != null && wordSet.Contains(lower);
 57        }
 58
 59        // Rare short words that should NEVER win over common alternatives
 60        private static readonly HashSet<string> rareBlacklist = new HashSet<string> {
 61            "риза","ска","омон","дот","шкод","катер",
 62            "нина","сводня","кое","слом","тагор",
 63            "пресет","превед","компутер","локаут",
 64        };
 65
 66        // Real 1-2 char words in Russian (everything else is corpus garbage)
 67        private static readonly HashSet<string> realShortWords = new HashSet<string> {
 68            "а","в","и","к","о","с","у","я","аж","ай","ан","ау",
 69            "бы","вы","да","до","ей","ем","ей","её","за","из","их","ко",
 70            "ли","мы","на","не","ни","но","ну","об","он","от","по","со",
 71            "та","те","то","ту","ты","уж","уз","ум","уч","чё",
 72            "же","мне","все","вас","нас","вам","нам","ещё","его","нет",
 73            "кто","что","как","так","тут","там","вот","уже","это","она","они",
 74        };
 75
 76        /// <summary>Check if word is in trusted dictionary. Short words use whitelist.</summary>
 77        public bool ContainsTrusted(string lower)
 78        {
 79            if (lower.Length <= 2) return realShortWords.Contains(lower);
 80            return trustedSet != null && trustedSet.Contains(lower);
 81        }
 82
 83        /// <summary>Build compact index from frequency-ordered dictionary.</summary>
 84        public void Build(string[] dictionary)
 85        {
 86            var sw = System.Diagnostics.Stopwatch.StartNew();
 87            words = dictionary;
 88            wordCount = words.Length;
 89
 90            wordSet = new HashSet<string>(StringComparer.Ordinal);
 91            trustedSet = new HashSet<string>(StringComparer.Ordinal);
 92            phoneticIdx = new Dictionary<string, List<int>>(StringComparer.Ordinal);
 93            for (int i = 0; i < wordCount; i++)
 94            {
 95                wordSet.Add(words[i]);
 96                // First 87k words = frequency-ordered from Leeds corpus = trusted
 97                if (i < 87000)
 98                {
 99                    trustedSet.Add(words[i]);
100                    if (words[i].Length >= 3)
101                    {
102                        string sk = Skeleton(words[i]);
103                        List<int> pl;
104                        if (!phoneticIdx.TryGetValue(sk, out pl)) { pl = new List<int>(2); phoneticIdx[sk] = pl; }
105                        pl.Add(i);
106                    }
107                }
108            }
109            Logger.Log("textproc", "CompactSpell: wordSet=" + wordSet.Count + " trusted=" + trustedSet.Count);
110
111            // Phase 1: count total deletes to pre-allocate arrays
112            long totalDeletes = 0;
113            for (int i = 0; i < wordCount; i++)
114            {
115                int len = words[i].Length;
116                if (len >= 3 && len <= 25) totalDeletes += len;
117            }
118
119            Logger.Log("textproc", "CompactSpell: " + wordCount + " words, ~" + totalDeletes + " deletes, allocating...");
120
121            // Phase 2: generate all delete-1 hashes
122            // Use List then convert to sorted array
123            var hashes = new int[totalDeletes];
124            var idxs = new int[totalDeletes];
125            int pos = 0;
126
127            for (int wi = 0; wi < wordCount; wi++)
128            {
129                string w = words[wi];
130                if (w.Length < 3 || w.Length > 25) continue;
131
132                for (int i = 0; i < w.Length; i++)
133                {
134                    // Hash the delete string without actually creating it (saves GC pressure)
135                    int hash = HashDelete(w, i);
136                    hashes[pos] = hash;
137                    idxs[pos] = wi;
138                    pos++;
139                }
140            }
141
142            // Trim if needed
143            if (pos < totalDeletes)
144            {
145                Array.Resize(ref hashes, pos);
146                Array.Resize(ref idxs, pos);
147            }
148
149            // Phase 3: sort by hash (parallel sort of both arrays)
150            SortParallel(hashes, idxs, 0, pos - 1);
151
152            deleteHashes = hashes;
153            deleteWordIdxs = idxs;
154
155            sw.Stop();
156            Logger.Log("textproc", "CompactSpell built: " + pos + " delete entries, " +
157                (pos * 8 / 1024 / 1024) + "MB arrays, " + sw.ElapsedMilliseconds + "ms");
158            ready = true;
159        }
160
161        /// <summary>Find best correction candidates.</summary>
162        public List<SymSpell.Candidate> Lookup(string input, int topN) { return Lookup(input, topN, false); }
163
164        /// <summary>force=true generates candidates even for an in-dictionary word — used for the
165        /// frequency-override (a rare corpus word may still have a far more frequent correct neighbour).</summary>
166        public List<SymSpell.Candidate> Lookup(string input, int topN, bool force)
167        {
168            var result = new List<SymSpell.Candidate>();
169            if (!ready || input == null || input.Length < 2) return result;
170            string lower = input.ToLower();
171            if (!force && IsCorrect(lower)) return result; // already correct
172
173            var candidateIdxs = new HashSet<int>();
174
175            // Phonetic recall: trusted words sharing the vowel-collapsed skeleton (малако→молоко).
176            if (phoneticIdx != null && lower.Length >= 3)
177            {
178                List<int> pl;
179                if (phoneticIdx.TryGetValue(Skeleton(lower), out pl) && pl.Count <= 300)
180                    foreach (int idx in pl) candidateIdxs.Add(idx);
181            }
182
183            // Check input as-is (it might be a delete of some dict word → insertion error)
184            int inputHash = HashString(lower);
185            FindByHash(inputHash, candidateIdxs);
186
187            // All distance-1 deletes from input
188            for (int i = 0; i < lower.Length; i++)
189            {
190                int h = HashDelete(lower, i);
191                FindByHash(h, candidateIdxs);
192
193                // Distance-2 deletes from input
194                // Build the delete-1 string, then hash each delete-1 of that
195                string d1 = lower.Remove(i, 1);
196                for (int j = 0; j < d1.Length; j++)
197                {
198                    int h2 = HashDelete(d1, j);
199                    FindByHash(h2, candidateIdxs);
200                }
201            }
202
203            // Score candidates
204            foreach (int idx in candidateIdxs)
205            {
206                if (idx < 0 || idx >= wordCount) continue;
207                string candidate = words[idx];
208                if (Math.Abs(candidate.Length - lower.Length) > 2) continue;
209
210                int dist = DamerauLevenshtein(lower, candidate);
211                if (dist <= 0 || dist > 2) continue;
212                if (dist == 2 && candidate[0] != lower[0]) continue; // first char must match for dist=2
213
214                result.Add(new SymSpell.Candidate { Word = candidate, Distance = dist, FreqIdx = idx });
215            }
216
217            // Noisy-channel score: P(candidate) [frequency + trusted] weighted by how natural the
218            // typed->candidate edit is [ё/phonetic/keyboard/double-letter via SpellScore], with the
219            // edit DISTANCE folded into the score (not a hard primary sort key) so a frequent,
220            // plausible real word can outrank a rare/garbage word that is merely one edit closer.
221            foreach (var c in result)
222            {
223                int score = 0;
224                int minL = Math.Min(lower.Length, c.Word.Length);
225
226                if (c.Word.Length > 0 && lower.Length > 0 && c.Word[0] == lower[0]) score += SpellScore.FirstCharBonus;
227                if (c.Word.Length > 0 && lower.Length > 0 && c.Word[c.Word.Length - 1] == lower[lower.Length - 1]) score += SpellScore.LastCharBonus;
228
229                int overlap = 0;
230                for (int p = 0; p < minL; p++)
231                    if (lower[p] == c.Word[p]) overlap++;
232                score += overlap * 200;
233
234                if (c.Word.Length == lower.Length) score += SpellScore.SameLenBonus;
235
236                bool isTrusted = trustedSet != null && trustedSet.Contains(c.Word);
237                if (isTrusted) score += SpellScore.TrustedBonus;
238
239                score += SpellScore.FreqBonus(c.FreqIdx);
240
241                // How plausible is this exact edit (ё↔е, о↔а, voicing, keyboard, doubled letter…)?
242                score += SpellScore.EditPlausibility(lower, c.Word);
243
244                // Rare words that historically caused hallucinations (kept as a safety net).
245                if (rareBlacklist.Contains(c.Word)) score -= 8000;
246
247                // Fold distance in — the core fix for the distance-primary "компутер vs компьютер" bug.
248                score -= c.Distance * SpellScore.DistancePenalty;
249
250                c.ContextScore = score;
251            }
252
253            result.Sort(delegate(SymSpell.Candidate a, SymSpell.Candidate b)
254            {
255                return b.ContextScore.CompareTo(a.ContextScore); // combined noisy-channel score
256            });
257
258            if (result.Count > topN) result.RemoveRange(topN, result.Count - topN);
259            return result;
260        }
261
262        // Find all word indices matching a given hash via binary search
263        private void FindByHash(int hash, HashSet<int> results)
264        {
265            int idx = Array.BinarySearch(deleteHashes, hash);
266            if (idx < 0) return;
267
268            // Found — scan left and right for all entries with same hash
269            for (int i = idx; i >= 0 && deleteHashes[i] == hash; i--)
270                results.Add(deleteWordIdxs[i]);
271            for (int i = idx + 1; i < deleteHashes.Length && deleteHashes[i] == hash; i++)
272                results.Add(deleteWordIdxs[i]);
273        }
274
275        // Normalize: ё→е for hashing (treats them as same char)
276        private static char NormChar(char c) { return c == 'ё' ? 'е' : c; }
277
278        private static int HashDelete(string s, int skipIdx)
279        {
280            unchecked
281            {
282                int hash = 17;
283                for (int i = 0; i < s.Length; i++)
284                {
285                    if (i == skipIdx) continue;
286                    hash = hash * 31 + NormChar(s[i]);
287                }
288                return hash;
289            }
290        }
291
292        private static int HashString(string s)
293        {
294            unchecked
295            {
296                int hash = 17;
297                for (int i = 0; i < s.Length; i++)
298                    hash = hash * 31 + NormChar(s[i]);
299                return hash;
300            }
301        }
302
303        // Quicksort two parallel arrays by the first
304        private static void SortParallel(int[] keys, int[] vals, int lo, int hi)
305        {
306            if (lo >= hi) return;
307            // Use iterative approach with stack to avoid stack overflow on large arrays
308            var stack = new Stack<int>();
309            stack.Push(lo); stack.Push(hi);
310
311            while (stack.Count > 0)
312            {
313                int h = stack.Pop();
314                int l = stack.Pop();
315                if (l >= h) continue;
316
317                int pivot = keys[(l + h) / 2];
318                int i = l, j = h;
319                while (i <= j)
320                {
321                    while (keys[i] < pivot) i++;
322                    while (keys[j] > pivot) j--;
323                    if (i <= j)
324                    {
325                        int tk = keys[i]; keys[i] = keys[j]; keys[j] = tk;
326                        int tv = vals[i]; vals[i] = vals[j]; vals[j] = tv;
327                        i++; j--;
328                    }
329                }
330                if (l < j) { stack.Push(l); stack.Push(j); }
331                if (i < h) { stack.Push(i); stack.Push(h); }
332            }
333        }
334
335        private static int DamerauLevenshtein(string a, string b)
336        {
337            int n = a.Length, m = b.Length;
338            if (n == 0) return m; if (m == 0) return n;
339            int[] prev2 = new int[m + 1];
340            int[] prev = new int[m + 1];
341            int[] curr = new int[m + 1];
342            for (int j = 0; j <= m; j++) prev[j] = j;
343            for (int i = 1; i <= n; i++)
344            {
345                curr[0] = i;
346                for (int j = 1; j <= m; j++)
347                {
348                    int cost = (a[i - 1] == b[j - 1]) ? 0 : 1;
349                    curr[j] = Math.Min(Math.Min(curr[j - 1] + 1, prev[j] + 1), prev[j - 1] + cost);
350                    if (i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1])
351                        curr[j] = Math.Min(curr[j], prev2[j - 2] + cost);
352                }
353                var tmp = prev2; prev2 = prev; prev = curr; curr = tmp;
354            }
355            return prev[m];
356        }
357
358        // Vowel-class collapsed skeleton for phonetic matching (ё→е, о/а→а, е/и/я/э→и, у/ю→у, ы→и).
359        private static string Skeleton(string w)
360        {
361            var sb = new StringBuilder(w.Length);
362            foreach (char c0 in w)
363            {
364                char c = c0 == 'ё' ? 'е' : c0;
365                switch (c)
366                {
367                    case 'о': case 'а': sb.Append('а'); break;
368                    case 'е': case 'и': case 'я': case 'э': sb.Append('и'); break;
369                    case 'у': case 'ю': sb.Append('у'); break;
370                    case 'ы': sb.Append('и'); break;
371                    default: sb.Append(c); break;
372                }
373            }
374            return sb.ToString();
375        }
376    }
377}