1using System; 2using System.Collections.Generic; 3 4namespace WindowCapture.Helpers 5{ 6 /// <summary> 7 /// Noisy-channel scoring helpers for spell correction (Stage 0 of the smarter-autocorrect plan). 8 /// 9 /// The idea: a typo→word substitution is not equally likely for every char pair. ё/е, unstressed 10 /// vowel confusions (о/а, и/е), consonant voicing (б/п, з/с …) and keyboard-adjacent slips are 11 /// CHEAP (very natural typos); random substitutions are expensive. This lets the ranker prefer 12 /// the phonetically/ergonomically plausible candidate, and — combined with word frequency — lets a 13 /// frequent real word (e.g. "компьютер") beat a rare/garbage near-neighbour ("компутер") even when 14 /// the latter is one edit closer. Replaces the misranking bag-of-chars CharEmbedNet. 15 /// </summary> 16 public static class SpellScore 17 { 18 // ЙЦУКЕН horizontal neighbours (most common adjacency slips). 19 private static readonly HashSet<long> kbdAdj = new HashSet<long>(); 20 static SpellScore() 21 { 22 string[] rows = { "йцукенгшщзхъ", "фывапролджэ", "ячсмитьбю" }; 23 foreach (var r in rows) 24 for (int i = 0; i + 1 < r.Length; i++) 25 { 26 AddPair(r[i], r[i + 1]); 27 } 28 } 29 private static long Key(char a, char b) { return a < b ? ((long)a << 16) | b : ((long)b << 16) | a; } 30 private static void AddPair(char a, char b) { kbdAdj.Add(Key(a, b)); } 31 private static bool KeyboardAdjacent(char a, char b) { return kbdAdj.Contains(Key(a, b)); } 32 33 // Phonetic / typo-plausibility classes (unordered pairs). 34 private static bool In(char a, char b, char x, char y) { return (a == x && b == y) || (a == y && b == x); } 35 36 /// <summary>Bonus for a single-char substitution typed→cand: higher = more natural typo.</summary> 37 public static int SubBonus(char a, char b) 38 { 39 if (a == b) return 0; 40 // ё/е are the same key for most typists; о↔ё (пошол/пошёл) almost as common. 41 if (In(a, b, 'е', 'ё')) return 1800; 42 if (In(a, b, 'о', 'ё')) return 1500; 43 // Unstressed-vowel reduction (аканье/иканье) — the #1 class of Russian misspellings. 44 if (In(a, b, 'о', 'а') || In(a, b, 'и', 'е') || In(a, b, 'е', 'я') || 45 In(a, b, 'а', 'я') || In(a, b, 'у', 'о') || In(a, b, 'ы', 'и')) return 1200; 46 // Consonant voicing pairs. 47 if (In(a, b, 'б', 'п') || In(a, b, 'в', 'ф') || In(a, b, 'г', 'к') || 48 In(a, b, 'д', 'т') || In(a, b, 'з', 'с') || In(a, b, 'ж', 'ш') || 49 In(a, b, 'с', 'ц') || In(a, b, 'ч', 'щ')) return 1000; 50 // Physical keyboard slip. 51 if (KeyboardAdjacent(a, b)) return 800; 52 return 0; 53 } 54 55 /// <summary> 56 /// Total plausibility bonus for turning `typed` into `cand` (both lowercase). Looks at the 57 /// kind of edits, not just their count: natural substitutions, a doubled consonant, or a 58 /// dropped soft-sign/vowel score well; arbitrary edits score 0. 59 /// </summary> 60 public static int EditPlausibility(string typed, string cand) 61 { 62 if (typed == null || cand == null) return 0; 63 int lt = typed.Length, lc = cand.Length; 64 65 if (lt == lc) 66 { 67 int bonus = 0, diffs = 0; 68 for (int i = 0; i < lt; i++) 69 if (typed[i] != cand[i]) { bonus += SubBonus(typed[i], cand[i]); diffs++; } 70 // Adjacent transposition (e.g. "пошло"→"пошёл"-style ordering slips) is also natural. 71 if (diffs == 2 && bonus == 0) 72 { 73 for (int i = 0; i + 1 < lt; i++) 74 if (typed[i] == cand[i + 1] && typed[i + 1] == cand[i] && typed[i] != typed[i + 1]) 75 return 900; 76 } 77 return bonus; 78 } 79 80 if (Math.Abs(lt - lc) == 1) 81 { 82 // Single insert/delete: classify the added/removed char. 83 string lng = lt > lc ? typed : cand; 84 string sht = lt > lc ? cand : typed; 85 int p = 0; 86 while (p < sht.Length && sht[p] == lng[p]) p++; 87 char ch = lng[p]; 88 // Doubled consonant (программа/грамотный) — extremely common. 89 if (p > 0 && lng[p] == lng[p - 1]) return 1400; 90 if (p + 1 < lng.Length && lng[p] == lng[p + 1]) return 1400; 91 // Dropped/added soft or hard sign, or a vowel (frequent omissions). 92 if (ch == 'ь' || ch == 'ъ' || ch == 'й') return 1100; 93 if ("аеёиоуыэюя".IndexOf(ch) >= 0) return 900; 94 return 300; // some insert/delete is still plausible 95 } 96 97 return 0; 98 } 99 100 // ===== Combined noisy-channel score for a candidate ===== 101 // Tuned against the word-level battery (Tools/TestSpellCheck). Higher = better. 102 public const int DistancePenalty = 2600; // each extra edit costs this; freq+plausibility can overcome it 103 public const int FirstCharBonus = 3000; // first char rarely mistyped 104 public const int LastCharBonus = 1800; // Russian endings matter 105 public const int SameLenBonus = 900; 106 public const int TrustedBonus = 2500; 107 public const int FreqMaxBonus = 9000; // most-frequent word gets ~this 108 109 /// <summary>Frequency bonus from a 0-based frequency rank (lower rank = more frequent).</summary> 110 public static int FreqBonus(int freqIdx) 111 { 112 if (freqIdx < 0) return 0; 113 // Log-shaped: rank 0 ~ FreqMaxBonus, decays; rank 80k ~ 0. 114 double f = 1.0 - Math.Log(1 + Math.Min(freqIdx, 80000)) / Math.Log(80001); 115 return (int)(f * FreqMaxBonus); 116 } 117 } 118}