windowcapture
исходный код / Helpers/SpellScore.cs

SpellScore.cs

118 строк · 6,135 байт · модуль Helpers
  1using System;
  2using System.Collections.Generic;
  3
  4namespace WindowCapture.Helpers
  5{
  6    /// <summary>
  7    /// Noisy-channel scoring helpers for spell correction (Stage 0 of the smarter-autocorrect plan).
  8    ///
  9    /// The idea: a typo→word substitution is not equally likely for every char pair. ё/е, unstressed
 10    /// vowel confusions (о/а, и/е), consonant voicing (б/п, з/с …) and keyboard-adjacent slips are
 11    /// CHEAP (very natural typos); random substitutions are expensive. This lets the ranker prefer
 12    /// the phonetically/ergonomically plausible candidate, and — combined with word frequency — lets a
 13    /// frequent real word (e.g. "компьютер") beat a rare/garbage near-neighbour ("компутер") even when
 14    /// the latter is one edit closer. Replaces the misranking bag-of-chars CharEmbedNet.
 15    /// </summary>
 16    public static class SpellScore
 17    {
 18        // ЙЦУКЕН horizontal neighbours (most common adjacency slips).
 19        private static readonly HashSet<long> kbdAdj = new HashSet<long>();
 20        static SpellScore()
 21        {
 22            string[] rows = { "йцукенгшщзхъ", "фывапролджэ", "ячсмитьбю" };
 23            foreach (var r in rows)
 24                for (int i = 0; i + 1 < r.Length; i++)
 25                {
 26                    AddPair(r[i], r[i + 1]);
 27                }
 28        }
 29        private static long Key(char a, char b) { return a < b ? ((long)a << 16) | b : ((long)b << 16) | a; }
 30        private static void AddPair(char a, char b) { kbdAdj.Add(Key(a, b)); }
 31        private static bool KeyboardAdjacent(char a, char b) { return kbdAdj.Contains(Key(a, b)); }
 32
 33        // Phonetic / typo-plausibility classes (unordered pairs).
 34        private static bool In(char a, char b, char x, char y) { return (a == x && b == y) || (a == y && b == x); }
 35
 36        /// <summary>Bonus for a single-char substitution typed→cand: higher = more natural typo.</summary>
 37        public static int SubBonus(char a, char b)
 38        {
 39            if (a == b) return 0;
 40            // ё/е are the same key for most typists; о↔ё (пошол/пошёл) almost as common.
 41            if (In(a, b, 'е', 'ё')) return 1800;
 42            if (In(a, b, 'о', 'ё')) return 1500;
 43            // Unstressed-vowel reduction (аканье/иканье) — the #1 class of Russian misspellings.
 44            if (In(a, b, 'о', 'а') || In(a, b, 'и', 'е') || In(a, b, 'е', 'я') ||
 45                In(a, b, 'а', 'я') || In(a, b, 'у', 'о') || In(a, b, 'ы', 'и')) return 1200;
 46            // Consonant voicing pairs.
 47            if (In(a, b, 'б', 'п') || In(a, b, 'в', 'ф') || In(a, b, 'г', 'к') ||
 48                In(a, b, 'д', 'т') || In(a, b, 'з', 'с') || In(a, b, 'ж', 'ш') ||
 49                In(a, b, 'с', 'ц') || In(a, b, 'ч', 'щ')) return 1000;
 50            // Physical keyboard slip.
 51            if (KeyboardAdjacent(a, b)) return 800;
 52            return 0;
 53        }
 54
 55        /// <summary>
 56        /// Total plausibility bonus for turning `typed` into `cand` (both lowercase). Looks at the
 57        /// kind of edits, not just their count: natural substitutions, a doubled consonant, or a
 58        /// dropped soft-sign/vowel score well; arbitrary edits score 0.
 59        /// </summary>
 60        public static int EditPlausibility(string typed, string cand)
 61        {
 62            if (typed == null || cand == null) return 0;
 63            int lt = typed.Length, lc = cand.Length;
 64
 65            if (lt == lc)
 66            {
 67                int bonus = 0, diffs = 0;
 68                for (int i = 0; i < lt; i++)
 69                    if (typed[i] != cand[i]) { bonus += SubBonus(typed[i], cand[i]); diffs++; }
 70                // Adjacent transposition (e.g. "пошло"→"пошёл"-style ordering slips) is also natural.
 71                if (diffs == 2 && bonus == 0)
 72                {
 73                    for (int i = 0; i + 1 < lt; i++)
 74                        if (typed[i] == cand[i + 1] && typed[i + 1] == cand[i] && typed[i] != typed[i + 1])
 75                            return 900;
 76                }
 77                return bonus;
 78            }
 79
 80            if (Math.Abs(lt - lc) == 1)
 81            {
 82                // Single insert/delete: classify the added/removed char.
 83                string lng = lt > lc ? typed : cand;
 84                string sht = lt > lc ? cand : typed;
 85                int p = 0;
 86                while (p < sht.Length && sht[p] == lng[p]) p++;
 87                char ch = lng[p];
 88                // Doubled consonant (программа/грамотный) — extremely common.
 89                if (p > 0 && lng[p] == lng[p - 1]) return 1400;
 90                if (p + 1 < lng.Length && lng[p] == lng[p + 1]) return 1400;
 91                // Dropped/added soft or hard sign, or a vowel (frequent omissions).
 92                if (ch == 'ь' || ch == 'ъ' || ch == 'й') return 1100;
 93                if ("аеёиоуыэюя".IndexOf(ch) >= 0) return 900;
 94                return 300; // some insert/delete is still plausible
 95            }
 96
 97            return 0;
 98        }
 99
100        // ===== Combined noisy-channel score for a candidate =====
101        // Tuned against the word-level battery (Tools/TestSpellCheck). Higher = better.
102        public const int DistancePenalty = 2600; // each extra edit costs this; freq+plausibility can overcome it
103        public const int FirstCharBonus = 3000;  // first char rarely mistyped
104        public const int LastCharBonus = 1800;    // Russian endings matter
105        public const int SameLenBonus = 900;
106        public const int TrustedBonus = 2500;
107        public const int FreqMaxBonus = 9000;     // most-frequent word gets ~this
108
109        /// <summary>Frequency bonus from a 0-based frequency rank (lower rank = more frequent).</summary>
110        public static int FreqBonus(int freqIdx)
111        {
112            if (freqIdx < 0) return 0;
113            // Log-shaped: rank 0 ~ FreqMaxBonus, decays; rank 80k ~ 0.
114            double f = 1.0 - Math.Log(1 + Math.Min(freqIdx, 80000)) / Math.Log(80001);
115            return (int)(f * FreqMaxBonus);
116        }
117    }
118}