1using System; 2using System.Collections.Generic; 3 4namespace WindowCapture.Helpers 5{ 6 /// <summary> 7 /// CharNN: Character-level neural error model for spelling correction. 8 /// Combines keyboard geometry, language bigram statistics, and error pattern detection 9 /// to score how likely a given correction is the intended word. 10 /// Pure C# — no ML libraries needed. 11 /// </summary> 12 public static class CharNN 13 { 14 // Keyboard coordinates: char → (row, col) 15 private static Dictionary<char, float[]> keyPosRu; 16 private static Dictionary<char, float[]> keyPosEn; 17 18 // Character bigram log-probabilities (trained from dictionary) 19 // Index: charIndex(prev) * ALPHA_SIZE + charIndex(next) 20 private const int ALPHA_SIZE = 34; // а-я + ё 21 private static float[] bigramLogProb; 22 private static volatile bool modelReady; 23 24 public static bool IsReady { get { return modelReady; } } 25 26 // Weights for the neural score components 27 private const float W_KEYBOARD = 0.30f; 28 private const float W_BIGRAM = 0.20f; 29 private const float W_PATTERN = 0.25f; 30 private const float W_FREQ = 0.15f; 31 private const float W_CONTEXT = 0.10f; 32 33 static CharNN() 34 { 35 InitKeyboard(); 36 } 37 38 private static void InitKeyboard() 39 { 40 // ЙЦУКЕН layout 41 keyPosRu = new Dictionary<char, float[]>(); 42 string[] ruRows = { 43 "йцукенгшщзхъ", 44 "фывапролджэ", 45 "ячсмитьбю" 46 }; 47 for (int r = 0; r < ruRows.Length; r++) 48 for (int c = 0; c < ruRows[r].Length; c++) 49 keyPosRu[ruRows[r][c]] = new float[] { r, c + r * 0.25f }; // offset for staggered rows 50 keyPosRu['ё'] = new float[] { 0, -1f }; 51 52 // QWERTY layout 53 keyPosEn = new Dictionary<char, float[]>(); 54 string[] enRows = { 55 "qwertyuiop", 56 "asdfghjkl", 57 "zxcvbnm" 58 }; 59 for (int r = 0; r < enRows.Length; r++) 60 for (int c = 0; c < enRows[r].Length; c++) 61 keyPosEn[enRows[r][c]] = new float[] { r, c + r * 0.25f }; 62 } 63 64 /// <summary>Train bigram model from dictionary words. Call on background thread.</summary> 65 public static void BuildModel(string[] dictionary) 66 { 67 var sw = System.Diagnostics.Stopwatch.StartNew(); 68 69 // Count bigrams 70 long[] counts = new long[ALPHA_SIZE * ALPHA_SIZE]; 71 long[] uniCounts = new long[ALPHA_SIZE]; 72 73 int maxWords = Math.Min(dictionary.Length, 200000); // top 200k for speed 74 for (int w = 0; w < maxWords; w++) 75 { 76 string word = dictionary[w]; 77 for (int i = 0; i < word.Length; i++) 78 { 79 int ci = CharIdx(word[i]); 80 if (ci < 0) continue; 81 uniCounts[ci]++; 82 if (i > 0) 83 { 84 int pi = CharIdx(word[i - 1]); 85 if (pi >= 0) counts[pi * ALPHA_SIZE + ci]++; 86 } 87 } 88 } 89 90 // Convert to log-probabilities with Laplace smoothing 91 bigramLogProb = new float[ALPHA_SIZE * ALPHA_SIZE]; 92 for (int i = 0; i < ALPHA_SIZE; i++) 93 { 94 long total = uniCounts[i] + ALPHA_SIZE; // Laplace smoothing 95 for (int j = 0; j < ALPHA_SIZE; j++) 96 { 97 long count = counts[i * ALPHA_SIZE + j] + 1; // +1 smoothing 98 bigramLogProb[i * ALPHA_SIZE + j] = (float)Math.Log((double)count / total); 99 } 100 } 101 102 sw.Stop(); 103 Logger.Log("textproc", "CharNN built: bigram model from " + maxWords + " words in " + sw.ElapsedMilliseconds + "ms"); 104 modelReady = true; 105 } 106 107 /// <summary>Load pre-trained weights from binary file (instant, ~5KB).</summary> 108 public static void LoadWeights(string filePath) 109 { 110 try 111 { 112 byte[] raw = System.IO.File.ReadAllBytes(filePath); 113 bigramLogProb = new float[ALPHA_SIZE * ALPHA_SIZE]; 114 int expected = ALPHA_SIZE * ALPHA_SIZE * 4; // 34*34*4 = 4624 bytes 115 if (raw.Length >= expected) 116 { 117 Buffer.BlockCopy(raw, 0, bigramLogProb, 0, expected); 118 modelReady = true; 119 Logger.Log("textproc", "CharNN loaded weights: " + raw.Length + " bytes"); 120 } 121 } 122 catch (Exception ex) { Logger.Log("textproc", "CharNN load err: " + ex.Message); } 123 } 124 125 /// <summary> 126 /// Neural score: how likely is it that 'input' was meant to be 'candidate'? 127 /// Higher = more likely. Range roughly 0-1. 128 /// </summary> 129 public static float NeuralScore(string input, string candidate, int freqIdx, int freqCutoff, string prevWord) 130 { 131 if (!modelReady) return 0f; 132 133 float kbScore = KeyboardScore(input, candidate); 134 float bgScore = BigramScore(candidate); 135 float ptScore = ErrorPatternScore(input, candidate); 136 float frScore = FreqScore(freqIdx, freqCutoff); 137 float ctScore = ContextScore(candidate, prevWord); 138 139 return W_KEYBOARD * kbScore + W_BIGRAM * bgScore + W_PATTERN * ptScore + W_FREQ * frScore + W_CONTEXT * ctScore; 140 } 141 142 // ===== Component 1: Keyboard proximity for substitution errors ===== 143 private static float KeyboardScore(string input, string candidate) 144 { 145 // Align the two strings and check substituted characters 146 // Use simple positional alignment (not full DP for speed) 147 int minLen = Math.Min(input.Length, candidate.Length); 148 float totalProx = 0f; 149 int subs = 0; 150 151 for (int i = 0; i < minLen; i++) 152 { 153 if (input[i] != candidate[i]) 154 { 155 float dist = KeyDist(input[i], candidate[i]); 156 // Close keys → high score (likely fat-finger) 157 totalProx += Math.Max(0, 1f - dist / 3f); 158 subs++; 159 } 160 } 161 162 if (subs == 0) return 1f; // identical = perfect 163 return totalProx / subs; 164 } 165 166 private static float KeyDist(char a, char b) 167 { 168 a = char.ToLower(a); 169 b = char.ToLower(b); 170 float[] pa, pb; 171 172 // Try Russian layout 173 if (keyPosRu.TryGetValue(a, out pa) && keyPosRu.TryGetValue(b, out pb)) 174 return (float)Math.Sqrt((pa[0] - pb[0]) * (pa[0] - pb[0]) + (pa[1] - pb[1]) * (pa[1] - pb[1])); 175 176 // Try English layout 177 if (keyPosEn.TryGetValue(a, out pa) && keyPosEn.TryGetValue(b, out pb)) 178 return (float)Math.Sqrt((pa[0] - pb[0]) * (pa[0] - pb[0]) + (pa[1] - pb[1]) * (pa[1] - pb[1])); 179 180 return 5f; // unknown = far 181 } 182 183 // ===== Component 2: Bigram naturalness of the candidate word ===== 184 private static float BigramScore(string word) 185 { 186 if (word.Length < 2) return 0.5f; 187 float sum = 0; 188 int count = 0; 189 for (int i = 1; i < word.Length; i++) 190 { 191 int pi = CharIdx(word[i - 1]); 192 int ci = CharIdx(word[i]); 193 if (pi >= 0 && ci >= 0) 194 { 195 sum += bigramLogProb[pi * ALPHA_SIZE + ci]; 196 count++; 197 } 198 } 199 if (count == 0) return 0.5f; 200 // Normalize: typical log-prob is -3 to -1. Map to 0-1. 201 float avg = sum / count; 202 return Sigmoid(avg + 2.5f); // shift so that avg=-2.5 → 0.5 203 } 204 205 // ===== Component 3: Error pattern detection ===== 206 private static float ErrorPatternScore(string input, string candidate) 207 { 208 float score = 0.5f; // neutral 209 210 // Detect doubled characters in input that aren't in candidate (duplication error) 211 int doublesInInput = CountDoubles(input); 212 int doublesInCand = CountDoubles(candidate); 213 if (doublesInInput > doublesInCand) 214 score += 0.2f * (doublesInInput - doublesInCand); // likely duplication error 215 216 // Detect transposition: adjacent chars swapped 217 if (input.Length == candidate.Length) 218 { 219 int swaps = 0; 220 for (int i = 0; i < input.Length - 1; i++) 221 { 222 if (input[i] == candidate[i + 1] && input[i + 1] == candidate[i] 223 && input[i] != candidate[i]) 224 { 225 swaps++; 226 i++; // skip next 227 } 228 } 229 if (swaps > 0) score += 0.3f; // transposition is a very common error 230 } 231 232 // Length analysis 233 int lenDiff = candidate.Length - input.Length; 234 if (lenDiff == 0) score += 0.15f; // same length = substitution (most common typo) 235 if (lenDiff == 1) score += 0.1f; // candidate 1 char longer = user missed a key 236 if (lenDiff == -1) score += 0.05f; // candidate 1 char shorter = user typed extra key (less common) 237 if (lenDiff < -1) score -= 0.1f; // candidate much shorter = suspicious 238 239 return Math.Min(1f, score); 240 } 241 242 private static int CountDoubles(string s) 243 { 244 int count = 0; 245 for (int i = 1; i < s.Length; i++) 246 if (s[i] == s[i - 1]) count++; 247 return count; 248 } 249 250 // ===== Component 4: Word frequency ===== 251 private static float FreqScore(int freqIdx, int freqCutoff) 252 { 253 if (freqIdx < freqCutoff / 10) return 1.0f; // top 10% = very common 254 if (freqIdx < freqCutoff) return 0.7f; // top 80k = common 255 if (freqIdx < freqCutoff * 5) return 0.3f; // top 400k = known 256 return 0.1f; // rare 257 } 258 259 // ===== Component 5: Morphological context from previous word ===== 260 private static readonly HashSet<string> preps = new HashSet<string> { 261 "в","на","по","к","с","у","за","от","из","до","для","без","при","через","под","над","перед","про" 262 }; 263 264 private static float ContextScore(string candidate, string prevWord) 265 { 266 if (string.IsNullOrEmpty(prevWord) || candidate.Length < 3) return 0.5f; 267 string prev = prevWord.ToLower(); 268 float score = 0.5f; 269 270 // After preposition → prefer oblique noun cases 271 if (preps.Contains(prev)) 272 { 273 if (EndsWith(candidate, "ом", "ем", "ой", "ам", "ях", "ую", "ым", "ей", "ов", "ах", "ие", "ию")) 274 score += 0.3f; 275 if (EndsWith(candidate, "ть", "ться")) score -= 0.2f; 276 } 277 278 // After "не" → prefer verbs 279 if (prev == "не" || prev == "ни") 280 { 281 if (EndsWith(candidate, "ть", "ет", "ит", "ал", "ла", "ся", "ли", "ют", "ат", "ешь", "ишь")) 282 score += 0.3f; 283 } 284 285 // After adjective → prefer nouns 286 if (EndsWith(prev, "ый", "ий", "ой", "ая", "яя", "ое", "ее", "ые", "ие")) 287 { 288 if (!EndsWith(candidate, "ть", "ет", "ит", "ся")) 289 score += 0.15f; 290 } 291 292 // After possessive → prefer nouns 293 if (prev == "мой" || prev == "твой" || prev == "наш" || prev == "ваш" || 294 prev == "это" || prev == "его" || prev == "её" || prev == "их") 295 { 296 if (!EndsWith(candidate, "ть", "ся")) score += 0.2f; 297 } 298 299 return Math.Min(1f, Math.Max(0f, score)); 300 } 301 302 // ===== Component 6: Sentence coherence ===== 303 /// <summary> 304 /// How well does this candidate fit the sentence so far? 305 /// Uses char-bigram overlap between candidate and sentence words 306 /// as a proxy for semantic relatedness. 307 /// </summary> 308 public static float SentenceCoherence(string candidate, string[] sentenceWords) 309 { 310 if (!modelReady || sentenceWords == null || sentenceWords.Length == 0) return 0.5f; 311 312 float totalScore = 0; 313 int count = 0; 314 315 // For each word in the sentence, compute char-trigram overlap with candidate 316 // Words that share trigrams are likely from the same topic/context 317 var candTrigrams = GetTrigrams(candidate); 318 if (candTrigrams.Count == 0) return 0.5f; 319 320 foreach (string sw in sentenceWords) 321 { 322 if (sw.Length < 3) continue; 323 var swTrigrams = GetTrigrams(sw); 324 if (swTrigrams.Count == 0) continue; 325 326 // Jaccard similarity of trigram sets 327 int intersection = 0; 328 foreach (string t in candTrigrams) 329 if (swTrigrams.Contains(t)) intersection++; 330 331 int union = candTrigrams.Count + swTrigrams.Count - intersection; 332 if (union > 0) 333 { 334 totalScore += (float)intersection / union; 335 count++; 336 } 337 } 338 339 if (count == 0) return 0.5f; 340 341 // Also: check morphological agreement 342 // If sentence contains preposition and candidate has matching case ending 343 float morphBonus = 0; 344 for (int i = sentenceWords.Length - 1; i >= Math.Max(0, sentenceWords.Length - 3); i--) 345 { 346 string w = sentenceWords[i].ToLower(); 347 // "в моменте" — preposition "в" + prepositional case "-е"/"-и" 348 if (preps.Contains(w)) 349 { 350 if (EndsWith(candidate, "е", "и", "у", "ю", "ом", "ем", "ой", "ам", "ей", "ах", "ях")) 351 morphBonus += 0.3f; 352 break; 353 } 354 } 355 356 float avgSim = totalScore / count; 357 // Scale: 0 overlap = 0, some overlap = up to 1 358 return Math.Min(1f, avgSim * 3f + morphBonus); 359 } 360 361 private static HashSet<string> GetTrigrams(string word) 362 { 363 var result = new HashSet<string>(); 364 string w = word.ToLower(); 365 for (int i = 0; i <= w.Length - 3; i++) 366 result.Add(w.Substring(i, 3)); 367 return result; 368 } 369 370 // ===== Helpers ===== 371 private static int CharIdx(char c) 372 { 373 c = char.ToLower(c); 374 if (c >= 'а' && c <= 'я') return c - 'а'; // 0-31 375 if (c == 'ё') return 32; 376 return -1; 377 } 378 379 private static float Sigmoid(float x) 380 { 381 return 1f / (1f + (float)Math.Exp(-x)); 382 } 383 384 private static bool EndsWith(string word, params string[] suffixes) 385 { 386 foreach (var s in suffixes) 387 if (word.Length >= s.Length && word.EndsWith(s, StringComparison.Ordinal)) 388 return true; 389 return false; 390 } 391 } 392}