1using System; 2using System.Collections.Generic; 3using System.IO; 4using System.Text; 5 6namespace WindowCapture.Helpers 7{ 8 /// <summary> 9 /// BigramLM: Word-level bigram language model. 10 /// Knows P(word2 | word1) — which words commonly follow which. 11 /// Used to pick the RIGHT candidate when multiple have same edit distance. 12 /// 13 /// "на карте" → high score, "на катере" → low score 14 /// "двойными буквами" → high, "двойными буями" → low 15 /// </summary> 16 public class BigramLM 17 { 18 Dictionary<string, int> wordIdx; 19 string[] words; 20 int wordCount; 21 int[] unigramCounts; 22 // For each word: list of (followerIdx, count) pairs 23 int[][] bigramFollowers; // [wordIdx][pair_idx*2+0]=followerIdx, [pair_idx*2+1]=count 24 25 volatile bool ready; 26 public bool IsReady { get { return ready; } } 27 28 public bool Load(string path) 29 { 30 try 31 { 32 using (var br = new BinaryReader(new FileStream(path, FileMode.Open))) 33 { 34 wordCount = br.ReadInt32(); 35 words = new string[wordCount]; 36 wordIdx = new Dictionary<string, int>(wordCount, StringComparer.Ordinal); 37 unigramCounts = new int[wordCount]; 38 bigramFollowers = new int[wordCount][]; 39 40 for (int i = 0; i < wordCount; i++) 41 { 42 int len = br.ReadInt32(); 43 byte[] wb = br.ReadBytes(len); 44 words[i] = Encoding.UTF8.GetString(wb); 45 wordIdx[words[i]] = i; 46 47 unigramCounts[i] = br.ReadInt32(); 48 49 int nFollowers = br.ReadInt32(); 50 bigramFollowers[i] = new int[nFollowers * 2]; 51 for (int j = 0; j < nFollowers; j++) 52 { 53 bigramFollowers[i][j * 2] = br.ReadInt32(); // follower idx 54 bigramFollowers[i][j * 2 + 1] = br.ReadInt32(); // count 55 } 56 } 57 } 58 ready = true; 59 Logger.Log("textproc", "BigramLM loaded: " + wordCount + " words"); 60 return true; 61 } 62 catch (Exception ex) 63 { 64 Logger.Log("textproc", "BigramLM load err: " + ex.Message); 65 return false; 66 } 67 } 68 69 /// <summary> 70 /// Score how likely word2 is to follow word1. 71 /// Returns 0-1000. Higher = more likely pair. 72 /// </summary> 73 public int Score(string prevWord, string candidate) 74 { 75 if (!ready || prevWord == null || candidate == null) return 0; 76 string pw = prevWord.ToLower(); 77 string cw = candidate.ToLower(); 78 79 int pi; 80 if (!wordIdx.TryGetValue(pw, out pi)) return 0; 81 82 int ci; 83 if (!wordIdx.TryGetValue(cw, out ci)) return 0; 84 85 // Search in bigram followers 86 int[] followers = bigramFollowers[pi]; 87 for (int j = 0; j < followers.Length; j += 2) 88 { 89 if (followers[j] == ci) 90 return Math.Min(1000, followers[j + 1]); 91 } 92 93 // Not in explicit bigrams — check if candidate is a common word (unigram fallback) 94 if (ci < wordCount) 95 return Math.Min(100, unigramCounts[ci] / 100); 96 97 return 0; 98 } 99 100 /// <summary>Check if word is in the LM vocabulary.</summary> 101 public bool HasWord(string word) { return wordIdx != null && wordIdx.ContainsKey(word.ToLower()); } 102 } 103}