windowcapture
исходный код / Helpers/BigramLM.cs

BigramLM.cs

103 строк · 3,807 байт · модуль Helpers
  1using System;
  2using System.Collections.Generic;
  3using System.IO;
  4using System.Text;
  5
  6namespace WindowCapture.Helpers
  7{
  8    /// <summary>
  9    /// BigramLM: Word-level bigram language model.
 10    /// Knows P(word2 | word1) — which words commonly follow which.
 11    /// Used to pick the RIGHT candidate when multiple have same edit distance.
 12    ///
 13    /// "на карте" → high score, "на катере" → low score
 14    /// "двойными буквами" → high, "двойными буями" → low
 15    /// </summary>
 16    public class BigramLM
 17    {
 18        Dictionary<string, int> wordIdx;
 19        string[] words;
 20        int wordCount;
 21        int[] unigramCounts;
 22        // For each word: list of (followerIdx, count) pairs
 23        int[][] bigramFollowers; // [wordIdx][pair_idx*2+0]=followerIdx, [pair_idx*2+1]=count
 24
 25        volatile bool ready;
 26        public bool IsReady { get { return ready; } }
 27
 28        public bool Load(string path)
 29        {
 30            try
 31            {
 32                using (var br = new BinaryReader(new FileStream(path, FileMode.Open)))
 33                {
 34                    wordCount = br.ReadInt32();
 35                    words = new string[wordCount];
 36                    wordIdx = new Dictionary<string, int>(wordCount, StringComparer.Ordinal);
 37                    unigramCounts = new int[wordCount];
 38                    bigramFollowers = new int[wordCount][];
 39
 40                    for (int i = 0; i < wordCount; i++)
 41                    {
 42                        int len = br.ReadInt32();
 43                        byte[] wb = br.ReadBytes(len);
 44                        words[i] = Encoding.UTF8.GetString(wb);
 45                        wordIdx[words[i]] = i;
 46
 47                        unigramCounts[i] = br.ReadInt32();
 48
 49                        int nFollowers = br.ReadInt32();
 50                        bigramFollowers[i] = new int[nFollowers * 2];
 51                        for (int j = 0; j < nFollowers; j++)
 52                        {
 53                            bigramFollowers[i][j * 2] = br.ReadInt32();     // follower idx
 54                            bigramFollowers[i][j * 2 + 1] = br.ReadInt32(); // count
 55                        }
 56                    }
 57                }
 58                ready = true;
 59                Logger.Log("textproc", "BigramLM loaded: " + wordCount + " words");
 60                return true;
 61            }
 62            catch (Exception ex)
 63            {
 64                Logger.Log("textproc", "BigramLM load err: " + ex.Message);
 65                return false;
 66            }
 67        }
 68
 69        /// <summary>
 70        /// Score how likely word2 is to follow word1.
 71        /// Returns 0-1000. Higher = more likely pair.
 72        /// </summary>
 73        public int Score(string prevWord, string candidate)
 74        {
 75            if (!ready || prevWord == null || candidate == null) return 0;
 76            string pw = prevWord.ToLower();
 77            string cw = candidate.ToLower();
 78
 79            int pi;
 80            if (!wordIdx.TryGetValue(pw, out pi)) return 0;
 81
 82            int ci;
 83            if (!wordIdx.TryGetValue(cw, out ci)) return 0;
 84
 85            // Search in bigram followers
 86            int[] followers = bigramFollowers[pi];
 87            for (int j = 0; j < followers.Length; j += 2)
 88            {
 89                if (followers[j] == ci)
 90                    return Math.Min(1000, followers[j + 1]);
 91            }
 92
 93            // Not in explicit bigrams — check if candidate is a common word (unigram fallback)
 94            if (ci < wordCount)
 95                return Math.Min(100, unigramCounts[ci] / 100);
 96
 97            return 0;
 98        }
 99
100        /// <summary>Check if word is in the LM vocabulary.</summary>
101        public bool HasWord(string word) { return wordIdx != null && wordIdx.ContainsKey(word.ToLower()); }
102    }
103}