windowcapture
исходный код / Helpers/MorphAnalyzer.cs

MorphAnalyzer.cs

186 строк · 6,923 байт · модуль Helpers
  1using System;
  2using System.Collections.Generic;
  3using System.IO;
  4using System.IO.Compression;
  5
  6namespace WindowCapture.Helpers
  7{
  8    /// <summary>
  9    /// MorphAnalyzer: Russian morphological analyzer based on OpenCorpora dictionary.
 10    /// Loads a compact binary dictionary (morph.bin) with 5M+ word forms and their POS tags.
 11    /// Provides part-of-speech tagging for any Russian word in O(1) time.
 12    /// </summary>
 13    public static class MorphAnalyzer
 14    {
 15        // POS tag constants (matching OpenCorpora tagset)
 16        public const byte NOUN = 1;   // существительное
 17        public const byte ADJF = 2;   // прилагательное (полное)
 18        public const byte ADJS = 3;   // прилагательное (краткое)
 19        public const byte COMP = 4;   // компаратив
 20        public const byte VERB = 5;   // глагол (личная форма)
 21        public const byte INFN = 6;   // глагол (инфинитив)
 22        public const byte PRTF = 7;   // причастие (полное)
 23        public const byte PRTS = 8;   // причастие (краткое)
 24        public const byte GRND = 9;   // деепричастие
 25        public const byte NUMR = 10;  // числительное
 26        public const byte ADVB = 11;  // наречие
 27        public const byte NPRO = 12;  // местоимение
 28        public const byte PRED = 13;  // предикатив
 29        public const byte PREP = 14;  // предлог
 30        public const byte CONJ = 15;  // союз
 31        public const byte PRCL = 16;  // частица
 32        public const byte INTJ = 17;  // междометие
 33
 34        private static Dictionary<string, byte> dict;
 35        private static volatile bool ready;
 36
 37        public static bool IsReady { get { return ready; } }
 38        public static int WordCount { get { return dict != null ? dict.Count : 0; } }
 39
 40        /// <summary>Load morphology dictionary from binary or gzipped file.</summary>
 41        public static void Load(string path)
 42        {
 43            try
 44            {
 45                var sw = System.Diagnostics.Stopwatch.StartNew();
 46                byte[] raw;
 47
 48                // Support .gz compressed files
 49                if (path.EndsWith(".gz"))
 50                {
 51                    using (var fs = File.OpenRead(path))
 52                    using (var gz = new GZipStream(fs, CompressionMode.Decompress))
 53                    using (var ms = new MemoryStream())
 54                    {
 55                        gz.CopyTo(ms);
 56                        raw = ms.ToArray();
 57                    }
 58                }
 59                else
 60                {
 61                    raw = File.ReadAllBytes(path);
 62                }
 63
 64                int count = BitConverter.ToInt32(raw, 0);
 65                dict = new Dictionary<string, byte>(count, StringComparer.Ordinal);
 66
 67                int offset = 4;
 68                for (int i = 0; i < count && offset < raw.Length; i++)
 69                {
 70                    int wordLen = raw[offset++];
 71                    if (offset + wordLen + 1 > raw.Length) break;
 72                    string word = System.Text.Encoding.UTF8.GetString(raw, offset, wordLen);
 73                    offset += wordLen;
 74                    byte pos = raw[offset++];
 75                    dict[word] = pos;
 76                }
 77
 78                sw.Stop();
 79                Logger.Log("textproc", "MorphAnalyzer loaded: " + dict.Count + " words in " + sw.ElapsedMilliseconds + "ms");
 80                ready = true;
 81            }
 82            catch (Exception ex)
 83            {
 84                Logger.Log("textproc", "MorphAnalyzer load error: " + ex.Message);
 85            }
 86        }
 87
 88        /// <summary>Get POS tag for a word. Returns 0 if unknown.</summary>
 89        public static byte GetPOS(string word)
 90        {
 91            if (!ready || dict == null) return 0;
 92            byte pos;
 93            return dict.TryGetValue(word.ToLower(), out pos) ? pos : (byte)0;
 94        }
 95
 96        /// <summary>Check if word is a verb (any form).</summary>
 97        public static bool IsVerb(string word)
 98        {
 99            byte pos = GetPOS(word);
100            return pos == VERB || pos == INFN || pos == GRND;
101        }
102
103        /// <summary>Check if word is a noun.</summary>
104        public static bool IsNoun(string word)
105        {
106            byte pos = GetPOS(word);
107            return pos == NOUN;
108        }
109
110        /// <summary>Check if word is an adjective.</summary>
111        public static bool IsAdjective(string word)
112        {
113            byte pos = GetPOS(word);
114            return pos == ADJF || pos == ADJS || pos == COMP;
115        }
116
117        /// <summary>Check if word is a participle (причастие).</summary>
118        public static bool IsParticiple(string word)
119        {
120            byte pos = GetPOS(word);
121            return pos == PRTF || pos == PRTS;
122        }
123
124        /// <summary>Check if word is a gerund (деепричастие).</summary>
125        public static bool IsGerund(string word)
126        {
127            return GetPOS(word) == GRND;
128        }
129
130        /// <summary>Check if word is a preposition.</summary>
131        public static bool IsPreposition(string word)
132        {
133            return GetPOS(word) == PREP;
134        }
135
136        /// <summary>Check if word is a conjunction.</summary>
137        public static bool IsConjunction(string word)
138        {
139            return GetPOS(word) == CONJ;
140        }
141
142        /// <summary>Check if word is a pronoun.</summary>
143        public static bool IsPronoun(string word)
144        {
145            return GetPOS(word) == NPRO;
146        }
147
148        /// <summary>Check if word is an adverb.</summary>
149        public static bool IsAdverb(string word)
150        {
151            return GetPOS(word) == ADVB;
152        }
153
154        /// <summary>Check if word is a particle.</summary>
155        public static bool IsParticle(string word)
156        {
157            return GetPOS(word) == PRCL;
158        }
159
160        /// <summary>Get human-readable POS name.</summary>
161        public static string GetPOSName(byte pos)
162        {
163            switch (pos)
164            {
165                case NOUN: return "NOUN";
166                case ADJF: return "ADJF";
167                case ADJS: return "ADJS";
168                case COMP: return "COMP";
169                case VERB: return "VERB";
170                case INFN: return "INFN";
171                case PRTF: return "PRTF";
172                case PRTS: return "PRTS";
173                case GRND: return "GRND";
174                case NUMR: return "NUMR";
175                case ADVB: return "ADVB";
176                case NPRO: return "NPRO";
177                case PRED: return "PRED";
178                case PREP: return "PREP";
179                case CONJ: return "CONJ";
180                case PRCL: return "PRCL";
181                case INTJ: return "INTJ";
182                default: return "?";
183            }
184        }
185    }
186}