1using System; 2using System.Collections.Generic; 3using System.IO; 4using System.IO.Compression; 5 6namespace WindowCapture.Helpers 7{ 8 /// <summary> 9 /// MorphAnalyzer: Russian morphological analyzer based on OpenCorpora dictionary. 10 /// Loads a compact binary dictionary (morph.bin) with 5M+ word forms and their POS tags. 11 /// Provides part-of-speech tagging for any Russian word in O(1) time. 12 /// </summary> 13 public static class MorphAnalyzer 14 { 15 // POS tag constants (matching OpenCorpora tagset) 16 public const byte NOUN = 1; // существительное 17 public const byte ADJF = 2; // прилагательное (полное) 18 public const byte ADJS = 3; // прилагательное (краткое) 19 public const byte COMP = 4; // компаратив 20 public const byte VERB = 5; // глагол (личная форма) 21 public const byte INFN = 6; // глагол (инфинитив) 22 public const byte PRTF = 7; // причастие (полное) 23 public const byte PRTS = 8; // причастие (краткое) 24 public const byte GRND = 9; // деепричастие 25 public const byte NUMR = 10; // числительное 26 public const byte ADVB = 11; // наречие 27 public const byte NPRO = 12; // местоимение 28 public const byte PRED = 13; // предикатив 29 public const byte PREP = 14; // предлог 30 public const byte CONJ = 15; // союз 31 public const byte PRCL = 16; // частица 32 public const byte INTJ = 17; // междометие 33 34 private static Dictionary<string, byte> dict; 35 private static volatile bool ready; 36 37 public static bool IsReady { get { return ready; } } 38 public static int WordCount { get { return dict != null ? dict.Count : 0; } } 39 40 /// <summary>Load morphology dictionary from binary or gzipped file.</summary> 41 public static void Load(string path) 42 { 43 try 44 { 45 var sw = System.Diagnostics.Stopwatch.StartNew(); 46 byte[] raw; 47 48 // Support .gz compressed files 49 if (path.EndsWith(".gz")) 50 { 51 using (var fs = File.OpenRead(path)) 52 using (var gz = new GZipStream(fs, CompressionMode.Decompress)) 53 using (var ms = new MemoryStream()) 54 { 55 gz.CopyTo(ms); 56 raw = ms.ToArray(); 57 } 58 } 59 else 60 { 61 raw = File.ReadAllBytes(path); 62 } 63 64 int count = BitConverter.ToInt32(raw, 0); 65 dict = new Dictionary<string, byte>(count, StringComparer.Ordinal); 66 67 int offset = 4; 68 for (int i = 0; i < count && offset < raw.Length; i++) 69 { 70 int wordLen = raw[offset++]; 71 if (offset + wordLen + 1 > raw.Length) break; 72 string word = System.Text.Encoding.UTF8.GetString(raw, offset, wordLen); 73 offset += wordLen; 74 byte pos = raw[offset++]; 75 dict[word] = pos; 76 } 77 78 sw.Stop(); 79 Logger.Log("textproc", "MorphAnalyzer loaded: " + dict.Count + " words in " + sw.ElapsedMilliseconds + "ms"); 80 ready = true; 81 } 82 catch (Exception ex) 83 { 84 Logger.Log("textproc", "MorphAnalyzer load error: " + ex.Message); 85 } 86 } 87 88 /// <summary>Get POS tag for a word. Returns 0 if unknown.</summary> 89 public static byte GetPOS(string word) 90 { 91 if (!ready || dict == null) return 0; 92 byte pos; 93 return dict.TryGetValue(word.ToLower(), out pos) ? pos : (byte)0; 94 } 95 96 /// <summary>Check if word is a verb (any form).</summary> 97 public static bool IsVerb(string word) 98 { 99 byte pos = GetPOS(word); 100 return pos == VERB || pos == INFN || pos == GRND; 101 } 102 103 /// <summary>Check if word is a noun.</summary> 104 public static bool IsNoun(string word) 105 { 106 byte pos = GetPOS(word); 107 return pos == NOUN; 108 } 109 110 /// <summary>Check if word is an adjective.</summary> 111 public static bool IsAdjective(string word) 112 { 113 byte pos = GetPOS(word); 114 return pos == ADJF || pos == ADJS || pos == COMP; 115 } 116 117 /// <summary>Check if word is a participle (причастие).</summary> 118 public static bool IsParticiple(string word) 119 { 120 byte pos = GetPOS(word); 121 return pos == PRTF || pos == PRTS; 122 } 123 124 /// <summary>Check if word is a gerund (деепричастие).</summary> 125 public static bool IsGerund(string word) 126 { 127 return GetPOS(word) == GRND; 128 } 129 130 /// <summary>Check if word is a preposition.</summary> 131 public static bool IsPreposition(string word) 132 { 133 return GetPOS(word) == PREP; 134 } 135 136 /// <summary>Check if word is a conjunction.</summary> 137 public static bool IsConjunction(string word) 138 { 139 return GetPOS(word) == CONJ; 140 } 141 142 /// <summary>Check if word is a pronoun.</summary> 143 public static bool IsPronoun(string word) 144 { 145 return GetPOS(word) == NPRO; 146 } 147 148 /// <summary>Check if word is an adverb.</summary> 149 public static bool IsAdverb(string word) 150 { 151 return GetPOS(word) == ADVB; 152 } 153 154 /// <summary>Check if word is a particle.</summary> 155 public static bool IsParticle(string word) 156 { 157 return GetPOS(word) == PRCL; 158 } 159 160 /// <summary>Get human-readable POS name.</summary> 161 public static string GetPOSName(byte pos) 162 { 163 switch (pos) 164 { 165 case NOUN: return "NOUN"; 166 case ADJF: return "ADJF"; 167 case ADJS: return "ADJS"; 168 case COMP: return "COMP"; 169 case VERB: return "VERB"; 170 case INFN: return "INFN"; 171 case PRTF: return "PRTF"; 172 case PRTS: return "PRTS"; 173 case GRND: return "GRND"; 174 case NUMR: return "NUMR"; 175 case ADVB: return "ADVB"; 176 case NPRO: return "NPRO"; 177 case PRED: return "PRED"; 178 case PREP: return "PREP"; 179 case CONJ: return "CONJ"; 180 case PRCL: return "PRCL"; 181 case INTJ: return "INTJ"; 182 default: return "?"; 183 } 184 } 185 } 186}