1// BuildData: generates .bloom files and charnn.bin from dictionary text files 2// Compile: csc /out:BuildData.exe BuildData.cs ..\Helpers\BloomFilter.cs 3// Run: BuildData.exe 4 5using System; 6using System.IO; 7using System.Text; 8using System.Collections.Generic; 9using WindowCapture.Helpers; 10 11class BuildData 12{ 13 static void Main() 14 { 15 string dataDir = Path.Combine(Path.GetDirectoryName( 16 System.Reflection.Assembly.GetExecutingAssembly().Location), "..", "Data"); 17 if (!Directory.Exists(dataDir)) 18 dataDir = Path.Combine(Environment.CurrentDirectory, "Data"); 19 20 Console.WriteLine("Data dir: " + dataDir); 21 22 // Build RU bloom 23 string ruPath = Path.Combine(dataDir, "dict_ru.txt"); 24 if (File.Exists(ruPath)) 25 { 26 Console.Write("Building RU bloom... "); 27 var lines = File.ReadAllLines(ruPath); 28 var bloom = new BloomFilter(lines.Length, 16, 3); 29 int count = 0; 30 foreach (var l in lines) 31 { 32 string w = l.Trim().ToLower(); 33 if (w.Length >= 2) { bloom.Add(w); count++; } 34 } 35 bloom.Save(Path.Combine(dataDir, "dict_ru.bloom")); 36 Console.WriteLine(count + " words, " + (bloom.BitCount / 8 / 1024) + "KB"); 37 38 // Test 39 Console.WriteLine(" Test 'привет': " + bloom.MayContain("привет")); 40 Console.WriteLine(" Test 'прввет': " + bloom.MayContain("прввет")); 41 Console.WriteLine(" Test 'пожалуйста': " + bloom.MayContain("пожалуйста")); 42 } 43 44 // Build EN bloom 45 string enPath = Path.Combine(dataDir, "dict_en.txt"); 46 if (File.Exists(enPath)) 47 { 48 Console.Write("Building EN bloom... "); 49 var lines = File.ReadAllLines(enPath); 50 var bloom = new BloomFilter(lines.Length, 16, 3); 51 int count = 0; 52 foreach (var l in lines) 53 { 54 string w = l.Trim().ToLower(); 55 if (w.Length >= 2) { bloom.Add(w); count++; } 56 } 57 bloom.Save(Path.Combine(dataDir, "dict_en.bloom")); 58 Console.WriteLine(count + " words, " + (bloom.BitCount / 8 / 1024) + "KB"); 59 } 60 61 // Build charnn.bin (bigram matrix from top 80k RU words) 62 if (File.Exists(ruPath)) 63 { 64 Console.Write("Building charnn.bin... "); 65 int ALPHA = 34; 66 long[] counts = new long[ALPHA * ALPHA]; 67 long[] uni = new long[ALPHA]; 68 var lines = File.ReadAllLines(ruPath); 69 int maxW = Math.Min(lines.Length, 80000); 70 for (int w = 0; w < maxW; w++) 71 { 72 string word = lines[w].Trim().ToLower(); 73 for (int i = 0; i < word.Length; i++) 74 { 75 int ci = CharIdx(word[i]); 76 if (ci < 0) continue; 77 uni[ci]++; 78 if (i > 0) { int pi = CharIdx(word[i - 1]); if (pi >= 0) counts[pi * ALPHA + ci]++; } 79 } 80 } 81 float[] logProb = new float[ALPHA * ALPHA]; 82 for (int i = 0; i < ALPHA; i++) 83 { 84 long total = uni[i] + ALPHA; 85 for (int j = 0; j < ALPHA; j++) 86 logProb[i * ALPHA + j] = (float)Math.Log((double)(counts[i * ALPHA + j] + 1) / total); 87 } 88 89 string binPath = Path.Combine(dataDir, "charnn.bin"); 90 using (var fs = new FileStream(binPath, FileMode.Create)) 91 using (var bw = new BinaryWriter(fs)) 92 { 93 for (int i = 0; i < logProb.Length; i++) bw.Write(logProb[i]); 94 } 95 Console.WriteLine(logProb.Length * 4 + " bytes"); 96 } 97 98 Console.WriteLine("Done!"); 99 } 100 101 static int CharIdx(char c) 102 { 103 c = char.ToLower(c); 104 if (c >= '\u0430' && c <= '\u044F') return c - '\u0430'; 105 if (c == '\u0451') return 32; // ё 106 return -1; 107 } 108}