windowcapture
исходный код / Tools/BuildData.cs

BuildData.cs

108 строк · 4,064 байт · модуль Tools
  1// BuildData: generates .bloom files and charnn.bin from dictionary text files
  2// Compile: csc /out:BuildData.exe BuildData.cs ..\Helpers\BloomFilter.cs
  3// Run: BuildData.exe
  4
  5using System;
  6using System.IO;
  7using System.Text;
  8using System.Collections.Generic;
  9using WindowCapture.Helpers;
 10
 11class BuildData
 12{
 13    static void Main()
 14    {
 15        string dataDir = Path.Combine(Path.GetDirectoryName(
 16            System.Reflection.Assembly.GetExecutingAssembly().Location), "..", "Data");
 17        if (!Directory.Exists(dataDir))
 18            dataDir = Path.Combine(Environment.CurrentDirectory, "Data");
 19
 20        Console.WriteLine("Data dir: " + dataDir);
 21
 22        // Build RU bloom
 23        string ruPath = Path.Combine(dataDir, "dict_ru.txt");
 24        if (File.Exists(ruPath))
 25        {
 26            Console.Write("Building RU bloom... ");
 27            var lines = File.ReadAllLines(ruPath);
 28            var bloom = new BloomFilter(lines.Length, 16, 3);
 29            int count = 0;
 30            foreach (var l in lines)
 31            {
 32                string w = l.Trim().ToLower();
 33                if (w.Length >= 2) { bloom.Add(w); count++; }
 34            }
 35            bloom.Save(Path.Combine(dataDir, "dict_ru.bloom"));
 36            Console.WriteLine(count + " words, " + (bloom.BitCount / 8 / 1024) + "KB");
 37
 38            // Test
 39            Console.WriteLine("  Test 'привет': " + bloom.MayContain("привет"));
 40            Console.WriteLine("  Test 'прввет': " + bloom.MayContain("прввет"));
 41            Console.WriteLine("  Test 'пожалуйста': " + bloom.MayContain("пожалуйста"));
 42        }
 43
 44        // Build EN bloom
 45        string enPath = Path.Combine(dataDir, "dict_en.txt");
 46        if (File.Exists(enPath))
 47        {
 48            Console.Write("Building EN bloom... ");
 49            var lines = File.ReadAllLines(enPath);
 50            var bloom = new BloomFilter(lines.Length, 16, 3);
 51            int count = 0;
 52            foreach (var l in lines)
 53            {
 54                string w = l.Trim().ToLower();
 55                if (w.Length >= 2) { bloom.Add(w); count++; }
 56            }
 57            bloom.Save(Path.Combine(dataDir, "dict_en.bloom"));
 58            Console.WriteLine(count + " words, " + (bloom.BitCount / 8 / 1024) + "KB");
 59        }
 60
 61        // Build charnn.bin (bigram matrix from top 80k RU words)
 62        if (File.Exists(ruPath))
 63        {
 64            Console.Write("Building charnn.bin... ");
 65            int ALPHA = 34;
 66            long[] counts = new long[ALPHA * ALPHA];
 67            long[] uni = new long[ALPHA];
 68            var lines = File.ReadAllLines(ruPath);
 69            int maxW = Math.Min(lines.Length, 80000);
 70            for (int w = 0; w < maxW; w++)
 71            {
 72                string word = lines[w].Trim().ToLower();
 73                for (int i = 0; i < word.Length; i++)
 74                {
 75                    int ci = CharIdx(word[i]);
 76                    if (ci < 0) continue;
 77                    uni[ci]++;
 78                    if (i > 0) { int pi = CharIdx(word[i - 1]); if (pi >= 0) counts[pi * ALPHA + ci]++; }
 79                }
 80            }
 81            float[] logProb = new float[ALPHA * ALPHA];
 82            for (int i = 0; i < ALPHA; i++)
 83            {
 84                long total = uni[i] + ALPHA;
 85                for (int j = 0; j < ALPHA; j++)
 86                    logProb[i * ALPHA + j] = (float)Math.Log((double)(counts[i * ALPHA + j] + 1) / total);
 87            }
 88
 89            string binPath = Path.Combine(dataDir, "charnn.bin");
 90            using (var fs = new FileStream(binPath, FileMode.Create))
 91            using (var bw = new BinaryWriter(fs))
 92            {
 93                for (int i = 0; i < logProb.Length; i++) bw.Write(logProb[i]);
 94            }
 95            Console.WriteLine(logProb.Length * 4 + " bytes");
 96        }
 97
 98        Console.WriteLine("Done!");
 99    }
100
101    static int CharIdx(char c)
102    {
103        c = char.ToLower(c);
104        if (c >= '\u0430' && c <= '\u044F') return c - '\u0430';
105        if (c == '\u0451') return 32; // ё
106        return -1;
107    }
108}