windowcapture
исходный код / Tools/BuildMorph.py

BuildMorph.py

117 строк · 3,892 байт · модуль Tools
  1#!/usr/bin/env python3
  2"""
  3BuildMorph: Parse OpenCorpora XML → compact binary morphology dictionary.
  4Output: morph.bin — word → POS tag (1 byte per word)
  5
  6POS tags encoded as single byte:
  7  1=NOUN, 2=ADJF, 3=ADJS, 4=COMP, 5=VERB, 6=INFN, 7=PRTF, 8=PRTS,
  8  9=GRND, 10=NUMR, 11=ADVB, 12=NPRO, 13=PRED, 14=PREP, 15=CONJ, 16=PRCL, 17=INTJ
  9
 10Format: morph.bin
 11  [4 bytes] count of entries
 12  For each entry:
 13    [1 byte] word length
 14    [N bytes] word (UTF-8)
 15    [1 byte] POS tag
 16"""
 17import xml.etree.ElementTree as ET
 18import struct
 19import sys
 20import os
 21
 22POS_MAP = {
 23    'NOUN': 1, 'ADJF': 2, 'ADJS': 3, 'COMP': 4,
 24    'VERB': 5, 'INFN': 6, 'PRTF': 7, 'PRTS': 8,
 25    'GRND': 9, 'NUMR': 10, 'ADVB': 11, 'NPRO': 12,
 26    'PRED': 13, 'PREP': 14, 'CONJ': 15, 'PRCL': 16, 'INTJ': 17,
 27}
 28
 29def parse_opencorpora(xml_path):
 30    """Parse OpenCorpora XML, extract word → POS tag mapping."""
 31    print(f"Parsing {xml_path}...")
 32    words = {}  # word_lower → pos_tag_byte
 33
 34    # Use iterparse for memory efficiency (400MB XML)
 35    context = ET.iterparse(xml_path, events=('end',))
 36    lemma_count = 0
 37    form_count = 0
 38
 39    current_pos = 0
 40
 41    for event, elem in context:
 42        if elem.tag == 'lemma':
 43            lemma_count += 1
 44            # Get POS from first <g> in first <l>
 45            l_elem = elem.find('l')
 46            if l_elem is not None:
 47                pos_tag = 0
 48                for g in l_elem.findall('g'):
 49                    v = g.get('v', '')
 50                    if v in POS_MAP:
 51                        pos_tag = POS_MAP[v]
 52                        break
 53
 54                if pos_tag > 0:
 55                    # Add lemma
 56                    word = l_elem.get('t', '').lower()
 57                    if word and len(word) >= 2 and len(word) <= 30:
 58                        if word not in words:
 59                            words[word] = pos_tag
 60                            form_count += 1
 61
 62                    # Add all forms
 63                    for f_elem in elem.findall('f'):
 64                        word = f_elem.get('t', '').lower()
 65                        if word and len(word) >= 2 and len(word) <= 30:
 66                            if word not in words:
 67                                words[word] = pos_tag
 68                                form_count += 1
 69
 70            elem.clear()  # Free memory
 71
 72            if lemma_count % 50000 == 0:
 73                print(f"  {lemma_count} lemmas, {form_count} forms...")
 74
 75    print(f"Done: {lemma_count} lemmas, {len(words)} unique forms")
 76    return words
 77
 78def save_binary(words, out_path):
 79    """Save as compact binary: count + (len + word_utf8 + pos_byte)*"""
 80    print(f"Saving to {out_path}...")
 81    with open(out_path, 'wb') as f:
 82        f.write(struct.pack('<I', len(words)))
 83        for word, pos in words.items():
 84            word_bytes = word.encode('utf-8')
 85            if len(word_bytes) > 255:
 86                continue
 87            f.write(struct.pack('B', len(word_bytes)))
 88            f.write(word_bytes)
 89            f.write(struct.pack('B', pos))
 90
 91    size = os.path.getsize(out_path)
 92    print(f"Saved: {len(words)} entries, {size / 1024 / 1024:.1f} MB")
 93
 94def main():
 95    xml_path = '/tmp/dict.opcorpora.xml'
 96    if not os.path.exists(xml_path):
 97        xml_path = os.path.join(os.path.dirname(__file__), '..', 'Data', 'dict.opcorpora.xml')
 98
 99    out_dir = os.path.join(os.path.dirname(__file__), '..', 'Data')
100    out_path = os.path.join(out_dir, 'morph.bin')
101
102    words = parse_opencorpora(xml_path)
103    save_binary(words, out_path)
104
105    # Stats
106    pos_counts = {}
107    for w, p in words.items():
108        pos_counts[p] = pos_counts.get(p, 0) + 1
109
110    pos_names = {v: k for k, v in POS_MAP.items()}
111    print("\nPOS distribution:")
112    for pos, count in sorted(pos_counts.items(), key=lambda x: -x[1]):
113        name = pos_names.get(pos, '?')
114        print(f"  {name}: {count}")
115
116if __name__ == '__main__':
117    main()