1#!/usr/bin/env python3 2""" 3BuildMorph: Parse OpenCorpora XML → compact binary morphology dictionary. 4Output: morph.bin — word → POS tag (1 byte per word) 5 6POS tags encoded as single byte: 7 1=NOUN, 2=ADJF, 3=ADJS, 4=COMP, 5=VERB, 6=INFN, 7=PRTF, 8=PRTS, 8 9=GRND, 10=NUMR, 11=ADVB, 12=NPRO, 13=PRED, 14=PREP, 15=CONJ, 16=PRCL, 17=INTJ 9 10Format: morph.bin 11 [4 bytes] count of entries 12 For each entry: 13 [1 byte] word length 14 [N bytes] word (UTF-8) 15 [1 byte] POS tag 16""" 17import xml.etree.ElementTree as ET 18import struct 19import sys 20import os 21 22POS_MAP = { 23 'NOUN': 1, 'ADJF': 2, 'ADJS': 3, 'COMP': 4, 24 'VERB': 5, 'INFN': 6, 'PRTF': 7, 'PRTS': 8, 25 'GRND': 9, 'NUMR': 10, 'ADVB': 11, 'NPRO': 12, 26 'PRED': 13, 'PREP': 14, 'CONJ': 15, 'PRCL': 16, 'INTJ': 17, 27} 28 29def parse_opencorpora(xml_path): 30 """Parse OpenCorpora XML, extract word → POS tag mapping.""" 31 print(f"Parsing {xml_path}...") 32 words = {} # word_lower → pos_tag_byte 33 34 # Use iterparse for memory efficiency (400MB XML) 35 context = ET.iterparse(xml_path, events=('end',)) 36 lemma_count = 0 37 form_count = 0 38 39 current_pos = 0 40 41 for event, elem in context: 42 if elem.tag == 'lemma': 43 lemma_count += 1 44 # Get POS from first <g> in first <l> 45 l_elem = elem.find('l') 46 if l_elem is not None: 47 pos_tag = 0 48 for g in l_elem.findall('g'): 49 v = g.get('v', '') 50 if v in POS_MAP: 51 pos_tag = POS_MAP[v] 52 break 53 54 if pos_tag > 0: 55 # Add lemma 56 word = l_elem.get('t', '').lower() 57 if word and len(word) >= 2 and len(word) <= 30: 58 if word not in words: 59 words[word] = pos_tag 60 form_count += 1 61 62 # Add all forms 63 for f_elem in elem.findall('f'): 64 word = f_elem.get('t', '').lower() 65 if word and len(word) >= 2 and len(word) <= 30: 66 if word not in words: 67 words[word] = pos_tag 68 form_count += 1 69 70 elem.clear() # Free memory 71 72 if lemma_count % 50000 == 0: 73 print(f" {lemma_count} lemmas, {form_count} forms...") 74 75 print(f"Done: {lemma_count} lemmas, {len(words)} unique forms") 76 return words 77 78def save_binary(words, out_path): 79 """Save as compact binary: count + (len + word_utf8 + pos_byte)*""" 80 print(f"Saving to {out_path}...") 81 with open(out_path, 'wb') as f: 82 f.write(struct.pack('<I', len(words))) 83 for word, pos in words.items(): 84 word_bytes = word.encode('utf-8') 85 if len(word_bytes) > 255: 86 continue 87 f.write(struct.pack('B', len(word_bytes))) 88 f.write(word_bytes) 89 f.write(struct.pack('B', pos)) 90 91 size = os.path.getsize(out_path) 92 print(f"Saved: {len(words)} entries, {size / 1024 / 1024:.1f} MB") 93 94def main(): 95 xml_path = '/tmp/dict.opcorpora.xml' 96 if not os.path.exists(xml_path): 97 xml_path = os.path.join(os.path.dirname(__file__), '..', 'Data', 'dict.opcorpora.xml') 98 99 out_dir = os.path.join(os.path.dirname(__file__), '..', 'Data') 100 out_path = os.path.join(out_dir, 'morph.bin') 101 102 words = parse_opencorpora(xml_path) 103 save_binary(words, out_path) 104 105 # Stats 106 pos_counts = {} 107 for w, p in words.items(): 108 pos_counts[p] = pos_counts.get(p, 0) + 1 109 110 pos_names = {v: k for k, v in POS_MAP.items()} 111 print("\nPOS distribution:") 112 for pos, count in sorted(pos_counts.items(), key=lambda x: -x[1]): 113 name = pos_names.get(pos, '?') 114 print(f" {name}: {count}") 115 116if __name__ == '__main__': 117 main()