-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdic2trie.py
37 lines (30 loc) · 1012 Bytes
/
dic2trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# This file turn ./LIWC2007_updated.dic to a jason file that is easy to search
import json
categories = {}
trie = {}
corpus_filepath = './LIWC2007_updated.dic'
def add(key, categories):
cursor = trie
for letter in key:
if letter == '*':
cursor['*'] = categories
break
if letter not in cursor:
cursor[letter] = {}
cursor = cursor[letter]
cursor['$'] = categories
def dic2trie():
for line in open(corpus_filepath).readlines():
if not line.startswith('%'):
parts = line.strip().split('\t')
if parts[0].isdigit():
# cache category names
categories[parts[0]] = parts[1]
else:
# print parts[0], ':', parts[1:]
add(parts[0], [categories[category_id] for category_id in parts[1:]])
fp = open('./LIWC2007_updated.trie', 'w')
fp.write(json.dumps(trie, sort_keys=True))
# indent=4,
if __name__ == '__main__':
dic2trie()