-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_database.py
58 lines (45 loc) · 1.5 KB
/
create_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sqlite3
import re
repeat = re.compile("[A-Z']*\\([0-9]+\\)")
whitespace = re.compile("\\s+")
def normalize_whitespace(line):
tokens = filter(None, whitespace.split(line))
return ' '.join(tokens)
def get_stress_pattern(p):
nums = []
for c in p:
if c.isdigit():
nums.append(1 if int(c)>0 else 0)
return nums
def get_last_syll(p):
lastSyll = ''
for s in p.split(' '):
for c in s:
if c.isdigit():
lastSyll = s[:-1]
break
else:
lastSyll += c
return lastSyll
def create(dbname):
conn = sqlite3.connect(dbname)
c = conn.cursor()
c.execute('create table words (word text, rhyme text, sylls integer,'
' stress text, partOfSpeech text)')
with open('cmudict.0.7a.txt', 'r') as f:
for line in f:
line = normalize_whitespace(line)
if not line[0].isalpha() and line[0] != '\'':
continue
tokens = line.split(' ',1)
word = tokens[0]
pronunciation = tokens[1]
if repeat.match(word):
word = word[:word.index('(')]
stress = ''.join(map(str,get_stress_pattern(pronunciation)))
sylls = len(stress)
rhyme = get_last_syll(pronunciation)
pos = "unknown" # part of speech
c.execute('insert into words values (?,?,?,?,?)', (word, rhyme, sylls, stress, pos))
conn.commit()
return conn