-
Notifications
You must be signed in to change notification settings - Fork 8
/
initdb.py
121 lines (95 loc) · 3.65 KB
/
initdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from flask import current_app
import csv
from chardet.universaldetector import UniversalDetector
from collections import defaultdict
from itertools import count
from .db import get_db, normalizeDBcol
from importlib.resources import read_text
import csv_reconcile
from . import scorer
def initDataTable(db, colnames, idcol):
cols = []
cnts = defaultdict(count)
for col in colnames:
slug = normalizeDBcol(col)
slug = f'{slug}{next(cnts[slug])}'
if col == idcol:
cols.append('%s TEXT PRIMARY KEY' % (slug,))
else:
cols.append('%s TEXT NOT NULL' % (slug,))
db.execute('INSERT INTO datacols VALUES (?,?,?)',
(col, slug, 1 if col == idcol else 0))
# create data table with the contents of the csv file
createSQL = 'CREATE TABLE data (\n %s\n)'
db.execute(createSQL % (',\n '.join(cols),))
def initReconcileTable(db, colnames):
create = [
'CREATE TABLE reconcile (\n id TEXT PRIMARY KEY,\n word TEXT NOT NULL'
]
for col in colnames:
create.append('%s TEXT NOT NULL' % (col,))
# create data table with the contents of the csv file
db.execute(',\n '.join(create) + '\n)')
def detectEncoding(filenm):
detector = UniversalDetector()
for line in open(filenm, 'rb'):
detector.feed(line)
if detector.done: break
detector.close()
if detector.result['confidence'] > .95:
return detector.result['encoding']
return None
def init_db(db,
csvfilenm,
idcol,
searchcol,
csvencoding=None,
scoreOptions=None,
csvkwargs=None):
enckwarg = dict()
csvencoding = csvencoding or detectEncoding(csvfilenm)
if csvencoding:
enckwarg['encoding'] = csvencoding
schema = read_text(csv_reconcile, 'schema.sql')
db.executescript(schema)
csvkwargs = {} if csvkwargs is None else csvkwargs
with db:
# Create a table with ids (as PRIMARY ID), words and bigrams
with open(csvfilenm, newline='', **enckwarg) as csvfile:
dialect = None
try:
dialect = csv.Sniffer().sniff(csvfile.read(1024))
except:
pass
csvfile.seek(0)
reader = csv.reader(csvfile, dialect=dialect, **csvkwargs)
header = next(reader)
# Throws if col doesn't exist
searchidx = header.index(searchcol)
ididx = header.index(idcol)
normalizedFields = scorer.getNormalizedFields()
initDataTable(db, header, idcol)
initReconcileTable(db, normalizedFields)
datavals = ','.join('?' * len(header))
for row in reader:
if len(row) != len(header): continue
mid = row[ididx]
word = row[searchidx]
matchFields = scorer.normalizeRow(word, row, **scoreOptions)
db.execute(
"INSERT INTO reconcile VALUES (%s)" %
(','.join('?' * (2 + len(normalizedFields))),),
(mid, word) + tuple(matchFields))
db.execute("INSERT INTO data VALUES (%s)" % (datavals), row)
def init_db_with_context(csvfilenm, idcol, searchcol):
db = get_db()
csvkwargs = current_app.config.get('CSVKWARGS', {})
scoreOptions = current_app.config['SCOREOPTIONS']
csvencoding = current_app.config.get('CSVENCODING', None)
return init_db(db,
csvfilenm,
idcol,
searchcol,
csvencoding=csvencoding,
csvkwargs=csvkwargs,
scoreOptions=scoreOptions)