-
Notifications
You must be signed in to change notification settings - Fork 3
/
god_frame.py
58 lines (52 loc) · 1.88 KB
/
god_frame.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from __future__ import division
import pandas as pd
import os
import re
import common
# this title should be a saint
def canonize_title(title):
trans = title
trans = re.sub('\s+', ' ', title)
clitics = ['ll', 's', 't']
for clitic in clitics:
trans = trans.replace(' '+clitic, "'"+clitic)
return trans
db = common.get_songdb()
rows = []
merged = 0
for artist_discog in db.itervalues():
title_to_row = {}
for title, song in artist_discog.iteritems():
try:
raw, comp = common.get_sizes(song)
scraped = True
inf_raw, inf_comp = common.get_inf_ratio(song)
ratio = inf_raw / inf_comp
assert raw == inf_raw, "{} != {}".format(raw, inf_raw)
except common.NotScrapedException:
raw = comp = None
scraped = False
ir = None
canon_title = canonize_title(title)
if canon_title not in title_to_row:
row = dict(artist=song.artist, title=canon_title, date=song.earliest,
peak=song.peakPos, scraped=scraped,
raw=raw, comp=comp, icomp=inf_comp, ratio=ratio,
)
title_to_row[canon_title] = row
# Got a dupe. Merge them.
else:
merged += 1
extant = title_to_row[canon_title]
extant['peak'] = min(extant['peak'], song.peakPos)
extant['scraped'] = extant['scraped'] or scraped
extant['date'] = min(extant['date'], song.earliest)
rows.extend(title_to_row.values())
print "Merged {} duplicate rows".format(merged)
df = pd.DataFrame(rows)
df['date'] = pd.to_datetime(df['date'])
# Blargh. Can't do this with nullable col. http://stackoverflow.com/a/21290084/262271
#df['raw'] = df['raw'].astype(int)
#df['comp'] = df['comp'].astype(int)
print "Saving god frame with shape {}".format(df.shape)
df.to_pickle(common.OMNI_PICKLE_NAME)