-
Notifications
You must be signed in to change notification settings - Fork 0
/
xml_parser.py
93 lines (73 loc) · 2.73 KB
/
xml_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import xml.etree.cElementTree as etree
from xml_utils import (
calculate_level,
strip_tag_name,
)
USE_EXTRA_ARTISTS = False
def parse_artists(pathXML):
artist = None
level = 0
for event, elem in etree.iterparse(pathXML, events=('start', 'end')):
try:
tname = strip_tag_name(elem.tag)
level = calculate_level(event, tname, level, 'artist')
if event == 'start' and tname == 'artist':
artist = {
'id': None,
'biography': None,
'profile': None,
'social_media_links': [],
}
elif event != 'end' and level == 2:
if tname == 'id':
artist['id'] = elem.text
if tname == 'name':
artist['name'] = elem.text
if tname == 'profile':
artist['profile'] = elem.text
elif event != 'end' and level == 3:
if tname == 'url':
artist['social_media_links'].append(elem.text)
if event == 'end' and tname == 'artist':
yield artist
except etree.ParseError:
print('>>>error parsing...continue')
finally:
elem.clear()
def parse_genres(pathXML):
artists = []
artist = {}
parsing_artist = False
level = 0
artist_tags = ['artists']
for event, elem in etree.iterparse(pathXML, events=('start', 'end')):
try:
tname = strip_tag_name(elem.tag)
level = calculate_level(event, tname, level, 'release')
if event == 'start':
if tname == 'artists':
artists = []
elif tname == 'artist':
parsing_artist = True
artist = {'id': None, 'name': None, 'genres': set()}
artists.append(artist)
# we might not wanna include the extraartists here
if USE_EXTRA_ARTISTS:
artist_tags.append('extraartists')
if event == 'end' and tname in artist_tags:
parsing_artist = False
if parsing_artist and level == 4:
if tname == 'id':
artist['id'] = elem.text
if tname == 'name':
artist['name'] = elem.text
# update all genre artists
if event != 'end' and tname == 'genre':
for artist in artists:
artist['genres'].add(elem.text)
elif event == 'end' and tname == 'release':
yield artists
except etree.ParseError:
print('>>>error parsing...continue')
finally:
elem.clear()