-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
159 lines (143 loc) · 5.75 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import http
import requests
import sys
from string import Template
from os import listdir
from os.path import isfile,join
import csv
from pandas import DataFrame
from threading import Thread, Lock
import time
from rx import Observable, Observer
song_df = DataFrame(columns=['artist_name', 'song_name', 'genres', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])
df_lock = Lock()
def fetch_task(self,song):
songlist = self.getSongids(song)['tracks']['items']
if len(songlist) > 0:
index = None
for idx, track in enumerate(songlist):
for artist in track['artists']:
if artist['id'] == self.id:
index = idx
break
if index is not None :
stats = self.getSongStatistics(songlist[index]['id'])
statistic = {
'artist_name': self.name,
'song_name': song,
'genres': self.genres,
'danceability': stats['danceability'],
'energy': stats['energy'],
'key': stats['key'],
'loudness': stats['loudness'],
'mode': stats['mode'],
'speechiness': stats['speechiness'],
'acousticness': stats['acousticness'],
'instrumentalness': stats['instrumentalness'],
'liveness': stats['liveness'],
'valence': stats['valence'],
'tempo': stats['tempo'],
'time_signature': stats['time_signature']
}
df_lock.acquire()
global song_df
song_df = song_df.append(statistic, ignore_index=True)
df_lock.release()
token = sys.argv[1]
midi_dir = "./clean_midi"
not_found=0
class Artist():
def __init__(self,name):
self.name = name
self.spotifyname= None
self.id = None
self.potential_ids=[]
self.genres=[]
self.local_song_names=[]
self.local_mathced_song_list=[]
self.local_unmatched_song_list=[]
print("looking for artist: " + self.name)
self.getLocalSongNames()
def getLocalSongNames(self):
artistpath = midi_dir+ "/"+ self.name
#Fixed join with empty string clause
raw_local_song_names = [".".join(f.split(sep=".")[:-1]) for f in listdir(artistpath) if isfile(join(artistpath, f))]
self.local_song_names = []
for song in raw_local_song_names:
if song[-1].isdigit() and song[-2] == ".":
continue
else:
self.local_song_names.append(song)
self.local_song_names.sort()
print("number of songs: ", len(self.local_song_names))
def getArtist(self):
while True:
request_url = Template("https://api.spotify.com/v1/search?q=$artist&type=artist").substitute(artist=self.name)
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": "Bearer " + token
}
response = requests.get(request_url, headers=headers)
if response.status_code==429:
time.sleep(int(response.headers['Retry-After']))
continue
fetched_artist_names = response.json()['artists']['items']
if (len(fetched_artist_names) > 0):
self.spotifyname = fetched_artist_names[0]['name']
self.id = fetched_artist_names[0]['id']
self.genres = fetched_artist_names[0]['genres']
else:
self.spotifyname = "unmatched artist"
global not_found
not_found = not_found + 1
print("Local name ", self.name, " spot name: ", self.spotifyname)
return response
def getSongids(self, songname):
while True:
request_url = Template("https://api.spotify.com/v1/search?q=$song&type=track").substitute(song=songname)
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": "Bearer " + token
}
response = requests.get(request_url,headers=headers)
if response.status_code==429:
time.sleep(int(response.headers['Retry-After']))
continue
return response.json()
def getSongStatistics(self,id):
while True:
request_url = Template("https://api.spotify.com/v1/audio-features/$song_id").substitute(song_id=id)
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": "Bearer " + token
}
response = requests.get(request_url, headers=headers)
if response.status_code==429:
time.sleep(int(response.headers['Retry-After']))
continue
response = response.json()
return response
def getMatchedSongs(self):
if self.id is None:
return
threads=[]
for song in self.local_song_names:
threads.append(Thread(target=fetch_task,args=(self,song)))
for i in range(len(self.local_song_names)):
threads[i].start()
for i in range(len(self.local_song_names)):
threads[i].join()
artistList=[]
onlyfiles = [f for f in listdir(midi_dir) if not isfile(join(midi_dir, f))]
onlyfiles.sort()
for filename in onlyfiles:
artist = Artist(filename)
artistList.append(artist)
for artist in artistList:
artist.getArtist()
artist.getMatchedSongs()
song_df.to_csv('statistics.csv',sep=',')
print(not_found, " artists unmatched")