Skip to content

Commit

Permalink
Merge pull request #1 from AlbertSuarez/box_integration
Browse files Browse the repository at this point in the history
Box integration
  • Loading branch information
AlbertSuarez authored Aug 9, 2019
2 parents b589994 + 72165b8 commit a0c73cc
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 3 deletions.
11 changes: 11 additions & 0 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,20 @@ requests==2.22.0
beautifulsoup4==4.7.1
tqdm==4.32.2
Unidecode==1.1.1
boxsdk==2.5.0
## The following requirements were added by pip freeze:
asn1crypto==0.24.0
attrs==19.1.0
certifi==2019.6.16
cffi==1.12.3
chardet==3.0.4
cryptography==2.7
dropbox==9.4.0
idna==2.8
pycparser==2.19
PyJWT==1.7.1
requests-toolbelt==0.9.1
six==1.12.0
soupsieve==1.9.2
urllib3==1.25.3
wrapt==1.11.2
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
requests
beautifulsoup4
tqdm
Unidecode
Unidecode
boxsdk[jwt]
19 changes: 18 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@
':.': ':'
}

BOX_CONFIG_FILE_PATH = 'data/jwt_config.json'
BOX_RETRIES = 3
BOX_RTM = 3
BOX_FOLDER_ROOT_ID = '0'
BOX_FOLDER_APP_ID = '84132126414'
BOX_LINK_OPEN_ACCESS = 'open'
BOX_LINK_ALLOW_DOWNLOAD = True
BOX_LINK_ALLOW_PREVIEW = True


__all__ = [
'AZ_LYRICS_BASE_URL',
Expand All @@ -74,5 +83,13 @@
'CSV_HEADER_SONG_URL',
'CSV_HEADER_LYRICS',
'STR_CLEAN_TIMES',
'STR_CLEAN_DICT'
'STR_CLEAN_DICT',
'BOX_CONFIG_FILE_PATH',
'BOX_RETRIES',
'BOX_RTM',
'BOX_FOLDER_ROOT_ID',
'BOX_FOLDER_APP_ID',
'BOX_LINK_OPEN_ACCESS',
'BOX_LINK_ALLOW_DOWNLOAD',
'BOX_LINK_ALLOW_PREVIEW'
]
19 changes: 18 additions & 1 deletion src/__main__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
import os

from tqdm import tqdm

from src import *
from src import azlyrics, csv_parser
from src import azlyrics, csv_parser, box_sdk


def scrape():
for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)):
# Downloads file if it is available on Box folder.
csv_file_name = f'{CSV_FILE}_{artist_letter}.csv'
file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1])
if file_id:
box_sdk.download_file(file_id, csv_file_name)

# Iterates over all artists with the given letter.
artist_url_list = azlyrics.get_artist_url_list(artist_letter)
for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)):
song_url_list = azlyrics.get_song_url_list(artist_url)
for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)):
if not csv_parser.exists_song(artist_letter, artist_url, song_url):
song_lyrics = azlyrics.get_song_lyrics(song_url)
csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter)
# Uploads or updates the CSV on Box per every artist.
if file_id:
file_id = box_sdk.update_file(file_id, csv_file_name)
else:
file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name)

# Removes the local version of the CSV for saving storage.
os.remove(csv_file_name)


if __name__ == '__main__':
Expand Down
90 changes: 90 additions & 0 deletions src/box_sdk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import time

from boxsdk import JWTAuth
from boxsdk import Client

from src import *


def create_folder(folder_name):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
sub_folder = box_client.folder(BOX_FOLDER_ROOT_ID).create_subfolder(folder_name)
return sub_folder.id
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API creating the folder [{folder_name}] into folder root: {e}')
return None


def create_shared_link(folder_id):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
shared_link = box_client.folder(folder_id).get_shared_link(
access=BOX_LINK_OPEN_ACCESS,
allow_download=BOX_LINK_ALLOW_DOWNLOAD,
allow_preview=BOX_LINK_ALLOW_PREVIEW
)
return shared_link
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API creating a shared link for folder [{folder_id}]: {e}')
return None


def search_file(folder_id, file_name):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
for result in box_client.folder(folder_id).get_items():
if result.name == file_name:
return result.id
return None
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API searching files into folder [{folder_id}] with name [{file_name}]: {e}')
return None


def upload_file(folder_id, file_path):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
file_name = file_path.split('/')[-1]
return box_client.folder(folder_id).upload(file_path, file_name).id
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}')
return None


def update_file(file_id, file_path):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
return box_client.file(file_id).update_contents(file_path).id
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API updating the file [{file_id}] with file [{file_path}]: {e}')
return None


def download_file(file_id, file_path):
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
with open(file_path, 'wb') as file:
box_client.file(file_id).download_to(file)
return True
except Exception as e:
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}')
return None

0 comments on commit a0c73cc

Please sign in to comment.