diff --git a/requirements.lock b/requirements.lock index e4c3a20..e99dff3 100644 --- a/requirements.lock +++ b/requirements.lock @@ -2,9 +2,20 @@ requests==2.22.0 beautifulsoup4==4.7.1 tqdm==4.32.2 Unidecode==1.1.1 +boxsdk==2.5.0 ## The following requirements were added by pip freeze: +asn1crypto==0.24.0 +attrs==19.1.0 certifi==2019.6.16 +cffi==1.12.3 chardet==3.0.4 +cryptography==2.7 +dropbox==9.4.0 idna==2.8 +pycparser==2.19 +PyJWT==1.7.1 +requests-toolbelt==0.9.1 +six==1.12.0 soupsieve==1.9.2 urllib3==1.25.3 +wrapt==1.11.2 diff --git a/requirements.txt b/requirements.txt index f804347..b945804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ requests beautifulsoup4 tqdm -Unidecode \ No newline at end of file +Unidecode +boxsdk[jwt] \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 7af11c2..8f1f46c 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -56,6 +56,15 @@ ':.': ':' } +BOX_CONFIG_FILE_PATH = 'data/jwt_config.json' +BOX_RETRIES = 3 +BOX_RTM = 3 +BOX_FOLDER_ROOT_ID = '0' +BOX_FOLDER_APP_ID = '84132126414' +BOX_LINK_OPEN_ACCESS = 'open' +BOX_LINK_ALLOW_DOWNLOAD = True +BOX_LINK_ALLOW_PREVIEW = True + __all__ = [ 'AZ_LYRICS_BASE_URL', @@ -74,5 +83,13 @@ 'CSV_HEADER_SONG_URL', 'CSV_HEADER_LYRICS', 'STR_CLEAN_TIMES', - 'STR_CLEAN_DICT' + 'STR_CLEAN_DICT', + 'BOX_CONFIG_FILE_PATH', + 'BOX_RETRIES', + 'BOX_RTM', + 'BOX_FOLDER_ROOT_ID', + 'BOX_FOLDER_APP_ID', + 'BOX_LINK_OPEN_ACCESS', + 'BOX_LINK_ALLOW_DOWNLOAD', + 'BOX_LINK_ALLOW_PREVIEW' ] diff --git a/src/__main__.py b/src/__main__.py index 5bda890..9e8e2cc 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,11 +1,20 @@ +import os + from tqdm import tqdm from src import * -from src import azlyrics, csv_parser +from src import azlyrics, csv_parser, box_sdk def scrape(): for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): + # Downloads file if it is available on Box folder. + csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' + file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1]) + if file_id: + box_sdk.download_file(file_id, csv_file_name) + + # Iterates over all artists with the given letter. artist_url_list = azlyrics.get_artist_url_list(artist_letter) for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)): song_url_list = azlyrics.get_song_url_list(artist_url) @@ -13,6 +22,14 @@ def scrape(): if not csv_parser.exists_song(artist_letter, artist_url, song_url): song_lyrics = azlyrics.get_song_lyrics(song_url) csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter) + # Uploads or updates the CSV on Box per every artist. + if file_id: + file_id = box_sdk.update_file(file_id, csv_file_name) + else: + file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name) + + # Removes the local version of the CSV for saving storage. + os.remove(csv_file_name) if __name__ == '__main__': diff --git a/src/box_sdk.py b/src/box_sdk.py new file mode 100644 index 0000000..cd8e7c5 --- /dev/null +++ b/src/box_sdk.py @@ -0,0 +1,90 @@ +import time + +from boxsdk import JWTAuth +from boxsdk import Client + +from src import * + + +def create_folder(folder_name): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + sub_folder = box_client.folder(BOX_FOLDER_ROOT_ID).create_subfolder(folder_name) + return sub_folder.id + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API creating the folder [{folder_name}] into folder root: {e}') + return None + + +def create_shared_link(folder_id): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + shared_link = box_client.folder(folder_id).get_shared_link( + access=BOX_LINK_OPEN_ACCESS, + allow_download=BOX_LINK_ALLOW_DOWNLOAD, + allow_preview=BOX_LINK_ALLOW_PREVIEW + ) + return shared_link + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API creating a shared link for folder [{folder_id}]: {e}') + return None + + +def search_file(folder_id, file_name): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + for result in box_client.folder(folder_id).get_items(): + if result.name == file_name: + return result.id + return None + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API searching files into folder [{folder_id}] with name [{file_name}]: {e}') + return None + + +def upload_file(folder_id, file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + file_name = file_path.split('/')[-1] + return box_client.folder(folder_id).upload(file_path, file_name).id + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}') + return None + + +def update_file(file_id, file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + return box_client.file(file_id).update_contents(file_path).id + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API updating the file [{file_id}] with file [{file_path}]: {e}') + return None + + +def download_file(file_id, file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + with open(file_path, 'wb') as file: + box_client.file(file_id).download_to(file) + return True + except Exception as e: + time.sleep(BOX_RTM) + if i == BOX_RETRIES - 1: + print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}') + return None