From 9f1831e468009110e0a700559921cc8636f549b3 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 10:51:22 +0200 Subject: [PATCH 1/8] Added boxsdk requirements --- requirements.lock | 11 +++++++++++ requirements.txt | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/requirements.lock b/requirements.lock index e4c3a20..e99dff3 100644 --- a/requirements.lock +++ b/requirements.lock @@ -2,9 +2,20 @@ requests==2.22.0 beautifulsoup4==4.7.1 tqdm==4.32.2 Unidecode==1.1.1 +boxsdk==2.5.0 ## The following requirements were added by pip freeze: +asn1crypto==0.24.0 +attrs==19.1.0 certifi==2019.6.16 +cffi==1.12.3 chardet==3.0.4 +cryptography==2.7 +dropbox==9.4.0 idna==2.8 +pycparser==2.19 +PyJWT==1.7.1 +requests-toolbelt==0.9.1 +six==1.12.0 soupsieve==1.9.2 urllib3==1.25.3 +wrapt==1.11.2 diff --git a/requirements.txt b/requirements.txt index f804347..b945804 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ requests beautifulsoup4 tqdm -Unidecode \ No newline at end of file +Unidecode +boxsdk[jwt] \ No newline at end of file From 732c6e46a0af8c85d683f815581ca1e8016059e2 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 10:51:41 +0200 Subject: [PATCH 2/8] Initialized Box client --- src/__init__.py | 5 ++++- src/box_sdk.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 src/box_sdk.py diff --git a/src/__init__.py b/src/__init__.py index 7af11c2..a104e16 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -56,6 +56,8 @@ ':.': ':' } +BOX_CONFIG_FILE_PATH = 'data/jwt_config.json' + __all__ = [ 'AZ_LYRICS_BASE_URL', @@ -74,5 +76,6 @@ 'CSV_HEADER_SONG_URL', 'CSV_HEADER_LYRICS', 'STR_CLEAN_TIMES', - 'STR_CLEAN_DICT' + 'STR_CLEAN_DICT', + 'BOX_CONFIG_FILE_PATH' ] diff --git a/src/box_sdk.py b/src/box_sdk.py new file mode 100644 index 0000000..7dcfae9 --- /dev/null +++ b/src/box_sdk.py @@ -0,0 +1,7 @@ +from boxsdk import JWTAuth +from boxsdk import Client + +from src import * + + +box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) From 9e871835e5b98f66b1607a67d3392e7d2858c56d Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 12:01:39 +0200 Subject: [PATCH 3/8] Implemented folder creation, sharing and file uploading functions --- src/__init__.py | 14 +++++++++++++- src/box_sdk.py | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index a104e16..583c355 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -57,6 +57,12 @@ } BOX_CONFIG_FILE_PATH = 'data/jwt_config.json' +BOX_RETRIES = 3 +BOX_FOLDER_ROOT_ID = '0' +BOX_FOLDER_APP_ID = '84132126414' +BOX_LINK_OPEN_ACCESS = 'open' +BOX_LINK_ALLOW_DOWNLOAD = True +BOX_LINK_ALLOW_PREVIEW = True __all__ = [ @@ -77,5 +83,11 @@ 'CSV_HEADER_LYRICS', 'STR_CLEAN_TIMES', 'STR_CLEAN_DICT', - 'BOX_CONFIG_FILE_PATH' + 'BOX_CONFIG_FILE_PATH', + 'BOX_RETRIES', + 'BOX_FOLDER_ROOT_ID', + 'BOX_FOLDER_APP_ID', + 'BOX_LINK_OPEN_ACCESS', + 'BOX_LINK_ALLOW_DOWNLOAD', + 'BOX_LINK_ALLOW_PREVIEW' ] diff --git a/src/box_sdk.py b/src/box_sdk.py index 7dcfae9..62517bf 100644 --- a/src/box_sdk.py +++ b/src/box_sdk.py @@ -4,4 +4,42 @@ from src import * -box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) +def create_folder(folder_name): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + sub_folder = box_client.folder(BOX_FOLDER_ROOT_ID).create_subfolder(folder_name) + return sub_folder.id + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API creating the folder [{folder_name}] into folder root: {e}') + return None + + +def create_shared_link(folder_id): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + shared_link = box_client.folder(folder_id).get_shared_link( + access=BOX_LINK_OPEN_ACCESS, + allow_download=BOX_LINK_ALLOW_DOWNLOAD, + allow_preview=BOX_LINK_ALLOW_PREVIEW + ) + return shared_link + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API creating a shared link for folder [{folder_id}]: {e}') + return None + + +def upload_file(file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + folder_id = '0' + file_name = file_path.split('/')[-1] + return box_client.folder(folder_id).upload(file_path, file_name) + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}') + return None From 679eac91a815548e4fc662245a48fda3fa62f3dc Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 12:34:01 +0200 Subject: [PATCH 4/8] Implemented search, update and download for files --- src/box_sdk.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/box_sdk.py b/src/box_sdk.py index 62517bf..07014a7 100644 --- a/src/box_sdk.py +++ b/src/box_sdk.py @@ -32,14 +32,51 @@ def create_shared_link(folder_id): return None -def upload_file(file_path): +def search_file(folder_id, file_name): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + for result in box_client.folder(folder_id).get_items(): + if result.name == file_name: + return result.id + return None + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API searching files into folder [{folder_id}] with name [{file_name}]: {e}') + return None + + +def upload_file(folder_id, file_path): box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: - folder_id = '0' file_name = file_path.split('/')[-1] return box_client.folder(folder_id).upload(file_path, file_name) except Exception as e: if i == BOX_RETRIES - 1: print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}') return None + + +def update_file(file_id, file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + return box_client.file(file_id).update_contents(file_path) + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API updating the file [{file_id}] with file [{file_path}]: {e}') + return None + + +def download_file(file_id, file_path): + box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) + for i in range(0, BOX_RETRIES): + try: + with open(file_path, 'wb') as file: + box_client.file(file_id).download_to(file) + return True + except Exception as e: + if i == BOX_RETRIES - 1: + print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}') + return None From 11c5f0df91ca7489e837cb8367c87181431abd9d Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 12:50:13 +0200 Subject: [PATCH 5/8] Returned file identifier for uploading and updating --- src/box_sdk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/box_sdk.py b/src/box_sdk.py index 07014a7..993a906 100644 --- a/src/box_sdk.py +++ b/src/box_sdk.py @@ -51,7 +51,7 @@ def upload_file(folder_id, file_path): for i in range(0, BOX_RETRIES): try: file_name = file_path.split('/')[-1] - return box_client.folder(folder_id).upload(file_path, file_name) + return box_client.folder(folder_id).upload(file_path, file_name).id except Exception as e: if i == BOX_RETRIES - 1: print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}') @@ -62,7 +62,7 @@ def update_file(file_id, file_path): box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: - return box_client.file(file_id).update_contents(file_path) + return box_client.file(file_id).update_contents(file_path).id except Exception as e: if i == BOX_RETRIES - 1: print(f'Error calling Box API updating the file [{file_id}] with file [{file_path}]: {e}') From 4ff112daa92ef6082df3b24f1151283bf5cf71d9 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 12:51:04 +0200 Subject: [PATCH 6/8] Implemented Box logic into the main function --- src/__main__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/__main__.py b/src/__main__.py index 5bda890..1310e52 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,11 +1,16 @@ from tqdm import tqdm from src import * -from src import azlyrics, csv_parser +from src import azlyrics, csv_parser, box_sdk def scrape(): for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): + csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' + file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1]) + if file_id: + box_sdk.download_file(file_id, csv_file_name) + artist_url_list = azlyrics.get_artist_url_list(artist_letter) for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)): song_url_list = azlyrics.get_song_url_list(artist_url) @@ -13,6 +18,10 @@ def scrape(): if not csv_parser.exists_song(artist_letter, artist_url, song_url): song_lyrics = azlyrics.get_song_lyrics(song_url) csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter) + if file_id: + file_id = box_sdk.update_file(file_id, csv_file_name) + else: + file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name) if __name__ == '__main__': From 1ddabab55f38fb7e1bbd6729b980c290c9fdc0fa Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 12:58:28 +0200 Subject: [PATCH 7/8] Removed the local version of the file and added some comments --- src/__main__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/__main__.py b/src/__main__.py index 1310e52..9e8e2cc 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,3 +1,5 @@ +import os + from tqdm import tqdm from src import * @@ -6,11 +8,13 @@ def scrape(): for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): + # Downloads file if it is available on Box folder. csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1]) if file_id: box_sdk.download_file(file_id, csv_file_name) + # Iterates over all artists with the given letter. artist_url_list = azlyrics.get_artist_url_list(artist_letter) for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)): song_url_list = azlyrics.get_song_url_list(artist_url) @@ -18,11 +22,15 @@ def scrape(): if not csv_parser.exists_song(artist_letter, artist_url, song_url): song_lyrics = azlyrics.get_song_lyrics(song_url) csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter) + # Uploads or updates the CSV on Box per every artist. if file_id: file_id = box_sdk.update_file(file_id, csv_file_name) else: file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name) + # Removes the local version of the CSV for saving storage. + os.remove(csv_file_name) + if __name__ == '__main__': scrape() From 72165b84c7747a768267e488ea03d71a33c70371 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 13:01:21 +0200 Subject: [PATCH 8/8] Added small RTM to Box API --- src/__init__.py | 2 ++ src/box_sdk.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/src/__init__.py b/src/__init__.py index 583c355..8f1f46c 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -58,6 +58,7 @@ BOX_CONFIG_FILE_PATH = 'data/jwt_config.json' BOX_RETRIES = 3 +BOX_RTM = 3 BOX_FOLDER_ROOT_ID = '0' BOX_FOLDER_APP_ID = '84132126414' BOX_LINK_OPEN_ACCESS = 'open' @@ -85,6 +86,7 @@ 'STR_CLEAN_DICT', 'BOX_CONFIG_FILE_PATH', 'BOX_RETRIES', + 'BOX_RTM', 'BOX_FOLDER_ROOT_ID', 'BOX_FOLDER_APP_ID', 'BOX_LINK_OPEN_ACCESS', diff --git a/src/box_sdk.py b/src/box_sdk.py index 993a906..cd8e7c5 100644 --- a/src/box_sdk.py +++ b/src/box_sdk.py @@ -1,3 +1,5 @@ +import time + from boxsdk import JWTAuth from boxsdk import Client @@ -11,6 +13,7 @@ def create_folder(folder_name): sub_folder = box_client.folder(BOX_FOLDER_ROOT_ID).create_subfolder(folder_name) return sub_folder.id except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API creating the folder [{folder_name}] into folder root: {e}') return None @@ -27,6 +30,7 @@ def create_shared_link(folder_id): ) return shared_link except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API creating a shared link for folder [{folder_id}]: {e}') return None @@ -41,6 +45,7 @@ def search_file(folder_id, file_name): return result.id return None except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API searching files into folder [{folder_id}] with name [{file_name}]: {e}') return None @@ -53,6 +58,7 @@ def upload_file(folder_id, file_path): file_name = file_path.split('/')[-1] return box_client.folder(folder_id).upload(file_path, file_name).id except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API uploading the file [{file_path}] to folder with id [{folder_id}]: {e}') return None @@ -64,6 +70,7 @@ def update_file(file_id, file_path): try: return box_client.file(file_id).update_contents(file_path).id except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API updating the file [{file_id}] with file [{file_path}]: {e}') return None @@ -77,6 +84,7 @@ def download_file(file_id, file_path): box_client.file(file_id).download_to(file) return True except Exception as e: + time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}') return None