From fb87b00dbe03b0ba51cb6d2dc96325017c964ec6 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 14:55:26 +0200 Subject: [PATCH] Documented code --- src/__init__.py | 5 +++++ src/__main__.py | 4 ++++ src/azlyrics.py | 20 ++++++++++++++++++++ src/box_sdk.py | 36 +++++++++++++++++++++++++++++++++++- src/csv_parser.py | 17 +++++++++++++++++ src/string_cleaner.py | 15 +++++++++++++++ 6 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index 8f1f46c..0720a91 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,9 +1,11 @@ +# AZLyrics website AZ_LYRICS_BASE_URL = 'https://www.azlyrics.com' AZ_LYRICS_ARTIST_LETTER_LIST = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '19' ] +# Scrapping BASE = 'Mozilla/5.0' SCRAPE_RTD_MINIMUM = 4 SCRAPE_RTD_MAXIMUM = 6 @@ -24,6 +26,7 @@ SCRAPE_RETRIES_AMOUNT = 3 SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10 +# CSV CSV_FILE = 'data/azlyrics_lyrics' CSV_HEADER_ARTIST_NAME = 'ARTIST_NAME' CSV_HEADER_ARTIST_URL = 'ARTIST_URL' @@ -31,6 +34,7 @@ CSV_HEADER_SONG_URL = 'SONG_URL' CSV_HEADER_LYRICS = 'LYRICS' +# String cleaning STR_CLEAN_TIMES = 3 STR_CLEAN_DICT = { '\n\n': '\n', @@ -56,6 +60,7 @@ ':.': ':' } +# Box integration BOX_CONFIG_FILE_PATH = 'data/jwt_config.json' BOX_RETRIES = 3 BOX_RTM = 3 diff --git a/src/__main__.py b/src/__main__.py index 9e8e2cc..25d563d 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -7,6 +7,10 @@ def scrape(): + """ + Processes the main function of the scraper. + :return: All AZLyrics scraped. + """ for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): # Downloads file if it is available on Box folder. csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' diff --git a/src/azlyrics.py b/src/azlyrics.py index eea33e7..a475c63 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -9,6 +9,11 @@ def _get_html(url): + """ + Retrieves the HTML content given a Internet accessible URL. + :param url: URL to retrieve. + :return: HTML content formatted as String, None if there was an error. + """ time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD for i in range(0, SCRAPE_RETRIES_AMOUNT): try: @@ -29,6 +34,11 @@ def _get_html(url): def get_artist_url_list(artist_letter): + """ + Retrieves the AZLyrics website URLs for all the artists given its first character. + :param artist_letter: First character of an artist. + :return: List of pairs containing the artist name and its AZLyrics URL. + """ artist_url_list = [] try: @@ -50,6 +60,11 @@ def get_artist_url_list(artist_letter): def get_song_url_list(artist_url): + """ + Retrieves the AZLyrics website URLs for all the songs from an artist AZLyrics URL. + :param artist_url: AZLyrics URL from a given artist. + :return: List of pairs containing the song name and its AZLyrics URL. + """ song_url_list = [] try: @@ -69,6 +84,11 @@ def get_song_url_list(artist_url): def get_song_lyrics(song_url): + """ + Retrieves and cleans the lyrics of a song given its AZLyrics URL. + :param song_url: AZLyrics URL from a given song. + :return: Cleaned and formatted song lyrics. + """ song_lyrics = '' try: diff --git a/src/box_sdk.py b/src/box_sdk.py index cd8e7c5..d49cca1 100644 --- a/src/box_sdk.py +++ b/src/box_sdk.py @@ -7,6 +7,11 @@ def create_folder(folder_name): + """ + Creates a folder in the root folder given its name. + :param folder_name: Folder name to create. + :return: Folder identifier if the creation was successful, None otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -20,6 +25,11 @@ def create_folder(folder_name): def create_shared_link(folder_id): + """ + Creates an Internet accessible shared link of folder given its identifier. + :param folder_id: Folder identifier. + :return: Shared link if the creation was successful, None otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -37,6 +47,12 @@ def create_shared_link(folder_id): def search_file(folder_id, file_name): + """ + Finds a file into a folder given its identifier and a query string. + :param folder_id: Folder identifier. + :param file_name: File name. + :return: File identifier if the file exists, None otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -52,6 +68,12 @@ def search_file(folder_id, file_name): def upload_file(folder_id, file_path): + """ + Uploads a file (that must not exist in Box folder) into a folder given its path. + :param folder_id: Folder identifier. + :param file_path: File path. + :return: File identifier if the upload was successful, None otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -65,6 +87,12 @@ def upload_file(folder_id, file_path): def update_file(file_id, file_path): + """ + Updates a file (that must exist in Box folder) given its identifier. + :param file_id: File identifier. + :param file_path: File path. + :return: File identifier if the update was successful, None otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -77,6 +105,12 @@ def update_file(file_id, file_path): def download_file(file_id, file_path): + """ + Downloads a Box file given its identifier to a specific path. + :param file_id: File identifier. + :param file_path: File path. + :return: True if the download was successful, False otherwise. + """ box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH)) for i in range(0, BOX_RETRIES): try: @@ -87,4 +121,4 @@ def download_file(file_id, file_path): time.sleep(BOX_RTM) if i == BOX_RETRIES - 1: print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}') - return None + return False diff --git a/src/csv_parser.py b/src/csv_parser.py index e429112..6d97271 100644 --- a/src/csv_parser.py +++ b/src/csv_parser.py @@ -5,6 +5,13 @@ def exists_song(csv_letter, artist_url, song_url): + """ + Checks if a song exists in a given CSV given the artist and song url. + :param csv_letter: CSV letter in order to identify which CSV to get. + :param artist_url: Artist AZLyrics URL. + :param song_url: Song AZLyrics URL. + :return: True if the song exists in the CSV, False otherwise. + """ csv_file_name = f'{CSV_FILE}_{csv_letter}.csv' exists_file = os.path.isfile(csv_file_name) if exists_file: @@ -17,6 +24,16 @@ def exists_song(csv_letter, artist_url, song_url): def append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, csv_letter): + """ + Appends song information into the end of a (in)existing CSV. + :param artist_name: Artist name. + :param artist_url: Artist AZLyrics URL. + :param song_name: Song name. + :param song_url: Song AZLyrics URL. + :param song_lyrics: Song lyrics. + :param csv_letter: CSV letter for getting the CSV where to append. + :return: Song information appended. + """ if song_lyrics: csv_file_name = f'{CSV_FILE}_{csv_letter}.csv' exists_file = os.path.isfile(csv_file_name) diff --git a/src/string_cleaner.py b/src/string_cleaner.py index 8f00ca9..c3e9171 100644 --- a/src/string_cleaner.py +++ b/src/string_cleaner.py @@ -5,12 +5,22 @@ def clean_url(url_str): + """ + Cleans a given URL. + :param url_str: String formatted URL. + :return: Cleaned string formatted URL. + """ url_str = url_str.lower() url_str = url_str.strip() return url_str def clean_name(name_str): + """ + Cleans a given name (song or artist). + :param name_str: String formatted song. + :return: Cleaned string formatted song. + """ name_str = name_str.lower() name_str = name_str.strip() name_str = unidecode.unidecode(name_str) @@ -18,6 +28,11 @@ def clean_name(name_str): def clean_lyrics(lyrics_str): + """ + Cleans a given string where song lyrics are. + :param lyrics_str: String formatted lyrics. + :return: Cleaned string formatted lyrics. + """ lyrics_str = lyrics_str.lower() lyrics_str = lyrics_str.strip() lyrics_str = unidecode.unidecode(lyrics_str)