Skip to content

Commit

Permalink
Documented code
Browse files Browse the repository at this point in the history
  • Loading branch information
AlbertSuarez committed Aug 9, 2019
1 parent a0c73cc commit fb87b00
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# AZLyrics website
AZ_LYRICS_BASE_URL = 'https://www.azlyrics.com'
AZ_LYRICS_ARTIST_LETTER_LIST = [
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '19'
]

# Scrapping
BASE = 'Mozilla/5.0'
SCRAPE_RTD_MINIMUM = 4
SCRAPE_RTD_MAXIMUM = 6
Expand All @@ -24,13 +26,15 @@
SCRAPE_RETRIES_AMOUNT = 3
SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10

# CSV
CSV_FILE = 'data/azlyrics_lyrics'
CSV_HEADER_ARTIST_NAME = 'ARTIST_NAME'
CSV_HEADER_ARTIST_URL = 'ARTIST_URL'
CSV_HEADER_SONG_NAME = 'SONG_NAME'
CSV_HEADER_SONG_URL = 'SONG_URL'
CSV_HEADER_LYRICS = 'LYRICS'

# String cleaning
STR_CLEAN_TIMES = 3
STR_CLEAN_DICT = {
'\n\n': '\n',
Expand All @@ -56,6 +60,7 @@
':.': ':'
}

# Box integration
BOX_CONFIG_FILE_PATH = 'data/jwt_config.json'
BOX_RETRIES = 3
BOX_RTM = 3
Expand Down
4 changes: 4 additions & 0 deletions src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@


def scrape():
"""
Processes the main function of the scraper.
:return: All AZLyrics scraped.
"""
for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)):
# Downloads file if it is available on Box folder.
csv_file_name = f'{CSV_FILE}_{artist_letter}.csv'
Expand Down
20 changes: 20 additions & 0 deletions src/azlyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@


def _get_html(url):
"""
Retrieves the HTML content given a Internet accessible URL.
:param url: URL to retrieve.
:return: HTML content formatted as String, None if there was an error.
"""
time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD
for i in range(0, SCRAPE_RETRIES_AMOUNT):
try:
Expand All @@ -29,6 +34,11 @@ def _get_html(url):


def get_artist_url_list(artist_letter):
"""
Retrieves the AZLyrics website URLs for all the artists given its first character.
:param artist_letter: First character of an artist.
:return: List of pairs containing the artist name and its AZLyrics URL.
"""
artist_url_list = []

try:
Expand All @@ -50,6 +60,11 @@ def get_artist_url_list(artist_letter):


def get_song_url_list(artist_url):
"""
Retrieves the AZLyrics website URLs for all the songs from an artist AZLyrics URL.
:param artist_url: AZLyrics URL from a given artist.
:return: List of pairs containing the song name and its AZLyrics URL.
"""
song_url_list = []

try:
Expand All @@ -69,6 +84,11 @@ def get_song_url_list(artist_url):


def get_song_lyrics(song_url):
"""
Retrieves and cleans the lyrics of a song given its AZLyrics URL.
:param song_url: AZLyrics URL from a given song.
:return: Cleaned and formatted song lyrics.
"""
song_lyrics = ''

try:
Expand Down
36 changes: 35 additions & 1 deletion src/box_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@


def create_folder(folder_name):
"""
Creates a folder in the root folder given its name.
:param folder_name: Folder name to create.
:return: Folder identifier if the creation was successful, None otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -20,6 +25,11 @@ def create_folder(folder_name):


def create_shared_link(folder_id):
"""
Creates an Internet accessible shared link of folder given its identifier.
:param folder_id: Folder identifier.
:return: Shared link if the creation was successful, None otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -37,6 +47,12 @@ def create_shared_link(folder_id):


def search_file(folder_id, file_name):
"""
Finds a file into a folder given its identifier and a query string.
:param folder_id: Folder identifier.
:param file_name: File name.
:return: File identifier if the file exists, None otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -52,6 +68,12 @@ def search_file(folder_id, file_name):


def upload_file(folder_id, file_path):
"""
Uploads a file (that must not exist in Box folder) into a folder given its path.
:param folder_id: Folder identifier.
:param file_path: File path.
:return: File identifier if the upload was successful, None otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -65,6 +87,12 @@ def upload_file(folder_id, file_path):


def update_file(file_id, file_path):
"""
Updates a file (that must exist in Box folder) given its identifier.
:param file_id: File identifier.
:param file_path: File path.
:return: File identifier if the update was successful, None otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -77,6 +105,12 @@ def update_file(file_id, file_path):


def download_file(file_id, file_path):
"""
Downloads a Box file given its identifier to a specific path.
:param file_id: File identifier.
:param file_path: File path.
:return: True if the download was successful, False otherwise.
"""
box_client = Client(JWTAuth.from_settings_file(BOX_CONFIG_FILE_PATH))
for i in range(0, BOX_RETRIES):
try:
Expand All @@ -87,4 +121,4 @@ def download_file(file_id, file_path):
time.sleep(BOX_RTM)
if i == BOX_RETRIES - 1:
print(f'Error calling Box API downloading the file [{file_id}] to file [{file_path}]: {e}')
return None
return False
17 changes: 17 additions & 0 deletions src/csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@


def exists_song(csv_letter, artist_url, song_url):
"""
Checks if a song exists in a given CSV given the artist and song url.
:param csv_letter: CSV letter in order to identify which CSV to get.
:param artist_url: Artist AZLyrics URL.
:param song_url: Song AZLyrics URL.
:return: True if the song exists in the CSV, False otherwise.
"""
csv_file_name = f'{CSV_FILE}_{csv_letter}.csv'
exists_file = os.path.isfile(csv_file_name)
if exists_file:
Expand All @@ -17,6 +24,16 @@ def exists_song(csv_letter, artist_url, song_url):


def append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, csv_letter):
"""
Appends song information into the end of a (in)existing CSV.
:param artist_name: Artist name.
:param artist_url: Artist AZLyrics URL.
:param song_name: Song name.
:param song_url: Song AZLyrics URL.
:param song_lyrics: Song lyrics.
:param csv_letter: CSV letter for getting the CSV where to append.
:return: Song information appended.
"""
if song_lyrics:
csv_file_name = f'{CSV_FILE}_{csv_letter}.csv'
exists_file = os.path.isfile(csv_file_name)
Expand Down
15 changes: 15 additions & 0 deletions src/string_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,34 @@


def clean_url(url_str):
"""
Cleans a given URL.
:param url_str: String formatted URL.
:return: Cleaned string formatted URL.
"""
url_str = url_str.lower()
url_str = url_str.strip()
return url_str


def clean_name(name_str):
"""
Cleans a given name (song or artist).
:param name_str: String formatted song.
:return: Cleaned string formatted song.
"""
name_str = name_str.lower()
name_str = name_str.strip()
name_str = unidecode.unidecode(name_str)
return name_str


def clean_lyrics(lyrics_str):
"""
Cleans a given string where song lyrics are.
:param lyrics_str: String formatted lyrics.
:return: Cleaned string formatted lyrics.
"""
lyrics_str = lyrics_str.lower()
lyrics_str = lyrics_str.strip()
lyrics_str = unidecode.unidecode(lyrics_str)
Expand Down

0 comments on commit fb87b00

Please sign in to comment.