From 4ab5dcb57b7144ae820b3067843b74d1bc141c6e Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Fri, 9 Aug 2019 15:17:48 +0000 Subject: [PATCH 1/7] Added proxy configuration to requests for TOR --- requirements.lock | 3 +++ requirements.txt | 4 ++-- src/__init__.py | 4 +++- src/azlyrics.py | 3 ++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/requirements.lock b/requirements.lock index e99dff3..95d978e 100644 --- a/requirements.lock +++ b/requirements.lock @@ -12,8 +12,11 @@ chardet==3.0.4 cryptography==2.7 dropbox==9.4.0 idna==2.8 +pkg-resources==0.0.0 pycparser==2.19 PyJWT==1.7.1 +pyOpenSSL==19.0.0 +PySocks==1.7.0 requests-toolbelt==0.9.1 six==1.12.0 soupsieve==1.9.2 diff --git a/requirements.txt b/requirements.txt index b945804..c306342 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -requests +requests[socks,security] beautifulsoup4 tqdm Unidecode -boxsdk[jwt] \ No newline at end of file +boxsdk[jwt] diff --git a/src/__init__.py b/src/__init__.py index 8f1f46c..72e2d6f 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -5,9 +5,10 @@ ] BASE = 'Mozilla/5.0' +SCRAPE_PROXY = 'socks5://127.0.0.1:9050' SCRAPE_RTD_MINIMUM = 4 SCRAPE_RTD_MAXIMUM = 6 -SCRAPE_USER_AGENT_USE_RANDOM = False +SCRAPE_USER_AGENT_USE_RANDOM = True SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \ f'Chrome/75.0.3770.100 Safari/537.36' SCRAPE_USER_AGENT_LIST = [ @@ -69,6 +70,7 @@ __all__ = [ 'AZ_LYRICS_BASE_URL', 'AZ_LYRICS_ARTIST_LETTER_LIST', + 'SCRAPE_PROXY', 'SCRAPE_RTD_MINIMUM', 'SCRAPE_RTD_MAXIMUM', 'SCRAPE_USER_AGENT_USE_RANDOM', diff --git a/src/azlyrics.py b/src/azlyrics.py index eea33e7..0323e3f 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -16,7 +16,8 @@ def _get_html(url): headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)} else: headers = {'User-Agent': SCRAPE_USER_AGENT} - response = requests.get(url, headers=headers) + proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + response = requests.get(url, proxies=proxies, headers=headers) assert response.ok html_content = response.content return html_content From 0e9a48e60048e43d9a2e237d29f07a876bb95dc6 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 16:41:32 +0000 Subject: [PATCH 2/7] Added stem and fake_useragent strategy --- requirements.lock | 2 ++ requirements.txt | 2 ++ src/__init__.py | 18 ------------------ src/azlyrics.py | 12 ++++++++---- 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/requirements.lock b/requirements.lock index 95d978e..96874b8 100644 --- a/requirements.lock +++ b/requirements.lock @@ -3,6 +3,8 @@ beautifulsoup4==4.7.1 tqdm==4.32.2 Unidecode==1.1.1 boxsdk==2.5.0 +stem==1.7.1 +fake-useragent==0.1.11 ## The following requirements were added by pip freeze: asn1crypto==0.24.0 attrs==19.1.0 diff --git a/requirements.txt b/requirements.txt index c306342..e554279 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ beautifulsoup4 tqdm Unidecode boxsdk[jwt] +stem +fake_useragent diff --git a/src/__init__.py b/src/__init__.py index e9b1506..5587734 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -6,24 +6,9 @@ ] # Scrapping -BASE = 'Mozilla/5.0' SCRAPE_PROXY = 'socks5://127.0.0.1:9050' SCRAPE_RTD_MINIMUM = 4 SCRAPE_RTD_MAXIMUM = 6 -SCRAPE_USER_AGENT_USE_RANDOM = True -SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \ - f'Chrome/75.0.3770.100 Safari/537.36' -SCRAPE_USER_AGENT_LIST = [ - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', - f'{BASE} (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', - f'{BASE} (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', - f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' -] SCRAPE_RETRIES_AMOUNT = 3 SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10 @@ -78,9 +63,6 @@ 'SCRAPE_PROXY', 'SCRAPE_RTD_MINIMUM', 'SCRAPE_RTD_MAXIMUM', - 'SCRAPE_USER_AGENT_USE_RANDOM', - 'SCRAPE_USER_AGENT', - 'SCRAPE_USER_AGENT_LIST', 'SCRAPE_RETRIES_AMOUNT', 'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES', 'CSV_FILE', diff --git a/src/azlyrics.py b/src/azlyrics.py index a6af038..19fbe23 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -3,6 +3,9 @@ import requests from bs4 import BeautifulSoup +from stem import Signal +from stem.control import Controller +from fake_useragent import UserAgent from src import * from src import string_cleaner @@ -17,11 +20,12 @@ def _get_html(url): time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD for i in range(0, SCRAPE_RETRIES_AMOUNT): try: - if SCRAPE_USER_AGENT_USE_RANDOM: - headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)} - else: - headers = {'User-Agent': SCRAPE_USER_AGENT} + with Controller.from_port(port = 9051) as c: + c.authenticate() + c.signal(Signal.NEWNYM) + proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + headers = {'User-Agent': UserAgent().random} response = requests.get(url, proxies=proxies, headers=headers) assert response.ok html_content = response.content From 902264fd6890ed4c598d814b410169895ae4b946 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 18:43:36 +0200 Subject: [PATCH 3/7] Removed useless (which makes an error) requirement --- requirements.lock | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.lock b/requirements.lock index 96874b8..cc87b0f 100644 --- a/requirements.lock +++ b/requirements.lock @@ -14,7 +14,6 @@ chardet==3.0.4 cryptography==2.7 dropbox==9.4.0 idna==2.8 -pkg-resources==0.0.0 pycparser==2.19 PyJWT==1.7.1 pyOpenSSL==19.0.0 From 426b3d819966c51e4cfa8463cc49854412e777a1 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 18:43:52 +0200 Subject: [PATCH 4/7] Refactored scraping Python file --- src/azlyrics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/azlyrics.py b/src/azlyrics.py index 19fbe23..0319f7c 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -20,10 +20,9 @@ def _get_html(url): time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD for i in range(0, SCRAPE_RETRIES_AMOUNT): try: - with Controller.from_port(port = 9051) as c: + with Controller.from_port(port=9051) as c: c.authenticate() c.signal(Signal.NEWNYM) - proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} headers = {'User-Agent': UserAgent().random} response = requests.get(url, proxies=proxies, headers=headers) From 1f6665bd4e2c7e0538c5e22fa013e4b208edd79b Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 18:47:34 +0200 Subject: [PATCH 5/7] Increased scraping RTM --- src/__init__.py | 10 ++++++---- src/azlyrics.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 5587734..b425bae 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -7,10 +7,11 @@ # Scrapping SCRAPE_PROXY = 'socks5://127.0.0.1:9050' -SCRAPE_RTD_MINIMUM = 4 -SCRAPE_RTD_MAXIMUM = 6 +SCRAPE_RTD_MINIMUM = 15 +SCRAPE_RTD_MAXIMUM = 60 SCRAPE_RETRIES_AMOUNT = 3 -SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10 +SCRAPE_RTD_ERROR_MINIMUM = 150 +SCRAPE_RTD_ERROR_MAXIMUM = 300 # CSV CSV_FILE = 'data/azlyrics_lyrics' @@ -64,7 +65,8 @@ 'SCRAPE_RTD_MINIMUM', 'SCRAPE_RTD_MAXIMUM', 'SCRAPE_RETRIES_AMOUNT', - 'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES', + 'SCRAPE_RTD_ERROR_MINIMUM', + 'SCRAPE_RTD_ERROR_MAXIMUM', 'CSV_FILE', 'CSV_HEADER_ARTIST_NAME', 'CSV_HEADER_ARTIST_URL', diff --git a/src/azlyrics.py b/src/azlyrics.py index 0319f7c..32ed32c 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -33,7 +33,7 @@ def _get_html(url): if i == SCRAPE_RETRIES_AMOUNT - 1: print(f'Unable to retrieve HTML from {url}: {e}') else: - time.sleep(SCRAPE_SLEEP_TIME_BETWEEN_RETRIES) + time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM)) return None From 75b2e6af03a4559ad07ddd7f7376a8c8b9c960e3 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 18:59:22 +0200 Subject: [PATCH 6/7] Removed tqdm logging and used prints --- requirements.lock | 6 ++---- requirements.txt | 3 +-- src/__main__.py | 21 +++++++++++++++------ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/requirements.lock b/requirements.lock index cc87b0f..b7e1ed1 100644 --- a/requirements.lock +++ b/requirements.lock @@ -1,6 +1,5 @@ requests==2.22.0 -beautifulsoup4==4.7.1 -tqdm==4.32.2 +beautifulsoup4==4.8.0 Unidecode==1.1.1 boxsdk==2.5.0 stem==1.7.1 @@ -12,7 +11,6 @@ certifi==2019.6.16 cffi==1.12.3 chardet==3.0.4 cryptography==2.7 -dropbox==9.4.0 idna==2.8 pycparser==2.19 PyJWT==1.7.1 @@ -20,6 +18,6 @@ pyOpenSSL==19.0.0 PySocks==1.7.0 requests-toolbelt==0.9.1 six==1.12.0 -soupsieve==1.9.2 +soupsieve==1.9.3 urllib3==1.25.3 wrapt==1.11.2 diff --git a/requirements.txt b/requirements.txt index e554279..4a8e43c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ requests[socks,security] beautifulsoup4 -tqdm Unidecode boxsdk[jwt] stem -fake_useragent +fake_useragent \ No newline at end of file diff --git a/src/__main__.py b/src/__main__.py index 25d563d..6774915 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,7 +1,5 @@ import os -from tqdm import tqdm - from src import * from src import azlyrics, csv_parser, box_sdk @@ -11,18 +9,28 @@ def scrape(): Processes the main function of the scraper. :return: All AZLyrics scraped. """ - for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): + for artist_letter in AZ_LYRICS_ARTIST_LETTER_LIST: + # Logging stuff + print(f'[1] Processing [{artist_letter}] letter...') + # Downloads file if it is available on Box folder. csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' + print(f'[1] Searching for {csv_file_name} in Box folder...') file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1]) if file_id: + print(f'[1] ---> File found with id [{file_id}]!') box_sdk.download_file(file_id, csv_file_name) # Iterates over all artists with the given letter. + print('[1] Scraping artists URLs...') artist_url_list = azlyrics.get_artist_url_list(artist_letter) - for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)): + print(f'[1] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]') + for artist_name, artist_url in artist_url_list: + print(f'[2] Scraping song URLs for {artist_name}...') song_url_list = azlyrics.get_song_url_list(artist_url) - for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)): + print(f'[2] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]') + for song_name, song_url in song_url_list: + print(f'[3] Scraping lyrics for song: [{song_name}]') if not csv_parser.exists_song(artist_letter, artist_url, song_url): song_lyrics = azlyrics.get_song_lyrics(song_url) csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter) @@ -33,7 +41,8 @@ def scrape(): file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name) # Removes the local version of the CSV for saving storage. - os.remove(csv_file_name) + if os.path.isfile(csv_file_name): + os.remove(csv_file_name) if __name__ == '__main__': From e4b0b76717aa609adb09e2ce501a596f280f9267 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Tue, 20 Aug 2019 19:06:50 +0200 Subject: [PATCH 7/7] Improved README --- README.md | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f300aad..d6bdb21 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,11 @@ This project is using Python3. All these requirements have been specified in the 1. [Requests](https://2.python-requests.org/en/master/): used for retrieving the HTML content of a website. 2. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content. -3. [Tqdm](https://tqdm.github.io/): used for having cool and beautiful progessbars. -4. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters. -5. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage. +3. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs. +4. [Stem](https://stem.torproject.org/): used for authentificating every request with a different IP. +5. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request. +6. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters. +7. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage. ## Recommendations @@ -39,7 +41,22 @@ To run this script, please execute the following from the root directory: 3. Move [JWT configuration](#jwt-configuration) file from Box API -4. Run the script +4. Install [Tor browser](https://2019.www.torproject.org/docs/debian.html.en) + +5. Configure Tor IP renewal editting `/etc/tor/torrc` file + + ``` + ControlPort 9051 + CookieAuthentication 1 + ``` + +6. Restart Tor browser + + ```bash + sudo service tor restart + ``` + +7. Run the script ```bash python3 -m src