diff --git a/README.md b/README.md index f300aad..d6bdb21 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,11 @@ This project is using Python3. All these requirements have been specified in the 1. [Requests](https://2.python-requests.org/en/master/): used for retrieving the HTML content of a website. 2. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content. -3. [Tqdm](https://tqdm.github.io/): used for having cool and beautiful progessbars. -4. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters. -5. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage. +3. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs. +4. [Stem](https://stem.torproject.org/): used for authentificating every request with a different IP. +5. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request. +6. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters. +7. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage. ## Recommendations @@ -39,7 +41,22 @@ To run this script, please execute the following from the root directory: 3. Move [JWT configuration](#jwt-configuration) file from Box API -4. Run the script +4. Install [Tor browser](https://2019.www.torproject.org/docs/debian.html.en) + +5. Configure Tor IP renewal editting `/etc/tor/torrc` file + + ``` + ControlPort 9051 + CookieAuthentication 1 + ``` + +6. Restart Tor browser + + ```bash + sudo service tor restart + ``` + +7. Run the script ```bash python3 -m src diff --git a/requirements.lock b/requirements.lock index e99dff3..b7e1ed1 100644 --- a/requirements.lock +++ b/requirements.lock @@ -1,8 +1,9 @@ requests==2.22.0 -beautifulsoup4==4.7.1 -tqdm==4.32.2 +beautifulsoup4==4.8.0 Unidecode==1.1.1 boxsdk==2.5.0 +stem==1.7.1 +fake-useragent==0.1.11 ## The following requirements were added by pip freeze: asn1crypto==0.24.0 attrs==19.1.0 @@ -10,12 +11,13 @@ certifi==2019.6.16 cffi==1.12.3 chardet==3.0.4 cryptography==2.7 -dropbox==9.4.0 idna==2.8 pycparser==2.19 PyJWT==1.7.1 +pyOpenSSL==19.0.0 +PySocks==1.7.0 requests-toolbelt==0.9.1 six==1.12.0 -soupsieve==1.9.2 +soupsieve==1.9.3 urllib3==1.25.3 wrapt==1.11.2 diff --git a/requirements.txt b/requirements.txt index b945804..4a8e43c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -requests +requests[socks,security] beautifulsoup4 -tqdm Unidecode -boxsdk[jwt] \ No newline at end of file +boxsdk[jwt] +stem +fake_useragent \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 0720a91..b425bae 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -6,25 +6,12 @@ ] # Scrapping -BASE = 'Mozilla/5.0' -SCRAPE_RTD_MINIMUM = 4 -SCRAPE_RTD_MAXIMUM = 6 -SCRAPE_USER_AGENT_USE_RANDOM = False -SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \ - f'Chrome/75.0.3770.100 Safari/537.36' -SCRAPE_USER_AGENT_LIST = [ - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', - f'{BASE} (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', - f'{BASE} (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0', - f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', - f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' -] +SCRAPE_PROXY = 'socks5://127.0.0.1:9050' +SCRAPE_RTD_MINIMUM = 15 +SCRAPE_RTD_MAXIMUM = 60 SCRAPE_RETRIES_AMOUNT = 3 -SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10 +SCRAPE_RTD_ERROR_MINIMUM = 150 +SCRAPE_RTD_ERROR_MAXIMUM = 300 # CSV CSV_FILE = 'data/azlyrics_lyrics' @@ -74,13 +61,12 @@ __all__ = [ 'AZ_LYRICS_BASE_URL', 'AZ_LYRICS_ARTIST_LETTER_LIST', + 'SCRAPE_PROXY', 'SCRAPE_RTD_MINIMUM', 'SCRAPE_RTD_MAXIMUM', - 'SCRAPE_USER_AGENT_USE_RANDOM', - 'SCRAPE_USER_AGENT', - 'SCRAPE_USER_AGENT_LIST', 'SCRAPE_RETRIES_AMOUNT', - 'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES', + 'SCRAPE_RTD_ERROR_MINIMUM', + 'SCRAPE_RTD_ERROR_MAXIMUM', 'CSV_FILE', 'CSV_HEADER_ARTIST_NAME', 'CSV_HEADER_ARTIST_URL', diff --git a/src/__main__.py b/src/__main__.py index 25d563d..6774915 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,7 +1,5 @@ import os -from tqdm import tqdm - from src import * from src import azlyrics, csv_parser, box_sdk @@ -11,18 +9,28 @@ def scrape(): Processes the main function of the scraper. :return: All AZLyrics scraped. """ - for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)): + for artist_letter in AZ_LYRICS_ARTIST_LETTER_LIST: + # Logging stuff + print(f'[1] Processing [{artist_letter}] letter...') + # Downloads file if it is available on Box folder. csv_file_name = f'{CSV_FILE}_{artist_letter}.csv' + print(f'[1] Searching for {csv_file_name} in Box folder...') file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1]) if file_id: + print(f'[1] ---> File found with id [{file_id}]!') box_sdk.download_file(file_id, csv_file_name) # Iterates over all artists with the given letter. + print('[1] Scraping artists URLs...') artist_url_list = azlyrics.get_artist_url_list(artist_letter) - for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)): + print(f'[1] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]') + for artist_name, artist_url in artist_url_list: + print(f'[2] Scraping song URLs for {artist_name}...') song_url_list = azlyrics.get_song_url_list(artist_url) - for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)): + print(f'[2] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]') + for song_name, song_url in song_url_list: + print(f'[3] Scraping lyrics for song: [{song_name}]') if not csv_parser.exists_song(artist_letter, artist_url, song_url): song_lyrics = azlyrics.get_song_lyrics(song_url) csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter) @@ -33,7 +41,8 @@ def scrape(): file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name) # Removes the local version of the CSV for saving storage. - os.remove(csv_file_name) + if os.path.isfile(csv_file_name): + os.remove(csv_file_name) if __name__ == '__main__': diff --git a/src/azlyrics.py b/src/azlyrics.py index a475c63..32ed32c 100644 --- a/src/azlyrics.py +++ b/src/azlyrics.py @@ -3,6 +3,9 @@ import requests from bs4 import BeautifulSoup +from stem import Signal +from stem.control import Controller +from fake_useragent import UserAgent from src import * from src import string_cleaner @@ -17,11 +20,12 @@ def _get_html(url): time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD for i in range(0, SCRAPE_RETRIES_AMOUNT): try: - if SCRAPE_USER_AGENT_USE_RANDOM: - headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)} - else: - headers = {'User-Agent': SCRAPE_USER_AGENT} - response = requests.get(url, headers=headers) + with Controller.from_port(port=9051) as c: + c.authenticate() + c.signal(Signal.NEWNYM) + proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + headers = {'User-Agent': UserAgent().random} + response = requests.get(url, proxies=proxies, headers=headers) assert response.ok html_content = response.content return html_content @@ -29,7 +33,7 @@ def _get_html(url): if i == SCRAPE_RETRIES_AMOUNT - 1: print(f'Unable to retrieve HTML from {url}: {e}') else: - time.sleep(SCRAPE_SLEEP_TIME_BETWEEN_RETRIES) + time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM)) return None