Merge pull request #3 from AlbertSuarez/tor_integration

Tor integration
AlbertSuarez · Aug 20, 2019 · 64341a3 · 64341a3
2 parents 9be29bf + e4b0b76
commit 64341a3
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -17,9 +17,11 @@ This project is using Python3. All these requirements have been specified in the
 
 1. [Requests](https://2.python-requests.org/en/master/): used for retrieving the HTML content of a website.
 2. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content.
-3. [Tqdm](https://tqdm.github.io/): used for having cool and beautiful progessbars.
-4. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
-5. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.
+3. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs.
+4. [Stem](https://stem.torproject.org/): used for authentificating every request with a different IP.
+5. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request.
+6. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
+7. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.
 
 ## Recommendations
 
@@ -39,7 +41,22 @@ To run this script, please execute the following from the root directory:
 
 3. Move [JWT configuration](#jwt-configuration) file from Box API
 
-4. Run the script
+4. Install [Tor browser](https://2019.www.torproject.org/docs/debian.html.en)
+
+5. Configure Tor IP renewal editting `/etc/tor/torrc` file
+
+   ```
+   ControlPort 9051
+   CookieAuthentication 1
+   ```
+
+6. Restart Tor browser
+
+  ```bash
+  sudo service tor restart
+  ```
+
+7. Run the script
 
   ```bash
   python3 -m src

diff --git a/requirements.lock b/requirements.lock
@@ -1,21 +1,23 @@
 requests==2.22.0
-beautifulsoup4==4.7.1
-tqdm==4.32.2
+beautifulsoup4==4.8.0
 Unidecode==1.1.1
 boxsdk==2.5.0
+stem==1.7.1
+fake-useragent==0.1.11
 ## The following requirements were added by pip freeze:
 asn1crypto==0.24.0
 attrs==19.1.0
 certifi==2019.6.16
 cffi==1.12.3
 chardet==3.0.4
 cryptography==2.7
-dropbox==9.4.0
 idna==2.8
 pycparser==2.19
 PyJWT==1.7.1
+pyOpenSSL==19.0.0
+PySocks==1.7.0
 requests-toolbelt==0.9.1
 six==1.12.0
-soupsieve==1.9.2
+soupsieve==1.9.3
 urllib3==1.25.3
 wrapt==1.11.2
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
-requests
+requests[socks,security]
 beautifulsoup4
-tqdm
 Unidecode
-boxsdk[jwt]
+boxsdk[jwt]
+stem
+fake_useragent
diff --git a/src/__init__.py b/src/__init__.py
@@ -6,25 +6,12 @@
 ]
 
 # Scrapping
-BASE = 'Mozilla/5.0'
-SCRAPE_RTD_MINIMUM = 4
-SCRAPE_RTD_MAXIMUM = 6
-SCRAPE_USER_AGENT_USE_RANDOM = False
-SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
-    f'Chrome/75.0.3770.100 Safari/537.36'
-SCRAPE_USER_AGENT_LIST = [
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
-    f'{BASE} (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
-    f'{BASE} (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
-    f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
-]
+SCRAPE_PROXY = 'socks5://127.0.0.1:9050'
+SCRAPE_RTD_MINIMUM = 15
+SCRAPE_RTD_MAXIMUM = 60
 SCRAPE_RETRIES_AMOUNT = 3
-SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10
+SCRAPE_RTD_ERROR_MINIMUM = 150
+SCRAPE_RTD_ERROR_MAXIMUM = 300
 
 # CSV
 CSV_FILE = 'data/azlyrics_lyrics'
@@ -74,13 +61,12 @@
 __all__ = [
     'AZ_LYRICS_BASE_URL',
     'AZ_LYRICS_ARTIST_LETTER_LIST',
+    'SCRAPE_PROXY',
     'SCRAPE_RTD_MINIMUM',
     'SCRAPE_RTD_MAXIMUM',
-    'SCRAPE_USER_AGENT_USE_RANDOM',
-    'SCRAPE_USER_AGENT',
-    'SCRAPE_USER_AGENT_LIST',
     'SCRAPE_RETRIES_AMOUNT',
-    'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES',
+    'SCRAPE_RTD_ERROR_MINIMUM',
+    'SCRAPE_RTD_ERROR_MAXIMUM',
     'CSV_FILE',
     'CSV_HEADER_ARTIST_NAME',
     'CSV_HEADER_ARTIST_URL',

diff --git a/src/__main__.py b/src/__main__.py
@@ -1,7 +1,5 @@
 import os
 
-from tqdm import tqdm
-
 from src import *
 from src import azlyrics, csv_parser, box_sdk
 
@@ -11,18 +9,28 @@ def scrape():
     Processes the main function of the scraper.
     :return: All AZLyrics scraped.
     """
-    for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)):
+    for artist_letter in AZ_LYRICS_ARTIST_LETTER_LIST:
+        # Logging stuff
+        print(f'[1] Processing [{artist_letter}] letter...')
+
         # Downloads file if it is available on Box folder.
         csv_file_name = f'{CSV_FILE}_{artist_letter}.csv'
+        print(f'[1] Searching for {csv_file_name} in Box folder...')
         file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1])
         if file_id:
+            print(f'[1] ---> File found with id [{file_id}]!')
             box_sdk.download_file(file_id, csv_file_name)
 
         # Iterates over all artists with the given letter.
+        print('[1] Scraping artists URLs...')
         artist_url_list = azlyrics.get_artist_url_list(artist_letter)
-        for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)):
+        print(f'[1] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
+        for artist_name, artist_url in artist_url_list:
+            print(f'[2] Scraping song URLs for {artist_name}...')
             song_url_list = azlyrics.get_song_url_list(artist_url)
-            for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)):
+            print(f'[2] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
+            for song_name, song_url in song_url_list:
+                print(f'[3] Scraping lyrics for song: [{song_name}]')
                 if not csv_parser.exists_song(artist_letter, artist_url, song_url):
                     song_lyrics = azlyrics.get_song_lyrics(song_url)
                     csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter)
@@ -33,7 +41,8 @@ def scrape():
                 file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name)
 
         # Removes the local version of the CSV for saving storage.
-        os.remove(csv_file_name)
+        if os.path.isfile(csv_file_name):
+            os.remove(csv_file_name)
 
 
 if __name__ == '__main__':

diff --git a/src/azlyrics.py b/src/azlyrics.py
@@ -3,6 +3,9 @@
 import requests
 
 from bs4 import BeautifulSoup
+from stem import Signal
+from stem.control import Controller
+from fake_useragent import UserAgent
 
 from src import *
 from src import string_cleaner
@@ -17,19 +20,20 @@ def _get_html(url):
     time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM))  # RTD
     for i in range(0, SCRAPE_RETRIES_AMOUNT):
         try:
-            if SCRAPE_USER_AGENT_USE_RANDOM:
-                headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)}
-            else:
-                headers = {'User-Agent': SCRAPE_USER_AGENT}
-            response = requests.get(url, headers=headers)
+            with Controller.from_port(port=9051) as c:
+                c.authenticate()
+                c.signal(Signal.NEWNYM)
+            proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY}
+            headers = {'User-Agent': UserAgent().random}
+            response = requests.get(url, proxies=proxies, headers=headers)
             assert response.ok
             html_content = response.content
             return html_content
         except Exception as e:
             if i == SCRAPE_RETRIES_AMOUNT - 1:
                 print(f'Unable to retrieve HTML from {url}: {e}')
             else:
-                time.sleep(SCRAPE_SLEEP_TIME_BETWEEN_RETRIES)
+                time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM))
     return None