From 4ab5dcb57b7144ae820b3067843b74d1bc141c6e Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Fri, 9 Aug 2019 15:17:48 +0000
Subject: [PATCH 1/7] Added proxy configuration to requests for TOR

---
 requirements.lock | 3 +++
 requirements.txt  | 4 ++--
 src/__init__.py   | 4 +++-
 src/azlyrics.py   | 3 ++-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index e99dff3..95d978e 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -12,8 +12,11 @@ chardet==3.0.4
 cryptography==2.7
 dropbox==9.4.0
 idna==2.8
+pkg-resources==0.0.0
 pycparser==2.19
 PyJWT==1.7.1
+pyOpenSSL==19.0.0
+PySocks==1.7.0
 requests-toolbelt==0.9.1
 six==1.12.0
 soupsieve==1.9.2
diff --git a/requirements.txt b/requirements.txt
index b945804..c306342 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-requests
+requests[socks,security]
 beautifulsoup4
 tqdm
 Unidecode
-boxsdk[jwt]
\ No newline at end of file
+boxsdk[jwt]
diff --git a/src/__init__.py b/src/__init__.py
index 8f1f46c..72e2d6f 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -5,9 +5,10 @@
 ]
 
 BASE = 'Mozilla/5.0'
+SCRAPE_PROXY = 'socks5://127.0.0.1:9050'
 SCRAPE_RTD_MINIMUM = 4
 SCRAPE_RTD_MAXIMUM = 6
-SCRAPE_USER_AGENT_USE_RANDOM = False
+SCRAPE_USER_AGENT_USE_RANDOM = True
 SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
     f'Chrome/75.0.3770.100 Safari/537.36'
 SCRAPE_USER_AGENT_LIST = [
@@ -69,6 +70,7 @@
 __all__ = [
     'AZ_LYRICS_BASE_URL',
     'AZ_LYRICS_ARTIST_LETTER_LIST',
+    'SCRAPE_PROXY',
     'SCRAPE_RTD_MINIMUM',
     'SCRAPE_RTD_MAXIMUM',
     'SCRAPE_USER_AGENT_USE_RANDOM',
diff --git a/src/azlyrics.py b/src/azlyrics.py
index eea33e7..0323e3f 100644
--- a/src/azlyrics.py
+++ b/src/azlyrics.py
@@ -16,7 +16,8 @@ def _get_html(url):
                 headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)}
             else:
                 headers = {'User-Agent': SCRAPE_USER_AGENT}
-            response = requests.get(url, headers=headers)
+            proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY}
+            response = requests.get(url, proxies=proxies, headers=headers)
             assert response.ok
             html_content = response.content
             return html_content

From 0e9a48e60048e43d9a2e237d29f07a876bb95dc6 Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 16:41:32 +0000
Subject: [PATCH 2/7] Added stem and fake_useragent strategy

---
 requirements.lock |  2 ++
 requirements.txt  |  2 ++
 src/__init__.py   | 18 ------------------
 src/azlyrics.py   | 12 ++++++++----
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index 95d978e..96874b8 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -3,6 +3,8 @@ beautifulsoup4==4.7.1
 tqdm==4.32.2
 Unidecode==1.1.1
 boxsdk==2.5.0
+stem==1.7.1
+fake-useragent==0.1.11
 ## The following requirements were added by pip freeze:
 asn1crypto==0.24.0
 attrs==19.1.0
diff --git a/requirements.txt b/requirements.txt
index c306342..e554279 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ beautifulsoup4
 tqdm
 Unidecode
 boxsdk[jwt]
+stem
+fake_useragent
diff --git a/src/__init__.py b/src/__init__.py
index e9b1506..5587734 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -6,24 +6,9 @@
 ]
 
 # Scrapping
-BASE = 'Mozilla/5.0'
 SCRAPE_PROXY = 'socks5://127.0.0.1:9050'
 SCRAPE_RTD_MINIMUM = 4
 SCRAPE_RTD_MAXIMUM = 6
-SCRAPE_USER_AGENT_USE_RANDOM = True
-SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
-    f'Chrome/75.0.3770.100 Safari/537.36'
-SCRAPE_USER_AGENT_LIST = [
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
-    f'{BASE} (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
-    f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
-    f'{BASE} (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0',
-    f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
-    f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
-]
 SCRAPE_RETRIES_AMOUNT = 3
 SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10
 
@@ -78,9 +63,6 @@
     'SCRAPE_PROXY',
     'SCRAPE_RTD_MINIMUM',
     'SCRAPE_RTD_MAXIMUM',
-    'SCRAPE_USER_AGENT_USE_RANDOM',
-    'SCRAPE_USER_AGENT',
-    'SCRAPE_USER_AGENT_LIST',
     'SCRAPE_RETRIES_AMOUNT',
     'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES',
     'CSV_FILE',
diff --git a/src/azlyrics.py b/src/azlyrics.py
index a6af038..19fbe23 100644
--- a/src/azlyrics.py
+++ b/src/azlyrics.py
@@ -3,6 +3,9 @@
 import requests
 
 from bs4 import BeautifulSoup
+from stem import Signal
+from stem.control import Controller
+from fake_useragent import UserAgent
 
 from src import *
 from src import string_cleaner
@@ -17,11 +20,12 @@ def _get_html(url):
     time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM))  # RTD
     for i in range(0, SCRAPE_RETRIES_AMOUNT):
         try:
-            if SCRAPE_USER_AGENT_USE_RANDOM:
-                headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)}
-            else:
-                headers = {'User-Agent': SCRAPE_USER_AGENT}
+            with Controller.from_port(port = 9051) as c:
+                c.authenticate()
+                c.signal(Signal.NEWNYM)
+
             proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY}
+            headers = {'User-Agent': UserAgent().random}
             response = requests.get(url, proxies=proxies, headers=headers)
             assert response.ok
             html_content = response.content

From 902264fd6890ed4c598d814b410169895ae4b946 Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 18:43:36 +0200
Subject: [PATCH 3/7] Removed useless (which makes an error) requirement

---
 requirements.lock | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.lock b/requirements.lock
index 96874b8..cc87b0f 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -14,7 +14,6 @@ chardet==3.0.4
 cryptography==2.7
 dropbox==9.4.0
 idna==2.8
-pkg-resources==0.0.0
 pycparser==2.19
 PyJWT==1.7.1
 pyOpenSSL==19.0.0

From 426b3d819966c51e4cfa8463cc49854412e777a1 Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 18:43:52 +0200
Subject: [PATCH 4/7] Refactored scraping Python file

---
 src/azlyrics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/azlyrics.py b/src/azlyrics.py
index 19fbe23..0319f7c 100644
--- a/src/azlyrics.py
+++ b/src/azlyrics.py
@@ -20,10 +20,9 @@ def _get_html(url):
     time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM))  # RTD
     for i in range(0, SCRAPE_RETRIES_AMOUNT):
         try:
-            with Controller.from_port(port = 9051) as c:
+            with Controller.from_port(port=9051) as c:
                 c.authenticate()
                 c.signal(Signal.NEWNYM)
-
             proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY}
             headers = {'User-Agent': UserAgent().random}
             response = requests.get(url, proxies=proxies, headers=headers)

From 1f6665bd4e2c7e0538c5e22fa013e4b208edd79b Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 18:47:34 +0200
Subject: [PATCH 5/7] Increased scraping RTM

---
 src/__init__.py | 10 ++++++----
 src/azlyrics.py |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/__init__.py b/src/__init__.py
index 5587734..b425bae 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -7,10 +7,11 @@
 
 # Scrapping
 SCRAPE_PROXY = 'socks5://127.0.0.1:9050'
-SCRAPE_RTD_MINIMUM = 4
-SCRAPE_RTD_MAXIMUM = 6
+SCRAPE_RTD_MINIMUM = 15
+SCRAPE_RTD_MAXIMUM = 60
 SCRAPE_RETRIES_AMOUNT = 3
-SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10
+SCRAPE_RTD_ERROR_MINIMUM = 150
+SCRAPE_RTD_ERROR_MAXIMUM = 300
 
 # CSV
 CSV_FILE = 'data/azlyrics_lyrics'
@@ -64,7 +65,8 @@
     'SCRAPE_RTD_MINIMUM',
     'SCRAPE_RTD_MAXIMUM',
     'SCRAPE_RETRIES_AMOUNT',
-    'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES',
+    'SCRAPE_RTD_ERROR_MINIMUM',
+    'SCRAPE_RTD_ERROR_MAXIMUM',
     'CSV_FILE',
     'CSV_HEADER_ARTIST_NAME',
     'CSV_HEADER_ARTIST_URL',
diff --git a/src/azlyrics.py b/src/azlyrics.py
index 0319f7c..32ed32c 100644
--- a/src/azlyrics.py
+++ b/src/azlyrics.py
@@ -33,7 +33,7 @@ def _get_html(url):
             if i == SCRAPE_RETRIES_AMOUNT - 1:
                 print(f'Unable to retrieve HTML from {url}: {e}')
             else:
-                time.sleep(SCRAPE_SLEEP_TIME_BETWEEN_RETRIES)
+                time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM))
     return None
 
 

From 75b2e6af03a4559ad07ddd7f7376a8c8b9c960e3 Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 18:59:22 +0200
Subject: [PATCH 6/7] Removed tqdm logging and used prints

---
 requirements.lock |  6 ++----
 requirements.txt  |  3 +--
 src/__main__.py   | 21 +++++++++++++++------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index cc87b0f..b7e1ed1 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -1,6 +1,5 @@
 requests==2.22.0
-beautifulsoup4==4.7.1
-tqdm==4.32.2
+beautifulsoup4==4.8.0
 Unidecode==1.1.1
 boxsdk==2.5.0
 stem==1.7.1
@@ -12,7 +11,6 @@ certifi==2019.6.16
 cffi==1.12.3
 chardet==3.0.4
 cryptography==2.7
-dropbox==9.4.0
 idna==2.8
 pycparser==2.19
 PyJWT==1.7.1
@@ -20,6 +18,6 @@ pyOpenSSL==19.0.0
 PySocks==1.7.0
 requests-toolbelt==0.9.1
 six==1.12.0
-soupsieve==1.9.2
+soupsieve==1.9.3
 urllib3==1.25.3
 wrapt==1.11.2
diff --git a/requirements.txt b/requirements.txt
index e554279..4a8e43c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 requests[socks,security]
 beautifulsoup4
-tqdm
 Unidecode
 boxsdk[jwt]
 stem
-fake_useragent
+fake_useragent
\ No newline at end of file
diff --git a/src/__main__.py b/src/__main__.py
index 25d563d..6774915 100644
--- a/src/__main__.py
+++ b/src/__main__.py
@@ -1,7 +1,5 @@
 import os
 
-from tqdm import tqdm
-
 from src import *
 from src import azlyrics, csv_parser, box_sdk
 
@@ -11,18 +9,28 @@ def scrape():
     Processes the main function of the scraper.
     :return: All AZLyrics scraped.
     """
-    for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)):
+    for artist_letter in AZ_LYRICS_ARTIST_LETTER_LIST:
+        # Logging stuff
+        print(f'[1] Processing [{artist_letter}] letter...')
+
         # Downloads file if it is available on Box folder.
         csv_file_name = f'{CSV_FILE}_{artist_letter}.csv'
+        print(f'[1] Searching for {csv_file_name} in Box folder...')
         file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1])
         if file_id:
+            print(f'[1] ---> File found with id [{file_id}]!')
             box_sdk.download_file(file_id, csv_file_name)
 
         # Iterates over all artists with the given letter.
+        print('[1] Scraping artists URLs...')
         artist_url_list = azlyrics.get_artist_url_list(artist_letter)
-        for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)):
+        print(f'[1] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
+        for artist_name, artist_url in artist_url_list:
+            print(f'[2] Scraping song URLs for {artist_name}...')
             song_url_list = azlyrics.get_song_url_list(artist_url)
-            for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)):
+            print(f'[2] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
+            for song_name, song_url in song_url_list:
+                print(f'[3] Scraping lyrics for song: [{song_name}]')
                 if not csv_parser.exists_song(artist_letter, artist_url, song_url):
                     song_lyrics = azlyrics.get_song_lyrics(song_url)
                     csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter)
@@ -33,7 +41,8 @@ def scrape():
                 file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name)
 
         # Removes the local version of the CSV for saving storage.
-        os.remove(csv_file_name)
+        if os.path.isfile(csv_file_name):
+            os.remove(csv_file_name)
 
 
 if __name__ == '__main__':

From e4b0b76717aa609adb09e2ce501a596f280f9267 Mon Sep 17 00:00:00 2001
From: Albert Suarez <alsumo95@gmail.com>
Date: Tue, 20 Aug 2019 19:06:50 +0200
Subject: [PATCH 7/7] Improved README

---
 README.md | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f300aad..d6bdb21 100644
--- a/README.md
+++ b/README.md
@@ -17,9 +17,11 @@ This project is using Python3. All these requirements have been specified in the
 
 1. [Requests](https://2.python-requests.org/en/master/): used for retrieving the HTML content of a website.
 2. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content.
-3. [Tqdm](https://tqdm.github.io/): used for having cool and beautiful progessbars.
-4. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
-5. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.
+3. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs.
+4. [Stem](https://stem.torproject.org/): used for authentificating every request with a different IP.
+5. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request.
+6. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
+7. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.
 
 ## Recommendations
 
@@ -39,7 +41,22 @@ To run this script, please execute the following from the root directory:
 
 3. Move [JWT configuration](#jwt-configuration) file from Box API
 
-4. Run the script
+4. Install [Tor browser](https://2019.www.torproject.org/docs/debian.html.en)
+
+5. Configure Tor IP renewal editting `/etc/tor/torrc` file
+
+   ```
+   ControlPort 9051
+   CookieAuthentication 1
+   ```
+
+6. Restart Tor browser
+
+  ```bash
+  sudo service tor restart
+  ```
+
+7. Run the script
 
   ```bash
   python3 -m src