Skip to content

Commit

Permalink
Merge pull request #3 from AlbertSuarez/tor_integration
Browse files Browse the repository at this point in the history
Tor integration
  • Loading branch information
AlbertSuarez authored Aug 20, 2019
2 parents 9be29bf + e4b0b76 commit 64341a3
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 45 deletions.
25 changes: 21 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ This project is using Python3. All these requirements have been specified in the

1. [Requests](https://2.python-requests.org/en/master/): used for retrieving the HTML content of a website.
2. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content.
3. [Tqdm](https://tqdm.github.io/): used for having cool and beautiful progessbars.
4. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
5. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.
3. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs.
4. [Stem](https://stem.torproject.org/): used for authentificating every request with a different IP.
5. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request.
6. [Unidecode](https://pypi.org/project/Unidecode/): used for cleaning strings from weird characters.
7. [Box SDK](https://github.com/box/box-python-sdk): used for uploading/downloading files to/from Box Cloud Storage.

## Recommendations

Expand All @@ -39,7 +41,22 @@ To run this script, please execute the following from the root directory:

3. Move [JWT configuration](#jwt-configuration) file from Box API

4. Run the script
4. Install [Tor browser](https://2019.www.torproject.org/docs/debian.html.en)

5. Configure Tor IP renewal editting `/etc/tor/torrc` file

```
ControlPort 9051
CookieAuthentication 1
```

6. Restart Tor browser

```bash
sudo service tor restart
```

7. Run the script

```bash
python3 -m src
Expand Down
10 changes: 6 additions & 4 deletions requirements.lock
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
requests==2.22.0
beautifulsoup4==4.7.1
tqdm==4.32.2
beautifulsoup4==4.8.0
Unidecode==1.1.1
boxsdk==2.5.0
stem==1.7.1
fake-useragent==0.1.11
## The following requirements were added by pip freeze:
asn1crypto==0.24.0
attrs==19.1.0
certifi==2019.6.16
cffi==1.12.3
chardet==3.0.4
cryptography==2.7
dropbox==9.4.0
idna==2.8
pycparser==2.19
PyJWT==1.7.1
pyOpenSSL==19.0.0
PySocks==1.7.0
requests-toolbelt==0.9.1
six==1.12.0
soupsieve==1.9.2
soupsieve==1.9.3
urllib3==1.25.3
wrapt==1.11.2
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
requests
requests[socks,security]
beautifulsoup4
tqdm
Unidecode
boxsdk[jwt]
boxsdk[jwt]
stem
fake_useragent
30 changes: 8 additions & 22 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,12 @@
]

# Scrapping
BASE = 'Mozilla/5.0'
SCRAPE_RTD_MINIMUM = 4
SCRAPE_RTD_MAXIMUM = 6
SCRAPE_USER_AGENT_USE_RANDOM = False
SCRAPE_USER_AGENT = f'{BASE} (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
f'Chrome/75.0.3770.100 Safari/537.36'
SCRAPE_USER_AGENT_LIST = [
f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
f'{BASE} (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
f'{BASE} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
f'{BASE} (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
f'{BASE} (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0',
f'{BASE} (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0',
f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
f'{BASE} (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
]
SCRAPE_PROXY = 'socks5://127.0.0.1:9050'
SCRAPE_RTD_MINIMUM = 15
SCRAPE_RTD_MAXIMUM = 60
SCRAPE_RETRIES_AMOUNT = 3
SCRAPE_SLEEP_TIME_BETWEEN_RETRIES = 10
SCRAPE_RTD_ERROR_MINIMUM = 150
SCRAPE_RTD_ERROR_MAXIMUM = 300

# CSV
CSV_FILE = 'data/azlyrics_lyrics'
Expand Down Expand Up @@ -74,13 +61,12 @@
__all__ = [
'AZ_LYRICS_BASE_URL',
'AZ_LYRICS_ARTIST_LETTER_LIST',
'SCRAPE_PROXY',
'SCRAPE_RTD_MINIMUM',
'SCRAPE_RTD_MAXIMUM',
'SCRAPE_USER_AGENT_USE_RANDOM',
'SCRAPE_USER_AGENT',
'SCRAPE_USER_AGENT_LIST',
'SCRAPE_RETRIES_AMOUNT',
'SCRAPE_SLEEP_TIME_BETWEEN_RETRIES',
'SCRAPE_RTD_ERROR_MINIMUM',
'SCRAPE_RTD_ERROR_MAXIMUM',
'CSV_FILE',
'CSV_HEADER_ARTIST_NAME',
'CSV_HEADER_ARTIST_URL',
Expand Down
21 changes: 15 additions & 6 deletions src/__main__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os

from tqdm import tqdm

from src import *
from src import azlyrics, csv_parser, box_sdk

Expand All @@ -11,18 +9,28 @@ def scrape():
Processes the main function of the scraper.
:return: All AZLyrics scraped.
"""
for artist_letter in tqdm(AZ_LYRICS_ARTIST_LETTER_LIST, total=len(AZ_LYRICS_ARTIST_LETTER_LIST)):
for artist_letter in AZ_LYRICS_ARTIST_LETTER_LIST:
# Logging stuff
print(f'[1] Processing [{artist_letter}] letter...')

# Downloads file if it is available on Box folder.
csv_file_name = f'{CSV_FILE}_{artist_letter}.csv'
print(f'[1] Searching for {csv_file_name} in Box folder...')
file_id = box_sdk.search_file(BOX_FOLDER_APP_ID, csv_file_name.split('/')[-1])
if file_id:
print(f'[1] ---> File found with id [{file_id}]!')
box_sdk.download_file(file_id, csv_file_name)

# Iterates over all artists with the given letter.
print('[1] Scraping artists URLs...')
artist_url_list = azlyrics.get_artist_url_list(artist_letter)
for artist_name, artist_url in tqdm(artist_url_list, total=len(artist_url_list)):
print(f'[1] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
for artist_name, artist_url in artist_url_list:
print(f'[2] Scraping song URLs for {artist_name}...')
song_url_list = azlyrics.get_song_url_list(artist_url)
for song_name, song_url in tqdm(song_url_list, total=len(song_url_list)):
print(f'[2] ---> {len(artist_url_list)} artists found with letter [{artist_letter}]')
for song_name, song_url in song_url_list:
print(f'[3] Scraping lyrics for song: [{song_name}]')
if not csv_parser.exists_song(artist_letter, artist_url, song_url):
song_lyrics = azlyrics.get_song_lyrics(song_url)
csv_parser.append_to_csv(artist_name, artist_url, song_name, song_url, song_lyrics, artist_letter)
Expand All @@ -33,7 +41,8 @@ def scrape():
file_id = box_sdk.upload_file(BOX_FOLDER_APP_ID, csv_file_name)

# Removes the local version of the CSV for saving storage.
os.remove(csv_file_name)
if os.path.isfile(csv_file_name):
os.remove(csv_file_name)


if __name__ == '__main__':
Expand Down
16 changes: 10 additions & 6 deletions src/azlyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import requests

from bs4 import BeautifulSoup
from stem import Signal
from stem.control import Controller
from fake_useragent import UserAgent

from src import *
from src import string_cleaner
Expand All @@ -17,19 +20,20 @@ def _get_html(url):
time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD
for i in range(0, SCRAPE_RETRIES_AMOUNT):
try:
if SCRAPE_USER_AGENT_USE_RANDOM:
headers = {'User-Agent': random.choice(SCRAPE_USER_AGENT_LIST)}
else:
headers = {'User-Agent': SCRAPE_USER_AGENT}
response = requests.get(url, headers=headers)
with Controller.from_port(port=9051) as c:
c.authenticate()
c.signal(Signal.NEWNYM)
proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY}
headers = {'User-Agent': UserAgent().random}
response = requests.get(url, proxies=proxies, headers=headers)
assert response.ok
html_content = response.content
return html_content
except Exception as e:
if i == SCRAPE_RETRIES_AMOUNT - 1:
print(f'Unable to retrieve HTML from {url}: {e}')
else:
time.sleep(SCRAPE_SLEEP_TIME_BETWEEN_RETRIES)
time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM))
return None


Expand Down

0 comments on commit 64341a3

Please sign in to comment.