From a3c8f81d3c6ea32c0b89b45c3a100496f30dc516 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 13:24:59 +0100 Subject: [PATCH 01/20] Installed needed requirements for a secure scraping --- requirements.lock | 9 +++++++++ requirements.txt | 7 +++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/requirements.lock b/requirements.lock index 3fb9ba2..ffb10a1 100644 --- a/requirements.lock +++ b/requirements.lock @@ -2,14 +2,23 @@ requests==2.23.0 APScheduler==3.6.3 python-twitter==3.5 dictdiffer==0.8.1 +beautifulsoup4==4.8.2 +stem==1.8.0 +fake-useragent==0.1.11 ## The following requirements were added by pip freeze: certifi==2019.11.28 +cffi==1.14.0 chardet==3.0.4 +cryptography==2.8 future==0.18.2 idna==2.9 oauthlib==3.1.0 +pycparser==2.20 +pyOpenSSL==19.1.0 +PySocks==1.7.1 pytz==2019.3 requests-oauthlib==1.3.0 six==1.14.0 +soupsieve==2.0 tzlocal==2.0.0 urllib3==1.25.8 diff --git a/requirements.txt b/requirements.txt index 2b4d0c6..2f48c8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ -requests +requests[socks,security] APScheduler python-twitter -dictdiffer \ No newline at end of file +dictdiffer +beautifulsoup4 +stem +fake_useragent \ No newline at end of file From da35efa04b9ddb2cfae3d7bb86414509c8d44d42 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 14:29:34 +0100 Subject: [PATCH 02/20] Implemented data retrieving from Worldometers --- src/config.py | 13 +++++- src/worldometers/__init__.py | 0 src/worldometers/retriever.py | 86 +++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 src/worldometers/__init__.py create mode 100644 src/worldometers/retriever.py diff --git a/src/config.py b/src/config.py index d0efdba..e9d1e7a 100644 --- a/src/config.py +++ b/src/config.py @@ -19,7 +19,15 @@ TIME_BETWEEN_RESOURCES = 15 TIME_BETWEEN_TWEETS = 10 * 60 # 10 minutes -# Resources +# Scrapping +SCRAPE_PROXY = 'socks5://127.0.0.1:9050' +SCRAPE_RTD_MINIMUM = 0.1 +SCRAPE_RTD_MAXIMUM = 0.5 +SCRAPE_RETRIES_AMOUNT = 10 +SCRAPE_RTD_ERROR_MINIMUM = 0.5 +SCRAPE_RTD_ERROR_MAXIMUM = 1 + +# Resources - Johns Hopkins DATA_FOLDER = 'data' DATA_CONFIRMED = 'Confirmed' DATA_DEATHS = 'Deaths' @@ -45,6 +53,9 @@ } ] +# Resources - Worldometers +WORLDOMETERS_URL = 'https://www.worldometers.info/coronavirus/' + # Twitter HASHTAG_LIST = '#coronavirus #covid19' diff --git a/src/worldometers/__init__.py b/src/worldometers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/worldometers/retriever.py b/src/worldometers/retriever.py new file mode 100644 index 0000000..5e25598 --- /dev/null +++ b/src/worldometers/retriever.py @@ -0,0 +1,86 @@ +import random +import time +import requests +from bs4 import BeautifulSoup + +from fake_useragent import UserAgent +from stem import Signal +from stem.control import Controller + +from src.config import SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM, SCRAPE_RETRIES_AMOUNT, SCRAPE_PROXY, \ + SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, WORLDOMETERS_URL +from src.helper import log + + +def _get_html(url): + """ + Retrieves the HTML content given a Internet accessible URL. + :param url: URL to retrieve. + :return: HTML content formatted as String, None if there was an error. + """ + time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD + for i in range(0, SCRAPE_RETRIES_AMOUNT): + try: + with Controller.from_port(port=9051) as c: + c.authenticate() + c.signal(Signal.NEWNYM) + proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + headers = {'User-Agent': UserAgent().random} + response = requests.get(url, proxies=proxies, headers=headers) + assert response.ok + html_content = response.content + return html_content + except Exception as e: + if i == SCRAPE_RETRIES_AMOUNT - 1: + print(f'Unable to retrieve HTML from {url}: {e}') + else: + time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM)) + return None + + +def get_last_update(): + """ + Retrieve data from Worldometers. + :return: 3 Dictionaries (confirmed, deaths & recovered) with the last update grouped by location. + """ + confirmed_dict = dict() + death_dict = dict() + recovered_dict = dict() + html_content = _get_html(WORLDOMETERS_URL) + if html_content: + soup = BeautifulSoup(html_content, 'html.parser') + table_countries = soup.findChildren('table', id='main_table_countries') + # Table + if table_countries: + body_list = table_countries[0].findChildren('tbody') + # Body + if body_list: + row_list = body_list[0].findChildren('tr') + # Rows + if row_list: + for row in row_list: + # Every row + try: + cell_list = row.findChildren('td') + if cell_list and len(cell_list) >= 6: + # Extract country + country_cell = cell_list[0] + if country_cell.find('a'): + country = cell_list[0].find('a').text.strip() + else: + country = cell_list[0].text.strip() + # Extract string + confirmed_n = '' if not cell_list[1] else cell_list[1].text.strip().replace(',', '') + deaths_n = '' if not cell_list[3] else cell_list[3].text.strip().replace(',', '') + recovered_n = '' if not cell_list[5] else cell_list[5].text.strip().replace(',', '') + # Parse to integer + confirmed_n = 0 if not confirmed_n else int(confirmed_n) + deaths_n = 0 if not deaths_n else int(deaths_n) + recovered_n = 0 if not recovered_n else int(recovered_n) + # Add to dictionary + confirmed_dict[country] = confirmed_n + death_dict[country] = deaths_n + recovered_dict[country] = recovered_n + except Exception as e: + log.warn(f'There was an error processing one row - [{e}]. Ignoring...') + return confirmed_dict, death_dict, recovered_dict From 9f13f8cea7013e33b24927cea79cdc62cf026962 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 15:05:12 +0100 Subject: [PATCH 03/20] Reorganized configuration variables --- src/config.py | 254 ++++++++++++++++++++++----------- src/cron/runner.py | 6 +- src/johns_hopkins/retriever.py | 10 +- 3 files changed, 178 insertions(+), 92 deletions(-) diff --git a/src/config.py b/src/config.py index e9d1e7a..681d197 100644 --- a/src/config.py +++ b/src/config.py @@ -8,16 +8,9 @@ MAX_INSTANCES = 1 GRACE_TIME = 1 * 3600 # 1 hour -# Data URLs -DATA_ATTEMPTS = 3 -DATA_TIMEOUT = 30 -DATA_RTD = 15 -USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.6.01001)' -URL_BASE = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data' - # Time TIME_BETWEEN_RESOURCES = 15 -TIME_BETWEEN_TWEETS = 10 * 60 # 10 minutes +TIME_BETWEEN_TWEETS = 5 * 60 # Scrapping SCRAPE_PROXY = 'socks5://127.0.0.1:9050' @@ -27,29 +20,50 @@ SCRAPE_RTD_ERROR_MINIMUM = 0.5 SCRAPE_RTD_ERROR_MAXIMUM = 1 -# Resources - Johns Hopkins +# Resources DATA_FOLDER = 'data' DATA_CONFIRMED = 'Confirmed' DATA_DEATHS = 'Deaths' DATA_RECOVERED = 'Recovered' -RESOURCES = [ +DATA_PATH_DICT = { + DATA_CONFIRMED: f'{DATA_FOLDER}/{DATA_CONFIRMED.lower()}.json', + DATA_DEATHS: f'{DATA_FOLDER}/{DATA_DEATHS.lower()}.json', + DATA_RECOVERED: f'{DATA_FOLDER}/{DATA_RECOVERED.lower()}.json', +} + +# Icons +ICON_UP = 'โฌ†๏ธ' +ICON_DOWN = 'โฌ‡๏ธ' +ICON_DICT = { + DATA_CONFIRMED: '๐ŸŸก', + DATA_DEATHS: '๐Ÿ”ด', + DATA_RECOVERED: '๐ŸŸข' +} + +# Resources - Johns Hopkins +JOHNS_HOPKINS_DATA_ATTEMPTS = 3 +JOHNS_HOPKINS_DATA_TIMEOUT = 30 +JOHNS_HOPKINS_DATA_RTD = 15 +JOHNS_HOPKINS_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.6.01001)' +JOHNS_HOPKINS_URL_BASE = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data' +JOHNS_HOPKINS_RESOURCES = [ { 'name': DATA_CONFIRMED, - 'data_url': f'{URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_CONFIRMED}.csv', - 'data_path': f'{DATA_FOLDER}/{DATA_CONFIRMED.lower()}.json', - 'icon': '๐ŸŸก' + 'data_url': f'{JOHNS_HOPKINS_URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_CONFIRMED}.csv', + 'data_path': DATA_PATH_DICT[DATA_CONFIRMED], + 'icon': ICON_DICT[DATA_CONFIRMED] }, { 'name': DATA_DEATHS, - 'data_url': f'{URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_DEATHS}.csv', - 'data_path': f'{DATA_FOLDER}/{DATA_DEATHS.lower()}.json', - 'icon': '๐Ÿ”ด' + 'data_url': f'{JOHNS_HOPKINS_URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_DEATHS}.csv', + 'data_path': DATA_PATH_DICT[DATA_DEATHS], + 'icon': ICON_DICT[DATA_DEATHS] }, { 'name': DATA_RECOVERED, - 'data_url': f'{URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_RECOVERED}.csv', - 'data_path': f'{DATA_FOLDER}/{DATA_RECOVERED.lower()}.json', - 'icon': '๐ŸŸข' + 'data_url': f'{JOHNS_HOPKINS_URL_BASE}/csse_covid_19_time_series/time_series_19-covid-{DATA_RECOVERED}.csv', + 'data_path': DATA_PATH_DICT[DATA_RECOVERED], + 'icon': ICON_DICT[DATA_RECOVERED] } ] @@ -59,85 +73,157 @@ # Twitter HASHTAG_LIST = '#coronavirus #covid19' -# Icons -ICON_UP = 'โฌ†๏ธ' -ICON_DOWN = 'โฌ‡๏ธ' - # Flags FLAG_DEFAULT = random.choice(['๐ŸŒ', '๐ŸŒŽ', '๐ŸŒŽ']) FLAGS = { - 'Armenia': '๐Ÿ‡ฆ๐Ÿ‡ฒ', - 'Russia': '๐Ÿ‡ท๐Ÿ‡บ', - 'Monaco': '๐Ÿ‡ฒ๐Ÿ‡จ', - 'South Korea': '๐Ÿ‡ฐ๐Ÿ‡ท', - 'Ecuador': '๐Ÿ‡ช๐Ÿ‡จ', - 'Lebanon': '๐Ÿ‡ฑ๐Ÿ‡ง', + 'China': '๐Ÿ‡จ๐Ÿ‡ณ', + 'Italy': '๐Ÿ‡ฎ๐Ÿ‡น', + 'Iran': '๐Ÿ‡ฎ๐Ÿ‡ท', + 'S. Korea': '๐Ÿ‡ฐ๐Ÿ‡ท', 'Spain': '๐Ÿ‡ช๐Ÿ‡ธ', - 'US': '๐Ÿ‡บ๐Ÿ‡ธ', + 'Germany': '๐Ÿ‡ฉ๐Ÿ‡ช', + 'France': '๐Ÿ‡ซ๐Ÿ‡ท', + 'USA': '๐Ÿ‡บ๐Ÿ‡ธ', 'Switzerland': '๐Ÿ‡จ๐Ÿ‡ญ', - 'Saudi Arabia': '๐Ÿ‡ธ๐Ÿ‡ฆ', - 'Israel': '๐Ÿ‡ฎ๐Ÿ‡ฑ', - 'Italy': '๐Ÿ‡ฎ๐Ÿ‡น', - 'Canada': '๐Ÿ‡จ๐Ÿ‡ฆ', - 'Singapore': '๐Ÿ‡ธ๐Ÿ‡ฌ', - 'Afghanistan': '๐Ÿ‡ฆ๐Ÿ‡ซ', - 'India': '๐Ÿ‡ฎ๐Ÿ‡ณ', - 'Croatia': '๐Ÿ‡ญ๐Ÿ‡ท', 'Norway': '๐Ÿ‡ณ๐Ÿ‡ด', + 'Sweden': '๐Ÿ‡ธ๐Ÿ‡ช', 'Denmark': '๐Ÿ‡ฉ๐Ÿ‡ฐ', - 'Senegal': '๐Ÿ‡ธ๐Ÿ‡ณ', - 'Macau': '๐Ÿ‡ฒ๐Ÿ‡ด', - 'Latvia': '๐Ÿ‡ฑ๐Ÿ‡ป', - 'Belarus': '๐Ÿ‡ง๐Ÿ‡พ', - 'North Macedonia': '๐Ÿ‡ฒ๐Ÿ‡ฐ', - 'Sri Lanka': '๐Ÿ‡ฑ๐Ÿ‡ฐ', + 'Netherlands': '๐Ÿ‡ณ๐Ÿ‡ฑ', 'UK': '๐Ÿ‡ฌ๐Ÿ‡ง', - 'Romania': '๐Ÿ‡ท๐Ÿ‡ด', - 'Estonia': '๐Ÿ‡ช๐Ÿ‡ช', - 'Dominican Republic': '๐Ÿ‡ฉ๐Ÿ‡ด', - 'Azerbaijan': '๐Ÿ‡ฆ๐Ÿ‡ฟ', - 'Indonesia': '๐Ÿ‡ฎ๐Ÿ‡ฉ', - 'Brazil': '๐Ÿ‡ง๐Ÿ‡ท', - 'Ireland': '๐Ÿ‡ฎ๐Ÿ‡ช', - 'Georgia': '๐Ÿ‡ฌ๐Ÿ‡ช', 'Japan': '๐Ÿ‡ฏ๐Ÿ‡ต', - 'Pakistan': '๐Ÿ‡ต๐Ÿ‡ฐ', - 'Cambodia': '๐Ÿ‡ฐ๐Ÿ‡ญ', - 'Iceland': '๐Ÿ‡ฎ๐Ÿ‡ธ', - 'France': '๐Ÿ‡ซ๐Ÿ‡ท', - 'Malaysia': '๐Ÿ‡ฒ๐Ÿ‡พ', + 'Diamond Princess': '๐Ÿ’Ž', + 'Belgium': '๐Ÿ‡ง๐Ÿ‡ช', 'Austria': '๐Ÿ‡ฆ๐Ÿ‡น', - 'Nigeria': '๐Ÿ‡ณ๐Ÿ‡ฌ', - 'Germany': '๐Ÿ‡ฉ๐Ÿ‡ช', - 'Bahrain': '๐Ÿ‡ง๐Ÿ‡ญ', - 'San Marino': '๐Ÿ‡ธ๐Ÿ‡ฒ', 'Qatar': '๐Ÿ‡ถ๐Ÿ‡ฆ', - 'Lithuania': '๐Ÿ‡ฑ๐Ÿ‡น', - 'Mainland China': '๐Ÿ‡จ๐Ÿ‡ณ', - 'Philippines': '๐Ÿ‡ต๐Ÿ‡ญ', - 'Oman': '๐Ÿ‡ด๐Ÿ‡ฒ', - 'Algeria': '๐Ÿ‡ฉ๐Ÿ‡ฟ', - 'United Arab Emirates': '๐Ÿ‡ฆ๐Ÿ‡ช', - 'Vietnam': '๐Ÿ‡ป๐Ÿ‡ณ', - 'Morocco': '๐Ÿ‡ฒ๐Ÿ‡ฆ', - 'Iraq': '๐Ÿ‡ฎ๐Ÿ‡ถ', - 'Kuwait': '๐Ÿ‡ฐ๐Ÿ‡ผ', - 'Belgium': '๐Ÿ‡ง๐Ÿ‡ช', - 'Hong Kong': '๐Ÿ‡ญ๐Ÿ‡ฐ', - 'Andorra': '๐Ÿ‡ฆ๐Ÿ‡ฉ', + 'Australia': '๐Ÿ‡ฆ๐Ÿ‡บ', + 'Malaysia': '๐Ÿ‡ฒ๐Ÿ‡พ', 'Finland': '๐Ÿ‡ซ๐Ÿ‡ฎ', - 'Netherlands': '๐Ÿ‡ณ๐Ÿ‡ฑ', - 'Luxembourg': '๐Ÿ‡ฑ๐Ÿ‡บ', - 'Czech Republic': '๐Ÿ‡จ๐Ÿ‡ฟ', - 'Thailand': '๐Ÿ‡น๐Ÿ‡ญ', + 'Bahrain': '๐Ÿ‡ง๐Ÿ‡ญ', + 'Canada': '๐Ÿ‡จ๐Ÿ‡ฆ', + 'Singapore': '๐Ÿ‡ธ๐Ÿ‡ฌ', + 'Greece': '๐Ÿ‡ฌ๐Ÿ‡ท', 'Portugal': '๐Ÿ‡ต๐Ÿ‡น', - 'Iran': '๐Ÿ‡ฎ๐Ÿ‡ท', + 'Israel': '๐Ÿ‡ฎ๐Ÿ‡ฑ', + 'Brazil': '๐Ÿ‡ง๐Ÿ‡ท', + 'Czechia': '๐Ÿ‡จ๐Ÿ‡ฟ', + 'Slovenia': '๐Ÿ‡ธ๐Ÿ‡ฎ', + 'Hong Kong': '๐Ÿ‡ญ๐Ÿ‡ฐ', + 'Iceland': '๐Ÿ‡ฎ๐Ÿ‡ธ', + 'Estonia': '๐Ÿ‡ช๐Ÿ‡ช', + 'Kuwait': '๐Ÿ‡ฐ๐Ÿ‡ผ', + 'Iraq': '๐Ÿ‡ฎ๐Ÿ‡ถ', + 'Romania': '๐Ÿ‡ท๐Ÿ‡ด', + 'Philippines': '๐Ÿ‡ต๐Ÿ‡ญ', + 'Indonesia': '๐Ÿ‡ฎ๐Ÿ‡ฉ', + 'Lebanon': '๐Ÿ‡ฑ๐Ÿ‡ง', 'Egypt': '๐Ÿ‡ช๐Ÿ‡ฌ', - 'Sweden': '๐Ÿ‡ธ๐Ÿ‡ช', - 'New Zealand': '๐Ÿ‡ณ๐Ÿ‡ฟ', + 'Poland': '๐Ÿ‡ต๐Ÿ‡ฑ', + 'Ireland': '๐Ÿ‡ฎ๐Ÿ‡ช', + 'Saudi Arabia': '๐Ÿ‡ธ๐Ÿ‡ฆ', + 'UAE': '๐Ÿ‡ฆ๐Ÿ‡ช', + 'India': '๐Ÿ‡ฎ๐Ÿ‡ณ', + 'Thailand': '๐Ÿ‡น๐Ÿ‡ญ', + 'San Marino': '๐Ÿ‡ธ๐Ÿ‡ฒ', 'Taiwan': '๐Ÿ‡น๐Ÿ‡ผ', - 'Australia': '๐Ÿ‡ฆ๐Ÿ‡บ', - 'Greece': '๐Ÿ‡ฌ๐Ÿ‡ท', + 'Vietnam': '๐Ÿ‡ป๐Ÿ‡ณ', + 'Luxembourg': '๐Ÿ‡ฑ๐Ÿ‡บ', + 'Russia': '๐Ÿ‡ท๐Ÿ‡บ', + 'Chile': '๐Ÿ‡จ๐Ÿ‡ฑ', + 'Serbia': '๐Ÿ‡ท๐Ÿ‡ธ', + 'Albania': '๐Ÿ‡ฆ๐Ÿ‡ฑ', + 'Peru': '๐Ÿ‡ต๐Ÿ‡ช', + 'Algeria': '๐Ÿ‡ฉ๐Ÿ‡ฟ', + 'Croatia': '๐Ÿ‡ญ๐Ÿ‡ท', + 'Brunei': '๐Ÿ‡ง๐Ÿ‡ณ', + 'Panama': '๐Ÿ‡ต๐Ÿ‡ฆ', + 'Palestine': '๐Ÿ‡ต๐Ÿ‡ธ', + 'Argentina': '๐Ÿ‡ฆ๐Ÿ‡ท', + 'Slovakia': '๐Ÿ‡ธ๐Ÿ‡ฐ', + 'Bulgaria': '๐Ÿ‡ง๐Ÿ‡ฌ', + 'Georgia': '๐Ÿ‡ฌ๐Ÿ‡ช', + 'Pakistan': '๐Ÿ‡ต๐Ÿ‡ฐ', + 'Belarus': '๐Ÿ‡ง๐Ÿ‡พ', + 'Ecuador': '๐Ÿ‡ช๐Ÿ‡จ', + 'Latvia': '๐Ÿ‡ฑ๐Ÿ‡ป', + 'Costa Rica': '๐Ÿ‡จ๐Ÿ‡ท', + 'Hungary': '๐Ÿ‡ญ๐Ÿ‡บ', + 'South Africa': '๐Ÿ‡ฟ๐Ÿ‡ฆ', + 'Senegal': '๐Ÿ‡ธ๐Ÿ‡ณ', + 'Cyprus': '๐Ÿ‡จ๐Ÿ‡พ', + 'Oman': '๐Ÿ‡ด๐Ÿ‡ฒ', + 'Bosnia and Herzegovina': '๐Ÿ‡ง๐Ÿ‡ฆ', + 'Malta': '๐Ÿ‡ฒ๐Ÿ‡น', + 'Morocco': '๐Ÿ‡ฒ๐Ÿ‡ฆ', + 'Tunisia': '๐Ÿ‡น๐Ÿ‡ณ', + 'Colombia': '๐Ÿ‡จ๐Ÿ‡ด', + 'Azerbaijan': '๐Ÿ‡ฆ๐Ÿ‡ฟ', + 'Armenia': '๐Ÿ‡ฆ๐Ÿ‡ฒ', 'Mexico': '๐Ÿ‡ฒ๐Ÿ‡ฝ', - 'Nepal': '๐Ÿ‡ณ๐Ÿ‡ต' + 'North Macedonia': '๐Ÿ‡ฒ๐Ÿ‡ฐ', + 'Dominican Republic': '๐Ÿ‡ฉ๐Ÿ‡ด', + 'Afghanistan': '๐Ÿ‡ฆ๐Ÿ‡ซ', + 'Macao': '๐Ÿ‡ฒ๐Ÿ‡ด', + 'Bolivia': '๐Ÿ‡ง๐Ÿ‡ด', + 'Maldives': '๐Ÿ‡ฒ๐Ÿ‡ป', + 'Sri Lanka': '๐Ÿ‡ฑ๐Ÿ‡ฐ', + 'Faeroe Islands': '๐Ÿ‡ซ๐Ÿ‡ด', + 'Lithuania': '๐Ÿ‡ฑ๐Ÿ‡น', + 'Jamaica': '๐Ÿ‡ฏ๐Ÿ‡ฒ', + 'Cambodia': '๐Ÿ‡ฐ๐Ÿ‡ญ', + 'New Zealand': '๐Ÿ‡ณ๐Ÿ‡ฟ', + 'French Guiana': '๐Ÿ‡ฌ๐Ÿ‡ซ', + 'Kazakhstan': '๐Ÿ‡ฐ๐Ÿ‡ฟ', + 'Martinique': '๐Ÿ‡ฒ๐Ÿ‡ถ', + 'Moldova': '๐Ÿ‡ฒ๐Ÿ‡ฉ', + 'Paraguay': '๐Ÿ‡ต๐Ÿ‡พ', + 'Rรฉunion': '๐Ÿ‡ท๐Ÿ‡ช', + 'Turkey': '๐Ÿ‡น๐Ÿ‡ท', + 'Cuba': '๐Ÿ‡จ๐Ÿ‡บ', + 'Liechtenstein': '๐Ÿ‡ฑ๐Ÿ‡ฎ', + 'Uruguay': '๐Ÿ‡บ๐Ÿ‡พ', + 'Ukraine': '๐Ÿ‡บ๐Ÿ‡ฆ', + 'Bangladesh': '๐Ÿ‡ง๐Ÿ‡ฉ', + 'Channel Islands': '๐Ÿ‡ฏ๐Ÿ‡ช', + 'French Polynesia': '๐Ÿ‡ต๐Ÿ‡ซ', + 'Puerto Rico': '๐Ÿ‡ต๐Ÿ‡ท', + 'Monaco': '๐Ÿ‡ฒ๐Ÿ‡จ', + 'Nigeria': '๐Ÿ‡ณ๐Ÿ‡ฌ', + 'Aruba': '๐Ÿ‡ฆ๐Ÿ‡ผ', + 'Burkina Faso': '๐Ÿ‡ง๐Ÿ‡ซ', + 'Cameroon': '๐Ÿ‡จ๐Ÿ‡ฒ', + 'DRC': '๐Ÿ‡จ๐Ÿ‡ฉ', + 'Ghana': '๐Ÿ‡ฌ๐Ÿ‡ญ', + 'Honduras': '๐Ÿ‡ญ๐Ÿ‡ณ', + 'Namibia': '๐Ÿ‡ณ๐Ÿ‡ฆ', + 'Saint Martin': '๐Ÿ‡ฒ๐Ÿ‡ถ', + 'Trinidad and Tobago': '๐Ÿ‡น๐Ÿ‡น', + 'Venezuela': '๐Ÿ‡ป๐Ÿ‡ช', + 'Guyana': '๐Ÿ‡ฌ๐Ÿ‡พ', + 'Sudan': '๐Ÿ‡ธ๐Ÿ‡ฉ', + 'Andorra': '๐Ÿ‡ฆ๐Ÿ‡ฉ', + 'Jordan': '๐Ÿ‡ฏ๐Ÿ‡ด', + 'Nepal': '๐Ÿ‡ณ๐Ÿ‡ต', + 'Antigua and Barbuda': '๐Ÿ‡ฆ๐Ÿ‡ฌ', + 'Bhutan': '๐Ÿ‡ง๐Ÿ‡น', + 'Cayman Islands': '๐Ÿ‡ฐ๐Ÿ‡พ', + 'Ivory Coast': '๐Ÿ‡จ๐Ÿ‡ฎ', + 'Curaรงao': '๐Ÿ‡จ๐Ÿ‡ผ', + 'Ethiopia': '๐Ÿ‡ช๐Ÿ‡น', + 'Gabon': '๐Ÿ‡ฌ๐Ÿ‡ฆ', + 'Gibraltar': '๐Ÿ‡ฌ๐Ÿ‡ฎ', + 'Guadeloupe': '๐Ÿ‡ฌ๐Ÿ‡ต', + 'Guatemala': '๐Ÿ‡ฌ๐Ÿ‡น', + 'Guinea': '๐Ÿ‡ฌ๐Ÿ‡ณ', + 'Vatican City': '๐Ÿ‡ป๐Ÿ‡ฆ', + 'Kenya': '๐Ÿ‡ฐ๐Ÿ‡ช', + 'Mauritania': '๐Ÿ‡ฒ๐Ÿ‡ท', + 'Mongolia': '๐Ÿ‡ฒ๐Ÿ‡ณ', + 'Rwanda': '๐Ÿ‡ท๐Ÿ‡ผ', + 'St. Barth': '๐Ÿ‡ง๐Ÿ‡ฑ', + 'Saint Lucia': '๐Ÿ‡ฑ๐Ÿ‡จ', + 'St. Vincent Grenadines': '๐Ÿ‡ป๐Ÿ‡จ', + 'Suriname': '๐Ÿ‡ธ๐Ÿ‡ท', + 'Eswatini': '๐Ÿ‡ธ๐Ÿ‡ฟ', + 'Togo': '๐Ÿ‡น๐Ÿ‡ฌ', + 'U.S. Virgin Islands': '๐Ÿ‡ป๐Ÿ‡ฌ' } diff --git a/src/cron/runner.py b/src/cron/runner.py index ece01f7..fb23ee0 100644 --- a/src/cron/runner.py +++ b/src/cron/runner.py @@ -4,7 +4,7 @@ import dictdiffer -from src.config import RESOURCES, TIME_BETWEEN_RESOURCES, DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED, FLAGS, \ +from src.config import JOHNS_HOPKINS_RESOURCES, TIME_BETWEEN_RESOURCES, DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED, FLAGS, \ FLAG_DEFAULT, HASHTAG_LIST, ICON_UP, ICON_DOWN from src.helper import log from src.johns_hopkins import retriever @@ -89,9 +89,9 @@ def run(): :return: Iteration done. """ try: - for i, item in enumerate(RESOURCES): + for i, item in enumerate(JOHNS_HOPKINS_RESOURCES): item_name = item['name'] - log.info(f'{i + 1}/{len(RESOURCES)} Processing {item_name}...') + log.info(f'{i + 1}/{len(JOHNS_HOPKINS_RESOURCES)} Processing {item_name}...') results = retriever.get_last_update(item['data_url']) if results is not None: item_data_path = item['data_path'] diff --git a/src/johns_hopkins/retriever.py b/src/johns_hopkins/retriever.py index 835e60f..8a81a9b 100644 --- a/src/johns_hopkins/retriever.py +++ b/src/johns_hopkins/retriever.py @@ -4,7 +4,7 @@ from io import StringIO -from src.config import USER_AGENT, DATA_ATTEMPTS, DATA_TIMEOUT, DATA_RTD +from src.config import JOHNS_HOPKINS_USER_AGENT, JOHNS_HOPKINS_DATA_ATTEMPTS, JOHNS_HOPKINS_DATA_TIMEOUT, JOHNS_HOPKINS_DATA_RTD from src.helper import log @@ -15,9 +15,9 @@ def get_last_update(data_url): :return: Dictionary with the last update grouped by location. """ result_dict = dict() - for attempt in range(DATA_ATTEMPTS): + for attempt in range(JOHNS_HOPKINS_DATA_ATTEMPTS): try: - response = requests.get(data_url, headers={'User-Agent': USER_AGENT}, timeout=DATA_TIMEOUT) + response = requests.get(data_url, headers={'User-Agent': JOHNS_HOPKINS_USER_AGENT}, timeout=JOHNS_HOPKINS_DATA_TIMEOUT) rows = [row for row in csv.reader(StringIO(response.text), delimiter=',')][1:] for row in rows: dict_key = row[1] if not row[0] else f'{row[0]}, {row[1]}' @@ -25,7 +25,7 @@ def get_last_update(data_url): return result_dict except Exception as e: log.error(f'Cannot retrieve information from {data_url} - Attempt [{attempt + 1}] - [{e}]') - time.sleep(DATA_RTD) - if attempt == DATA_ATTEMPTS - 1: + time.sleep(JOHNS_HOPKINS_DATA_RTD) + if attempt == JOHNS_HOPKINS_DATA_ATTEMPTS - 1: log.exception(e) return None From ae390f44441fd89af9c254c16b8ce105244e28e6 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 15:07:37 +0100 Subject: [PATCH 04/20] Changed cron to run every 10 minutes --- src/config.py | 2 +- src/scheduler.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/config.py b/src/config.py index 681d197..3f5a2ec 100644 --- a/src/config.py +++ b/src/config.py @@ -6,7 +6,7 @@ # Scheduler MAX_INSTANCES = 1 -GRACE_TIME = 1 * 3600 # 1 hour +GRACE_TIME = 9 * 60 # 9 minutes # Time TIME_BETWEEN_RESOURCES = 15 diff --git a/src/scheduler.py b/src/scheduler.py index a020c1a..812f6a1 100644 --- a/src/scheduler.py +++ b/src/scheduler.py @@ -8,7 +8,8 @@ scheduler = BlockingScheduler() -@scheduler.scheduled_job('cron', hour='*', minute=0, max_instances=MAX_INSTANCES, misfire_grace_time=GRACE_TIME) +@scheduler.scheduled_job('cron', hour='*', minute='0,10,20,30,40,50', + max_instances=MAX_INSTANCES, misfire_grace_time=GRACE_TIME) def run(): """ Main function for running the bot. From 9a890d574a52e33e22052e379b835bbdcd2c306c Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 15:28:15 +0100 Subject: [PATCH 05/20] Adapted runner for working wit the new data source --- src/cron/runner.py | 55 +++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/cron/runner.py b/src/cron/runner.py index fb23ee0..c51d464 100644 --- a/src/cron/runner.py +++ b/src/cron/runner.py @@ -4,10 +4,10 @@ import dictdiffer -from src.config import JOHNS_HOPKINS_RESOURCES, TIME_BETWEEN_RESOURCES, DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED, FLAGS, \ - FLAG_DEFAULT, HASHTAG_LIST, ICON_UP, ICON_DOWN +from src.config import TIME_BETWEEN_RESOURCES, DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED, FLAGS, \ + FLAG_DEFAULT, HASHTAG_LIST, ICON_UP, ICON_DOWN, DATA_PATH_DICT, ICON_DICT from src.helper import log -from src.johns_hopkins import retriever +from src.worldometers import retriever from src.twitter import api @@ -25,59 +25,63 @@ def _notify_changes(diff_tuple, resource_type, icon, results, total_worldwide): if diff_tuple[0] == 'change': number = diff_tuple[2][0] - diff_tuple[2][1] place = diff_tuple[1] + place = place if type(place) is not list else place[0] total = results[place] - flag = FLAGS.get(place.split(', ')[-1], FLAG_DEFAULT) + flag = FLAGS.get(place, FLAG_DEFAULT) + place_h = '#{}'.format(''.join([c.lower().capitalize() for c in place.strip().split()])) total_worldwide += number if number > 0: if resource_type == DATA_CONFIRMED: message_list.append( f'{icon} {ICON_UP} {abs(number):,} new confirmed case(s) in {place} {flag} totaling {total:,} ' - f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}') + f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}') elif resource_type == DATA_DEATHS: message_list.append( f'{icon} {ICON_UP} {abs(number):,} new death(s) confirmed in {place} {flag} totaling {total:,} ' - f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif resource_type == DATA_RECOVERED: message_list.append( f'{icon} {ICON_UP} {abs(number):,} new recoveries in {place} {flag} totaling {total:,} ' - f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif number < 0: if resource_type == DATA_CONFIRMED: message_list.append( f'{icon} {ICON_DOWN} Confirmed cases have decreased in {abs(number):,} in {place} {flag} ' - f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif resource_type == DATA_DEATHS: message_list.append( f'{icon} {ICON_DOWN} Deaths have decreased in {abs(number):,} in {place} {flag} ' - f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif resource_type == DATA_RECOVERED: message_list.append( f'{icon} {ICON_DOWN} Recovered cases have decreased in {abs(number):,} in {place} {flag} ' - f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'totaling {total:,} in this place. Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif diff_tuple[0] == 'add': for place, number in diff_tuple[2]: if number > 0: total_worldwide += number - flag = FLAGS.get(place.split(', ')[-1], FLAG_DEFAULT) + place = place if type(place) is not list else place[0] + flag = FLAGS.get(place, FLAG_DEFAULT) + place_h = '#{}'.format(''.join([c.lower().capitalize() for c in place.strip().split()])) if resource_type == DATA_CONFIRMED: message_list.append( f'{icon} {ICON_UP} First {number:,} confirmed case(s) in {place} {flag}. ' - f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif resource_type == DATA_DEATHS: message_list.append( f'{icon} {ICON_UP} First {number:,} death(s) in {place} {flag}. ' - f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) elif resource_type == DATA_RECOVERED: message_list.append( f'{icon} {ICON_UP} First {number:,} recovered case(s) in {place} {flag}. ' - f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST}' + f'Already {total_worldwide:,} worldwide. {HASHTAG_LIST} {place_h}' ) api.tweet(message_list) return total_worldwide @@ -89,12 +93,21 @@ def run(): :return: Iteration done. """ try: - for i, item in enumerate(JOHNS_HOPKINS_RESOURCES): - item_name = item['name'] - log.info(f'{i + 1}/{len(JOHNS_HOPKINS_RESOURCES)} Processing {item_name}...') - results = retriever.get_last_update(item['data_url']) + resources_list = [DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED] + confirmed_results, deaths_results, recovered_results = retriever.get_last_update() + for i, item_name in enumerate(resources_list): + log.info(f'{i + 1}/{len(resources_list)} Processing {item_name}...') + # Get results + results = None + if item_name == DATA_CONFIRMED: + results = confirmed_results + elif item_name == DATA_DEATHS: + results = deaths_results + elif item_name == DATA_RECOVERED: + results = recovered_results + # Process if results is not None: - item_data_path = item['data_path'] + item_data_path = DATA_PATH_DICT[item_name] item_data_path_exists = os.path.isfile(item_data_path) # Get old results if they exist old_results = dict() @@ -110,7 +123,9 @@ def run(): diff_results = list(dictdiffer.diff(results, old_results)) for j, diff_tuple in enumerate(diff_results): log.info(f'{j + 1}/{len(diff_results)} New changes found: [{diff_tuple}]') - total_worldwide = _notify_changes(diff_tuple, item_name, item['icon'], results, total_worldwide) + total_worldwide = _notify_changes( + diff_tuple, item_name, ICON_DICT[item_name], results, total_worldwide + ) time.sleep(TIME_BETWEEN_RESOURCES) except Exception as e: log.error(f'Unexpected error: [{e}]') From 67157bbbfab381e42d89c636437ddf33f33d006d Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 18:27:13 +0100 Subject: [PATCH 06/20] Added configuration for Tor --- tor/torrc | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 tor/torrc diff --git a/tor/torrc b/tor/torrc new file mode 100644 index 0000000..ee36560 --- /dev/null +++ b/tor/torrc @@ -0,0 +1,192 @@ +## Configuration file for a typical Tor user +## Last updated 9 October 2013 for Tor 0.2.5.2-alpha. +## (may or may not work for much older or much newer versions of Tor.) +## +## Lines that begin with "## " try to explain what's going on. Lines +## that begin with just "#" are disabled commands: you can enable them +## by removing the "#" symbol. +## +## See 'man tor', or https://www.torproject.org/docs/tor-manual.html, +## for more options you can use in this file. +## +## Tor will look for this file in various places based on your platform: +## https://www.torproject.org/docs/faq#torrc + +## Tor opens a socks proxy on port 9050 by default -- even if you don't +## configure one below. Set "SocksPort 0" if you plan to run Tor only +## as a relay, and not make any local application connections yourself. +#SocksPort 9050 # Default: Bind to localhost:9050 for local connections. +#SocksPort 192.168.0.1:9100 # Bind to this address:port too. + +## Entry policies to allow/deny SOCKS requests based on IP address. +## First entry that matches wins. If no SocksPolicy is set, we accept +## all (and only) requests that reach a SocksPort. Untrusted users who +## can access your SocksPort may be able to learn about the connections +## you make. +#SocksPolicy accept 192.168.0.0/16 +#SocksPolicy reject * + +## Logs go to stdout at level "notice" unless redirected by something +## else, like one of the below lines. You can have as many Log lines as +## you want. +## +## We advise using "notice" in most cases, since anything more verbose +## may provide sensitive information to an attacker who obtains the logs. +## +## Send all messages of level 'notice' or higher to /var/log/tor/notices.log +#Log notice file /var/log/tor/notices.log +## Send every possible message to /var/log/tor/debug.log +#Log debug file /var/log/tor/debug.log +## Use the system log instead of Tor's logfiles +#Log notice syslog +## To send all messages to stderr: +#Log debug stderr + +## Uncomment this to start the process in the background... or use +## --runasdaemon 1 on the command line. This is ignored on Windows; +## see the FAQ entry if you want Tor to run as an NT service. +#RunAsDaemon 1 + +## The directory for keeping all the keys/etc. By default, we store +## things in $HOME/.tor on Unix, and in Application Data\tor on Windows. +#DataDirectory /var/lib/tor + +## The port on which Tor will listen for local connections from Tor +## controller applications, as documented in control-spec.txt. +ControlPort 9051 +## If you enable the controlport, be sure to enable one of these +## authentication methods, to prevent attackers from accessing it. +#HashedControlPassword 16:872860B76453A77D60CA2BB8C1A7042072093276A3D701AD684053EC4C +CookieAuthentication 1 + +############### This section is just for location-hidden services ### + +## Once you have configured a hidden service, you can look at the +## contents of the file ".../hidden_service/hostname" for the address +## to tell people. +## +## HiddenServicePort x y:z says to redirect requests on port x to the +## address y:z. + +#HiddenServiceDir /var/lib/tor/hidden_service/ +#HiddenServicePort 80 127.0.0.1:80 + +#HiddenServiceDir /var/lib/tor/other_hidden_service/ +#HiddenServicePort 80 127.0.0.1:80 +#HiddenServicePort 22 127.0.0.1:22 + +################ This section is just for relays ##################### +# +## See https://www.torproject.org/docs/tor-doc-relay for details. + +## Required: what port to advertise for incoming Tor connections. +#ORPort 9001 +## If you want to listen on a port other than the one advertised in +## ORPort (e.g. to advertise 443 but bind to 9090), you can do it as +## follows. You'll need to do ipchains or other port forwarding +## yourself to make this work. +#ORPort 443 NoListen +#ORPort 127.0.0.1:9090 NoAdvertise + +## The IP address or full DNS name for incoming connections to your +## relay. Leave commented out and Tor will guess. +#Address noname.example.com + +## If you have multiple network interfaces, you can specify one for +## outgoing traffic to use. +# OutboundBindAddress 10.0.0.5 + +## A handle for your relay, so people don't have to refer to it by key. +#Nickname ididnteditheconfig + +## Define these to limit how much relayed traffic you will allow. Your +## own traffic is still unthrottled. Note that RelayBandwidthRate must +## be at least 20 KB. +## Note that units for these config options are bytes per second, not bits +## per second, and that prefixes are binary prefixes, i.e. 2^10, 2^20, etc. +#RelayBandwidthRate 100 KB # Throttle traffic to 100KB/s (800Kbps) +#RelayBandwidthBurst 200 KB # But allow bursts up to 200KB/s (1600Kbps) + +## Use these to restrict the maximum traffic per day, week, or month. +## Note that this threshold applies separately to sent and received bytes, +## not to their sum: setting "4 GB" may allow up to 8 GB total before +## hibernating. +## +## Set a maximum of 4 gigabytes each way per period. +#AccountingMax 4 GB +## Each period starts daily at midnight (AccountingMax is per day) +#AccountingStart day 00:00 +## Each period starts on the 3rd of the month at 15:00 (AccountingMax +## is per month) +#AccountingStart month 3 15:00 + +## Administrative contact information for this relay or bridge. This line +## can be used to contact you if your relay or bridge is misconfigured or +## something else goes wrong. Note that we archive and publish all +## descriptors containing these lines and that Google indexes them, so +## spammers might also collect them. You may want to obscure the fact that +## it's an email address and/or generate a new address for this purpose. +#ContactInfo Random Person +## You might also include your PGP or GPG fingerprint if you have one: +#ContactInfo 0xFFFFFFFF Random Person + +## Uncomment this to mirror directory information for others. Please do +## if you have enough bandwidth. +#DirPort 9030 # what port to advertise for directory connections +## If you want to listen on a port other than the one advertised in +## DirPort (e.g. to advertise 80 but bind to 9091), you can do it as +## follows. below too. You'll need to do ipchains or other port +## forwarding yourself to make this work. +#DirPort 80 NoListen +#DirPort 127.0.0.1:9091 NoAdvertise +## Uncomment to return an arbitrary blob of html on your DirPort. Now you +## can explain what Tor is if anybody wonders why your IP address is +## contacting them. See contrib/tor-exit-notice.html in Tor's source +## distribution for a sample. +#DirPortFrontPage /etc/tor/tor-exit-notice.html + +## Uncomment this if you run more than one Tor relay, and add the identity +## key fingerprint of each Tor relay you control, even if they're on +## different networks. You declare it here so Tor clients can avoid +## using more than one of your relays in a single circuit. See +## https://www.torproject.org/docs/faq#MultipleRelays +## However, you should never include a bridge's fingerprint here, as it would +## break its concealability and potentionally reveal its IP/TCP address. +#MyFamily $keyid,$keyid,... + +## A comma-separated list of exit policies. They're considered first +## to last, and the first match wins. If you want to _replace_ +## the default exit policy, end this with either a reject *:* or an +## accept *:*. Otherwise, you're _augmenting_ (prepending to) the +## default exit policy. Leave commented to just use the default, which is +## described in the man page or at +## https://www.torproject.org/documentation.html +## +## Look at https://www.torproject.org/faq-abuse.html#TypicalAbuses +## for issues you might encounter if you use the default exit policy. +## +## If certain IPs and ports are blocked externally, e.g. by your firewall, +## you should update your exit policy to reflect this -- otherwise Tor +## users will be told that those destinations are down. +## +## For security, by default Tor rejects connections to private (local) +## networks, including to your public IP address. See the man page entry +## for ExitPolicyRejectPrivate if you want to allow "exit enclaving". +## +#ExitPolicy accept *:6660-6667,reject *:* # allow irc ports but no more +#ExitPolicy accept *:119 # accept nntp as well as default exit policy +#ExitPolicy reject *:* # no exits allowed + +## Bridge relays (or "bridges") are Tor relays that aren't listed in the +## main directory. Since there is no complete public list of them, even an +## ISP that filters connections to all the known Tor relays probably +## won't be able to block all the bridges. Also, websites won't treat you +## differently because they won't know you're running Tor. If you can +## be a real relay, please do; but if not, be a bridge! +#BridgeRelay 1 +## By default, Tor will advertise your bridge to users through various +## mechanisms like https://bridges.torproject.org/. If you want to run +## a private bridge, for example because you'll give out your bridge +## address manually to your friends, uncomment this line: +#PublishServerDescriptor 0 + From 4d51cc589b18eb07664927d4126500b61a6ad29d Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 18:29:09 +0100 Subject: [PATCH 07/20] Improved Docker configuration for running Tor --- Dockerfile | 4 +++ docker-compose.yml | 1 + src/config.py | 6 ++-- src/cron/runner.py | 65 ++++++++++++++++++----------------- src/worldometers/retriever.py | 8 ++--- 5 files changed, 44 insertions(+), 40 deletions(-) diff --git a/Dockerfile b/Dockerfile index b8f8fbe..fd27a29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,10 @@ FROM python:3.7 ADD . /srv/covid19-bot WORKDIR /srv/covid19-bot +RUN apt update +RUN apt install tor -y +COPY tor/torrc /etc/tor/torrc +RUN service tor restart RUN pip install --upgrade pip RUN pip3 install -r requirements.lock CMD python3 -m src \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c7d79a9..3107431 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,3 +11,4 @@ services: restart: always volumes: - './data:/srv/covid19-bot/data' + - './tor:/srv/covid19-bot/tor' diff --git a/src/config.py b/src/config.py index 3f5a2ec..09db526 100644 --- a/src/config.py +++ b/src/config.py @@ -14,11 +14,9 @@ # Scrapping SCRAPE_PROXY = 'socks5://127.0.0.1:9050' -SCRAPE_RTD_MINIMUM = 0.1 -SCRAPE_RTD_MAXIMUM = 0.5 SCRAPE_RETRIES_AMOUNT = 10 -SCRAPE_RTD_ERROR_MINIMUM = 0.5 -SCRAPE_RTD_ERROR_MAXIMUM = 1 +SCRAPE_RTD_ERROR_MINIMUM = 3 +SCRAPE_RTD_ERROR_MAXIMUM = 5 # Resources DATA_FOLDER = 'data' diff --git a/src/cron/runner.py b/src/cron/runner.py index c51d464..a2e184c 100644 --- a/src/cron/runner.py +++ b/src/cron/runner.py @@ -95,38 +95,39 @@ def run(): try: resources_list = [DATA_CONFIRMED, DATA_DEATHS, DATA_RECOVERED] confirmed_results, deaths_results, recovered_results = retriever.get_last_update() - for i, item_name in enumerate(resources_list): - log.info(f'{i + 1}/{len(resources_list)} Processing {item_name}...') - # Get results - results = None - if item_name == DATA_CONFIRMED: - results = confirmed_results - elif item_name == DATA_DEATHS: - results = deaths_results - elif item_name == DATA_RECOVERED: - results = recovered_results - # Process - if results is not None: - item_data_path = DATA_PATH_DICT[item_name] - item_data_path_exists = os.path.isfile(item_data_path) - # Get old results if they exist - old_results = dict() - if item_data_path_exists: - with open(item_data_path, 'r') as item_data_file: - old_results = json.load(item_data_file) - # Save latest results - with open(item_data_path, 'w') as item_data_file: - json.dump(results, item_data_file) - # Check for differences if it did not exist before - if item_data_path_exists: - total_worldwide = sum(old_results.values()) - diff_results = list(dictdiffer.diff(results, old_results)) - for j, diff_tuple in enumerate(diff_results): - log.info(f'{j + 1}/{len(diff_results)} New changes found: [{diff_tuple}]') - total_worldwide = _notify_changes( - diff_tuple, item_name, ICON_DICT[item_name], results, total_worldwide - ) - time.sleep(TIME_BETWEEN_RESOURCES) + if all([confirmed_results, deaths_results, recovered_results]): + for i, item_name in enumerate(resources_list): + log.info(f'{i + 1}/{len(resources_list)} Processing {item_name}...') + # Get results + results = None + if item_name == DATA_CONFIRMED: + results = confirmed_results + elif item_name == DATA_DEATHS: + results = deaths_results + elif item_name == DATA_RECOVERED: + results = recovered_results + # Process + if results is not None: + item_data_path = DATA_PATH_DICT[item_name] + item_data_path_exists = os.path.isfile(item_data_path) + # Get old results if they exist + old_results = dict() + if item_data_path_exists: + with open(item_data_path, 'r') as item_data_file: + old_results = json.load(item_data_file) + # Save latest results + with open(item_data_path, 'w') as item_data_file: + json.dump(results, item_data_file) + # Check for differences if it did not exist before + if item_data_path_exists: + total_worldwide = sum(old_results.values()) + diff_results = list(dictdiffer.diff(results, old_results)) + for j, diff_tuple in enumerate(diff_results): + log.info(f'{j + 1}/{len(diff_results)} New changes found: [{diff_tuple}]') + total_worldwide = _notify_changes( + diff_tuple, item_name, ICON_DICT[item_name], results, total_worldwide + ) + time.sleep(TIME_BETWEEN_RESOURCES) except Exception as e: log.error(f'Unexpected error: [{e}]') log.exception(e) diff --git a/src/worldometers/retriever.py b/src/worldometers/retriever.py index 5e25598..9ded933 100644 --- a/src/worldometers/retriever.py +++ b/src/worldometers/retriever.py @@ -7,8 +7,8 @@ from stem import Signal from stem.control import Controller -from src.config import SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM, SCRAPE_RETRIES_AMOUNT, SCRAPE_PROXY, \ - SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, WORLDOMETERS_URL +from src.config import SCRAPE_RETRIES_AMOUNT, SCRAPE_PROXY, SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, \ + WORLDOMETERS_URL from src.helper import log @@ -18,7 +18,6 @@ def _get_html(url): :param url: URL to retrieve. :return: HTML content formatted as String, None if there was an error. """ - time.sleep(random.uniform(SCRAPE_RTD_MINIMUM, SCRAPE_RTD_MAXIMUM)) # RTD for i in range(0, SCRAPE_RETRIES_AMOUNT): try: with Controller.from_port(port=9051) as c: @@ -32,8 +31,9 @@ def _get_html(url): return html_content except Exception as e: if i == SCRAPE_RETRIES_AMOUNT - 1: - print(f'Unable to retrieve HTML from {url}: {e}') + log.error(f'Unable to retrieve HTML from {url}: {e}') else: + log.warn(f'Unable to retrieve HTML from {url} - Retry {i}: {e}') time.sleep(random.uniform(SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM)) return None From 5846e5bda2e446687cc8e16e47b1e8dc47401031 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 18:29:21 +0100 Subject: [PATCH 08/20] Change main function for DEBUG mode --- src/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/__main__.py b/src/__main__.py index 4e90b24..1e13668 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -12,10 +12,10 @@ scheduler.scheduler.start() else: log.info('Running jobs manually since DEBUG is enabled.') - time.sleep(5) - scheduler.run() while True: - time.sleep(10000) + time.sleep(5) + scheduler.run() + time.sleep(30) except Exception as e: log.error(f'Unexpected error {e} in Covid-19') From 58e88eddb696487857db7a402a12d51755d5807d Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 18:33:25 +0100 Subject: [PATCH 09/20] Added flag for enabling/disabling Tor --- src/config.py | 1 + src/worldometers/retriever.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/config.py b/src/config.py index 09db526..979799a 100644 --- a/src/config.py +++ b/src/config.py @@ -13,6 +13,7 @@ TIME_BETWEEN_TWEETS = 5 * 60 # Scrapping +TOR_ENABLE = False SCRAPE_PROXY = 'socks5://127.0.0.1:9050' SCRAPE_RETRIES_AMOUNT = 10 SCRAPE_RTD_ERROR_MINIMUM = 3 diff --git a/src/worldometers/retriever.py b/src/worldometers/retriever.py index 9ded933..706bf2c 100644 --- a/src/worldometers/retriever.py +++ b/src/worldometers/retriever.py @@ -8,7 +8,7 @@ from stem.control import Controller from src.config import SCRAPE_RETRIES_AMOUNT, SCRAPE_PROXY, SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, \ - WORLDOMETERS_URL + WORLDOMETERS_URL, TOR_ENABLE from src.helper import log @@ -20,10 +20,12 @@ def _get_html(url): """ for i in range(0, SCRAPE_RETRIES_AMOUNT): try: - with Controller.from_port(port=9051) as c: - c.authenticate() - c.signal(Signal.NEWNYM) - proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + proxies = {} + if TOR_ENABLE: + with Controller.from_port(port=9051) as c: + c.authenticate() + c.signal(Signal.NEWNYM) + proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} headers = {'User-Agent': UserAgent().random} response = requests.get(url, proxies=proxies, headers=headers) assert response.ok From bc0e2e0f6d0051cc45bc1b768a407d7a59d6b9a1 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 20:52:00 +0100 Subject: [PATCH 10/20] Removed tor configuration --- tor/torrc | 192 ------------------------------------------------------ 1 file changed, 192 deletions(-) delete mode 100644 tor/torrc diff --git a/tor/torrc b/tor/torrc deleted file mode 100644 index ee36560..0000000 --- a/tor/torrc +++ /dev/null @@ -1,192 +0,0 @@ -## Configuration file for a typical Tor user -## Last updated 9 October 2013 for Tor 0.2.5.2-alpha. -## (may or may not work for much older or much newer versions of Tor.) -## -## Lines that begin with "## " try to explain what's going on. Lines -## that begin with just "#" are disabled commands: you can enable them -## by removing the "#" symbol. -## -## See 'man tor', or https://www.torproject.org/docs/tor-manual.html, -## for more options you can use in this file. -## -## Tor will look for this file in various places based on your platform: -## https://www.torproject.org/docs/faq#torrc - -## Tor opens a socks proxy on port 9050 by default -- even if you don't -## configure one below. Set "SocksPort 0" if you plan to run Tor only -## as a relay, and not make any local application connections yourself. -#SocksPort 9050 # Default: Bind to localhost:9050 for local connections. -#SocksPort 192.168.0.1:9100 # Bind to this address:port too. - -## Entry policies to allow/deny SOCKS requests based on IP address. -## First entry that matches wins. If no SocksPolicy is set, we accept -## all (and only) requests that reach a SocksPort. Untrusted users who -## can access your SocksPort may be able to learn about the connections -## you make. -#SocksPolicy accept 192.168.0.0/16 -#SocksPolicy reject * - -## Logs go to stdout at level "notice" unless redirected by something -## else, like one of the below lines. You can have as many Log lines as -## you want. -## -## We advise using "notice" in most cases, since anything more verbose -## may provide sensitive information to an attacker who obtains the logs. -## -## Send all messages of level 'notice' or higher to /var/log/tor/notices.log -#Log notice file /var/log/tor/notices.log -## Send every possible message to /var/log/tor/debug.log -#Log debug file /var/log/tor/debug.log -## Use the system log instead of Tor's logfiles -#Log notice syslog -## To send all messages to stderr: -#Log debug stderr - -## Uncomment this to start the process in the background... or use -## --runasdaemon 1 on the command line. This is ignored on Windows; -## see the FAQ entry if you want Tor to run as an NT service. -#RunAsDaemon 1 - -## The directory for keeping all the keys/etc. By default, we store -## things in $HOME/.tor on Unix, and in Application Data\tor on Windows. -#DataDirectory /var/lib/tor - -## The port on which Tor will listen for local connections from Tor -## controller applications, as documented in control-spec.txt. -ControlPort 9051 -## If you enable the controlport, be sure to enable one of these -## authentication methods, to prevent attackers from accessing it. -#HashedControlPassword 16:872860B76453A77D60CA2BB8C1A7042072093276A3D701AD684053EC4C -CookieAuthentication 1 - -############### This section is just for location-hidden services ### - -## Once you have configured a hidden service, you can look at the -## contents of the file ".../hidden_service/hostname" for the address -## to tell people. -## -## HiddenServicePort x y:z says to redirect requests on port x to the -## address y:z. - -#HiddenServiceDir /var/lib/tor/hidden_service/ -#HiddenServicePort 80 127.0.0.1:80 - -#HiddenServiceDir /var/lib/tor/other_hidden_service/ -#HiddenServicePort 80 127.0.0.1:80 -#HiddenServicePort 22 127.0.0.1:22 - -################ This section is just for relays ##################### -# -## See https://www.torproject.org/docs/tor-doc-relay for details. - -## Required: what port to advertise for incoming Tor connections. -#ORPort 9001 -## If you want to listen on a port other than the one advertised in -## ORPort (e.g. to advertise 443 but bind to 9090), you can do it as -## follows. You'll need to do ipchains or other port forwarding -## yourself to make this work. -#ORPort 443 NoListen -#ORPort 127.0.0.1:9090 NoAdvertise - -## The IP address or full DNS name for incoming connections to your -## relay. Leave commented out and Tor will guess. -#Address noname.example.com - -## If you have multiple network interfaces, you can specify one for -## outgoing traffic to use. -# OutboundBindAddress 10.0.0.5 - -## A handle for your relay, so people don't have to refer to it by key. -#Nickname ididnteditheconfig - -## Define these to limit how much relayed traffic you will allow. Your -## own traffic is still unthrottled. Note that RelayBandwidthRate must -## be at least 20 KB. -## Note that units for these config options are bytes per second, not bits -## per second, and that prefixes are binary prefixes, i.e. 2^10, 2^20, etc. -#RelayBandwidthRate 100 KB # Throttle traffic to 100KB/s (800Kbps) -#RelayBandwidthBurst 200 KB # But allow bursts up to 200KB/s (1600Kbps) - -## Use these to restrict the maximum traffic per day, week, or month. -## Note that this threshold applies separately to sent and received bytes, -## not to their sum: setting "4 GB" may allow up to 8 GB total before -## hibernating. -## -## Set a maximum of 4 gigabytes each way per period. -#AccountingMax 4 GB -## Each period starts daily at midnight (AccountingMax is per day) -#AccountingStart day 00:00 -## Each period starts on the 3rd of the month at 15:00 (AccountingMax -## is per month) -#AccountingStart month 3 15:00 - -## Administrative contact information for this relay or bridge. This line -## can be used to contact you if your relay or bridge is misconfigured or -## something else goes wrong. Note that we archive and publish all -## descriptors containing these lines and that Google indexes them, so -## spammers might also collect them. You may want to obscure the fact that -## it's an email address and/or generate a new address for this purpose. -#ContactInfo Random Person -## You might also include your PGP or GPG fingerprint if you have one: -#ContactInfo 0xFFFFFFFF Random Person - -## Uncomment this to mirror directory information for others. Please do -## if you have enough bandwidth. -#DirPort 9030 # what port to advertise for directory connections -## If you want to listen on a port other than the one advertised in -## DirPort (e.g. to advertise 80 but bind to 9091), you can do it as -## follows. below too. You'll need to do ipchains or other port -## forwarding yourself to make this work. -#DirPort 80 NoListen -#DirPort 127.0.0.1:9091 NoAdvertise -## Uncomment to return an arbitrary blob of html on your DirPort. Now you -## can explain what Tor is if anybody wonders why your IP address is -## contacting them. See contrib/tor-exit-notice.html in Tor's source -## distribution for a sample. -#DirPortFrontPage /etc/tor/tor-exit-notice.html - -## Uncomment this if you run more than one Tor relay, and add the identity -## key fingerprint of each Tor relay you control, even if they're on -## different networks. You declare it here so Tor clients can avoid -## using more than one of your relays in a single circuit. See -## https://www.torproject.org/docs/faq#MultipleRelays -## However, you should never include a bridge's fingerprint here, as it would -## break its concealability and potentionally reveal its IP/TCP address. -#MyFamily $keyid,$keyid,... - -## A comma-separated list of exit policies. They're considered first -## to last, and the first match wins. If you want to _replace_ -## the default exit policy, end this with either a reject *:* or an -## accept *:*. Otherwise, you're _augmenting_ (prepending to) the -## default exit policy. Leave commented to just use the default, which is -## described in the man page or at -## https://www.torproject.org/documentation.html -## -## Look at https://www.torproject.org/faq-abuse.html#TypicalAbuses -## for issues you might encounter if you use the default exit policy. -## -## If certain IPs and ports are blocked externally, e.g. by your firewall, -## you should update your exit policy to reflect this -- otherwise Tor -## users will be told that those destinations are down. -## -## For security, by default Tor rejects connections to private (local) -## networks, including to your public IP address. See the man page entry -## for ExitPolicyRejectPrivate if you want to allow "exit enclaving". -## -#ExitPolicy accept *:6660-6667,reject *:* # allow irc ports but no more -#ExitPolicy accept *:119 # accept nntp as well as default exit policy -#ExitPolicy reject *:* # no exits allowed - -## Bridge relays (or "bridges") are Tor relays that aren't listed in the -## main directory. Since there is no complete public list of them, even an -## ISP that filters connections to all the known Tor relays probably -## won't be able to block all the bridges. Also, websites won't treat you -## differently because they won't know you're running Tor. If you can -## be a real relay, please do; but if not, be a bridge! -#BridgeRelay 1 -## By default, Tor will advertise your bridge to users through various -## mechanisms like https://bridges.torproject.org/. If you want to run -## a private bridge, for example because you'll give out your bridge -## address manually to your friends, uncomment this line: -#PublishServerDescriptor 0 - From 1ea23e24db480a5f330273dc61b81877d30f5ab5 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 20:53:13 +0100 Subject: [PATCH 11/20] Removed some unneeded requirements --- requirements.lock | 6 ------ requirements.txt | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/requirements.lock b/requirements.lock index ffb10a1..4792d19 100644 --- a/requirements.lock +++ b/requirements.lock @@ -3,19 +3,13 @@ APScheduler==3.6.3 python-twitter==3.5 dictdiffer==0.8.1 beautifulsoup4==4.8.2 -stem==1.8.0 fake-useragent==0.1.11 ## The following requirements were added by pip freeze: certifi==2019.11.28 -cffi==1.14.0 chardet==3.0.4 -cryptography==2.8 future==0.18.2 idna==2.9 oauthlib==3.1.0 -pycparser==2.20 -pyOpenSSL==19.1.0 -PySocks==1.7.1 pytz==2019.3 requests-oauthlib==1.3.0 six==1.14.0 diff --git a/requirements.txt b/requirements.txt index 2f48c8a..a661888 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -requests[socks,security] +requests APScheduler python-twitter dictdiffer beautifulsoup4 -stem fake_useragent \ No newline at end of file From 44437a95ca0b60230ace0d63bc7804cc9ff92022 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 20:53:48 +0100 Subject: [PATCH 12/20] Tweaked a bit the main function for debug mode --- src/__main__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/__main__.py b/src/__main__.py index 1e13668..7797e6a 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -13,9 +13,8 @@ else: log.info('Running jobs manually since DEBUG is enabled.') while True: - time.sleep(5) - scheduler.run() time.sleep(30) + scheduler.run() except Exception as e: log.error(f'Unexpected error {e} in Covid-19') From 26a16abe2424f5c3413ba55bead63940f9d99d6a Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:02:13 +0100 Subject: [PATCH 13/20] Added sockets protocol for requests --- requirements.lock | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.lock b/requirements.lock index 4792d19..264e9fd 100644 --- a/requirements.lock +++ b/requirements.lock @@ -10,6 +10,7 @@ chardet==3.0.4 future==0.18.2 idna==2.9 oauthlib==3.1.0 +PySocks==1.7.1 pytz==2019.3 requests-oauthlib==1.3.0 six==1.14.0 diff --git a/requirements.txt b/requirements.txt index a661888..a5e6278 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests +requests[socks] APScheduler python-twitter dictdiffer From 8570a27adf43de705f1ccc9c8da2629a4a8d432f Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:02:44 +0100 Subject: [PATCH 14/20] Removed tor installation on Python Dockerfile --- Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index fd27a29..b8f8fbe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,6 @@ FROM python:3.7 ADD . /srv/covid19-bot WORKDIR /srv/covid19-bot -RUN apt update -RUN apt install tor -y -COPY tor/torrc /etc/tor/torrc -RUN service tor restart RUN pip install --upgrade pip RUN pip3 install -r requirements.lock CMD python3 -m src \ No newline at end of file From f2d41d11879d86556213ddce1db5772585c4832f Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:05:32 +0100 Subject: [PATCH 15/20] Added dperson/torproxy Docker container --- docker-compose.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 3107431..8fed8ba 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ -version: '3.5' +version: '3' services: + covid19-bot: image: covid19-bot container_name: covid19-bot @@ -11,4 +12,13 @@ services: restart: always volumes: - './data:/srv/covid19-bot/data' - - './tor:/srv/covid19-bot/tor' + links: + - covid19-bot-tor + + covid19-bot-tor: + image: dperson/torproxy + container_name: covid19-bot + ports: + - 8118:8118 + - 9050:9050 + - 9051:9051 \ No newline at end of file From da41f1c5c6d2b712ff6575416cb0e049347900f6 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:06:25 +0100 Subject: [PATCH 16/20] Implemented Tor interaction --- src/config.py | 5 ++-- src/worldometers/retriever.py | 46 ++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/config.py b/src/config.py index 979799a..b4372eb 100644 --- a/src/config.py +++ b/src/config.py @@ -13,8 +13,9 @@ TIME_BETWEEN_TWEETS = 5 * 60 # Scrapping -TOR_ENABLE = False -SCRAPE_PROXY = 'socks5://127.0.0.1:9050' +TOR_ENABLE = True +IP_ECHO_ENDPOINT = 'http://ipecho.net/plain' +HTTP_PROXY = 'socks5://covid-bot-tor:9050' SCRAPE_RETRIES_AMOUNT = 10 SCRAPE_RTD_ERROR_MINIMUM = 3 SCRAPE_RTD_ERROR_MAXIMUM = 5 diff --git a/src/worldometers/retriever.py b/src/worldometers/retriever.py index 706bf2c..4baf9db 100644 --- a/src/worldometers/retriever.py +++ b/src/worldometers/retriever.py @@ -1,31 +1,55 @@ import random import time import requests -from bs4 import BeautifulSoup +import telnetlib +from bs4 import BeautifulSoup from fake_useragent import UserAgent -from stem import Signal -from stem.control import Controller -from src.config import SCRAPE_RETRIES_AMOUNT, SCRAPE_PROXY, SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, \ - WORLDOMETERS_URL, TOR_ENABLE +from src.config import SCRAPE_RETRIES_AMOUNT, SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, \ + WORLDOMETERS_URL, TOR_ENABLE, IP_ECHO_ENDPOINT, HTTP_PROXY from src.helper import log +def __get_ip(): + return requests.get(IP_ECHO_ENDPOINT, proxies={'http': HTTP_PROXY}).text + + +def __request_ip_change(): + tn = telnetlib.Telnet('covid19-bot-tor', 9051) + tn.read_until("Escape character is '^]'.", 2) + tn.write('AUTHENTICATE ""\r\n') + tn.read_until('250 OK', 2) + tn.write('signal NEWNYM\r\n') + tn.read_until('250 OK', 2) + tn.write('quit\r\n') + tn.close() + + +def __wait_for_ip_confirmation(ip_address): + while True: + new_ip_address = __get_ip() + if new_ip_address == ip_address: + time.sleep(1) + else: + log.info(f'New IP address allocated: [{new_ip_address}]') + break + + def _get_html(url): """ Retrieves the HTML content given a Internet accessible URL. :param url: URL to retrieve. :return: HTML content formatted as String, None if there was an error. """ + if TOR_ENABLE: + ip_address = __get_ip() + log.info(f'Current IP address: [{ip_address}]') + __request_ip_change() + __wait_for_ip_confirmation(ip_address) for i in range(0, SCRAPE_RETRIES_AMOUNT): try: - proxies = {} - if TOR_ENABLE: - with Controller.from_port(port=9051) as c: - c.authenticate() - c.signal(Signal.NEWNYM) - proxies = {'http': SCRAPE_PROXY, 'https': SCRAPE_PROXY} + proxies = {'http': HTTP_PROXY} if TOR_ENABLE else {} headers = {'User-Agent': UserAgent().random} response = requests.get(url, proxies=proxies, headers=headers) assert response.ok From 7db341b56593946c6e1b43cda339629769bfced8 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:24:54 +0100 Subject: [PATCH 17/20] Inspected code --- src/johns_hopkins/retriever.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/johns_hopkins/retriever.py b/src/johns_hopkins/retriever.py index 8a81a9b..b2faa11 100644 --- a/src/johns_hopkins/retriever.py +++ b/src/johns_hopkins/retriever.py @@ -4,7 +4,8 @@ from io import StringIO -from src.config import JOHNS_HOPKINS_USER_AGENT, JOHNS_HOPKINS_DATA_ATTEMPTS, JOHNS_HOPKINS_DATA_TIMEOUT, JOHNS_HOPKINS_DATA_RTD +from src.config import JOHNS_HOPKINS_USER_AGENT, JOHNS_HOPKINS_DATA_ATTEMPTS, JOHNS_HOPKINS_DATA_TIMEOUT, \ + JOHNS_HOPKINS_DATA_RTD from src.helper import log @@ -17,7 +18,9 @@ def get_last_update(data_url): result_dict = dict() for attempt in range(JOHNS_HOPKINS_DATA_ATTEMPTS): try: - response = requests.get(data_url, headers={'User-Agent': JOHNS_HOPKINS_USER_AGENT}, timeout=JOHNS_HOPKINS_DATA_TIMEOUT) + response = requests.get( + data_url, headers={'User-Agent': JOHNS_HOPKINS_USER_AGENT}, timeout=JOHNS_HOPKINS_DATA_TIMEOUT + ) rows = [row for row in csv.reader(StringIO(response.text), delimiter=',')][1:] for row in rows: dict_key = row[1] if not row[0] else f'{row[0]}, {row[1]}' From 48dca5f924f15ec3a973ac859a40635ee03553ae Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:25:19 +0100 Subject: [PATCH 18/20] Got Tor working --- docker-compose.yml | 5 ++--- src/config.py | 1 - src/worldometers/retriever.py | 33 +-------------------------------- 3 files changed, 3 insertions(+), 36 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8fed8ba..a6085fa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,8 +17,7 @@ services: covid19-bot-tor: image: dperson/torproxy - container_name: covid19-bot + container_name: covid19-bot-tor ports: - 8118:8118 - - 9050:9050 - - 9051:9051 \ No newline at end of file + - 9050:9050 \ No newline at end of file diff --git a/src/config.py b/src/config.py index b4372eb..884f871 100644 --- a/src/config.py +++ b/src/config.py @@ -14,7 +14,6 @@ # Scrapping TOR_ENABLE = True -IP_ECHO_ENDPOINT = 'http://ipecho.net/plain' HTTP_PROXY = 'socks5://covid-bot-tor:9050' SCRAPE_RETRIES_AMOUNT = 10 SCRAPE_RTD_ERROR_MINIMUM = 3 diff --git a/src/worldometers/retriever.py b/src/worldometers/retriever.py index 4baf9db..6166c57 100644 --- a/src/worldometers/retriever.py +++ b/src/worldometers/retriever.py @@ -1,52 +1,21 @@ import random import time import requests -import telnetlib from bs4 import BeautifulSoup from fake_useragent import UserAgent from src.config import SCRAPE_RETRIES_AMOUNT, SCRAPE_RTD_ERROR_MINIMUM, SCRAPE_RTD_ERROR_MAXIMUM, \ - WORLDOMETERS_URL, TOR_ENABLE, IP_ECHO_ENDPOINT, HTTP_PROXY + WORLDOMETERS_URL, TOR_ENABLE, HTTP_PROXY from src.helper import log -def __get_ip(): - return requests.get(IP_ECHO_ENDPOINT, proxies={'http': HTTP_PROXY}).text - - -def __request_ip_change(): - tn = telnetlib.Telnet('covid19-bot-tor', 9051) - tn.read_until("Escape character is '^]'.", 2) - tn.write('AUTHENTICATE ""\r\n') - tn.read_until('250 OK', 2) - tn.write('signal NEWNYM\r\n') - tn.read_until('250 OK', 2) - tn.write('quit\r\n') - tn.close() - - -def __wait_for_ip_confirmation(ip_address): - while True: - new_ip_address = __get_ip() - if new_ip_address == ip_address: - time.sleep(1) - else: - log.info(f'New IP address allocated: [{new_ip_address}]') - break - - def _get_html(url): """ Retrieves the HTML content given a Internet accessible URL. :param url: URL to retrieve. :return: HTML content formatted as String, None if there was an error. """ - if TOR_ENABLE: - ip_address = __get_ip() - log.info(f'Current IP address: [{ip_address}]') - __request_ip_change() - __wait_for_ip_confirmation(ip_address) for i in range(0, SCRAPE_RETRIES_AMOUNT): try: proxies = {'http': HTTP_PROXY} if TOR_ENABLE else {} From 3d592fce8b24aeb1f2deb768d74293a7c18e5eeb Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:35:47 +0100 Subject: [PATCH 19/20] Improved README --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index fbe243e..99a1807 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ## Data source -Real time data is being retrieved from the [Novel Coronavirus (COVID-19) Cases repository](https://github.com/CSSEGISandData/COVID-19), by [JHU CSSE](https://systems.jhu.edu/research/public-health/ncov/). +Real time data is being retrieved from the [Worldometer website](https://www.worldometers.info/coronavirus/). ## Python requirements @@ -23,6 +23,9 @@ This project is using Python3.7. All these requirements have been specified in t 2. [Dictdiffer](https://dictdiffer.readthedocs.io/en/latest/): used for checking data differences easier. 3. [APScheduler](https://apscheduler.readthedocs.io/en/stable/): used for scheduling jobs in a certain time. 4. [Twitter](https://python-twitter.readthedocs.io/en/latest/): used for posting tweets. +5. [BeautifulSoup](https://pypi.org/project/beautifulsoup4/): used for scraping an HTML content. +6. [Tor](https://2019.www.torproject.org/docs/debian.html.en): used for making requests anonymous using other IPs. +7. [Fake User-Agent](https://pypi.org/project/fake-useragent/): used for using random User-Agent's for every request. ## Recommendations @@ -49,13 +52,7 @@ To run this script, please execute the following from the root directory: TWITTER_ACCESS_TOKEN_SECRET={TWITTER_ACCESS_TOKEN_SECRET} ``` -4. Run the script as a Python module - - ```bash - python3 -m src - ``` - - or as a Docker container +4. Run the script as a Docker container ```bash docker-compose up -d --build From 4fffef769d181ec0a9e4e34535763d91a91155e6 Mon Sep 17 00:00:00 2001 From: Albert Suarez Date: Sat, 14 Mar 2020 21:42:28 +0100 Subject: [PATCH 20/20] Reduced time between tweets from 5 to 1 minute --- src/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index 884f871..93e76bb 100644 --- a/src/config.py +++ b/src/config.py @@ -10,7 +10,7 @@ # Time TIME_BETWEEN_RESOURCES = 15 -TIME_BETWEEN_TWEETS = 5 * 60 +TIME_BETWEEN_TWEETS = 1 * 60 # 1 minute # Scrapping TOR_ENABLE = True