Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #2015 - Update topsites.py for IAM user credential #2021

Merged
merged 1 commit into from
Jan 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions tests/test_topsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@


TEST_XML = '''<?xml version="1.0"?><aws:TopSitesResponse xmlns:aws="http://alexa.amazonaws.com/doc/2005-10-05/"><aws:Response><aws:OperationRequest><aws:RequestId>9ffc5e13-175e-4c7e-b33b-0efe3501d1f3</aws:RequestId></aws:OperationRequest><aws:TopSitesResult><aws:Alexa><aws:TopSites><aws:List><aws:CountryName>China</aws:CountryName><aws:CountryCode>CN</aws:CountryCode><aws:TotalSites>671496</aws:TotalSites><aws:Sites><aws:Site><aws:DataUrl>baidu.com</aws:DataUrl><aws:Country><aws:Rank>1</aws:Rank><aws:Reach><aws:PerMillion>358000</aws:PerMillion></aws:Reach><aws:PageViews><aws:PerMillion>77410</aws:PerMillion><aws:PerUser>11.5</aws:PerUser></aws:PageViews></aws:Country><aws:Global><aws:Rank>4</aws:Rank></aws:Global></aws:Site></aws:Sites></aws:List></aws:TopSites></aws:Alexa></aws:TopSitesResult><aws:ResponseStatus><aws:StatusCode>Success</aws:StatusCode></aws:ResponseStatus></aws:Response></aws:TopSitesResponse>''' # nopep8
TEST_QUERY_STRING = 'AWSAccessKeyId=1234567890ABCDEFGHIJ&Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&SignatureMethod=HmacSHA256&SignatureVersion=2&Start=1&Timestamp=2006-01-01T00%3A00%3A00.000Z' # nopep8
TEST_QUERY_URI = 'https://ats.amazonaws.com/?AWSAccessKeyId=1234567890ABCDEFGHIJ&Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&SignatureMethod=HmacSHA256&SignatureVersion=2&Start=1&Timestamp=2006-01-01T00%3A00%3A00.000Z&Signature=9a9mCUzgmU6apeV2t5VZx%2FekIgIDjNRMKVLGWzbmjSs%3D' # nopep8
TEST_QUERY_STRING = 'Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # nopep8
TEST_QUERY_URI = 'https://ats.amazonaws.com/api?Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # nopep8
TEST_QUERY_AUTH = 'AWS4-HMAC-SHA256 Credential=1234567890ABCDEFGHIJ/20060101/us-west-1/AlexaTopSites/aws4_request, SignedHeaders=host;x-amz-date, Signature=55b760bcae9a2ae93b0d08a85c3e613ec43c7f39f69ef2345896cf7660234f49' # nopep8
TEST_QUERY_TIMESTAMP = '20060101T000000Z'


class TestTopsites(unittest.TestCase):
Expand All @@ -26,11 +28,14 @@ def setUp(self):
topsites.ats_access_key = '1234567890ABCDEFGHIJ'
topsites.ats_secret_key = 'JIHGFEDCBA0987654321'

def test_build_uri(self):
def test_build_request(self):
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
with patch('datetime.datetime') as dt_mock:
dt_mock.utcnow.return_value = testdt
self.assertEqual(topsites.build_uri('CN', 1), TEST_QUERY_URI)
uri, authorization, timestamp = topsites.build_request('CN', 1)
self.assertEqual(uri, TEST_QUERY_URI)
self.assertEqual(authorization, TEST_QUERY_AUTH)
self.assertEqual(timestamp, TEST_QUERY_TIMESTAMP)

def test_build_query_string(self):
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
Expand Down
124 changes: 87 additions & 37 deletions tools/topsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,19 @@

# Constant for alexa top site API
ATS_ACTION_NAME = 'TopSites'
ATS_ALGORITHM = 'AWS4-HMAC-SHA256'
ATS_COUNT = 100
ATS_DATEFORMAT_AWS = '%Y%m%dT%H%M%SZ'
ATS_DATEFORMAT_CREDENTIAL = '%Y%m%d'
ATS_HASH_ALGORITHM = 'HmacSHA256'
ATS_RESPONSE_GROUP_NAME = 'Country'
ATS_SERVICE_HOST = 'ats.amazonaws.com'
ATS_AWS_BASE_URL = 'https://' + ATS_SERVICE_HOST + '/?'
ATS_DATEFORMAT = '%Y-%m-%dT%H:%M:%S.%f'
ATS_HASH_ALGORITHM = 'HmacSHA256'
ATS_COUNT = 100
ATS_SERVICE_ENDPOINT = 'ats.us-west-1.amazonaws.com'
ATS_SERVICE_URI = '/api'
ATS_SERVICE_REGION = 'us-west-1'
ATS_SERVICE_NAME = 'AlexaTopSites'
ATS_SIGNED_HEADERS = 'host;x-amz-date'
ATS_AWS_BASE_URL = 'https://' + ATS_SERVICE_HOST + ATS_SERVICE_URI

# Location of the DB and its backup.
DB_PATH = app.config['DATA_PATH']
Expand Down Expand Up @@ -82,29 +89,29 @@ def __init__(self, url, priority, country_code, ranking):
def query_topsites(country_code, count=1000):
"""Query top sites with given country code and count."""
for index in range(1, count, 100):
uri = build_uri(country_code, index)
uri, authorization, timestamp = build_request(country_code, index)
try:
response = requests.get(uri)
headers = {'x-amz-date': timestamp, 'Authorization': authorization}
response = requests.get(uri, headers=headers)
dom = parseString(response.content)

if response.status_code == 200:
# See http://docs.aws.amazon.com/AlexaTopSites/latest/index.html?QUERY_QueryRequests.html # nopep8
# See https://docs.aws.amazon.com/AlexaTopSites/latest/QUERY_QueryRequests.html # nopep8
sites = dom.getElementsByTagName('aws:Site')
for site in sites:
parse_site(site)
parse_site(site, country_code)
session.commit()
else:
# Get error code and message from response
# See http://docs.aws.amazon.com/AlexaTopSites/latest/index.html?QUERY_QueryAuthenticationErrors.html # nopep8
code = node_text(dom, 'Code')
message = node_text(dom, 'Message')
print('Send request to {} get error: {}.\nMessage: {}'.format(
uri, code, message))
# See https://docs.aws.amazon.com/AlexaTopSites/latest/Authentication.html # nopep8
message = node_text(dom, 'aws:ErrorCode')
print('Send request to {} get error message: \n{}'.format(
uri, message))
except ConnectionError:
print('Unable send request to {}'.format(uri))


def parse_site(site):
def parse_site(site, country_code):
"""Parse given dom object."""
url = node_text(site, 'aws:DataUrl')
rank = int(node_text(site, 'aws:Rank'))
Expand Down Expand Up @@ -135,47 +142,90 @@ def parse_site(site):
site_row.ranking = rank


def build_uri(country_code, start_ranking):
def build_request(country_code, start_ranking):
"""Build Alexa top site URI with given country code and start ranking."""
# Build query string
query_string = build_query_string(country_code, start_ranking)
# Prepare timestamp & datestamp
nowtime = datetime.datetime.utcnow()
timestamp = nowtime.strftime(ATS_DATEFORMAT_AWS)
datestamp = nowtime.strftime(ATS_DATEFORMAT_CREDENTIAL)

# Build request
canonical_query = build_query_string(country_code, start_ranking)
canonical_headers = 'host:{host}\nx-amz-date:{amzdate}\n'.format(
host=ATS_SERVICE_ENDPOINT,
amzdate=timestamp)
payload_hash = get_sha256_hex("")

canonical_request = 'GET\n{service_uri}\n{query}\n{headers}\n{signed_headers}\n{payload_hash}'.format(
service_uri=ATS_SERVICE_URI,
query=canonical_query,
headers=canonical_headers,
signed_headers=ATS_SIGNED_HEADERS,
payload_hash=payload_hash)

This comment was marked as abuse.


# Create string to sign from request
credential_scope = '{datestamp}/{service_region}/{service_name}/aws4_request'.format(
datestamp=datestamp,
service_region=ATS_SERVICE_REGION,
service_name=ATS_SERVICE_NAME)
to_sign = '{algorithm}\n{timestamp}\n{scope}\n{sha_request}'.format(
algorithm=ATS_ALGORITHM,
timestamp=timestamp,
scope=credential_scope,
sha_request=get_sha256_hex(canonical_request))

# Calculate signature
key = get_sign_key(ats_secret_key, datestamp, ATS_SERVICE_REGION, ATS_SERVICE_NAME)
signature = gen_sign_hex(key, to_sign)

uri = '{base}?{query}'.format(
base=ATS_AWS_BASE_URL,
query=canonical_query)

# String to sign
to_sign = 'GET\n{}\n/\n{}'.format(ATS_SERVICE_HOST, query_string)
signature = gen_sign(to_sign)
authorization = '{algorithm} Credential={access_key}/{scope}, SignedHeaders={signed_headers}, Signature={signature}'.format(
algorithm=ATS_ALGORITHM,
access_key=ats_access_key,
scope=credential_scope,
signed_headers=ATS_SIGNED_HEADERS,
signature=signature)

# URI with signature
uri = '{base}{query}&{signature}'.format(
base=ATS_AWS_BASE_URL,
query=query_string,
signature=urlencode({"Signature": signature}, "UTF-8"))
return uri
return uri, authorization, timestamp


def build_query_string(country_code, start_ranking):
"""Build query string for request with start ranking and count."""
nowtime = datetime.datetime.utcnow()
timestamp = '{}Z'.format(nowtime.strftime(ATS_DATEFORMAT)[:-3])

# Alexa top site only accept request with ordered query parameters
# Keep the order!
query_params = [
('AWSAccessKeyId', ats_access_key),
('Action', ATS_ACTION_NAME),
('Count', ATS_COUNT),
('CountryCode', country_code),
('ResponseGroup', ATS_RESPONSE_GROUP_NAME),
('SignatureMethod', ATS_HASH_ALGORITHM),
('SignatureVersion', 2),
('Start', start_ranking),
('Timestamp', timestamp)]
('Start', start_ranking)]
return urlencode(query_params)


def gen_sign(data):
def get_sign_key(key, datestamp, region_name, service_name):
"""AWS sign key from key, datestamp, region and service"""

This comment was marked as abuse.

date = gen_sign(('AWS4' + key).encode('utf-8'), datestamp)
region = gen_sign(date, region_name)
service = gen_sign(region, service_name)
sign_key = gen_sign(service, 'aws4_request')
return sign_key


def get_sha256_hex(data):

This comment was marked as abuse.

return hashlib.sha256(data).hexdigest()


def gen_sign(key, data):
"""Compute RFC 2104-compliant HMAC signature."""
return hmac.new(key, data.encode('utf-8'), hashlib.sha256).digest()


def gen_sign_hex(key, data):
"""Compute RFC 2104-compliant HMAC signature."""
dig = hmac.new(ats_secret_key, data, hashlib.sha256).digest()
return base64.b64encode(dig)
return hmac.new(key, data.encode('utf-8'), hashlib.sha256).hexdigest()


def node_text(tree, tag_name):
Expand Down