Skip to content

Commit

Permalink
Fix #2015 - Update topsites.py for IAM user credential
Browse files Browse the repository at this point in the history
Use HTTP GET and different signature then use root credential
Fix unittest cases
  • Loading branch information
MDTsai committed Jan 12, 2018
1 parent 42cfdc2 commit 683a3bf
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 42 deletions.
14 changes: 9 additions & 5 deletions tests/test_topsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,25 @@


TEST_XML = '''<?xml version="1.0"?><aws:TopSitesResponse xmlns:aws="http://alexa.amazonaws.com/doc/2005-10-05/"><aws:Response><aws:OperationRequest><aws:RequestId>9ffc5e13-175e-4c7e-b33b-0efe3501d1f3</aws:RequestId></aws:OperationRequest><aws:TopSitesResult><aws:Alexa><aws:TopSites><aws:List><aws:CountryName>China</aws:CountryName><aws:CountryCode>CN</aws:CountryCode><aws:TotalSites>671496</aws:TotalSites><aws:Sites><aws:Site><aws:DataUrl>baidu.com</aws:DataUrl><aws:Country><aws:Rank>1</aws:Rank><aws:Reach><aws:PerMillion>358000</aws:PerMillion></aws:Reach><aws:PageViews><aws:PerMillion>77410</aws:PerMillion><aws:PerUser>11.5</aws:PerUser></aws:PageViews></aws:Country><aws:Global><aws:Rank>4</aws:Rank></aws:Global></aws:Site></aws:Sites></aws:List></aws:TopSites></aws:Alexa></aws:TopSitesResult><aws:ResponseStatus><aws:StatusCode>Success</aws:StatusCode></aws:ResponseStatus></aws:Response></aws:TopSitesResponse>''' # nopep8
TEST_QUERY_STRING = 'AWSAccessKeyId=1234567890ABCDEFGHIJ&Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&SignatureMethod=HmacSHA256&SignatureVersion=2&Start=1&Timestamp=2006-01-01T00%3A00%3A00.000Z' # nopep8
TEST_QUERY_URI = 'https://ats.amazonaws.com/?AWSAccessKeyId=1234567890ABCDEFGHIJ&Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&SignatureMethod=HmacSHA256&SignatureVersion=2&Start=1&Timestamp=2006-01-01T00%3A00%3A00.000Z&Signature=9a9mCUzgmU6apeV2t5VZx%2FekIgIDjNRMKVLGWzbmjSs%3D' # nopep8

TEST_QUERY_STRING = 'Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # nopep8
TEST_QUERY_URI = 'https://ats.amazonaws.com/api?Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # nopep8
TEST_QUERY_AUTH = 'AWS4-HMAC-SHA256 Credential=1234567890ABCDEFGHIJ/20060101/us-west-1/AlexaTopSites/aws4_request, SignedHeaders=host;x-amz-date, Signature=55b760bcae9a2ae93b0d08a85c3e613ec43c7f39f69ef2345896cf7660234f49'
TEST_QUERY_TIMESTAMP = '20060101T000000Z'

class TestTopsites(unittest.TestCase):
def setUp(self):
self.dom = parseString(TEST_XML)
topsites.ats_access_key = '1234567890ABCDEFGHIJ'
topsites.ats_secret_key = 'JIHGFEDCBA0987654321'

def test_build_uri(self):
def test_build_request(self):
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
with patch('datetime.datetime') as dt_mock:
dt_mock.utcnow.return_value = testdt
self.assertEqual(topsites.build_uri('CN', 1), TEST_QUERY_URI)
uri, authorization, timestamp = topsites.build_request('CN', 1)
self.assertEqual(uri, TEST_QUERY_URI)
self.assertEqual(authorization, TEST_QUERY_AUTH)
self.assertEqual(timestamp, TEST_QUERY_TIMESTAMP)

def test_build_query_string(self):
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
Expand Down
124 changes: 87 additions & 37 deletions tools/topsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,19 @@

# Constant for alexa top site API
ATS_ACTION_NAME = 'TopSites'
ATS_ALGORITHM = 'AWS4-HMAC-SHA256'
ATS_COUNT = 100
ATS_DATEFORMAT_AWS = '%Y%m%dT%H%M%SZ'
ATS_DATEFORMAT_CREDENTIAL = '%Y%m%d'
ATS_HASH_ALGORITHM = 'HmacSHA256'
ATS_RESPONSE_GROUP_NAME = 'Country'
ATS_SERVICE_HOST = 'ats.amazonaws.com'
ATS_AWS_BASE_URL = 'https://' + ATS_SERVICE_HOST + '/?'
ATS_DATEFORMAT = '%Y-%m-%dT%H:%M:%S.%f'
ATS_HASH_ALGORITHM = 'HmacSHA256'
ATS_COUNT = 100
ATS_SERVICE_ENDPOINT = 'ats.us-west-1.amazonaws.com'
ATS_SERVICE_URI = '/api'
ATS_SERVICE_REGION = 'us-west-1'
ATS_SERVICE_NAME = 'AlexaTopSites'
ATS_SIGNED_HEADERS = 'host;x-amz-date'
ATS_AWS_BASE_URL = 'https://' + ATS_SERVICE_HOST + ATS_SERVICE_URI

# Location of the DB and its backup.
DB_PATH = app.config['DATA_PATH']
Expand Down Expand Up @@ -82,29 +89,29 @@ def __init__(self, url, priority, country_code, ranking):
def query_topsites(country_code, count=1000):
"""Query top sites with given country code and count."""
for index in range(1, count, 100):
uri = build_uri(country_code, index)
uri, authorization, timestamp = build_request(country_code, index)
try:
response = requests.get(uri)
headers = {'x-amz-date': timestamp, 'Authorization': authorization}
response = requests.get(uri, headers=headers)
dom = parseString(response.content)

if response.status_code == 200:
# See http://docs.aws.amazon.com/AlexaTopSites/latest/index.html?QUERY_QueryRequests.html # nopep8
# See https://docs.aws.amazon.com/AlexaTopSites/latest/QUERY_QueryRequests.html # nopep8
sites = dom.getElementsByTagName('aws:Site')
for site in sites:
parse_site(site)
parse_site(site, country_code)
session.commit()
else:
# Get error code and message from response
# See http://docs.aws.amazon.com/AlexaTopSites/latest/index.html?QUERY_QueryAuthenticationErrors.html # nopep8
code = node_text(dom, 'Code')
message = node_text(dom, 'Message')
print('Send request to {} get error: {}.\nMessage: {}'.format(
uri, code, message))
# See https://docs.aws.amazon.com/AlexaTopSites/latest/Authentication.html # nopep8
message = node_text(dom, 'aws:ErrorCode')
print('Send request to {} get error message: \n{}'.format(
uri, message))
except ConnectionError:
print('Unable send request to {}'.format(uri))


def parse_site(site):
def parse_site(site, country_code):
"""Parse given dom object."""
url = node_text(site, 'aws:DataUrl')
rank = int(node_text(site, 'aws:Rank'))
Expand Down Expand Up @@ -135,47 +142,90 @@ def parse_site(site):
site_row.ranking = rank


def build_uri(country_code, start_ranking):
def build_request(country_code, start_ranking):
"""Build Alexa top site URI with given country code and start ranking."""
# Build query string
query_string = build_query_string(country_code, start_ranking)
# Prepare timestamp & datestamp
nowtime = datetime.datetime.utcnow()
timestamp = nowtime.strftime(ATS_DATEFORMAT_AWS)
datestamp = nowtime.strftime(ATS_DATEFORMAT_CREDENTIAL)

# Build request
canonical_query = build_query_string(country_code, start_ranking)
canonical_headers = 'host:{host}\nx-amz-date:{amzdate}\n'.format(
host=ATS_SERVICE_ENDPOINT,
amzdate=timestamp)
payload_hash = get_sha256_hex("")

canonical_request = 'GET\n{service_uri}\n{query}\n{headers}\n{signed_headers}\n{payload_hash}'.format(
service_uri=ATS_SERVICE_URI,
query=canonical_query,
headers=canonical_headers,
signed_headers=ATS_SIGNED_HEADERS,
payload_hash=payload_hash)

# Create string to sign from request
credential_scope = '{datestamp}/{service_region}/{service_name}/aws4_request'.format(
datestamp=datestamp,
service_region=ATS_SERVICE_REGION,
service_name=ATS_SERVICE_NAME)
to_sign = '{algorithm}\n{timestamp}\n{scope}\n{sha_request}'.format(
algorithm=ATS_ALGORITHM,
timestamp=timestamp,
scope=credential_scope,
sha_request=get_sha256_hex(canonical_request))

# Calculate signature
key = get_sign_key(ats_secret_key, datestamp, ATS_SERVICE_REGION, ATS_SERVICE_NAME)
signature = gen_sign_hex(key, to_sign)

uri = '{base}?{query}'.format(
base=ATS_AWS_BASE_URL,
query=canonical_query)

# String to sign
to_sign = 'GET\n{}\n/\n{}'.format(ATS_SERVICE_HOST, query_string)
signature = gen_sign(to_sign)
authorization = '{algorithm} Credential={access_key}/{scope}, SignedHeaders={signed_headers}, Signature={signature}'.format(
algorithm=ATS_ALGORITHM,
access_key=ats_access_key,
scope=credential_scope,
signed_headers=ATS_SIGNED_HEADERS,
signature=signature)

# URI with signature
uri = '{base}{query}&{signature}'.format(
base=ATS_AWS_BASE_URL,
query=query_string,
signature=urlencode({"Signature": signature}, "UTF-8"))
return uri
return uri, authorization, timestamp


def build_query_string(country_code, start_ranking):
"""Build query string for request with start ranking and count."""
nowtime = datetime.datetime.utcnow()
timestamp = '{}Z'.format(nowtime.strftime(ATS_DATEFORMAT)[:-3])

# Alexa top site only accept request with ordered query parameters
# Keep the order!
query_params = [
('AWSAccessKeyId', ats_access_key),
('Action', ATS_ACTION_NAME),
('Count', ATS_COUNT),
('CountryCode', country_code),
('ResponseGroup', ATS_RESPONSE_GROUP_NAME),
('SignatureMethod', ATS_HASH_ALGORITHM),
('SignatureVersion', 2),
('Start', start_ranking),
('Timestamp', timestamp)]
('Start', start_ranking)]
return urlencode(query_params)


def gen_sign(data):
def get_sign_key(key, datestamp, region_name, service_name):
"""AWS sign key from key, datestamp, region and service"""
date = gen_sign(('AWS4' + key).encode('utf-8'), datestamp)
region = gen_sign(date, region_name)
service = gen_sign(region, service_name)
sign_key = gen_sign(service, 'aws4_request')
return sign_key


def get_sha256_hex(data):
return hashlib.sha256(data).hexdigest()


def gen_sign(key, data):
"""Compute RFC 2104-compliant HMAC signature."""
return hmac.new(key, data.encode('utf-8'), hashlib.sha256).digest()


def gen_sign_hex(key, data):
"""Compute RFC 2104-compliant HMAC signature."""
dig = hmac.new(ats_secret_key, data, hashlib.sha256).digest()
return base64.b64encode(dig)
return hmac.new(key, data.encode('utf-8'), hashlib.sha256).hexdigest()


def node_text(tree, tag_name):
Expand Down

0 comments on commit 683a3bf

Please sign in to comment.