diff --git a/.gitignore b/.gitignore index 76d0f357..95de70d4 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ un_sdg/metadata/ *.Rproj .Rproj.user venv +credentials/ diff --git a/requirements.txt b/requirements.txt index e3d11fa1..57b17ae5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,6 @@ black==21.8b0 flake8==3.9.2 xlrd==2.0.1 pytest==6.2.5 -mypy==0.910 \ No newline at end of file +mypy==0.910 +google-auth-oauthlib==0.4.6 +google-api-python-client==2.19.1 diff --git a/site_analytics/README.md b/site_analytics/README.md new file mode 100644 index 00000000..6fd8ab87 --- /dev/null +++ b/site_analytics/README.md @@ -0,0 +1,49 @@ +# Site analytics + +Exposes simple site analytics, such as page views by slug URL on the [ourworldindata.org](https://ourworldindata.org) website. For internal use by OWID staff only. + +This service uses the [Google Analytics Reporting API v4](https://developers.google.com/analytics/devguides/reporting/core/v4). + +## Setup + +In order to use this service, you must: + +1. Have a client secrets file located in `site_analytics/config/credentials/owid-analytics-client-secrets.json`. +2. Add the following variables to `.env` at the root of this repository: + +```bash +GA_VIEW_ID = "{GA_VIEW_ID}" +GA_ACCOUNT_ID = "{GA_ACCOUNT_ID}" +GA_PROPERTY_ID = "{GA_PROPERTY_ID}" +``` + +Ask an OWID developer for these files/variables (available to OWID staff only). + + +## Examples + +Execute a single simple [Google Analytics report request](https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet) with one metric and one dimension over one time range: + +```python +from site_analytics.request_report import execute_report_request + +df = execute_report_request( + metric="pageviews", + dimension="pagePath", + start_date="7daysAgo", + end_date="yesterday", + filters_expression="ga:dimension1==0" # excludes views of embedded pages +) +print(df.tail(5)) +# start_date end_date pagePath pageviews +# 41932 7daysAgo yesterday /covid-cases 90053.0 +# 41933 7daysAgo yesterday /covid-vaccinations 123601.0 +# 41934 7daysAgo yesterday /coronavirus 167887.0 +# 41935 7daysAgo yesterday / 176392.0 +# 41936 7daysAgo yesterday /covid-vaccinations 711477.0 + +``` + +For acceptable values of the `metric`, `dimension`, etc parameters, check out the [Google Analytics UA query explorer](https://ga-dev-tools.web.app/query-explorer/). + +> Note: set `filters_expression="ga:dimension1==0"` to exclude visits to embedded OWID pages. diff --git a/site_analytics/__init__.py b/site_analytics/__init__.py new file mode 100644 index 00000000..32f3d8b4 --- /dev/null +++ b/site_analytics/__init__.py @@ -0,0 +1,6 @@ +import os + +CURRENT_DIR = os.path.dirname(__file__).split("/")[-1] +CONFIGPATH = os.path.join(CURRENT_DIR, "config") +CREDSPATH = os.path.join(CONFIGPATH, "credentials") +CLIENT_SECRETS_PATH = os.path.join(CREDSPATH, "owid-analytics-client-secrets.json") diff --git a/site_analytics/request_report.py b/site_analytics/request_report.py new file mode 100644 index 00000000..42ccb80c --- /dev/null +++ b/site_analytics/request_report.py @@ -0,0 +1,84 @@ +import os +import pandas as pd +from typing import Optional +from dotenv import load_dotenv +from apiclient.discovery import build +from google.oauth2.credentials import Credentials + +from site_analytics.utils import google_analytics_authenticate + +load_dotenv() + +GA_ACCOUNT_ID = os.getenv("GA_ACCOUNT_ID") +GA_PROPERTY_ID = os.getenv("GA_PROPERTY_ID") +GA_VIEW_ID = os.getenv("GA_VIEW_ID") + + +def execute_report_request( + metric: str, + dimension: str, + start_date: str, + end_date: str, + filters_expression: Optional[str] = None, + credentials: Optional[Credentials] = None, +) -> pd.DataFrame: + """Request a single simple Google Analytics report with one metric and one + dimension in a single time range. + + Uses the Google Analytics Reporting API v4. See + https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet + for valid values of the `metric`, `dimension`, `start_date`, etc parameters. + """ + if not credentials: + credentials = google_analytics_authenticate() + analytics = build("analyticsreporting", "v4", credentials=credentials) + ga_rows = [] + report_request = { + "viewId": f"ga:{GA_VIEW_ID}", + "dateRanges": [{"startDate": start_date, "endDate": end_date}], + "metrics": [{"expression": f"ga:{metric}"}], + "dimensions": [ + {"name": f"ga:{dimension}"}, + ], + "pageSize": "100000", + "orderBys": [ + { + "fieldName": f"ga:{metric}", + "sortOrder": "DESCENDING", + } + ], + } + if filters_expression: + report_request["filtersExpression"] = filters_expression + response = ( + analytics.reports() + .batchGet(body={"reportRequests": [report_request]}) + .execute() + ) + for row in response["reports"][0]["data"]["rows"]: + dims = row["dimensions"] + assert "(other)" not in dims + val = row["metrics"][0]["values"][0] + ga_rows.append([start_date, end_date] + dims + [val]) + while response["reports"][0].get("nextPageToken"): + report_request["pageToken"] = response["reports"][0].get("nextPageToken") + response = ( + analytics.reports() + .batchGet(body={"reportRequests": [report_request]}) + .execute() + ) + for row in response["reports"][0]["data"]["rows"]: + dims = row["dimensions"] + assert "(other)" not in dims + val = row["metrics"][0]["values"][0] + ga_rows.append([start_date, end_date] + dims + [val]) + df = pd.DataFrame( + ga_rows, + columns=["start_date", "end_date", dimension, metric], + ) + try: + df[metric] = df[metric].astype(float) + except: + pass + df = df.sort_values(by=metric).reset_index(drop=True) + return df diff --git a/site_analytics/test_request_report.py b/site_analytics/test_request_report.py new file mode 100644 index 00000000..1c06d521 --- /dev/null +++ b/site_analytics/test_request_report.py @@ -0,0 +1,56 @@ +import pytest + +import os +import pandas as pd +from apiclient.discovery import build + +from site_analytics.utils import google_analytics_authenticate +from site_analytics.request_report import execute_report_request + +from dotenv import load_dotenv + +load_dotenv() + +GA_ACCOUNT_ID = os.getenv("GA_ACCOUNT_ID") +GA_PROPERTY_ID = os.getenv("GA_PROPERTY_ID") + + +@pytest.fixture(scope="module") +def credentials(): + yield google_analytics_authenticate() + + +def test_dimension1_name(credentials): + nm = ( + build("analytics", "v3", credentials=credentials) + .management() + .customDimensions() + .get( + accountId=GA_ACCOUNT_ID, + webPropertyId=GA_PROPERTY_ID, + customDimensionId="ga:dimension1", + ) + .execute()["name"] + ) + assert nm == "Page is embedded" + + +@pytest.fixture(scope="module") +def df_simple_request(credentials): + df = execute_report_request( + metric="pageviews", + dimension="pagePath", + start_date="yesterday", + end_date="yesterday", + filters_expression=None, + credentials=credentials, + ) + yield df + + +def test_is_frame(df_simple_request): + assert type(df_simple_request) == pd.DataFrame + + +def test_gt_zero_rows(df_simple_request): + assert df_simple_request.shape[0] > 0 diff --git a/site_analytics/utils.py b/site_analytics/utils.py new file mode 100644 index 00000000..f9a6bb31 --- /dev/null +++ b/site_analytics/utils.py @@ -0,0 +1,16 @@ +from google_auth_oauthlib.flow import InstalledAppFlow +from site_analytics import CLIENT_SECRETS_PATH + + +def google_analytics_authenticate(): + flow = InstalledAppFlow.from_client_secrets_file( + CLIENT_SECRETS_PATH, + scopes=[ + "openid", + "https://www.googleapis.com/auth/userinfo.email", + "https://www.googleapis.com/auth/analytics.readonly", + ], + ) + flow.run_local_server() + credentials = flow.credentials + return credentials diff --git a/standard_importer/posts_to_update.py b/standard_importer/posts_to_update.py new file mode 100644 index 00000000..99ca159d --- /dev/null +++ b/standard_importer/posts_to_update.py @@ -0,0 +1,184 @@ +"""Constructs a dataframe that counts the number of charts per OWID page that +have been updated. + +Each row is in the dataframe is an OWID page. + +The exported csv file is intended to make it easy for OWID staff to see which +public OWID pages need to be checked for textual inconsistencies after one or +more suggested chart revisions have been approved as part of a bulk dataset +update. + +Example: + + >>> from standard_importer import posts_to_update + >>> df = posts_to_update.main(dataset_id=5357, since="2021-08-01", include_google_analytics=True) + >>> print(df.head()) + post_title post_slug num_charts_updated pageviews_from_20210608_to_20210608 + 1 Women's employment https://ourworldindata.org/female-labor-supply 2 9755.0 + 2 Working women: Key facts and ... https://ourworldindata.org/female-labor-force-... 2 6250.0 + 0 Teachers and Professors https://ourworldindata.org/teachers-and-profes... 2 1894.0 + +""" + +import os +import re +import time +import datetime as dt +import grequests +import json +import pandas as pd +import numpy as np +import logging +from tqdm import tqdm +from dotenv import load_dotenv + +from db import get_connection +from utils import assert_admin_api_connection, batchify +from site_analytics.request_report import execute_report_request + +load_dotenv() + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +# config for google analytics report request +GA_CONFIG = { + "metric": "pageviews", + "dimension": "pagePath", + "filters_expression": "ga:dimension1==0", # exclude embedded page views + "start_date": (dt.datetime.utcnow() - dt.timedelta(days=90)).strftime("%Y-%m-%d"), + "end_date": (dt.datetime.utcnow() - dt.timedelta(days=1)).strftime("%Y-%m-%d"), +} + +SITE_SESSION_ID = os.getenv("SITE_SESSION_ID") +SITE_HOST = os.getenv("SITE_HOST") +DEBUG = os.getenv("DEBUG") == "True" + + +def main( + dataset_id: int, since: str, include_google_analytics: bool = False +) -> pd.DataFrame: + """Constructs a dataframe that counts the number of charts from a dataset + per OWID page that have been updated. + """ + if include_google_analytics: + df_ga = get_ga_data() + + df = get_charts_updated_data(dataset_id, since) + + if include_google_analytics: + metric = GA_CONFIG["metric"] + dimension = GA_CONFIG["dimension"] + start_date = GA_CONFIG["start_date"] + end_date = GA_CONFIG["start_date"] + df = df.merge( + df_ga[[dimension, metric]], + left_on="post_slug", + right_on=dimension, + how="left", + validate="1:1", + ).drop(columns=[dimension]) + df.sort_values(by=[metric, "num_charts_updated"], ascending=False, inplace=True) + df.rename( + columns={ + metric: f"{metric}_from_{re.sub('-', '', start_date)}_to_{re.sub('-', '', end_date)}" + }, + inplace=True, + ) + else: + df.sort_values(by=["num_charts_updated"], ascending=False, inplace=True) + + df["post_slug"] = "https://ourworldindata.org/" + df["post_slug"] + return df + + +def get_ga_data() -> pd.DataFrame: + df_ga = execute_report_request(**GA_CONFIG) + assert (df_ga[GA_CONFIG["dimension"]] == "(other)").sum() == 0 + if "pagePath" in df_ga.columns: + df_ga["pagePath"] = ( + df_ga["pagePath"] + .str.replace(r"\?.*", "", regex=True) + .str.replace(r"/?\.?$", "", regex=True) + .str.replace(r"^/", "", regex=True) + ) + df_ga = ( + df_ga.groupby(["start_date", "end_date", GA_CONFIG["dimension"]])[ + GA_CONFIG["metric"] + ] + .sum() + .reset_index() + ) + + df_ga.sort_values(GA_CONFIG["metric"], ascending=False, inplace=True) + return df_ga + + +def get_charts_updated_data(dataset_id: int, since: str) -> pd.DataFrame: + """returns a dataframe of the number of charts updated since YYYY-MM-DD, by page.""" + # retrieves all charts that use a variable from {dataset} and have been updated. + assert_admin_api_connection() + logger.info("Retrieving updated charts...") + df = pd.read_sql( + f""" + SELECT + charts.id, charts.updatedAt, charts.createdAt, charts.lastEditedAt, charts.publishedAt + FROM charts + INNER JOIN chart_dimensions + ON charts.id = chart_dimensions.chartId + WHERE variableId IN ( + SELECT id + FROM variables + WHERE datasetId = {dataset_id} + ) + AND charts.updatedAt >= "{since}" + ORDER BY updatedAt DESC + """, + get_connection(), + ) + if DEBUG: + logger.warning( + "DEBUG mode is on. Only retrieving post references for the first " + "10 updated charts." + ) + df = df.iloc[:10] + + logger.info("Retrieving chart references to OWID pages...") + wait = 2 + batch_size = 50 + n_batches = int(np.ceil(df.shape[0] / batch_size)) + + responses = [] + for batch in tqdm(batchify(df, batch_size=batch_size), total=n_batches): + requests = [] + for _, row in batch.iterrows(): + # refs = get_references_by_chart_id(row.id)["references"] + url = f"{SITE_HOST}/admin/api/charts/{row.id}.references.json" + res = grequests.get(url, cookies={"sessionid": SITE_SESSION_ID}) + requests.append(res) + responses += grequests.map(requests) + time.sleep(wait) + + references = [] + for resp in responses: + refs = json.loads(resp.content)["references"] + for ref in refs: + ref["chartId"] = row.id + references += refs + + # df_refs = pd.read_csv('refs.csv') + df_refs = pd.DataFrame(references).rename(columns={"id": "refId"}) + df = ( + df_refs.value_counts(["title", "slug"]) + .reset_index() + .rename( + columns={ + 0: "num_charts_updated", + "title": "post_title", + "slug": "post_slug", + } + ) + ) + return df diff --git a/utils.py b/utils.py index 0a836756..33ce3780 100644 --- a/utils.py +++ b/utils.py @@ -1,7 +1,16 @@ +import os import re +import json +import requests from typing import Any, Generator, List, Collection from dataclasses import dataclass, field +from dotenv import load_dotenv + +load_dotenv() +SITE_HOST = os.getenv("SITE_HOST") +SITE_SESSION_ID = os.getenv("SITE_SESSION_ID") + def write_file(file_path, content): with open(file_path, "w") as f: @@ -94,3 +103,22 @@ def from_values(xs: List[int]): def to_values(self): return [self.min, self.max] + + +def assert_admin_api_connection() -> None: + """raises an AssertionError if unable to successfully connect to the admin API.""" + res = False + try: + charts = json.loads( + requests.get( + f"{SITE_HOST}/admin/api/charts.json?limit=1", + cookies={"sessionid": SITE_SESSION_ID}, + ).content + ) + res = len(charts["charts"]) > 0 + except Exception: + res = False + assert res, ( + "Failed to connect to admin API, have you set SITE_HOST and " + "SITE_SESSION_ID correctly in .env?" + )