feat: add percent change to analytics package (#4335)

* feat: added percent change column and formatting (#4331) * chore: updated ga package version (#4331) * fix: made argument names consistent (#4331)
DataBiosphere · Jan 17, 2025 · eef1cee · eef1cee
1 parent d2ac666
commit eef1cee
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 29 deletions.
diff --git a/analytics/analytics_package/analytics/sheets_api.py b/analytics/analytics_package/analytics/sheets_api.py
@@ -2,6 +2,14 @@
 import gspread_formatting
 from enum import Enum
 from googleapiclient.discovery import build
+import numpy as np
+
+FONT_SIZE_PTS = 10
+PTS_PIXELS_RATIO = 4/3
+DEFAULT_BUFFER_CHARS = 2
+GREEN_COLOR = "#00FF00"
+RED_COLOR = "#FF0000"
+
 
 class FILE_OVERRIDE_BEHAVIORS(Enum):
     OVERRIDE_IF_IN_SAME_PLACE = 1
@@ -12,9 +20,17 @@ class WORKSHEET_OVERRIDE_BEHAVIORS(Enum):
     OVERRIDE = 1
     EXIT = 2
 
-FONT_SIZE_PTS = 10
-PTS_PIXELS_RATIO = 4/3
-DEFAULT_BUFFER_CHARS = 2
+class COLUMN_FORMAT_OPTIONS(Enum):
+    DEFAULT = 1
+    PERCENT_UNCOLORED = 2
+    PERCENT_COLORED = 3
+
+DEFAULT_SHEET_FORMATTING_OPTIONS = {
+    "bold_header": True,
+    "center_header": True,
+    "freeze_header": True,
+    "column_widths": {"justify": True, "buffer_chars": DEFAULT_BUFFER_CHARS}
+}
 
 def extract_credentials(authentication_response):
     """Extracts the credentials from the tuple from api.authenticate"""
@@ -141,17 +157,14 @@ def create_sheet_in_folder(authentication_response, sheet_name, parent_folder_na
     # Open new file
     return gc.open_by_key(spread_id)
 
+
 def fill_worksheet_with_df(
         sheet,
         df,
         worksheet_name,
         overlapBehavior,
-        options={
-            "bold_header": True,
-            "center_header": True,
-            "freeze_header": True,
-            "column_widths": {"justify": True, "buffer_chars": DEFAULT_BUFFER_CHARS}
-        }
+        sheet_formatting_options=DEFAULT_SHEET_FORMATTING_OPTIONS,
+        column_formatting_options={}
     ):
     """
     Fill a worksheet with the contents of a DataFrame.
@@ -162,8 +175,10 @@ def fill_worksheet_with_df(
     :param df: the DataFrame to fill the worksheet with
     :param worksheet_name: the name of the worksheet to fill. Cannot be "Sheet1"
     :param overlapBehavior: the behavior to take if the worksheet already exists.
-    :param options: the formatting options for the worksheet. 
+    :param sheet_formatting_options: the formatting options for the worksheet. 
         Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional
+    :param column_formatting_options: the column formatting options for the worksheet.
+        Should be a dictionary with dataframe columns as keys and instances of COLUMN_FORMAT_OPTIONS as values, optional
     """
     # Sheet1 is special since it's created by default, so it's not allowed
     assert worksheet_name != "Sheet1"
@@ -179,19 +194,19 @@ def fill_worksheet_with_df(
         )
 
     # Add data to worksheet
-    worksheet.update([df.columns.values.tolist()] + df.values.tolist())
+    worksheet.update([df.columns.values.tolist()] + df.fillna("NA").values.tolist())
 
     # Format worksheet
     # Justify Column Widths
-    if "column_widths" not in options or options["column_widths"]["justify"]:
+    if "column_widths" not in sheet_formatting_options or sheet_formatting_options["column_widths"]["justify"]:
         text_widths = df.astype(str).columns.map(
             lambda column_name: df[column_name].astype(str).str.len().max()
         )
         header_widths = df.columns.str.len()
         buffer_chars = (
             DEFAULT_BUFFER_CHARS 
-            if ("column_widths" not in options or "buffer_chars" not in options["column_widths"]) 
-            else options["column_widths"]["buffer_chars"]
+            if ("column_widths" not in sheet_formatting_options or "buffer_chars" not in sheet_formatting_options["column_widths"]) 
+            else sheet_formatting_options["column_widths"]["buffer_chars"]
         )
         column_widths = [
             round((max(len_tuple) + buffer_chars) * FONT_SIZE_PTS * 1/PTS_PIXELS_RATIO)
@@ -202,26 +217,71 @@ def fill_worksheet_with_df(
         ]
         gspread_formatting.set_column_widths(worksheet, zip(column_positions, column_widths))
     # Freeze Header
-    if "freeze_header" not in options or options["freeze_header"]:
+    if "freeze_header" not in sheet_formatting_options or sheet_formatting_options["freeze_header"]:
         gspread_formatting.set_frozen(worksheet, rows=1)
-    format_options = gspread_formatting.CellFormat()
+    base_format_options = gspread_formatting.CellFormat()
     # Bold Header
-    if "bold_header" not in options or options["bold_header"]:
-        format_options += gspread_formatting.CellFormat(textFormat=gspread_formatting.TextFormat(bold=True))
+    if "bold_header" not in sheet_formatting_options or sheet_formatting_options["bold_header"]:
+        base_format_options += gspread_formatting.CellFormat(textFormat=gspread_formatting.TextFormat(bold=True))
     # Center Header
-    if "center_header" not in options or options["center_header"]:
-        format_options += gspread_formatting.CellFormat(horizontalAlignment="CENTER")
+    if "center_header" not in sheet_formatting_options or sheet_formatting_options["center_header"]:
+        base_format_options += gspread_formatting.CellFormat(horizontalAlignment="CENTER")
+    # Handle column specific formatting
+    for column in column_formatting_options:
+        if column not in df.columns:
+            raise KeyError("Formatting column is not in the dataframe")
+        # Skip if the column is set to default
+        if column_formatting_options[column] == COLUMN_FORMAT_OPTIONS.DEFAULT:
+            continue
+        # Get the column position
+        column_position_numeric = df.columns.get_loc(column) + 1
+        column_range_top = gspread.utils.rowcol_to_a1(1, column_position_numeric)
+        column_range_bottom = gspread.utils.rowcol_to_a1(df.index.size + 1, column_position_numeric)
+        column_range = f"{column_range_top}:{column_range_bottom}"
+        column_worksheet_range = gspread_formatting.GridRange.from_a1_range(column_range, worksheet)
+        # Get conditional formatting rules
+        if column_formatting_options[column] == COLUMN_FORMAT_OPTIONS.PERCENT_COLORED:
+            green_rule = gspread_formatting.ConditionalFormatRule(
+                ranges=[column_worksheet_range],
+                booleanRule=gspread_formatting.BooleanRule(
+                condition=gspread_formatting.BooleanCondition('NUMBER_GREATER_THAN_EQ', ['0']),
+                format=gspread_formatting.CellFormat(
+                    textFormat=gspread_formatting.TextFormat(foregroundColor=gspread_formatting.Color(0,1,0)))
+                )
+            )
+            red_rule = gspread_formatting.ConditionalFormatRule(
+                ranges=[column_worksheet_range],
+                booleanRule=gspread_formatting.BooleanRule(
+                condition=gspread_formatting.BooleanCondition('NUMBER_LESS_THAN_EQ', ['0']),
+                format=gspread_formatting.CellFormat(
+                    textFormat=gspread_formatting.TextFormat(foregroundColor=gspread_formatting.Color(1,0,0)))
+                )
+            )
+            # Apply conditional formatting rules
+            conditional_formatting_rules = gspread_formatting.get_conditional_format_rules(worksheet)
+            conditional_formatting_rules.append(green_rule)
+            conditional_formatting_rules.append(red_rule)
+            conditional_formatting_rules.save()
+        if column_formatting_options[column] in (COLUMN_FORMAT_OPTIONS.PERCENT_COLORED, COLUMN_FORMAT_OPTIONS.PERCENT_UNCOLORED):
+            # Apply percent format rule
+            gspread_formatting.format_cell_range(
+                worksheet, 
+                column_range, 
+                gspread_formatting.CellFormat(numberFormat=gspread_formatting.NumberFormat(type='PERCENT', pattern='0.0%'))
+            )
+
+    # Apply base formatting options
     gspread_formatting.format_cell_range(
         worksheet,
         f"A1:{gspread.utils.rowcol_to_a1(1, len(df.columns))}",
-        format_options
+        base_format_options
     )
 
     # Delete Sheet1 if it has been created by default
     if "Sheet1" in [i.title for i in sheet.worksheets()]:
         sheet.del_worksheet(sheet.worksheet("Sheet1"))
 
-def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, options={}):
+def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, sheet_formatting_options={}, column_formatting_options={}):
     """
     Fill a sheet with the contents of a dictionary of DataFrames.
     The keys of the dictionary are the names of the worksheets, and the values contain the data to be placed in the sheet.
@@ -230,8 +290,12 @@ def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, options={}):
     :param sheet: the gspread.Spreadsheet object
     :param df_dict: the dictionary of DataFrames to fill the worksheets with
     :param overlapBehavior: the behavior to take if any of the worksheets already exist
-    :param options: the formatting options for the worksheets.
-        Should be a dictionary with optional elements "bold_header", "center_header", "freeze_header", and "column_widths", optional
+    :param sheet_formatting_options: the formatting options for the worksheets.
+        Should be a 2 level dictionary with outer keys being names of worksheets and inner keys being some of
+        "bold_header", "center_header", "freeze_header", and "column_widths", optional
+    :param column_formatting_options: the column formatting options for the worksheets.
+        Should be a 2 level dictionary with outer keys being names of worksheets and inner keys being column names.
+        The inner keys should be an instance of COLUMN_FORMATTING_OPTIONS, optional
     """
     if overlapBehavior == WORKSHEET_OVERRIDE_BEHAVIORS.EXIT:
         for worksheet_name in df_dict.keys():
@@ -241,5 +305,8 @@ def fill_spreadsheet_with_df_dict(sheet, df_dict, overlapBehavior, options={}):
             except gspread.exceptions.WorksheetNotFound:
                 pass
     for worksheet_name, df in df_dict.items():
-        fill_worksheet_with_df(sheet, df, worksheet_name, overlapBehavior, options=options)
-
+        fill_worksheet_with_df(
+            sheet, df, worksheet_name, overlapBehavior, 
+            sheet_formatting_options=sheet_formatting_options.get(worksheet_name, DEFAULT_SHEET_FORMATTING_OPTIONS), 
+            column_formatting_options=column_formatting_options.get(worksheet_name, {})
+        )
diff --git a/analytics/analytics_package/analytics/sheets_elements.py b/analytics/analytics_package/analytics/sheets_elements.py
@@ -36,7 +36,7 @@ def get_rename_dict(dimensions):
         zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions])
     )
 
-def get_outbound_sheets_df(analytics_params):
+def get_outbound_links_df(analytics_params):
     """
     Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links.
 
@@ -95,4 +95,39 @@ def get_outbound_sheets_df(analytics_params):
             "hostname": "Hostname",
         } 
     )[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users"]]
-    return df_all_links.copy().reset_index(drop=True)
+
+    return df_all_links.copy().reset_index(drop=True)
+
+def get_outbound_links_change(analytics_params, start_current, end_current, start_previous, end_previous):
+    """
+    Get a DF with outbound links from the Analytics API and a comparison for the prior month
+    :param analytics_params: the parameters for the Analytics API, including authentication and property ids
+    :param start_current: the start date for the current month in the format "YYYY-MM-DD"
+    :param end_current: the end date for the current month
+    :param start_previous: the start date for the previous month
+    :param end_previous: the end date for the previous month
+    """
+    analytics_params_month_1 = {
+        **analytics_params,
+        "start_date": start_current,
+        "end_date": end_current,
+    }
+    analytics_params_month_2 = {
+        **analytics_params,
+        "start_date": start_previous,
+        "end_date": end_previous,
+    }
+    print(analytics_params_month_2)
+    df_current = get_outbound_links_df(analytics_params_month_1).set_index(
+        ["Page Path", "Outbound Link", "Hostname"]
+    )
+    df_previous = get_outbound_links_df(analytics_params_month_2).set_index(
+        ["Page Path", "Outbound Link", "Hostname"]
+    )
+    combined_index = df_current.index.union(df_previous.index)
+    df_current_reindexed = df_current.reindex(combined_index).fillna(0)
+    df_previous_reindexed = df_previous.reindex(combined_index)
+    df_current_reindexed["Total Clicks Percent Change"] = (df_current_reindexed["Total Clicks"] / df_previous_reindexed["Total Clicks"]) - 1
+    df_current_reindexed["Total Users Percent Change"] = (df_current_reindexed["Total Users"] / df_previous_reindexed["Total Users"]) - 1
+    return df_current_reindexed.sort_values(["Total Clicks", "Total Users"], ascending=False, kind="stable").reset_index()
+
diff --git a/analytics/analytics_package/setup.py b/analytics/analytics_package/setup.py
@@ -2,7 +2,7 @@
 
 setup(
 	name="analytics",
-	version="3.1.0",
+	version="3.2.0",
 	packages=["analytics"],
 	install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client", "gspread", "gspread-formatting"],
 )