feat: add dataset titles to anvil analytics report (anvilproject/anvil-portal#3909) (#4746)

hunterckx · web-flow · commit aed92565ffcc · 2026-04-07T23:07:52.000-07:00
feat: add dataset titles to anvil analytics report (#3909)
diff --git a/analytics/anvil-explorer-sheets/generate_sheets_report.ipynb b/analytics/anvil-explorer-sheets/generate_sheets_report.ipynb
@@ -2,22 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import analytics.api as ga\n",
-    "import analytics.sheets_api as sheets\n",
-    "import analytics.sheets_elements as elements\n",
-    "import analytics.entities as e\n",
-    "import pandas as pd\n",
-    "import gspread\n",
-    "from constants import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -34,14 +19,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import analytics.api as ga\n",
+    "import analytics.sheets_api as sheets\n",
+    "import analytics.sheets_elements as elements\n",
+    "import analytics.entities as e\n",
+    "import pandas as pd\n",
+    "import gspread\n",
+    "from constants import *\n",
+    "import utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=274560362763-p5netdrssq6r02lcfan6s157m6d65rqe.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8082%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fanalytics.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=3TOrvrS8EuGJvNWqEnjqNIlknGfZW7&access_type=offline\n"
+      "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=274560362763-p5netdrssq6r02lcfan6s157m6d65rqe.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8082%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fanalytics.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=P3M0s2lsdDE4jDwfNdfekOCqsMsZpJ&access_type=offline\n"
      ]
     }
    ],
@@ -82,59 +83,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_monthly_pageviews = elements.get_page_views_over_time_df(anvil_catalog_params_all_time)\n",
-    "df_pageviews = elements.get_page_views_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)\n",
+    "df_pageviews = utils.add_dataset_titles(\n",
+    "  elements.get_page_views_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)\n",
+    ")\n",
     "df_outbound = elements.get_outbound_links_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'spreadsheetId': '1ueSQdU9pTa1qPptkNc4vk4xoHuJgP6gNcQ7exRxN9y0',\n",
-       " 'replies': [{'addChart': {'chart': {'chartId': 113619751,\n",
+       "{'spreadsheetId': '1yg5dDlhe7PX02nin9FWbDLMimY1UhwI4yszpcfCiRSQ',\n",
+       " 'replies': [{'addChart': {'chart': {'chartId': 1605255347,\n",
        "     'spec': {'title': 'Pageviews and Users Over Time',\n",
        "      'basicChart': {'chartType': 'LINE',\n",
        "       'axis': [{'position': 'BOTTOM_AXIS', 'viewWindowOptions': {}},\n",
        "        {'position': 'LEFT_AXIS', 'viewWindowOptions': {}}],\n",
-       "       'domains': [{'domain': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
+       "       'domains': [{'domain': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
        "             'startRowIndex': 0,\n",
-       "             'endRowIndex': 13,\n",
+       "             'endRowIndex': 24,\n",
        "             'startColumnIndex': 0,\n",
        "             'endColumnIndex': 1}]}}}],\n",
-       "       'series': [{'series': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
+       "       'series': [{'series': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
        "             'startRowIndex': 0,\n",
-       "             'endRowIndex': 13,\n",
+       "             'endRowIndex': 24,\n",
        "             'startColumnIndex': 1,\n",
        "             'endColumnIndex': 2}]}},\n",
        "         'targetAxis': 'LEFT_AXIS'},\n",
-       "        {'series': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
+       "        {'series': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
        "             'startRowIndex': 0,\n",
-       "             'endRowIndex': 13,\n",
+       "             'endRowIndex': 24,\n",
        "             'startColumnIndex': 2,\n",
        "             'endColumnIndex': 3}]}},\n",
        "         'targetAxis': 'LEFT_AXIS'}],\n",
        "       'headerCount': 1},\n",
        "      'hiddenDimensionStrategy': 'SKIP_HIDDEN_ROWS_AND_COLUMNS',\n",
        "      'titleTextFormat': {'fontFamily': 'Roboto'},\n",
        "      'fontName': 'Roboto'},\n",
-       "     'position': {'overlayPosition': {'anchorCell': {'sheetId': 388329788,\n",
+       "     'position': {'overlayPosition': {'anchorCell': {'sheetId': 1535415780,\n",
        "        'columnIndex': 5},\n",
        "       'offsetXPixels': 75,\n",
        "       'offsetYPixels': 25,\n",
        "       'widthPixels': 600,\n",
        "       'heightPixels': 371}}}}}]}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/analytics/anvil-explorer-sheets/utils.py b/analytics/anvil-explorer-sheets/utils.py
@@ -0,0 +1,73 @@
+import re
+
+import pandas as pd
+import requests
+
+from analytics.entities import DIMENSION_PAGE_PATH
+
+ANVIL_DATASETS_API_URL = "https://service.explore.anvilproject.org/index/datasets"
+DATASETS_PATH_PATTERN = re.compile(r"^/datasets/([^/]+)")
+INSERT_AFTER_COLUMN = DIMENSION_PAGE_PATH["alias"]
+PAGE_PATH_COLUMN = DIMENSION_PAGE_PATH["alias"]
+DATASET_TITLE_COLUMN = "Dataset Title"
+
+
+def fetch_dataset_title_map() -> dict[str, str]:
+  """Fetch all datasets from the AnVIL API and return a mapping of entryId to title.
+
+  Paginates through the full catalog using the API's ``pagination.next`` URL.
+  """
+  title_map: dict[str, str] = {}
+  url: str | None = ANVIL_DATASETS_API_URL
+  params: dict[str, int] | None = {"size": 1000}
+  while url is not None:
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    data = response.json()
+    for hit in data["hits"]:
+      entry_id = hit.get("entryId")
+      datasets = hit.get("datasets", [])
+      if entry_id and datasets:
+        title = datasets[0].get("title", "")
+        if title:
+          title_map[entry_id] = title
+    url = data.get("pagination", {}).get("next")
+    params = None  # subsequent URLs already include query params
+  return title_map
+
+
+def add_dataset_titles(df: pd.DataFrame, title_map: dict[str, str] | None = None) -> pd.DataFrame:
+  """Add a 'Dataset Title' column to a pageviews dataframe.
+
+  For rows where the page path matches /datasets/[id], the title is looked up
+  from the AnVIL API. All other rows get "N/A".
+
+  Args:
+    df: A dataframe containing a "Page Path" column.
+    title_map: Optional pre-fetched ID-to-title mapping.
+
+  Returns:
+    A copy of the dataframe with a "Dataset Title" column inserted
+    after the column specified by the INSERT_AFTER_COLUMN global variable.
+  """
+  if title_map is None:
+    title_map = fetch_dataset_title_map()
+  df = df.copy()
+
+  def get_title(path: str) -> str:
+    match = DATASETS_PATH_PATTERN.match(path)
+    if match:
+      entry_id = match.group(1)
+      return title_map.get(entry_id, "N/A")
+    return "N/A"
+
+  df[DATASET_TITLE_COLUMN] = df[PAGE_PATH_COLUMN].map(get_title)
+
+  # Insert the title column right after the configured column
+  after_col_idx = list(df.columns).index(INSERT_AFTER_COLUMN)
+  cols = list(df.columns)
+  cols.remove(DATASET_TITLE_COLUMN)
+  cols.insert(after_col_idx + 1, DATASET_TITLE_COLUMN)
+  df = df[cols]
+
+  return df