Skip to content

Commit aed9256

Browse files
authored
feat: add dataset titles to anvil analytics report (anvilproject/anvil-portal#3909) (#4746)
feat: add dataset titles to anvil analytics report (#3909)
1 parent ab7ebaa commit aed9256

File tree

2 files changed

+107
-31
lines changed

2 files changed

+107
-31
lines changed

analytics/anvil-explorer-sheets/generate_sheets_report.ipynb

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 3,
6-
"metadata": {},
7-
"outputs": [],
8-
"source": [
9-
"import analytics.api as ga\n",
10-
"import analytics.sheets_api as sheets\n",
11-
"import analytics.sheets_elements as elements\n",
12-
"import analytics.entities as e\n",
13-
"import pandas as pd\n",
14-
"import gspread\n",
15-
"from constants import *"
16-
]
17-
},
18-
{
19-
"cell_type": "code",
20-
"execution_count": 4,
5+
"execution_count": 2,
216
"metadata": {},
227
"outputs": [
238
{
@@ -34,14 +19,30 @@
3419
},
3520
{
3621
"cell_type": "code",
37-
"execution_count": null,
22+
"execution_count": 3,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"import analytics.api as ga\n",
27+
"import analytics.sheets_api as sheets\n",
28+
"import analytics.sheets_elements as elements\n",
29+
"import analytics.entities as e\n",
30+
"import pandas as pd\n",
31+
"import gspread\n",
32+
"from constants import *\n",
33+
"import utils"
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": 4,
3839
"metadata": {},
3940
"outputs": [
4041
{
4142
"name": "stdout",
4243
"output_type": "stream",
4344
"text": [
44-
"Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=274560362763-p5netdrssq6r02lcfan6s157m6d65rqe.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8082%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fanalytics.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=3TOrvrS8EuGJvNWqEnjqNIlknGfZW7&access_type=offline\n"
45+
"Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=274560362763-p5netdrssq6r02lcfan6s157m6d65rqe.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8082%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fanalytics.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=P3M0s2lsdDE4jDwfNdfekOCqsMsZpJ&access_type=offline\n"
4546
]
4647
}
4748
],
@@ -82,59 +83,61 @@
8283
},
8384
{
8485
"cell_type": "code",
85-
"execution_count": 6,
86+
"execution_count": 5,
8687
"metadata": {},
8788
"outputs": [],
8889
"source": [
8990
"df_monthly_pageviews = elements.get_page_views_over_time_df(anvil_catalog_params_all_time)\n",
90-
"df_pageviews = elements.get_page_views_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)\n",
91+
"df_pageviews = utils.add_dataset_titles(\n",
92+
" elements.get_page_views_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)\n",
93+
")\n",
9194
"df_outbound = elements.get_outbound_links_change(anvil_catalog_params, START_DATE_CURRENT, END_DATE_CURRENT, START_DATE_PRIOR, END_DATE_PRIOR)"
9295
]
9396
},
9497
{
9598
"cell_type": "code",
96-
"execution_count": 7,
99+
"execution_count": 6,
97100
"metadata": {},
98101
"outputs": [
99102
{
100103
"data": {
101104
"text/plain": [
102-
"{'spreadsheetId': '1ueSQdU9pTa1qPptkNc4vk4xoHuJgP6gNcQ7exRxN9y0',\n",
103-
" 'replies': [{'addChart': {'chart': {'chartId': 113619751,\n",
105+
"{'spreadsheetId': '1yg5dDlhe7PX02nin9FWbDLMimY1UhwI4yszpcfCiRSQ',\n",
106+
" 'replies': [{'addChart': {'chart': {'chartId': 1605255347,\n",
104107
" 'spec': {'title': 'Pageviews and Users Over Time',\n",
105108
" 'basicChart': {'chartType': 'LINE',\n",
106109
" 'axis': [{'position': 'BOTTOM_AXIS', 'viewWindowOptions': {}},\n",
107110
" {'position': 'LEFT_AXIS', 'viewWindowOptions': {}}],\n",
108-
" 'domains': [{'domain': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
111+
" 'domains': [{'domain': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
109112
" 'startRowIndex': 0,\n",
110-
" 'endRowIndex': 13,\n",
113+
" 'endRowIndex': 24,\n",
111114
" 'startColumnIndex': 0,\n",
112115
" 'endColumnIndex': 1}]}}}],\n",
113-
" 'series': [{'series': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
116+
" 'series': [{'series': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
114117
" 'startRowIndex': 0,\n",
115-
" 'endRowIndex': 13,\n",
118+
" 'endRowIndex': 24,\n",
116119
" 'startColumnIndex': 1,\n",
117120
" 'endColumnIndex': 2}]}},\n",
118121
" 'targetAxis': 'LEFT_AXIS'},\n",
119-
" {'series': {'sourceRange': {'sources': [{'sheetId': 388329788,\n",
122+
" {'series': {'sourceRange': {'sources': [{'sheetId': 1535415780,\n",
120123
" 'startRowIndex': 0,\n",
121-
" 'endRowIndex': 13,\n",
124+
" 'endRowIndex': 24,\n",
122125
" 'startColumnIndex': 2,\n",
123126
" 'endColumnIndex': 3}]}},\n",
124127
" 'targetAxis': 'LEFT_AXIS'}],\n",
125128
" 'headerCount': 1},\n",
126129
" 'hiddenDimensionStrategy': 'SKIP_HIDDEN_ROWS_AND_COLUMNS',\n",
127130
" 'titleTextFormat': {'fontFamily': 'Roboto'},\n",
128131
" 'fontName': 'Roboto'},\n",
129-
" 'position': {'overlayPosition': {'anchorCell': {'sheetId': 388329788,\n",
132+
" 'position': {'overlayPosition': {'anchorCell': {'sheetId': 1535415780,\n",
130133
" 'columnIndex': 5},\n",
131134
" 'offsetXPixels': 75,\n",
132135
" 'offsetYPixels': 25,\n",
133136
" 'widthPixels': 600,\n",
134137
" 'heightPixels': 371}}}}}]}"
135138
]
136139
},
137-
"execution_count": 7,
140+
"execution_count": 6,
138141
"metadata": {},
139142
"output_type": "execute_result"
140143
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import re
2+
3+
import pandas as pd
4+
import requests
5+
6+
from analytics.entities import DIMENSION_PAGE_PATH
7+
8+
ANVIL_DATASETS_API_URL = "https://service.explore.anvilproject.org/index/datasets"
9+
DATASETS_PATH_PATTERN = re.compile(r"^/datasets/([^/]+)")
10+
INSERT_AFTER_COLUMN = DIMENSION_PAGE_PATH["alias"]
11+
PAGE_PATH_COLUMN = DIMENSION_PAGE_PATH["alias"]
12+
DATASET_TITLE_COLUMN = "Dataset Title"
13+
14+
15+
def fetch_dataset_title_map() -> dict[str, str]:
16+
"""Fetch all datasets from the AnVIL API and return a mapping of entryId to title.
17+
18+
Paginates through the full catalog using the API's ``pagination.next`` URL.
19+
"""
20+
title_map: dict[str, str] = {}
21+
url: str | None = ANVIL_DATASETS_API_URL
22+
params: dict[str, int] | None = {"size": 1000}
23+
while url is not None:
24+
response = requests.get(url, params=params)
25+
response.raise_for_status()
26+
data = response.json()
27+
for hit in data["hits"]:
28+
entry_id = hit.get("entryId")
29+
datasets = hit.get("datasets", [])
30+
if entry_id and datasets:
31+
title = datasets[0].get("title", "")
32+
if title:
33+
title_map[entry_id] = title
34+
url = data.get("pagination", {}).get("next")
35+
params = None # subsequent URLs already include query params
36+
return title_map
37+
38+
39+
def add_dataset_titles(df: pd.DataFrame, title_map: dict[str, str] | None = None) -> pd.DataFrame:
40+
"""Add a 'Dataset Title' column to a pageviews dataframe.
41+
42+
For rows where the page path matches /datasets/[id], the title is looked up
43+
from the AnVIL API. All other rows get "N/A".
44+
45+
Args:
46+
df: A dataframe containing a "Page Path" column.
47+
title_map: Optional pre-fetched ID-to-title mapping.
48+
49+
Returns:
50+
A copy of the dataframe with a "Dataset Title" column inserted
51+
after the column specified by the INSERT_AFTER_COLUMN global variable.
52+
"""
53+
if title_map is None:
54+
title_map = fetch_dataset_title_map()
55+
df = df.copy()
56+
57+
def get_title(path: str) -> str:
58+
match = DATASETS_PATH_PATTERN.match(path)
59+
if match:
60+
entry_id = match.group(1)
61+
return title_map.get(entry_id, "N/A")
62+
return "N/A"
63+
64+
df[DATASET_TITLE_COLUMN] = df[PAGE_PATH_COLUMN].map(get_title)
65+
66+
# Insert the title column right after the configured column
67+
after_col_idx = list(df.columns).index(INSERT_AFTER_COLUMN)
68+
cols = list(df.columns)
69+
cols.remove(DATASET_TITLE_COLUMN)
70+
cols.insert(after_col_idx + 1, DATASET_TITLE_COLUMN)
71+
df = df[cols]
72+
73+
return df

0 commit comments

Comments
 (0)