Skip to content

Commit 22f1c7f

Browse files
committed
Search all 1,500+ publications via auto-pagination, remove limit param
1 parent e092361 commit 22f1c7f

3 files changed

Lines changed: 33 additions & 20 deletions

File tree

dapi/client.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -368,20 +368,19 @@ def search(
368368
pi: Optional[str] = None,
369369
keyword: Optional[str] = None,
370370
publication_type: Optional[str] = None,
371-
limit: int = 100,
372371
output: str = "df",
373372
):
374-
"""Search published datasets with optional filters.
373+
"""Search all published datasets with optional filters.
375374
376-
All filters are AND-combined and case-insensitive.
375+
Searches across all ~1,500 published datasets. Filters are AND-combined
376+
and case-insensitive.
377377
378378
Args:
379379
query (str, optional): General search across title, description, keywords, PI.
380380
pi (str, optional): Filter by PI name.
381381
keyword (str, optional): Filter by keyword.
382382
publication_type (str, optional): Filter by type: "simulation",
383383
"experimental", "field_recon", "other", "hybrid_simulation".
384-
limit (int, optional): Max publications to fetch. Defaults to 100.
385384
output (str, optional): "df" for DataFrame (default), "list" for dicts.
386385
387386
Returns:
@@ -400,7 +399,6 @@ def search(
400399
pi=pi,
401400
keyword=keyword,
402401
publication_type=publication_type,
403-
limit=limit,
404402
output=output,
405403
)
406404

dapi/publications.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
_DS_PUBLICATIONS_API = "https://designsafe-ci.org/api/publications/v2/"
1111
_PUBLISHED_SYSTEM_ID = "designsafe.storage.published"
12+
_PAGE_SIZE = 100
1213

1314

1415
def _get_auth_headers(t: Tapis) -> Dict[str, str]:
@@ -17,6 +18,28 @@ def _get_auth_headers(t: Tapis) -> Dict[str, str]:
1718
return {"X-Tapis-Token": token, "Authorization": f"Bearer {token}"}
1819

1920

21+
def _fetch_all_publications(headers: Dict[str, str]) -> List[Dict]:
22+
"""Fetch all publications using pagination."""
23+
all_pubs = []
24+
offset = 0
25+
while True:
26+
resp = requests.get(
27+
_DS_PUBLICATIONS_API,
28+
headers=headers,
29+
params={"limit": _PAGE_SIZE, "offset": offset},
30+
timeout=30,
31+
)
32+
resp.raise_for_status()
33+
data = resp.json()
34+
batch = data.get("result", [])
35+
total = data.get("total", 0)
36+
all_pubs.extend(batch)
37+
if len(batch) < _PAGE_SIZE or len(all_pubs) >= total:
38+
break
39+
offset += _PAGE_SIZE
40+
return all_pubs
41+
42+
2043
def _pi_display(pi: Optional[Dict]) -> str:
2144
"""Format PI dict as display name."""
2245
if not pi:
@@ -93,11 +116,11 @@ def search_publications(
93116
pi: Optional[str] = None,
94117
keyword: Optional[str] = None,
95118
publication_type: Optional[str] = None,
96-
limit: int = 100,
97119
output: str = "df",
98120
) -> Union[pd.DataFrame, List[Dict]]:
99-
"""Search published datasets with optional filters.
121+
"""Search all published datasets with optional filters.
100122
123+
Fetches all publications (~1,500) and filters client-side.
101124
All filters are case-insensitive and combined with AND logic.
102125
At least one filter (query, pi, keyword, or publication_type) must be provided.
103126
@@ -108,7 +131,6 @@ def search_publications(
108131
keyword (str, optional): Filter by keyword (partial match against keywords list).
109132
publication_type (str, optional): Filter by type: "simulation", "experimental",
110133
"field_recon", "other", "hybrid_simulation".
111-
limit (int, optional): Max publications to fetch before filtering. Defaults to 100.
112134
output (str, optional): "df" for DataFrame (default), "list" for list of dicts.
113135
114136
Returns:
@@ -128,19 +150,12 @@ def search_publications(
128150

129151
headers = _get_auth_headers(t)
130152
try:
131-
resp = requests.get(
132-
_DS_PUBLICATIONS_API,
133-
headers=headers,
134-
params={"limit": limit, "offset": 0},
135-
timeout=30,
136-
)
137-
resp.raise_for_status()
153+
all_pubs = _fetch_all_publications(headers)
138154
except requests.RequestException as e:
139155
raise FileOperationError(f"Failed to search publications: {e}") from e
140156

141-
data = resp.json()
142157
matches = []
143-
for p in data.get("result", []):
158+
for p in all_pubs:
144159
# Apply filters with AND logic
145160
if query:
146161
searchable = " ".join(

docs/publications.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ ds.publications.search(publication_type="simulation")
4141
# Combine filters (AND logic)
4242
ds.publications.search(keyword="storm surge", publication_type="simulation")
4343

44-
# Increase search pool (API returns 100 by default)
45-
ds.publications.search(keyword="earthquake", limit=500)
44+
# Combine filters
45+
ds.publications.search(keyword="earthquake", publication_type="experimental")
4646
```
4747

4848
## Get publication details
@@ -79,7 +79,7 @@ DataFrame columns: `name`, `type`, `size`, `lastModified`, `path`.
7979

8080
## How it works
8181

82-
1. **Publication listing and search**: dapi queries the DesignSafe portal API (`/api/publications/v2/`). The API returns all publications; search filtering is done client-side.
82+
1. **Publication listing and search**: dapi queries the DesignSafe portal API (`/api/publications/v2/`). For search, all ~1,500 publications are fetched via pagination (~3 seconds) and filtered client-side.
8383

8484
2. **Publication detail**: dapi queries `/api/publications/v2/PRJ-XXXX/` which returns metadata in the `tree.children[0].value` structure, including DOIs, authors, and data types.
8585

0 commit comments

Comments
 (0)