ucbepic · Ruiying-Ma · Jun 19, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -1,14 +1,7 @@
 # Auto detect text files and perform LF normalization
 * text=auto
-*.sqlite filter=lfs diff=lfs merge=lfs -text
-*.bson filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.db filter=lfs diff=lfs merge=lfs -text
-*.sql filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/*.db filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/*.duckdb filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/*.sql filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/hidden/*.db filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/hidden/*.duckdb filter=lfs diff=lfs merge=lfs -text
-query_crmarenapro/query_dataset/hidden/*.sql filter=lfs diff=lfs merge=lfs -text
-query_krama/query_dataset/misc_files/* filter=lfs diff=lfs merge=lfs -text
+
+# NOTE: Large dataset files (*.sql, *.db, *.sqlite, *.duckdb, *.bson, ...) are
+# NOT stored in Git/Git LFS. They are mirrored on the Hugging Face Hub and
+# fetched via `bash download.sh` (see dataset_manifest.tsv). Do not re-add LFS
+# filters for these paths.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,11 @@ __pycache__/
 # runtime
 claude_projects/
 
+# failed traces collection
+failed_traces_claude_code_sonnet_4_6/
+failed_traces_claude_code_sonnet_4_6.zip
+collect_failed_traces.py
+
 # results
 results/
 
@@ -51,7 +56,8 @@ query_krama/query_dataset/misc_files/
 query_krama/query_dataset/geo.db
 query_krama/query*/ground_truth.py
 query_krama/scripts/
-query_civic_unstructured/
+query_civic_unstructured_old/
+query_civic_unstructured/query*/compute_ground_truth.py
 query_paper_unstructured/
 query_notice_unstructured/
 potentialdb_for_query_snowflake
@@ -79,6 +85,7 @@ query_googlelocal/query*/ground_truth.py
 query_music_brainz_20k/create_databases.py
 query_music_brainz_20k/create_queries.py
 query_music_brainz_20k/create_track_repre.py
+query_music_brainz_20k/ground_truth_dataset/
 query_PANCANCER_ATLAS/query*/ground_truth.py
 query_PANCANCER_ATLAS/detail_hint_ifneeded.txt
 # large db: 5.1G
@@ -99,4 +106,43 @@ query_yelp/query*/ground_truth.py
 # manifest are the answer key — keep local-only. Construction code in
 # manual_querycode/ IS shipped (see PROVENANCE.md) for full reproducibility.
 query_cve/clean/
-query_usaspending/clean/
+query_cve/query*/logs
+query_usaspending/clean/
+query_usaspending/query*/logs
+# === Dataset files mirrored on Hugging Face (fetched by download.sh) ===
+# These are NOT committed to git. See dataset_manifest.tsv.
+query_DEPS_DEV_V1/query_dataset/package_query.db
+query_DEPS_DEV_V1/query_dataset/project_query.db
+query_GITHUB_REPOS/query_dataset/repo_artifacts.db
+query_GITHUB_REPOS/query_dataset/repo_metadata.db
+query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql
+query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db
+query_PATENTS/query_dataset/patent_CPCDefinition.sql
+query_agnews/query_dataset/metadata.db
+query_bookreview/query_dataset/books_info.sql
+query_bookreview/query_dataset/review_query.db
+query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson
+query_civic_unstructured/query_dataset/funding.db
+query_crmarenapro/query_dataset/activities.duckdb
+query_crmarenapro/query_dataset/core_crm.db
+query_crmarenapro/query_dataset/products_orders.db
+query_crmarenapro/query_dataset/sales_pipeline.duckdb
+query_crmarenapro/query_dataset/support.sql
+query_crmarenapro/query_dataset/territory.db
+query_cve/query_dataset/kev.sql
+query_cve/query_dataset/vulns.db
+query_googlelocal/query_dataset/business_description.sql
+query_googlelocal/query_dataset/review_query.db
+query_imdb/query_dataset/movies.sql
+query_imdb/query_dataset/people.sqlite
+query_krama/query_dataset/domain_assets.db
+query_krama/query_dataset/domain_docs/domain_docs_db/files.bson
+query_krama/query_dataset/us_geo.db
+query_music_brainz_20k/query_dataset/tracks.db
+query_stockindex/query_dataset/indexInfo_query.db
+query_stockindex/query_dataset/indextrade_query.db
+query_stockmarket/query_dataset/stockinfo_query.db
+query_stockmarket/query_dataset/stocktrade_query.db
+query_usaspending/query_dataset/contracts.sql
+query_usaspending/query_dataset/recipients.db
+query_yelp/query_dataset/yelp_user.db
diff --git a/README.md b/README.md
@@ -137,25 +137,16 @@ Before running DAB, please complete the following setup steps.
 
 ### Clone the Repository
 
-Some datasets in DAB contain large database files exceeding 50MB and are thus stored in Git LFS. To automatically get the full datasets, you need to ensure you have Git LFS enabled:
-```bash
-git lfs install
-```
-Then you can run:
 ```bash
 git clone https://github.com/ucbepic/DataAgentBench.git
 cd DataAgentBench
 ```
-One database file of `PATENTS` dataset, `patent_publication.db`, exceeds Git LFS file-size limits (5GB). It is on [Google Drive](https://drive.google.com/file/d/1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC/view?usp=sharing).
-
-**Option 1:**
-Manually download the database to `query_PATENTS/query_dataset/patent_publication.db`
 
-**Option 2:**
-Run the following script to automatically download the database:
+The large dataset files (PostgreSQL dumps, SQLite/DuckDB databases, MongoDB BSON, etc.) are **not** stored in the Git repository. They are mirrored on the [Hugging Face Hub](https://huggingface.co/datasets/ruiyingm/DataAgentBench-data) (~13.4GB total). After cloning, download them with:
 ```bash
 bash download.sh
 ```
+This reads [`dataset_manifest.tsv`](./dataset_manifest.tsv), downloads every dataset file into its correct location, and verifies each one against a recorded sha256 checksum. Re-running is safe — files already present and intact are skipped. To re-verify everything (slow), run `VERIFY_ALL=1 bash download.sh`.
 
 
 ### Install Dependencies

diff --git a/dataset_manifest.tsv b/dataset_manifest.tsv
@@ -0,0 +1,36 @@
+query_DEPS_DEV_V1/query_dataset/package_query.db	6676259f784aa89aec838a3e4a74bc68793085e49520c819b2356205a6eaf68a	538877952
+query_DEPS_DEV_V1/query_dataset/project_query.db	e6b171594cbb8dbb5298b54206b235a396b0eee9dd8c2c68deefab8d76c72e6a	11022336
+query_GITHUB_REPOS/query_dataset/repo_artifacts.db	d514c66eb07403f79b8da2ff6a79a6c82f2d862cadaf71d536ea58644cd07bbc	361246720
+query_GITHUB_REPOS/query_dataset/repo_metadata.db	e9b71beb5d7e2b344f996baf19fb34231c24ab354019d65bd14a1cbfc62c67a1	567222272
+query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql	ce57356f7a0f72e1c43a1fc2083d46293991645cbce2f6f237c731bf4035fdd6	7555304
+query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db	7e1131c386014ea8bdd528d2fd040a85f6891dd9e46e8dc18947f7a712cf8610	293875712
+query_PATENTS/query_dataset/patent_CPCDefinition.sql	f888382228bf87538744194ca2ea1be126c316423d6d10e35e7d4ec226da0144	135238448
+query_PATENTS/query_dataset/patent_publication.db	1e22b92b9849b3f7a07dcf89075f37787c84feacb0cdbc7b4bd96ae4fa88071c	5421027328
+query_agnews/query_dataset/metadata.db	30cfb96d29b4e49c5f6ae6e5cb2c754d98e118a183ddd57ae44ee3564aeb1955	4046848
+query_bookreview/query_dataset/books_info.sql	80acb3ece574f26b634107c9c553dc5ab0c95ca11f6b226c9602dff9465393d9	649048
+query_bookreview/query_dataset/review_query.db	01f07a5894f058613374694a8f46d19341bf1fa316367494d019191e772b3e2c	1093632
+query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson	57fd5ff32b81478b696b6af7583dfd64744c01b3ed6bf112b3f8ee00c2e45bce	237654
+query_civic_unstructured/query_dataset/funding.db	7e2e209f30c79917882db69ef379e0d4f58be23e7117e6caf14c31ebdbbfefc3	106496
+query_crmarenapro/query_dataset/activities.duckdb	f3299b9214885f00c71c9af77d89b62ad73f3e545e3cf5baa4861d1265151c60	21245952
+query_crmarenapro/query_dataset/core_crm.db	aa595797e41db401136a0a68bf14bac44707f9f86ed235f61d9f051728e50506	188416
+query_crmarenapro/query_dataset/products_orders.db	92ad52f2a624ee5e0cdf7cd4cc64ac60b1e3f7af2c672be27f651f80af4a6bdb	143360
+query_crmarenapro/query_dataset/sales_pipeline.duckdb	5b44cc3754820bff20c3f25e50cf09c6be401f86e69df6b9d3ee7a4cf64ac5af	2371584
+query_crmarenapro/query_dataset/support.sql	5248cd64cddf4936027859c5d6f74b2d32fbeb4e1939d784b2ff26d45d83950c	8860941
+query_crmarenapro/query_dataset/territory.db	1a9650d4ba3c0a3690b936d294feb31a3227bf6f2ebe886f74cf12e14936366a	24576
+query_cve/query_dataset/kev.sql	76f15a345a8dd1006a5b21c198e52952846efa7dc5581727d5052137218a8bfe	1030796
+query_cve/query_dataset/vulns.db	e76efe1a177354afcc0929610eabbcf088f29af7bc1638d1def78085f4ca601b	20504576
+query_googlelocal/query_dataset/business_description.sql	cac36db1c60ab1fcbc661cc7df46c9980db1c2fcdaedb65b57f7a2618339ddd4	38204
+query_googlelocal/query_dataset/review_query.db	1e9ef561d4edf62ff49bf874253935caf8d404a79c60d21b9b0fc40ee041692c	585728
+query_imdb/query_dataset/movies.sql	9ef5452628ec8d07fbdc283fdb420168bd97b718dda25648087f2fe178bb1daa	1622189538
+query_imdb/query_dataset/people.sqlite	ba583402740f2f5ef42b5be7ac1dd4c47a0cac990ddf7332575e203b976c1b10	2639507456
+query_krama/query_dataset/domain_assets.db	d657f2253337b19694e7785b7d7ccf4398df95fe78012fc3b9e803715d15a3d1	160571392
+query_krama/query_dataset/domain_docs/domain_docs_db/files.bson	3bab163bfef5761e4e1407943eed4f9b2b1af0f6c14dd08931980f5df3dbd376	528766506
+query_krama/query_dataset/us_geo.db	bf442d08de0ffc396d8b750597dba7d0df6659fac44af98ec53b73ae7a31514c	28504064
+query_music_brainz_20k/query_dataset/tracks.db	0c349f70e5bafcba1deb605b49e224abfa95c5a83a44688faa511f9e123a02cc	2035712
+query_stockindex/query_dataset/indexInfo_query.db	976511f3eb213269c81b286dd4517e4948b3d3bb776e221559943f0439916b55	8192
+query_stockindex/query_dataset/indextrade_query.db	12e9cf350be2a6396f6be9f0c2bb057013859956bc8269952dc7a85008973bb1	4468736
+query_stockmarket/query_dataset/stockinfo_query.db	b8cf6b5627c92e17c10e27779afa8aa77c659e44bd4ccc14b10f1116053bc6a2	737280
+query_stockmarket/query_dataset/stocktrade_query.db	e9cbf3c4d05d2eeb919c5b207374aefd19aa02e40dca49f2688618beb180350d	964964352
+query_usaspending/query_dataset/contracts.sql	7fdde155670e23568458570bdb17171b36f4789b49c95dde5461b03ca951a861	4840083
+query_usaspending/query_dataset/recipients.db	008916d8764ba11e759be1499a78c0c090da2fa8b72ff409ed69623c2f1733b5	225280
+query_yelp/query_dataset/yelp_user.db	155ecf989e62e0d087ed9d4586893e6825ab56504bc5e62ad46d454067f9b730	3420160
diff --git a/download.sh b/download.sh
@@ -1,39 +1,109 @@
 #!/usr/bin/env bash
-set -e
-
-# db link: https://drive.google.com/file/d/1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC/view?usp=sharing
-FILE_ID="1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC"
-OUTPUT_PATH="query_PATENTS/query_dataset/patent_publication.db"
-
-# check if file already exists and has size > 5GB
-if [ -f "$OUTPUT_PATH" ]; then
-    FILE_SIZE=$(stat -c%s "$OUTPUT_PATH")
-    if [ "$FILE_SIZE" -gt 5368709120 ]; then
-        echo "File already exists and is larger than 5GB. Skipping download."
-        exit 0
-    else
-        echo "File exists but is smaller than 5GB. Re-downloading..."
-        rm "$OUTPUT_PATH"
-    fi
-fi
+set -euo pipefail
+
+# DataAgentBench dataset downloader.
+#
+# All large dataset files (PostgreSQL dumps, SQLite/DuckDB databases, MongoDB
+# BSON, etc.) are mirrored on the Hugging Face Hub instead of Git LFS. This
+# script downloads every file listed in dataset_manifest.tsv into its correct
+# location and verifies each one against the recorded sha256 checksum.
+#
+# Usage:
+#   bash download.sh              # download any missing / wrong-sized files
+#   VERIFY_ALL=1 bash download.sh # re-hash every existing file (slow, ~19GB)
+#
+# Re-running is safe: files already present with the expected size are skipped.
 
-echo "Downloading database (~5GB)..."
+REPO_ID="${DAB_HF_REPO:-ruiyingm/DataAgentBench-data}"
+MANIFEST="${DAB_MANIFEST:-dataset_manifest.tsv}"
 
-# Create directory if needed
-mkdir -p "$(dirname "$OUTPUT_PATH")"
+cd "$(dirname "$0")"
+
+if [ ! -f "$MANIFEST" ]; then
+    echo "ERROR: manifest '$MANIFEST' not found (run from the repo root)." >&2
+    exit 1
+fi
 
-# Download using gdown
-if ! command -v gdown &> /dev/null; then
-    echo "gdown not found. Installing..."
-    pip install gdown
+# huggingface_hub powers the download. Auto-install if absent, mirroring the
+# previous gdown bootstrap.
+if ! python -c "import huggingface_hub" >/dev/null 2>&1; then
+    echo "huggingface_hub not found. Installing..."
+    pip install -q "huggingface_hub>=0.23"
 fi
 
-gdown --id "$FILE_ID" -O "$OUTPUT_PATH"
+echo "Downloading datasets from https://huggingface.co/datasets/${REPO_ID}"
+
+REPO_ID="$REPO_ID" MANIFEST="$MANIFEST" VERIFY_ALL="${VERIFY_ALL:-0}" python - <<'PY'
+import hashlib
+import os
+import sys
+
+from huggingface_hub import hf_hub_download
+
+repo_id = os.environ["REPO_ID"]
+manifest = os.environ["MANIFEST"]
+verify_all = os.environ.get("VERIFY_ALL", "0") == "1"
+
+
+def sha256(path):
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+entries = []
+with open(manifest) as f:
+    for line in f:
+        line = line.rstrip("\n")
+        if not line or line.startswith("#"):
+            continue
+        path, digest, size = line.split("\t")
+        entries.append((path, digest, int(size)))
+
+downloaded = skipped = 0
+failures = []
+
+for path, digest, size in entries:
+    # Skip if already present and the right size (and, when VERIFY_ALL, the
+    # right hash). This avoids re-hashing ~19GB on every run.
+    if os.path.isfile(path) and os.path.getsize(path) == size:
+        if not verify_all or sha256(path) == digest:
+            print(f"  [skip] {path}")
+            skipped += 1
+            continue
+
+    print(f"  [get ] {path} ({size / 1e6:.1f} MB)")
+    try:
+        hf_hub_download(
+            repo_id=repo_id,
+            repo_type="dataset",
+            filename=path,
+            local_dir=".",
+        )
+    except Exception as exc:  # noqa: BLE001
+        print(f"  [FAIL] download error for {path}: {exc}")
+        failures.append(path)
+        continue
 
-echo "Download complete."
+    actual = sha256(path)
+    if actual != digest:
+        print(f"  [FAIL] checksum mismatch for {path}")
+        print(f"         expected {digest}")
+        print(f"         actual   {actual}")
+        failures.append(path)
+        continue
+    downloaded += 1
 
-# Optional: verify checksum
-echo "Verifying file size..."
-du -sh "$OUTPUT_PATH"
+print()
+print(f"Downloaded: {downloaded}, skipped: {skipped}, failed: {len(failures)}")
+if failures:
+    print("Failed files:")
+    for p in failures:
+        print(f"  - {p}")
+    sys.exit(1)
+print("All datasets present and verified.")
+PY
 
-echo "Done."
+echo "Done."
diff --git a/query_DEPS_DEV_V1/query_dataset/package_query.db b/query_DEPS_DEV_V1/query_dataset/package_query.db
diff --git a/query_DEPS_DEV_V1/query_dataset/project_query.db b/query_DEPS_DEV_V1/query_dataset/project_query.db
diff --git a/query_GITHUB_REPOS/query_dataset/repo_artifacts.db b/query_GITHUB_REPOS/query_dataset/repo_artifacts.db
diff --git a/query_GITHUB_REPOS/query_dataset/repo_metadata.db b/query_GITHUB_REPOS/query_dataset/repo_metadata.db
diff --git a/query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql b/query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql
diff --git a/query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db b/query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db
diff --git a/query_PATENTS/query_dataset/patent_CPCDefinition.sql b/query_PATENTS/query_dataset/patent_CPCDefinition.sql
diff --git a/query_agnews/query_dataset/metadata.db b/query_agnews/query_dataset/metadata.db
diff --git a/query_bookreview/query_dataset/books_info.sql b/query_bookreview/query_dataset/books_info.sql
diff --git a/query_bookreview/query_dataset/review_query.db b/query_bookreview/query_dataset/review_query.db
diff --git a/query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson b/query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson
diff --git a/query_civic_unstructured/query_dataset/funding.db b/query_civic_unstructured/query_dataset/funding.db
diff --git a/query_crmarenapro/query_dataset/activities.duckdb b/query_crmarenapro/query_dataset/activities.duckdb
diff --git a/query_crmarenapro/query_dataset/core_crm.db b/query_crmarenapro/query_dataset/core_crm.db
diff --git a/query_crmarenapro/query_dataset/products_orders.db b/query_crmarenapro/query_dataset/products_orders.db
diff --git a/query_crmarenapro/query_dataset/sales_pipeline.duckdb b/query_crmarenapro/query_dataset/sales_pipeline.duckdb
diff --git a/query_crmarenapro/query_dataset/support.sql b/query_crmarenapro/query_dataset/support.sql
diff --git a/query_crmarenapro/query_dataset/territory.db b/query_crmarenapro/query_dataset/territory.db
diff --git a/query_cve/query_dataset/kev.sql b/query_cve/query_dataset/kev.sql
diff --git a/query_cve/query_dataset/vulns.db b/query_cve/query_dataset/vulns.db
diff --git a/query_googlelocal/query_dataset/business_description.sql b/query_googlelocal/query_dataset/business_description.sql
diff --git a/query_googlelocal/query_dataset/review_query.db b/query_googlelocal/query_dataset/review_query.db
diff --git a/query_imdb/query_dataset/movies.sql b/query_imdb/query_dataset/movies.sql
diff --git a/query_imdb/query_dataset/people.sqlite b/query_imdb/query_dataset/people.sqlite