diff --git a/.gitattributes b/.gitattributes index 69bbb1a727..6ba8c91266 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,14 +1,7 @@ # Auto detect text files and perform LF normalization * text=auto -*.sqlite filter=lfs diff=lfs merge=lfs -text -*.bson filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.db filter=lfs diff=lfs merge=lfs -text -*.sql filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/*.db filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/*.duckdb filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/*.sql filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/hidden/*.db filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/hidden/*.duckdb filter=lfs diff=lfs merge=lfs -text -query_crmarenapro/query_dataset/hidden/*.sql filter=lfs diff=lfs merge=lfs -text -query_krama/query_dataset/misc_files/* filter=lfs diff=lfs merge=lfs -text + +# NOTE: Large dataset files (*.sql, *.db, *.sqlite, *.duckdb, *.bson, ...) are +# NOT stored in Git/Git LFS. They are mirrored on the Hugging Face Hub and +# fetched via `bash download.sh` (see dataset_manifest.tsv). Do not re-add LFS +# filters for these paths. diff --git a/.gitignore b/.gitignore index ecf571a926..9a156c5902 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,11 @@ __pycache__/ # runtime claude_projects/ +# failed traces collection +failed_traces_claude_code_sonnet_4_6/ +failed_traces_claude_code_sonnet_4_6.zip +collect_failed_traces.py + # results results/ @@ -51,7 +56,8 @@ query_krama/query_dataset/misc_files/ query_krama/query_dataset/geo.db query_krama/query*/ground_truth.py query_krama/scripts/ -query_civic_unstructured/ +query_civic_unstructured_old/ +query_civic_unstructured/query*/compute_ground_truth.py query_paper_unstructured/ query_notice_unstructured/ potentialdb_for_query_snowflake @@ -79,6 +85,7 @@ query_googlelocal/query*/ground_truth.py query_music_brainz_20k/create_databases.py query_music_brainz_20k/create_queries.py query_music_brainz_20k/create_track_repre.py +query_music_brainz_20k/ground_truth_dataset/ query_PANCANCER_ATLAS/query*/ground_truth.py query_PANCANCER_ATLAS/detail_hint_ifneeded.txt # large db: 5.1G @@ -99,4 +106,43 @@ query_yelp/query*/ground_truth.py # manifest are the answer key — keep local-only. Construction code in # manual_querycode/ IS shipped (see PROVENANCE.md) for full reproducibility. query_cve/clean/ -query_usaspending/clean/ \ No newline at end of file +query_cve/query*/logs +query_usaspending/clean/ +query_usaspending/query*/logs +# === Dataset files mirrored on Hugging Face (fetched by download.sh) === +# These are NOT committed to git. See dataset_manifest.tsv. +query_DEPS_DEV_V1/query_dataset/package_query.db +query_DEPS_DEV_V1/query_dataset/project_query.db +query_GITHUB_REPOS/query_dataset/repo_artifacts.db +query_GITHUB_REPOS/query_dataset/repo_metadata.db +query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql +query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db +query_PATENTS/query_dataset/patent_CPCDefinition.sql +query_agnews/query_dataset/metadata.db +query_bookreview/query_dataset/books_info.sql +query_bookreview/query_dataset/review_query.db +query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson +query_civic_unstructured/query_dataset/funding.db +query_crmarenapro/query_dataset/activities.duckdb +query_crmarenapro/query_dataset/core_crm.db +query_crmarenapro/query_dataset/products_orders.db +query_crmarenapro/query_dataset/sales_pipeline.duckdb +query_crmarenapro/query_dataset/support.sql +query_crmarenapro/query_dataset/territory.db +query_cve/query_dataset/kev.sql +query_cve/query_dataset/vulns.db +query_googlelocal/query_dataset/business_description.sql +query_googlelocal/query_dataset/review_query.db +query_imdb/query_dataset/movies.sql +query_imdb/query_dataset/people.sqlite +query_krama/query_dataset/domain_assets.db +query_krama/query_dataset/domain_docs/domain_docs_db/files.bson +query_krama/query_dataset/us_geo.db +query_music_brainz_20k/query_dataset/tracks.db +query_stockindex/query_dataset/indexInfo_query.db +query_stockindex/query_dataset/indextrade_query.db +query_stockmarket/query_dataset/stockinfo_query.db +query_stockmarket/query_dataset/stocktrade_query.db +query_usaspending/query_dataset/contracts.sql +query_usaspending/query_dataset/recipients.db +query_yelp/query_dataset/yelp_user.db diff --git a/README.md b/README.md index e14bedba3a..58bde2ce2d 100644 --- a/README.md +++ b/README.md @@ -137,25 +137,16 @@ Before running DAB, please complete the following setup steps. ### Clone the Repository -Some datasets in DAB contain large database files exceeding 50MB and are thus stored in Git LFS. To automatically get the full datasets, you need to ensure you have Git LFS enabled: -```bash -git lfs install -``` -Then you can run: ```bash git clone https://github.com/ucbepic/DataAgentBench.git cd DataAgentBench ``` -One database file of `PATENTS` dataset, `patent_publication.db`, exceeds Git LFS file-size limits (5GB). It is on [Google Drive](https://drive.google.com/file/d/1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC/view?usp=sharing). - -**Option 1:** -Manually download the database to `query_PATENTS/query_dataset/patent_publication.db` -**Option 2:** -Run the following script to automatically download the database: +The large dataset files (PostgreSQL dumps, SQLite/DuckDB databases, MongoDB BSON, etc.) are **not** stored in the Git repository. They are mirrored on the [Hugging Face Hub](https://huggingface.co/datasets/ruiyingm/DataAgentBench-data) (~13.4GB total). After cloning, download them with: ```bash bash download.sh ``` +This reads [`dataset_manifest.tsv`](./dataset_manifest.tsv), downloads every dataset file into its correct location, and verifies each one against a recorded sha256 checksum. Re-running is safe — files already present and intact are skipped. To re-verify everything (slow), run `VERIFY_ALL=1 bash download.sh`. ### Install Dependencies diff --git a/dataset_manifest.tsv b/dataset_manifest.tsv new file mode 100644 index 0000000000..6c860dc18b --- /dev/null +++ b/dataset_manifest.tsv @@ -0,0 +1,36 @@ +query_DEPS_DEV_V1/query_dataset/package_query.db 6676259f784aa89aec838a3e4a74bc68793085e49520c819b2356205a6eaf68a 538877952 +query_DEPS_DEV_V1/query_dataset/project_query.db e6b171594cbb8dbb5298b54206b235a396b0eee9dd8c2c68deefab8d76c72e6a 11022336 +query_GITHUB_REPOS/query_dataset/repo_artifacts.db d514c66eb07403f79b8da2ff6a79a6c82f2d862cadaf71d536ea58644cd07bbc 361246720 +query_GITHUB_REPOS/query_dataset/repo_metadata.db e9b71beb5d7e2b344f996baf19fb34231c24ab354019d65bd14a1cbfc62c67a1 567222272 +query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql ce57356f7a0f72e1c43a1fc2083d46293991645cbce2f6f237c731bf4035fdd6 7555304 +query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db 7e1131c386014ea8bdd528d2fd040a85f6891dd9e46e8dc18947f7a712cf8610 293875712 +query_PATENTS/query_dataset/patent_CPCDefinition.sql f888382228bf87538744194ca2ea1be126c316423d6d10e35e7d4ec226da0144 135238448 +query_PATENTS/query_dataset/patent_publication.db 1e22b92b9849b3f7a07dcf89075f37787c84feacb0cdbc7b4bd96ae4fa88071c 5421027328 +query_agnews/query_dataset/metadata.db 30cfb96d29b4e49c5f6ae6e5cb2c754d98e118a183ddd57ae44ee3564aeb1955 4046848 +query_bookreview/query_dataset/books_info.sql 80acb3ece574f26b634107c9c553dc5ab0c95ca11f6b226c9602dff9465393d9 649048 +query_bookreview/query_dataset/review_query.db 01f07a5894f058613374694a8f46d19341bf1fa316367494d019191e772b3e2c 1093632 +query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson 57fd5ff32b81478b696b6af7583dfd64744c01b3ed6bf112b3f8ee00c2e45bce 237654 +query_civic_unstructured/query_dataset/funding.db 7e2e209f30c79917882db69ef379e0d4f58be23e7117e6caf14c31ebdbbfefc3 106496 +query_crmarenapro/query_dataset/activities.duckdb f3299b9214885f00c71c9af77d89b62ad73f3e545e3cf5baa4861d1265151c60 21245952 +query_crmarenapro/query_dataset/core_crm.db aa595797e41db401136a0a68bf14bac44707f9f86ed235f61d9f051728e50506 188416 +query_crmarenapro/query_dataset/products_orders.db 92ad52f2a624ee5e0cdf7cd4cc64ac60b1e3f7af2c672be27f651f80af4a6bdb 143360 +query_crmarenapro/query_dataset/sales_pipeline.duckdb 5b44cc3754820bff20c3f25e50cf09c6be401f86e69df6b9d3ee7a4cf64ac5af 2371584 +query_crmarenapro/query_dataset/support.sql 5248cd64cddf4936027859c5d6f74b2d32fbeb4e1939d784b2ff26d45d83950c 8860941 +query_crmarenapro/query_dataset/territory.db 1a9650d4ba3c0a3690b936d294feb31a3227bf6f2ebe886f74cf12e14936366a 24576 +query_cve/query_dataset/kev.sql 76f15a345a8dd1006a5b21c198e52952846efa7dc5581727d5052137218a8bfe 1030796 +query_cve/query_dataset/vulns.db e76efe1a177354afcc0929610eabbcf088f29af7bc1638d1def78085f4ca601b 20504576 +query_googlelocal/query_dataset/business_description.sql cac36db1c60ab1fcbc661cc7df46c9980db1c2fcdaedb65b57f7a2618339ddd4 38204 +query_googlelocal/query_dataset/review_query.db 1e9ef561d4edf62ff49bf874253935caf8d404a79c60d21b9b0fc40ee041692c 585728 +query_imdb/query_dataset/movies.sql 9ef5452628ec8d07fbdc283fdb420168bd97b718dda25648087f2fe178bb1daa 1622189538 +query_imdb/query_dataset/people.sqlite ba583402740f2f5ef42b5be7ac1dd4c47a0cac990ddf7332575e203b976c1b10 2639507456 +query_krama/query_dataset/domain_assets.db d657f2253337b19694e7785b7d7ccf4398df95fe78012fc3b9e803715d15a3d1 160571392 +query_krama/query_dataset/domain_docs/domain_docs_db/files.bson 3bab163bfef5761e4e1407943eed4f9b2b1af0f6c14dd08931980f5df3dbd376 528766506 +query_krama/query_dataset/us_geo.db bf442d08de0ffc396d8b750597dba7d0df6659fac44af98ec53b73ae7a31514c 28504064 +query_music_brainz_20k/query_dataset/tracks.db 0c349f70e5bafcba1deb605b49e224abfa95c5a83a44688faa511f9e123a02cc 2035712 +query_stockindex/query_dataset/indexInfo_query.db 976511f3eb213269c81b286dd4517e4948b3d3bb776e221559943f0439916b55 8192 +query_stockindex/query_dataset/indextrade_query.db 12e9cf350be2a6396f6be9f0c2bb057013859956bc8269952dc7a85008973bb1 4468736 +query_stockmarket/query_dataset/stockinfo_query.db b8cf6b5627c92e17c10e27779afa8aa77c659e44bd4ccc14b10f1116053bc6a2 737280 +query_stockmarket/query_dataset/stocktrade_query.db e9cbf3c4d05d2eeb919c5b207374aefd19aa02e40dca49f2688618beb180350d 964964352 +query_usaspending/query_dataset/contracts.sql 7fdde155670e23568458570bdb17171b36f4789b49c95dde5461b03ca951a861 4840083 +query_usaspending/query_dataset/recipients.db 008916d8764ba11e759be1499a78c0c090da2fa8b72ff409ed69623c2f1733b5 225280 +query_yelp/query_dataset/yelp_user.db 155ecf989e62e0d087ed9d4586893e6825ab56504bc5e62ad46d454067f9b730 3420160 diff --git a/download.sh b/download.sh index 1ea1b3bef6..2a9b74d5b7 100755 --- a/download.sh +++ b/download.sh @@ -1,39 +1,109 @@ #!/usr/bin/env bash -set -e - -# db link: https://drive.google.com/file/d/1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC/view?usp=sharing -FILE_ID="1pALQ1UH-OwaEUeGYAx47uCyzClfK94XC" -OUTPUT_PATH="query_PATENTS/query_dataset/patent_publication.db" - -# check if file already exists and has size > 5GB -if [ -f "$OUTPUT_PATH" ]; then - FILE_SIZE=$(stat -c%s "$OUTPUT_PATH") - if [ "$FILE_SIZE" -gt 5368709120 ]; then - echo "File already exists and is larger than 5GB. Skipping download." - exit 0 - else - echo "File exists but is smaller than 5GB. Re-downloading..." - rm "$OUTPUT_PATH" - fi -fi +set -euo pipefail + +# DataAgentBench dataset downloader. +# +# All large dataset files (PostgreSQL dumps, SQLite/DuckDB databases, MongoDB +# BSON, etc.) are mirrored on the Hugging Face Hub instead of Git LFS. This +# script downloads every file listed in dataset_manifest.tsv into its correct +# location and verifies each one against the recorded sha256 checksum. +# +# Usage: +# bash download.sh # download any missing / wrong-sized files +# VERIFY_ALL=1 bash download.sh # re-hash every existing file (slow, ~19GB) +# +# Re-running is safe: files already present with the expected size are skipped. -echo "Downloading database (~5GB)..." +REPO_ID="${DAB_HF_REPO:-ruiyingm/DataAgentBench-data}" +MANIFEST="${DAB_MANIFEST:-dataset_manifest.tsv}" -# Create directory if needed -mkdir -p "$(dirname "$OUTPUT_PATH")" +cd "$(dirname "$0")" + +if [ ! -f "$MANIFEST" ]; then + echo "ERROR: manifest '$MANIFEST' not found (run from the repo root)." >&2 + exit 1 +fi -# Download using gdown -if ! command -v gdown &> /dev/null; then - echo "gdown not found. Installing..." - pip install gdown +# huggingface_hub powers the download. Auto-install if absent, mirroring the +# previous gdown bootstrap. +if ! python -c "import huggingface_hub" >/dev/null 2>&1; then + echo "huggingface_hub not found. Installing..." + pip install -q "huggingface_hub>=0.23" fi -gdown --id "$FILE_ID" -O "$OUTPUT_PATH" +echo "Downloading datasets from https://huggingface.co/datasets/${REPO_ID}" + +REPO_ID="$REPO_ID" MANIFEST="$MANIFEST" VERIFY_ALL="${VERIFY_ALL:-0}" python - <<'PY' +import hashlib +import os +import sys + +from huggingface_hub import hf_hub_download + +repo_id = os.environ["REPO_ID"] +manifest = os.environ["MANIFEST"] +verify_all = os.environ.get("VERIFY_ALL", "0") == "1" + + +def sha256(path): + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +entries = [] +with open(manifest) as f: + for line in f: + line = line.rstrip("\n") + if not line or line.startswith("#"): + continue + path, digest, size = line.split("\t") + entries.append((path, digest, int(size))) + +downloaded = skipped = 0 +failures = [] + +for path, digest, size in entries: + # Skip if already present and the right size (and, when VERIFY_ALL, the + # right hash). This avoids re-hashing ~19GB on every run. + if os.path.isfile(path) and os.path.getsize(path) == size: + if not verify_all or sha256(path) == digest: + print(f" [skip] {path}") + skipped += 1 + continue + + print(f" [get ] {path} ({size / 1e6:.1f} MB)") + try: + hf_hub_download( + repo_id=repo_id, + repo_type="dataset", + filename=path, + local_dir=".", + ) + except Exception as exc: # noqa: BLE001 + print(f" [FAIL] download error for {path}: {exc}") + failures.append(path) + continue -echo "Download complete." + actual = sha256(path) + if actual != digest: + print(f" [FAIL] checksum mismatch for {path}") + print(f" expected {digest}") + print(f" actual {actual}") + failures.append(path) + continue + downloaded += 1 -# Optional: verify checksum -echo "Verifying file size..." -du -sh "$OUTPUT_PATH" +print() +print(f"Downloaded: {downloaded}, skipped: {skipped}, failed: {len(failures)}") +if failures: + print("Failed files:") + for p in failures: + print(f" - {p}") + sys.exit(1) +print("All datasets present and verified.") +PY -echo "Done." \ No newline at end of file +echo "Done." diff --git a/query_DEPS_DEV_V1/query_dataset/package_query.db b/query_DEPS_DEV_V1/query_dataset/package_query.db deleted file mode 100644 index f9e9687f01..0000000000 --- a/query_DEPS_DEV_V1/query_dataset/package_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6676259f784aa89aec838a3e4a74bc68793085e49520c819b2356205a6eaf68a -size 538877952 diff --git a/query_DEPS_DEV_V1/query_dataset/project_query.db b/query_DEPS_DEV_V1/query_dataset/project_query.db deleted file mode 100644 index 3c84d37dbc..0000000000 --- a/query_DEPS_DEV_V1/query_dataset/project_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e6b171594cbb8dbb5298b54206b235a396b0eee9dd8c2c68deefab8d76c72e6a -size 11022336 diff --git a/query_GITHUB_REPOS/query_dataset/repo_artifacts.db b/query_GITHUB_REPOS/query_dataset/repo_artifacts.db deleted file mode 100644 index 4aae5f3669..0000000000 --- a/query_GITHUB_REPOS/query_dataset/repo_artifacts.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d514c66eb07403f79b8da2ff6a79a6c82f2d862cadaf71d536ea58644cd07bbc -size 361246720 diff --git a/query_GITHUB_REPOS/query_dataset/repo_metadata.db b/query_GITHUB_REPOS/query_dataset/repo_metadata.db deleted file mode 100644 index ed90a5d148..0000000000 --- a/query_GITHUB_REPOS/query_dataset/repo_metadata.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9b71beb5d7e2b344f996baf19fb34231c24ab354019d65bd14a1cbfc62c67a1 -size 567222272 diff --git a/query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql b/query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql deleted file mode 100644 index 22d76edc93..0000000000 --- a/query_PANCANCER_ATLAS/query_dataset/pancancer_clinical.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce57356f7a0f72e1c43a1fc2083d46293991645cbce2f6f237c731bf4035fdd6 -size 7555304 diff --git a/query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db b/query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db deleted file mode 100644 index 52566a51f3..0000000000 --- a/query_PANCANCER_ATLAS/query_dataset/pancancer_molecular.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e1131c386014ea8bdd528d2fd040a85f6891dd9e46e8dc18947f7a712cf8610 -size 293875712 diff --git a/query_PATENTS/query_dataset/patent_CPCDefinition.sql b/query_PATENTS/query_dataset/patent_CPCDefinition.sql deleted file mode 100644 index 783f4ed10c..0000000000 --- a/query_PATENTS/query_dataset/patent_CPCDefinition.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f888382228bf87538744194ca2ea1be126c316423d6d10e35e7d4ec226da0144 -size 135238448 diff --git a/query_agnews/query_dataset/metadata.db b/query_agnews/query_dataset/metadata.db deleted file mode 100644 index 6fd2c7ee41..0000000000 --- a/query_agnews/query_dataset/metadata.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:30cfb96d29b4e49c5f6ae6e5cb2c754d98e118a183ddd57ae44ee3564aeb1955 -size 4046848 diff --git a/query_bookreview/query_dataset/books_info.sql b/query_bookreview/query_dataset/books_info.sql deleted file mode 100644 index e23108fb9f..0000000000 --- a/query_bookreview/query_dataset/books_info.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80acb3ece574f26b634107c9c553dc5ab0c95ca11f6b226c9602dff9465393d9 -size 649048 diff --git a/query_bookreview/query_dataset/review_query.db b/query_bookreview/query_dataset/review_query.db deleted file mode 100644 index eb08127223..0000000000 --- a/query_bookreview/query_dataset/review_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01f07a5894f058613374694a8f46d19341bf1fa316367494d019191e772b3e2c -size 1093632 diff --git a/query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson b/query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson deleted file mode 100644 index 8fa2c86af8..0000000000 --- a/query_civic_unstructured/query_dataset/civic_docs_dump/civic_db/civic_docs.bson +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:57fd5ff32b81478b696b6af7583dfd64744c01b3ed6bf112b3f8ee00c2e45bce -size 237654 diff --git a/query_civic_unstructured/query_dataset/funding.db b/query_civic_unstructured/query_dataset/funding.db deleted file mode 100644 index 376302afc8..0000000000 --- a/query_civic_unstructured/query_dataset/funding.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e2e209f30c79917882db69ef379e0d4f58be23e7117e6caf14c31ebdbbfefc3 -size 106496 diff --git a/query_crmarenapro/query_dataset/activities.duckdb b/query_crmarenapro/query_dataset/activities.duckdb deleted file mode 100644 index bf41ecd1c8..0000000000 --- a/query_crmarenapro/query_dataset/activities.duckdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3299b9214885f00c71c9af77d89b62ad73f3e545e3cf5baa4861d1265151c60 -size 21245952 diff --git a/query_crmarenapro/query_dataset/core_crm.db b/query_crmarenapro/query_dataset/core_crm.db deleted file mode 100644 index 41f2e706ea..0000000000 --- a/query_crmarenapro/query_dataset/core_crm.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa595797e41db401136a0a68bf14bac44707f9f86ed235f61d9f051728e50506 -size 188416 diff --git a/query_crmarenapro/query_dataset/products_orders.db b/query_crmarenapro/query_dataset/products_orders.db deleted file mode 100644 index 12ee710ea6..0000000000 --- a/query_crmarenapro/query_dataset/products_orders.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92ad52f2a624ee5e0cdf7cd4cc64ac60b1e3f7af2c672be27f651f80af4a6bdb -size 143360 diff --git a/query_crmarenapro/query_dataset/sales_pipeline.duckdb b/query_crmarenapro/query_dataset/sales_pipeline.duckdb deleted file mode 100644 index 37e440d988..0000000000 --- a/query_crmarenapro/query_dataset/sales_pipeline.duckdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b44cc3754820bff20c3f25e50cf09c6be401f86e69df6b9d3ee7a4cf64ac5af -size 2371584 diff --git a/query_crmarenapro/query_dataset/support.sql b/query_crmarenapro/query_dataset/support.sql deleted file mode 100644 index 985eb90c1b..0000000000 --- a/query_crmarenapro/query_dataset/support.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5248cd64cddf4936027859c5d6f74b2d32fbeb4e1939d784b2ff26d45d83950c -size 8860941 diff --git a/query_crmarenapro/query_dataset/territory.db b/query_crmarenapro/query_dataset/territory.db deleted file mode 100644 index e918dedbf7..0000000000 --- a/query_crmarenapro/query_dataset/territory.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a9650d4ba3c0a3690b936d294feb31a3227bf6f2ebe886f74cf12e14936366a -size 24576 diff --git a/query_cve/query_dataset/kev.sql b/query_cve/query_dataset/kev.sql deleted file mode 100644 index bbb918d7f8..0000000000 --- a/query_cve/query_dataset/kev.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76f15a345a8dd1006a5b21c198e52952846efa7dc5581727d5052137218a8bfe -size 1030796 diff --git a/query_cve/query_dataset/vulns.db b/query_cve/query_dataset/vulns.db deleted file mode 100644 index 0722132dff..0000000000 --- a/query_cve/query_dataset/vulns.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e76efe1a177354afcc0929610eabbcf088f29af7bc1638d1def78085f4ca601b -size 20504576 diff --git a/query_googlelocal/query_dataset/business_description.sql b/query_googlelocal/query_dataset/business_description.sql deleted file mode 100644 index da7c6764ed..0000000000 --- a/query_googlelocal/query_dataset/business_description.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cac36db1c60ab1fcbc661cc7df46c9980db1c2fcdaedb65b57f7a2618339ddd4 -size 38204 diff --git a/query_googlelocal/query_dataset/review_query.db b/query_googlelocal/query_dataset/review_query.db deleted file mode 100644 index d269486c27..0000000000 --- a/query_googlelocal/query_dataset/review_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e9ef561d4edf62ff49bf874253935caf8d404a79c60d21b9b0fc40ee041692c -size 585728 diff --git a/query_imdb/query_dataset/movies.sql b/query_imdb/query_dataset/movies.sql deleted file mode 100644 index 7be2bed72f..0000000000 --- a/query_imdb/query_dataset/movies.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ef5452628ec8d07fbdc283fdb420168bd97b718dda25648087f2fe178bb1daa -size 1622189538 diff --git a/query_imdb/query_dataset/people.sqlite b/query_imdb/query_dataset/people.sqlite deleted file mode 100644 index 4d1eff0c01..0000000000 --- a/query_imdb/query_dataset/people.sqlite +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba583402740f2f5ef42b5be7ac1dd4c47a0cac990ddf7332575e203b976c1b10 -size 2639507456 diff --git a/query_krama/query_dataset/domain_assets.db b/query_krama/query_dataset/domain_assets.db deleted file mode 100644 index d5695973db..0000000000 --- a/query_krama/query_dataset/domain_assets.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d657f2253337b19694e7785b7d7ccf4398df95fe78012fc3b9e803715d15a3d1 -size 160571392 diff --git a/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson b/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson deleted file mode 100644 index dd7edfc7b9..0000000000 --- a/query_krama/query_dataset/domain_docs/domain_docs_db/files.bson +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bab163bfef5761e4e1407943eed4f9b2b1af0f6c14dd08931980f5df3dbd376 -size 528766506 diff --git a/query_krama/query_dataset/us_geo.db b/query_krama/query_dataset/us_geo.db deleted file mode 100644 index d55f05c848..0000000000 --- a/query_krama/query_dataset/us_geo.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf442d08de0ffc396d8b750597dba7d0df6659fac44af98ec53b73ae7a31514c -size 28504064 diff --git a/query_music_brainz_20k/query_dataset/tracks.db b/query_music_brainz_20k/query_dataset/tracks.db deleted file mode 100644 index 6951bf4f93..0000000000 --- a/query_music_brainz_20k/query_dataset/tracks.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c349f70e5bafcba1deb605b49e224abfa95c5a83a44688faa511f9e123a02cc -size 2035712 diff --git a/query_stockindex/query_dataset/indexInfo_query.db b/query_stockindex/query_dataset/indexInfo_query.db deleted file mode 100644 index 6ee5a19f66..0000000000 --- a/query_stockindex/query_dataset/indexInfo_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:976511f3eb213269c81b286dd4517e4948b3d3bb776e221559943f0439916b55 -size 8192 diff --git a/query_stockindex/query_dataset/indextrade_query.db b/query_stockindex/query_dataset/indextrade_query.db deleted file mode 100644 index d02869481c..0000000000 --- a/query_stockindex/query_dataset/indextrade_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12e9cf350be2a6396f6be9f0c2bb057013859956bc8269952dc7a85008973bb1 -size 4468736 diff --git a/query_stockmarket/query_dataset/stockinfo_query.db b/query_stockmarket/query_dataset/stockinfo_query.db deleted file mode 100644 index 6d10fbfc98..0000000000 --- a/query_stockmarket/query_dataset/stockinfo_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8cf6b5627c92e17c10e27779afa8aa77c659e44bd4ccc14b10f1116053bc6a2 -size 737280 diff --git a/query_stockmarket/query_dataset/stocktrade_query.db b/query_stockmarket/query_dataset/stocktrade_query.db deleted file mode 100644 index cae4d1e2e9..0000000000 --- a/query_stockmarket/query_dataset/stocktrade_query.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9cbf3c4d05d2eeb919c5b207374aefd19aa02e40dca49f2688618beb180350d -size 964964352 diff --git a/query_usaspending/query_dataset/contracts.sql b/query_usaspending/query_dataset/contracts.sql deleted file mode 100644 index 1842f14d7d..0000000000 --- a/query_usaspending/query_dataset/contracts.sql +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7fdde155670e23568458570bdb17171b36f4789b49c95dde5461b03ca951a861 -size 4840083 diff --git a/query_usaspending/query_dataset/recipients.db b/query_usaspending/query_dataset/recipients.db deleted file mode 100644 index b1ec91c20c..0000000000 --- a/query_usaspending/query_dataset/recipients.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:008916d8764ba11e759be1499a78c0c090da2fa8b72ff409ed69623c2f1733b5 -size 225280 diff --git a/query_yelp/query_dataset/yelp_user.db b/query_yelp/query_dataset/yelp_user.db deleted file mode 100644 index fe38b021a1..0000000000 --- a/query_yelp/query_dataset/yelp_user.db +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:155ecf989e62e0d087ed9d4586893e6825ab56504bc5e62ad46d454067f9b730 -size 3420160 diff --git a/upload_datasets_to_hf.py b/upload_datasets_to_hf.py new file mode 100755 index 0000000000..19d6348e85 --- /dev/null +++ b/upload_datasets_to_hf.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +"""Upload all DataAgentBench dataset files to the Hugging Face Hub. + +This mirrors every file listed in ``dataset_manifest.tsv`` to a Hugging Face +*dataset* repo, preserving the relative paths so that ``download.sh`` can pull +them straight back into place. + +Run once (per data refresh) from the repo root: + + export HF_TOKEN=hf_xxx # token with write access to the org repo + python upload_datasets_to_hf.py --create + +Re-running is safe: by default the HF upload de-duplicates unchanged files. + +Requires: pip install "huggingface_hub>=0.23" +""" +import argparse +import os +import sys + +from huggingface_hub import HfApi, create_repo + +DEFAULT_REPO = "ruiyingm/DataAgentBench-data" +DEFAULT_MANIFEST = "dataset_manifest.tsv" + + +def load_manifest(path): + entries = [] + with open(path) as f: + for line in f: + line = line.rstrip("\n") + if not line or line.startswith("#"): + continue + rel, digest, size = line.split("\t") + entries.append((rel, digest, int(size))) + return entries + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--repo-id", default=DEFAULT_REPO) + ap.add_argument("--manifest", default=DEFAULT_MANIFEST) + ap.add_argument("--token", default=os.environ.get("HF_TOKEN")) + ap.add_argument("--create", action="store_true", + help="create the dataset repo if it does not exist") + ap.add_argument("--private", action="store_true", + help="create the repo as private (default: public)") + args = ap.parse_args() + + if not args.token: + sys.exit("ERROR: provide a write token via --token or $HF_TOKEN") + + entries = load_manifest(args.manifest) + missing = [rel for rel, _, _ in entries if not os.path.isfile(rel)] + if missing: + print("WARNING: these manifest files are missing locally and will be " + "skipped:") + for m in missing: + print(f" - {m}") + entries = [e for e in entries if e[0] not in set(missing)] + + api = HfApi(token=args.token) + + if args.create: + create_repo(args.repo_id, repo_type="dataset", token=args.token, + private=args.private, exist_ok=True) + print(f"Repo ready: https://huggingface.co/datasets/{args.repo_id}") + + total = len(entries) + for i, (rel, _digest, size) in enumerate(entries, 1): + print(f"[{i}/{total}] uploading {rel} ({size / 1e6:.1f} MB) ...") + api.upload_file( + path_or_fileobj=rel, + path_in_repo=rel, + repo_id=args.repo_id, + repo_type="dataset", + commit_message=f"Add {rel}", + ) + + print(f"\nDone. Uploaded {total} files to {args.repo_id}.") + + +if __name__ == "__main__": + main()