From 81bdd280193a52525f99062dca8d1ab99977902c Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 16:52:00 -0500 Subject: [PATCH 1/9] fix relative link to installation section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index abc3e94..b2e2e94 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ - [Overview](#-overview) - [Project Structure](#-project-structure) - [Pipeline Components](#-pipeline-components) -- [Installation](#-installation) +- [Installation](#%EF%B8%8F-installation) - [Usage](#-usage) - [1. Individual Beetle Extraction](#1-individual-beetle-extraction) - [2. Zero-Shot Object Detection](#2-zero-shot-object-detection) From 0e03654c5801eda4c4f90e7414ca4fd7f9bf9c2d Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 16:53:04 -0500 Subject: [PATCH 2/9] Set for release tomorrow --- CITATION.cff | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index d3b2f05..59f564a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -14,7 +14,6 @@ authors: affiliation: "Virginia Tech" - family-names: "East" given-names: "Alyson" - email: "sydne.record@maine.edu" affiliation: "The University of Maine" - family-names: "Campolongo" given-names: "Elizabeth G." @@ -59,6 +58,6 @@ keywords: - ground-beetles license: MIT version: "1.0.0" -date-released: "2025-11-XX" # Update before release! +date-released: "2025-12-17" # Update before release! #doi: Add version agnostic DOI on release type: software From 120296ae7e21fe61d7148de83a427fb3903be0f2 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:19:52 -0500 Subject: [PATCH 3/9] Set base directories at top of file for easier reuse --- scripts/resizing_individual_beetle_images.py | 32 ++++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/resizing_individual_beetle_images.py b/scripts/resizing_individual_beetle_images.py index e1bc0d8..9fa0e7a 100644 --- a/scripts/resizing_individual_beetle_images.py +++ b/scripts/resizing_individual_beetle_images.py @@ -6,25 +6,26 @@ from PIL import Image import numpy as np +# Set Base Directories +BASE_DIR = "2018-NEON-beetles" +ORIGINAL_GROUP_IMAGES_DIR = os.path.join(BASE_DIR, "group_images") +PROCESS_DIR = os.path.join(BASE_DIR, "processed_images") + def calculate_uniform_scaling_factors(): """ Calculate uniform scaling factors between original group images and BeetlePalooza resized images. Returns a dictionary mapping picture_id -> uniform_scale_factor """ - # Paths - original_group_images_dir = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/2018-NEON-beetles/group_images" - resized_group_images_dir = "/fs/ess/PAS2136/paloozas/BeetlePalooza-2024/Resized Images [Corrected from ISA]" - scaling_factors = {} print("Step 1: Calculating uniform scaling factors between original and resized group images...") # Get list of resized images - if not os.path.exists(resized_group_images_dir): - print(f"Error: Directory {resized_group_images_dir} does not exist") + if not os.path.exists(PROCESS_DIR): + print(f"Error: Directory {PROCESS_DIR} does not exist") return {} - resized_files = [f for f in os.listdir(resized_group_images_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] + resized_files = [f for f in os.listdir(PROCESS_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] print(f"Found {len(resized_files)} resized images") processed = 0 @@ -35,7 +36,7 @@ def calculate_uniform_scaling_factors(): # Find corresponding original image original_path = None for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']: - potential_path = os.path.join(original_group_images_dir, picture_id + ext) + potential_path = os.path.join(ORIGINAL_GROUP_IMAGES_DIR, picture_id + ext) if os.path.exists(potential_path): original_path = potential_path break @@ -45,8 +46,8 @@ def calculate_uniform_scaling_factors(): continue # Load both images and get dimensions - resized_path = os.path.join(resized_group_images_dir, resized_filename) - + resized_path = os.path.join(PROCESS_DIR, resized_filename) + try: with Image.open(original_path) as orig_img: orig_width, orig_height = orig_img.size @@ -82,7 +83,7 @@ def calculate_uniform_scaling_factors(): print(f" Std: {np.std(scale_values):.3f}") # Save scaling factors to JSON file for reference - output_json = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/beetlepalooza_processing_scripts/uniform_scaling_factors.json" + output_json = os.path.join(PROCESS_DIR, "uniform_scaling_factors.json") with open(output_json, 'w') as f: json.dump(scaling_factors, f, indent=2, sort_keys=True) print(f"Uniform scaling factors saved to: {output_json}") @@ -130,10 +131,9 @@ def resize_individual_images_uniform(): print("\nStep 2: Resizing individual specimen images using uniform scaling...") # Paths - csv_file = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/2018-NEON-beetles/individual_specimens.csv" - individual_images_base = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/2018-NEON-beetles" - output_dir = "/fs/ess/PAS2136/Hawaii-2025/beetles_intake/beetlepalooza_processing_scripts/individual_images_resized_uniform" - + csv_file = os.path.join(BASE_DIR, "individual_specimens.csv") + output_dir = os.path.join(PROCESS_DIR, "individual_images_resized_uniform") + # Create output directory os.makedirs(output_dir, exist_ok=True) @@ -163,7 +163,7 @@ def resize_individual_images_uniform(): continue # Full path to individual image - individual_path_full = os.path.join(individual_images_base, individual_path_rel) + individual_path_full = os.path.join(BASE_DIR, individual_path_rel) if not os.path.exists(individual_path_full): skipped += 1 From 7ac40600b57ff289d65ece848ef7466f1c05dbe5 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:20:55 -0500 Subject: [PATCH 4/9] Make explicit directory setting need --- scripts/resizing_individual_beetle_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/resizing_individual_beetle_images.py b/scripts/resizing_individual_beetle_images.py index 9fa0e7a..9761969 100644 --- a/scripts/resizing_individual_beetle_images.py +++ b/scripts/resizing_individual_beetle_images.py @@ -7,7 +7,7 @@ import numpy as np # Set Base Directories -BASE_DIR = "2018-NEON-beetles" +BASE_DIR = "path/to/2018-NEON-beetles" ORIGINAL_GROUP_IMAGES_DIR = os.path.join(BASE_DIR, "group_images") PROCESS_DIR = os.path.join(BASE_DIR, "processed_images") From 6dd7e04df715da1bc07831f2db9c70ce00466f6e Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:38:51 -0500 Subject: [PATCH 5/9] restructure presentation to remove redundancy --- README.md | 177 ++++++++++++++++++++++++------------------------------ 1 file changed, 77 insertions(+), 100 deletions(-) diff --git a/README.md b/README.md index b2e2e94..394bb40 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,14 @@ - [Overview](#-overview) - [Project Structure](#-project-structure) - [Pipeline Components](#-pipeline-components) + - [1. Image Annotation and Extraction](#1-image-annotation-and-extraction) + - [2. Traditional Bouding Box Cropping (Individual Beetle Extraction)](#2-traditional-bounding-box-cropping) + - [3. Image Resizing with Uniform Scaling](#3-image-resizing-with-uniform-scaling) + - [4. Zero-Shot Object Detection](#4-zero-shot-object-detection) + - [5. Quality Control and Validation](#5-quality-control-and-validation) + - [6. NEON Data Analysis and Visualization](#6-neon-data-analysis-and-visualization) + - [7. Dataset Upload to Hugging Face](#7-dataset-upload-to-hugging-face) - [Installation](#%EF%B8%8F-installation) -- [Usage](#-usage) - - [1. Individual Beetle Extraction](#1-individual-beetle-extraction) - - [2. Zero-Shot Object Detection](#2-zero-shot-object-detection) - - [3. Quality Control and Validation](#3-quality-control-and-validation) - - [4. Data Visualization](#4-data-visualization) - [Data Sources](#-data-sources) - [Citation](#-citation) - [Acknowledgements](#acknowledgments) @@ -62,6 +64,8 @@ carabidae_beetle_processing/ ## 🔬 Pipeline Components +The pipeline and usage instructions are provided below. Please be sure to set up your coding environments appropriately for the needed portion of the pipeline (see [Installation](#%EF%B8%8F-installation) for detailed guidance). + ### 1. **Image Annotation and Extraction** **File:** `2018_neon_beetles_bbox.xml` @@ -70,7 +74,6 @@ CVAT (Computer Vision Annotation Tool) annotations containing: - 577 annotated images - Bounding box coordinates for individual beetles in group images - Image dimensions (5568 × 3712 pixels) -- Created: April 2025 **Format:** ```xml @@ -86,42 +89,32 @@ CVAT (Computer Vision Annotation Tool) annotations containing: Extracts individual beetle specimens from group images using CVAT XML bounding box annotations. Parses coordinates, crops specimens with optional padding, and saves as numbered PNG files with progress tracking. +#### Usage Instructions + +Extract individual beetles from group images using CVAT annotations: + +```bash +python scripts/2018_neon_beetles_get_individual_images.py \ + --xml_file annotations/2018_neon_beetles_bbox.xml \ + --images_dir /path/to/group_images/ \ + --output_dir /path/to/individual_beetles/ \ + --padding 0 +``` + +Outputs individual beetle images named `{original_name}_specimen_{N}.png`. + ### 3. **Image Resizing with Uniform Scaling** **Script:** `resizing_individual_beetle_images.py` -Aligns individual beetle crops with BeetlePalooza's Zooniverse-processed group images by applying uniform scaling factors. This enables accurate transfer of citizen science measurements from resized group images to individual specimens. +Aligns individual beetle crops with the 2018-NEON-Beetles Zooniverse-processed group images by applying uniform scaling factors. This enables accurate transfer of citizen science measurements from resized group images to individual specimens. Set proper base directories at the top of the script before use. **Workflow:** 1. Calculate uniform scaling factors (average of x and y) between original and resized group images 2. Apply scaling to all individual specimen images 3. Save scaling metadata and processing statistics to JSON -### 4. **Dataset Upload to Hugging Face** - -**Script:** `upload_dataset_to_hf.py` - -Utility script for uploading processed beetle datasets to Hugging Face Hub for public access and reproducibility. - -**Usage:** -```bash -export HF_TOKEN="your_hugging_face_token" - -python upload_dataset_to_hf.py \ - --folder_path /path/to/local/images \ - --repo_id imageomics/dataset-name \ - --path_in_repo images \ - --branch main -``` - -**Parameters:** -- `--folder_path`: Local directory containing files to upload -- `--repo_id`: Hugging Face repository identifier (org/repo-name) -- `--path_in_repo`: Subdirectory within the repository (default: "images") -- `--repo_type`: Repository type - "dataset" or "model" (default: "dataset") -- `--branch`: Target branch name (default: "main") - -### 5. **Zero-Shot Object Detection** +### 4. **Zero-Shot Object Detection** **Script:** `beetle_detection.py` | **Notebook:** `grounding_dino.ipynb` @@ -141,43 +134,87 @@ Optional parameters: `--model_id` (default: `IDEA-Research/grounding-dino-base`) The pipeline detects beetles using text prompts, filters by adaptive area thresholds, validates measurement points, applies NMS to remove duplicates, and selects optimal bounding boxes before saving crops and metadata. -### 6. **Inter-Annotator Agreement** +### 5. Quality Control and Validation + +#### Inter-Annotator Agreement **Script:** `inter_annotator.py` Quantifies measurement consistency between human annotators using three pairwise comparisons. Computes RMSE (measurement disagreement), R² (correlation strength), and average bias (systematic tendencies). Generates `InterAnnotatorAgreement.pdf` with scatter plots and console metrics report. -### 7. **Human vs. Automated System Validation** +```bash +python scripts/inter_annotator.py +``` + +Edit `DATA_PATH` and `ANNOTATOR_PAIRS` in the script to configure input data and comparisons. Outputs `InterAnnotatorAgreement.pdf` and console metrics. + +#### Human vs. Automated System **Script:** `calipers_vs_toras.py` Validates automated TORAS measurements against human caliper measurements (gold standard). Compares three annotators individually and averaged against the automated system using RMSE, R², and bias metrics. Generates `CalipersVsToras.pdf` with comparison plots. +```bash +python scripts/calipers_vs_toras.py +``` + +Edit configuration variables in the script for data paths and comparison pairs. Generates `CalipersVsToras.pdf` with validation metrics. -### 8. **NEON Data Analysis and Visualization** +### 6. **NEON Data Analysis and Visualization** **Script:** `Figure6and10.R` Analyzes NEON beetle data from PUUM site (Pu'u Maka'ala Natural Area Reserve, Hawaii) integrated with BeetlePalooza citizen science measurements. Retrieves data via NEON API, merges taxonomic identifications with morphometric measurements, and generates species abundance visualizations. Produces `BeetlePUUM_abundance.png` showing imaging status and merged analysis dataset. +Run R script for NEON data analysis: + +```bash +Rscript scripts/Figure6and10.R +``` + +Requires NEON API token saved in `NEON_Token.txt` (see [NEON token instructions](#neon-api-token)) and BeetlePalooza metadata (2018-NEON-Beetles `individual_metadata.csv`). Edit paths in script as needed. Produces `BeetlePUUM_abundance.png` showing species distributions. + **Requirements:** R packages: `ggplot2`, `dplyr`, `ggpubr`, `neonUtilities` +### 7. **Dataset Upload to Hugging Face** + +**Script:** `upload_dataset_to_hf.py` + +Utility script used to upload the processed beetle datasets to Hugging Face Hub for public access and reproducibility. + +**Usage:** +```bash +export HF_TOKEN="your_hugging_face_token" + +python upload_dataset_to_hf.py \ + --folder_path /path/to/local/images \ + --repo_id imageomics/dataset-name \ + --path_in_repo images \ + --branch main +``` + +**Parameters:** +- `--folder_path`: Local directory containing files to upload +- `--repo_id`: Hugging Face repository identifier (org/repo-name) +- `--path_in_repo`: Subdirectory within the repository (default: "images") +- `--repo_type`: Repository type - "dataset" or "model" (default: "dataset") +- `--branch`: Target branch name (default: "main") + --- ## 🛠️ Installation ### Prerequisites -- **Python 3.10+** (for Python scripts and notebooks) -- **R 4.0+** (for R scripts) -- **Git** (for version control) +- **Python 3.10+** +- **R 4.0+** - **CUDA-capable GPU** (recommended for Grounding DINO, but not required) ### Python Setup 1. **Clone the repository:** ```bash - git clone https://github.com/mridulk97/carabidae_beetle_processing.git + git clone git@github.com:Imageomics/carabidae_beetle_processing.git cd carabidae_beetle_processing ``` @@ -212,71 +249,11 @@ For R script (`Figure6and10.R`): --- -## 🚀 Usage - -### 1. Individual Beetle Extraction - -Extract individual beetles from group images using CVAT annotations: - -```bash -python scripts/2018_neon_beetles_get_individual_images.py \ - --xml_file annotations/2018_neon_beetles_bbox.xml \ - --images_dir /path/to/group_images/ \ - --output_dir /path/to/individual_beetles/ \ - --padding 0 -``` - -Outputs individual beetle images named `{original_name}_specimen_{N}.png`. - -### 2. Zero-Shot Object Detection - -Run automated beetle detection: - -```bash -python scripts/beetle_detection.py \ - --csv_path data/metadata.csv \ - --image_dir data/group_images \ - --save_folder data/individual_images \ - --output_csv data/processed.csv -``` - -Optional parameters include `--model_id`, `--text` (detection prompt), `--box_threshold`, `--text_threshold`, `--iou_threshold`, and `--padding`. See Pipeline Components section for parameter details. - -### 3. Quality Control and Validation - -#### Inter-Annotator Agreement - -```bash -python scripts/inter_annotator.py -``` - -Edit `DATA_PATH` and `ANNOTATOR_PAIRS` in the script to configure input data and comparisons. Outputs `InterAnnotatorAgreement.pdf` and console metrics. - -#### Human vs. Automated System - -```bash -python scripts/calipers_vs_toras.py -``` - -Edit configuration variables in the script for data paths and comparison pairs. Generates `CalipersVsToras.pdf` with validation metrics. - -### 4. Data Visualization - -Run R script for NEON data analysis: - -```bash -Rscript scripts/Figure6and10.R -``` - -Requires NEON API token saved in `NEON_Token.txt` and BeetlePalooza metadata. Edit paths in script as needed. Produces `BeetlePUUM_abundance.png` showing species distributions. - ---- - ## 📊 Data Sources ### Hugging Face Datasets (Primary Access Point) -The processed datasets from this pipeline are available on Hugging Face: +The processed datasets from this pipeline are available on Hugging Face along with the original data: #### 1. Hawaii Beetles Dataset **Repository:** [imageomics/Hawaii-beetles](https://huggingface.co/datasets/imageomics/Hawaii-beetles) From 283b6821585f996815be98b00e8b676fde17ea84 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:40:03 -0500 Subject: [PATCH 6/9] Set citation to proper release month --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 394bb40..b5ea9b3 100644 --- a/README.md +++ b/README.md @@ -290,7 +290,7 @@ If you use this code or methodology, please cite both this repository and our pa @software{Rayeed_Carabidae_Beetle_Processing_2025, author = {Rayeed, S M and Khurana, Mridul and East, Alyson and Campolongo, Elizabeth G. and Stevens, Samuel and Wu, Jiaman and Taylor, Graham W.}, license = {MIT}, - month = nov, + month = dec, title = {{Carabidae Beetle Processing Pipeline}}, url = {https://github.com/Imageomics/carabidae_beetle_processing}, version = {1.0.0}, From 91b09ce9f302f064b96c2d0599d645f61b37fefd Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:41:36 -0500 Subject: [PATCH 7/9] Add Zenodo metadata file and its format test workflow --- .github/workflows/validate-zenodo.yaml | 23 +++++++++ .zenodo.json | 70 ++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 .github/workflows/validate-zenodo.yaml create mode 100644 .zenodo.json diff --git a/.github/workflows/validate-zenodo.yaml b/.github/workflows/validate-zenodo.yaml new file mode 100644 index 0000000..cb4ee12 --- /dev/null +++ b/.github/workflows/validate-zenodo.yaml @@ -0,0 +1,23 @@ +name: Check zenodo metadata + +on: + push: + paths: + - '.zenodo.json' + - '.github/workflows/validate-zenodo.yaml' + +jobs: + check-zenodo-metadata: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '22' + - name: Install dependencies + run: npm install zenodraft@0.14.1 + - name: Check .zenodo.json file + run: | + npx zenodraft metadata validate .zenodo.json diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 0000000..ab76f7e --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,70 @@ +{ + "creators": [ + { + "name": "Rayeed, S M", + "affiliation": "Rensselaer Polytechnic Institute" + }, + { + "name": "Khurana, Mridul", + "affiliation": "Virginia Tech" + }, + { + "name": "East, Alyson", + "affiliation": "The University of Maine" + }, + { + "name": "Campolongo, Elizabeth G.", + "affiliation": "The Ohio State University" + }, + { + "name": "Stevens, Samuel", + "affiliation": "The Ohio State University" + }, + { + "name": "Wu, Jiaman", + "affiliation": "The Ohio State University" + }, + { + "name": "Taylor, Graham W.", + "affiliation": "University of Guelph" + } + ], + "description": "Pipeline for processing, analyzing, and validating beetle specimen images and morphometric measurements from NEON (National Ecological Observatory Network) beetle specimens (specifically for the 2018 NEON Beetles and Hawaii Beetles datasets). The project focuses on Carabidae (ground beetles) and implements automated beetle detection and cropping, morphometric trait extraction, inter-annotator agreement analysis, human vs. automated system validation, and species distribution visualization.", + "keywords": [ + "imageomics", + "computer-vision", + "beetles", + "carabidae", + "morphometrics", + "neon", + "grounding-dino", + "zero-shot-detection", + "quality-control", + "biodiversity", + "ecology", + "animals", + "image", + "segmentation", + "species", + "elytra", + "basal pronotum", + "traits", + "annotation", + "measurements", + "pinned specimens", + "Hawaii", + "ground-beetles" + ], + "title": "Carabidae Beetle Processing Pipeline", + "version": "1.0.0", + "license": "MIT", + "publication_date": "2025-12-17", + "grants": [ + { + "id": "021nxhr62::2118240" + }, + { + "id": "021nxhr62::2330423" + } + ] +} From 37af7e0c85b2e20df4dc09ed58941bcea854aba1 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Tue, 16 Dec 2025 17:45:20 -0500 Subject: [PATCH 8/9] Remove extraneous comment --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 59f564a..3a3e9e0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -58,6 +58,6 @@ keywords: - ground-beetles license: MIT version: "1.0.0" -date-released: "2025-12-17" # Update before release! +date-released: "2025-12-17" #doi: Add version agnostic DOI on release type: software From 530dc8507fb2f5d7ffbf0aa71bb03cd062516f3b Mon Sep 17 00:00:00 2001 From: Mridul Khurana Date: Thu, 18 Dec 2025 12:58:04 -0500 Subject: [PATCH 9/9] updated release date to 2025-12-18 in zenodo json and citation ciff --- .zenodo.json | 2 +- CITATION.cff | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index ab76f7e..547364e 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -58,7 +58,7 @@ "title": "Carabidae Beetle Processing Pipeline", "version": "1.0.0", "license": "MIT", - "publication_date": "2025-12-17", + "publication_date": "2025-12-18", "grants": [ { "id": "021nxhr62::2118240" diff --git a/CITATION.cff b/CITATION.cff index 3a3e9e0..da63406 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -58,6 +58,6 @@ keywords: - ground-beetles license: MIT version: "1.0.0" -date-released: "2025-12-17" +date-released: "2025-12-18" #doi: Add version agnostic DOI on release type: software