From efce749dac04d19443242892235501940c7951ba Mon Sep 17 00:00:00 2001 From: ying2212 Date: Tue, 12 May 2026 14:15:19 -0400 Subject: [PATCH 1/3] write SKILL to generate ingest source script --- .claude/skills/ingest-source/SKILL.md | 177 ++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 .claude/skills/ingest-source/SKILL.md diff --git a/.claude/skills/ingest-source/SKILL.md b/.claude/skills/ingest-source/SKILL.md new file mode 100644 index 0000000..a2568eb --- /dev/null +++ b/.claude/skills/ingest-source/SKILL.md @@ -0,0 +1,177 @@ +--- +name: ingest-source +description: Generate a Python ingest script that ingests sources (astronomical objects) into an AstroDB database from a parsed data table. Use this skill after match-schema has verified the source name, RA, Dec, and reference columns. Trigger when user says: "ingest sources", "ingest objects", "add new sources to database", or "ingest my data table". +compatibility: python, astropy, astrodb_utils, astroquery +--- + +# Ingest Sources Skill +Generate a Python script that ingests rows from a data table into the `Sources` table of an AstroDB SQLite database using `astrodb_utils.sources.ingest_source`. + +## Prerequisites +1. **A database file**: JSON data files with a `database.toml` settings file, following the astrodb-template-db structure. If the user doesn't have a database yet, run the `create-astrodb` skill first. +2. **Installed packages**: `astrodb_utils`, `astropy`, `astroquery` +3. **A data table**: CSV, ECSV, FITS, or other astropy-readable format with at minimum: a source name column and a discovery reference column +4. **Publications table populated**: every reference value must already exist in the `Publications` table. If not, tell user that the reference should be ingested first. +5. **Internet access (recommended)**: used to query SIMBAD for coordinates when RA/Dec are missing + +## Required Inputs +1. Path to the data table file (CSV, ECSV, FITS, etc.) +2. Path to `database.toml` (the project config file generated by `create-astrodb`) + +## Instrutions + +### Step 1: Parse the data table +Run the `parse-data-table` skill on the provided data file, then read the sidecar it produces: +```python +import json, os + +sidecar = "tmp/astrodb-parse-result.json" +meta = json.load(open(sidecar)) +file_path = meta["file_path"] # path to the data table +reader = meta["reader"] # "astropy" or "pandas" +fmt = meta["format"] # astropy format hint, or None +n_rows = meta["n_rows"] # number of rows +``` + +Load the table using the same reader that parse-data-table already verified: + +```python +from astropy.table import Table +import pandas as pd + +if reader == "astropy": + kwargs = {"format": fmt} if fmt else {} + data = Table.read(file_path, **kwargs) +else: + data = getattr(pd, meta["pandas_method"])(file_path) +``` + + +### Step 2: Confirm column mappings +Show the user the column names from the parsed result and ask them to identify: +- Which column is the **source name** (required) +- Which column is **RA** in decimal degrees (optional) +- Which column is **Dec** in decimal degrees (optional) +- Which column is the **discovery reference** (required - must exist in Publications table) +- Any optional columns: epoch, equinox, comment, other_reference + +Example prompt to user: +> The table has these columns: `source, ra_deg, dec_deg, reference, epoch` +> Which column is the source name? Which is the discovery reference? + + +### Step 3: Generate ingest_sources.py +Populate the script config with confirmed mappings and paths from the sidecar. +Write the script to `tmp/ingest_sources.py`. + +### Step 4: Run the script +Execute `tmp/ingest_sources.py` with `SAVE_DB = False` (dry run) and report: +- How many sources were ingested successfully +- Any rows skipped with warnings +- That the database is still in preview mode + +### Step 5: Prompt to save +After a successful dry run (no errors or only expected warnings), ask the user: + +> Ingestion preview complete: X sources processed, Y added, Z skipped. +> Would you like to save these changes to the database? (Sets SAVE_DB = True and re-runs) + +Only re-run with `SAVE_DB = True` if the user explicitly confirms. Never save automatically. + +## Generated Script +Write `tmp/ingest_sources.py` with the confirmed values filled in: + +```python +import logging +from astropy.table import Table +import pandas as pd +from astrodb_utils import build_db_from_json +from astrodb_utils.sources import ingest_source + +# --- Logging --- +astrodb_utils_logger = logging.getLogger("astrodb_utils") +astrodb_utils_logger.setLevel(logging.INFO) +logger = logging.getLogger("astrodb_utils.ingest_sources") +logger.setLevel(logging.INFO) + +# --- Configuration --- +SAVE_DB = False # set True only after dry run confirms no errors + +# Load database — matches the structure created by the create-astrodb skill. +# SCHEMA_PATH must point to the cloned schema repo (base_path in build_db_from_json). +# If you don't have it yet, run: +# git clone https://github.com/astrodbtoolkit/astrodb-template-db.git tests/astrodb-template-db +SCHEMA_PATH = "tests/astrodb-template-db" # cloned schema repo +DB_NAME = "tests/astrodb-template-tests" # output .sqlite path (no extension) +SETTINGS_FILE = "database.toml" # matches data_path and felis_path + +db = build_db_from_json( + settings_file=SETTINGS_FILE, + base_path=SCHEMA_PATH, + db_name=DB_NAME, +) + +# --- Data table --- +# Filled from parse-data-table sidecar (tmp/astrodb-parse-result.json) +TABLE_PATH = "path/to/data_table.csv" # fill in +data = Table.read(TABLE_PATH) +logger.info(f"Loaded {len(data)} rows from {TABLE_PATH}") + +# --- Column mapping — filled from confirmed mappings in Step 2 --- +SOURCE_COL = "source" # required +RA_COL = "ra_deg" # set to None if not in table (SIMBAD fallback) +DEC_COL = "dec_deg" # set to None if not in table (SIMBAD fallback) +REFERENCE_COL = "reference" # required — must exist in Publications table + +# Optional columns — set to None if not present +EPOCH_COL = None +EQUINOX_COL = None +COMMENT_COL = None +OTHER_REF_COL = None + +# --- Ingest sources --- +# raise_error=True: stop on first error (good for dry runs and development) +# raise_error=False: skip bad rows and continue (good for bulk ingestion) +sources_added = 0 +for row in data: + try: + ingest_source( + db, + source=row[SOURCE_COL], + reference=row[REFERENCE_COL], + ra=row[RA_COL] if RA_COL else None, + dec=row[DEC_COL] if DEC_COL else None, + epoch=str(row[EPOCH_COL]) if EPOCH_COL else None, + equinox=str(row[EQUINOX_COL]) if EQUINOX_COL else None, + other_reference=str(row[OTHER_REF_COL]) if OTHER_REF_COL else None, + comment=str(row[COMMENT_COL]) if COMMENT_COL else None, + raise_error=True, + ) + sources_added += 1 + logger.info(f"Ingested: {row[SOURCE_COL]}") + except Exception as e: + logger.warning(f"Skipping {row[SOURCE_COL]}: {e}") + continue + +logger.info(f"Total sources ingested: {sources_added} / {len(data)}") + +# --- Save database --- +# Save path matches data_path in database.toml, consistent with create-astrodb skill. +if SAVE_DB: + db.save_database(directory="data/") + logger.info("Database saved to data/") +``` + +## Key Behaviors +1. **Missing RA/Dec**: if `RA_COL` or `DEC_COL` is `None`, `ingest_source` queries SIMBAD automatically. If SIMBAD has no match, that row is skipped with a warning. +2. **Duplicate sources**: if a source already exists, `ingest_source` adds the new name as an alternate in the `Names` table. With `raise_error=False`, duplicates are skipped with a warning. +3. **Missing reference**: `reference` must already be in `Publications` or ingestion fails. Remind the user to run `ingest_publication` first. +4. **Unicode dashes**: handled automatically by `ingest_source` (en dash, em dash, minus sign, figure dash → `-`). +5. **Column name defaults**: `ra_col_name="ra_deg"`, `dec_col_name="dec_deg"`, `epoch_col_name="epoch_year"` — these are the database column names, not the input table column names. + +## Output +Report: +1. Number of sources successfully ingested vs total rows +2. Any skipped rows with reasons from the WARNING logs +3. Confirm whether database was saved or is still in preview mode (`SAVE_DB = False`) + From bdfdae8ac73c7ba0ffc00d09e421d6b57d29ae22 Mon Sep 17 00:00:00 2001 From: ying2212 Date: Thu, 14 May 2026 13:43:42 -0400 Subject: [PATCH 2/3] restrcuture skill with cleaner step + instruction --- .claude/skills/astrodb-ingest-source/SKILL.md | 183 ++++++++++++++++++ .../references/ingest_source_api.md | 53 +++++ .claude/skills/ingest-source/SKILL.md | 177 ----------------- 3 files changed, 236 insertions(+), 177 deletions(-) create mode 100644 .claude/skills/astrodb-ingest-source/SKILL.md create mode 100644 .claude/skills/astrodb-ingest-source/references/ingest_source_api.md delete mode 100644 .claude/skills/ingest-source/SKILL.md diff --git a/.claude/skills/astrodb-ingest-source/SKILL.md b/.claude/skills/astrodb-ingest-source/SKILL.md new file mode 100644 index 0000000..4684a80 --- /dev/null +++ b/.claude/skills/astrodb-ingest-source/SKILL.md @@ -0,0 +1,183 @@ +--- +name: ingest-sources +description: "Generate and run a Python script that ingests sources (astronomical objects) into an AstroDB Sources table from a data table. Use this skill when the user says: ingest sources, ingest objects, add new sources to the database, add objects to SIMPLE, or provides a FITS/CSV/ECSV file and wants to populate the Sources table. Works standalone or as the step after match-schema." +compatibility: python, astropy, astrodb_utils, astroquery +--- + +# Ingest Sources Skill + +Generate and run a Python script that ingests rows from a data table into the `Sources` +table of an AstroDB SQLite database using `astrodb_utils.sources.ingest_source`. + +Read `references/ingest_source_api.md` before starting — it has the full signature, +parameter meanings, and common warnings with fixes. + +## Prerequisites + +1. **Database**: JSON data files + `database.toml` (astrodb-template-db layout). + If absent, run the `create-astrodb` skill first. +2. **Packages**: `astrodb_utils`, `astropy`, `astroquery` +3. **Data table**: FITS, CSV, ECSV, or any astropy-readable format, with at minimum + a source name column and a discovery reference column. +4. **Publications populated**: every reference value must already exist in `Publications`. + If not, tell the user to run `ingest_publication` first. +5. **Internet (recommended)**: used by `ingest_source` to query SIMBAD when RA/Dec + are not in the table. + +## Required Inputs +1. Path to the data table file (CSV, ECSV, FITS, etc.) +2. Path to `database.toml` — check in order: + 1. A path the user explicitly stated in the conversation + 2. `database.toml` in the current working directory (root of the project) + 3. If not found, ask the user for the path before continuing + +--- + +## Step 1: Load and inspect the data table + +```python +from astropy.table import Table +data = Table.read("path/to/file.fits") +# If auto-detect fails: Table.read(..., format="fits") +print(data.colnames) +print(data[:3]) +``` + +Show the user the **column names**, **dtypes**, and a **3-row preview**. + +--- + +## Step 2: Confirm column mappings + +Show the actual column names from Step 1 — **never assume defaults like `source` or +`ra_deg`**, since real catalogs use names like `Name`, `RA`, `Dec`, `object`, etc. + +Ask the user to confirm: + +| Role | Required? | Notes | +|------|-----------|-------| +| Source name | **Yes** | String column | +| Discovery reference | **Yes** | Must already exist in `Publications` | +| RA (decimal degrees) | No | If absent → SIMBAD fallback | +| Dec (decimal degrees) | No | If absent → SIMBAD fallback | +| Epoch | No | | +| Equinox | No | | +| Comment | No | | +| Other reference | No | | + +After confirmation, read the first value of the reference column — use it as `{REF}` +to name the output script (e.g. `Burg24`). + +Example prompt to user: +> The table has these columns: `Name, RA, Dec, Dist, Reference` +> Which column is the source name? Which is the discovery reference? + +--- + +## Step 3: Write `tmp/ingest_{REF}_sources.py` + +Fill in all values from Steps 1–2 and write the script to `tmp/ingest_{REF}_sources.py`. +Every variable below must contain a real value — never write placeholder text to the file. + +```python +import logging +from astropy.table import Table +from astrodb_utils import build_db_from_json +from astrodb_utils.sources import ingest_source + +logging.getLogger("astrodb_utils").setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logging.basicConfig(format="%(levelname)s - %(message)s") + +SAVE_DB = False # set True only after a clean dry run + +# Adjust to match your project layout +SCHEMA_PATH = "tests/astrodb-template-db" +DB_NAME = "tests/astrodb-template-tests" +SETTINGS_FILE = "database.toml" + +db = build_db_from_json( + settings_file=SETTINGS_FILE, + base_path=SCHEMA_PATH, + db_name=DB_NAME, +) + +TABLE_PATH = "path/to/file.fits" # confirmed in Step 1 +SOURCE_COL = "Name" # confirmed in Step 2 — required +REFERENCE_COL = "Reference" # confirmed in Step 2 — required +RA_COL = "RA" # confirmed in Step 2 — set to None → SIMBAD fallback +DEC_COL = "Dec" # confirmed in Step 2 — set to None → SIMBAD fallback +EPOCH_COL = None # optional — set to column name if present +EQUINOX_COL = None # optional — set to column name if present +COMMENT_COL = None # optional — set to column name if present +OTHER_REF_COL = None # optional — set to column name if present + +data = Table.read(TABLE_PATH) +logger.info(f"Loaded {len(data)} rows from {TABLE_PATH}") + +sources_added = sources_skipped = 0 +for row in data: + source_name = str(row[SOURCE_COL]) + try: + ingest_source( + db, + source=source_name, + reference=str(row[REFERENCE_COL]), + ra=float(row[RA_COL]) if RA_COL else None, + dec=float(row[DEC_COL]) if DEC_COL else None, + epoch=str(row[EPOCH_COL]) if EPOCH_COL else None, + equinox=str(row[EQUINOX_COL]) if EQUINOX_COL else None, + other_reference=str(row[OTHER_REF_COL]) if OTHER_REF_COL else None, + comment=str(row[COMMENT_COL]) if COMMENT_COL else None, + raise_error=True, + ) + sources_added += 1 + logger.info(f"Ingested: {source_name}") + except Exception as e: + sources_skipped += 1 + logger.warning(f"Skipping {source_name}: {e}") + +logger.info(f"Done: {sources_added} ingested, {sources_skipped} skipped out of {len(data)} rows") + +if SAVE_DB: + db.save_database(directory="data/") + logger.info("Database saved to data/") +else: + logger.info("Dry run complete — NOT saved. Set SAVE_DB = True to persist.") +``` + +--- + +## Step 4: Run the script + +Run `tmp/ingest_{REF}_sources.py` with `SAVE_DB = False`. Report: + +- How many sources were ingested successfully +- Any rows skipped with their warning messages +- Confirmation that the database was **not** saved + +See `references/ingest_source_api.md` for the common warnings table and how to fix each one. + +--- + +## Step 5: Confirm and save + +After a successful dry run, ask the user: +> Ingestion preview complete: **X** ingested, **Y** skipped out of **Z** rows. +> Would you like to save these changes to the database? (Re-runs with `SAVE_DB = True`) + +**Never set `SAVE_DB = True` automatically** — only on explicit user confirmation. + +--- + +## Key Behaviors + +1. **Missing RA/Dec**: if `RA_COL = None`, `ingest_source` queries SIMBAD automatically. + If SIMBAD has no match, that row is skipped with a warning. +2. **Duplicate sources**: if a source already exists, `ingest_source` adds the new name + as an alternate in `Names` — it does not re-insert into `Sources`. +3. **Missing reference**: `reference` must already be in `Publications` or ingestion fails. + Remind the user to run `ingest_publication` first. +4. **Unicode dashes**: handled automatically by `ingest_source` + (en dash, em dash, minus sign, figure dash → `-`). \ No newline at end of file diff --git a/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md b/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md new file mode 100644 index 0000000..eeab2a7 --- /dev/null +++ b/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md @@ -0,0 +1,53 @@ +# astrodb_utils.sources API Reference + +Source of truth: https://github.com/astrodbtoolkit/astrodb_utils/blob/main/astrodb_utils/sources.py +Do NOT copy sources.py into this skill — always use the installed package. + +--- + +## ingest_source signature + +```python +from astrodb_utils.sources import ingest_source + +ingest_source( + db, # astrodbkit Database object (from build_db_from_json) + source, # str — source name + reference: str, # str — must exist in Publications table + *, + ra: float = None, # decimal degrees; None → SIMBAD lookup + dec: float = None, # decimal degrees; None → SIMBAD lookup + epoch: str = None, # e.g. "2000.0" + equinox: str = None, # e.g. "J2000" + other_reference: str = None, + comment: str = None, + raise_error: bool = True, # True = stop on error; False = warn and skip + search_db: bool = True, # True = check for duplicates before inserting + ra_col_name: str = "ra_deg", # column name in the DB Sources table for RA + dec_col_name: str = "dec_deg", # column name in the DB Sources table for Dec + epoch_col_name: str = "epoch_year", + use_simbad: bool = True, # query SIMBAD if RA/Dec missing or name unresolved +) +``` + +Returns None. Side effects: inserts into `Sources` and `Names`. +If source already exists: adds new name as alternate in `Names` only. + +--- + +## ra_col_name / dec_col_name + +These are column names **in the database Sources table** — not in the input data file. +Defaults (`ra_deg`, `dec_deg`) match astrodb-template-db. Only change if your DB schema differs. + +--- + +## Common warnings and fixes + +| Warning | Cause | Fix | +|---------|-------|-----| +| `Discovery reference X missing or not in Publications` | Reference not in Publications | Run `ingest_publication` first | +| `Coordinates needed and could not be retrieved from SIMBAD` | No RA/Dec + SIMBAD can't resolve name | Provide RA/Dec columns, or check source name spelling | +| `More than one match for X` | Name resolves to multiple DB candidates | Investigate duplicates manually | +| `No internet connection, not using Simbad` | SIMBAD unreachable | Provide RA/Dec explicitly | +| `Coordinates do not match for X` | Provided RA/Dec >60 arcsec from DB entry | Check coordinate columns and units | \ No newline at end of file diff --git a/.claude/skills/ingest-source/SKILL.md b/.claude/skills/ingest-source/SKILL.md deleted file mode 100644 index a2568eb..0000000 --- a/.claude/skills/ingest-source/SKILL.md +++ /dev/null @@ -1,177 +0,0 @@ ---- -name: ingest-source -description: Generate a Python ingest script that ingests sources (astronomical objects) into an AstroDB database from a parsed data table. Use this skill after match-schema has verified the source name, RA, Dec, and reference columns. Trigger when user says: "ingest sources", "ingest objects", "add new sources to database", or "ingest my data table". -compatibility: python, astropy, astrodb_utils, astroquery ---- - -# Ingest Sources Skill -Generate a Python script that ingests rows from a data table into the `Sources` table of an AstroDB SQLite database using `astrodb_utils.sources.ingest_source`. - -## Prerequisites -1. **A database file**: JSON data files with a `database.toml` settings file, following the astrodb-template-db structure. If the user doesn't have a database yet, run the `create-astrodb` skill first. -2. **Installed packages**: `astrodb_utils`, `astropy`, `astroquery` -3. **A data table**: CSV, ECSV, FITS, or other astropy-readable format with at minimum: a source name column and a discovery reference column -4. **Publications table populated**: every reference value must already exist in the `Publications` table. If not, tell user that the reference should be ingested first. -5. **Internet access (recommended)**: used to query SIMBAD for coordinates when RA/Dec are missing - -## Required Inputs -1. Path to the data table file (CSV, ECSV, FITS, etc.) -2. Path to `database.toml` (the project config file generated by `create-astrodb`) - -## Instrutions - -### Step 1: Parse the data table -Run the `parse-data-table` skill on the provided data file, then read the sidecar it produces: -```python -import json, os - -sidecar = "tmp/astrodb-parse-result.json" -meta = json.load(open(sidecar)) -file_path = meta["file_path"] # path to the data table -reader = meta["reader"] # "astropy" or "pandas" -fmt = meta["format"] # astropy format hint, or None -n_rows = meta["n_rows"] # number of rows -``` - -Load the table using the same reader that parse-data-table already verified: - -```python -from astropy.table import Table -import pandas as pd - -if reader == "astropy": - kwargs = {"format": fmt} if fmt else {} - data = Table.read(file_path, **kwargs) -else: - data = getattr(pd, meta["pandas_method"])(file_path) -``` - - -### Step 2: Confirm column mappings -Show the user the column names from the parsed result and ask them to identify: -- Which column is the **source name** (required) -- Which column is **RA** in decimal degrees (optional) -- Which column is **Dec** in decimal degrees (optional) -- Which column is the **discovery reference** (required - must exist in Publications table) -- Any optional columns: epoch, equinox, comment, other_reference - -Example prompt to user: -> The table has these columns: `source, ra_deg, dec_deg, reference, epoch` -> Which column is the source name? Which is the discovery reference? - - -### Step 3: Generate ingest_sources.py -Populate the script config with confirmed mappings and paths from the sidecar. -Write the script to `tmp/ingest_sources.py`. - -### Step 4: Run the script -Execute `tmp/ingest_sources.py` with `SAVE_DB = False` (dry run) and report: -- How many sources were ingested successfully -- Any rows skipped with warnings -- That the database is still in preview mode - -### Step 5: Prompt to save -After a successful dry run (no errors or only expected warnings), ask the user: - -> Ingestion preview complete: X sources processed, Y added, Z skipped. -> Would you like to save these changes to the database? (Sets SAVE_DB = True and re-runs) - -Only re-run with `SAVE_DB = True` if the user explicitly confirms. Never save automatically. - -## Generated Script -Write `tmp/ingest_sources.py` with the confirmed values filled in: - -```python -import logging -from astropy.table import Table -import pandas as pd -from astrodb_utils import build_db_from_json -from astrodb_utils.sources import ingest_source - -# --- Logging --- -astrodb_utils_logger = logging.getLogger("astrodb_utils") -astrodb_utils_logger.setLevel(logging.INFO) -logger = logging.getLogger("astrodb_utils.ingest_sources") -logger.setLevel(logging.INFO) - -# --- Configuration --- -SAVE_DB = False # set True only after dry run confirms no errors - -# Load database — matches the structure created by the create-astrodb skill. -# SCHEMA_PATH must point to the cloned schema repo (base_path in build_db_from_json). -# If you don't have it yet, run: -# git clone https://github.com/astrodbtoolkit/astrodb-template-db.git tests/astrodb-template-db -SCHEMA_PATH = "tests/astrodb-template-db" # cloned schema repo -DB_NAME = "tests/astrodb-template-tests" # output .sqlite path (no extension) -SETTINGS_FILE = "database.toml" # matches data_path and felis_path - -db = build_db_from_json( - settings_file=SETTINGS_FILE, - base_path=SCHEMA_PATH, - db_name=DB_NAME, -) - -# --- Data table --- -# Filled from parse-data-table sidecar (tmp/astrodb-parse-result.json) -TABLE_PATH = "path/to/data_table.csv" # fill in -data = Table.read(TABLE_PATH) -logger.info(f"Loaded {len(data)} rows from {TABLE_PATH}") - -# --- Column mapping — filled from confirmed mappings in Step 2 --- -SOURCE_COL = "source" # required -RA_COL = "ra_deg" # set to None if not in table (SIMBAD fallback) -DEC_COL = "dec_deg" # set to None if not in table (SIMBAD fallback) -REFERENCE_COL = "reference" # required — must exist in Publications table - -# Optional columns — set to None if not present -EPOCH_COL = None -EQUINOX_COL = None -COMMENT_COL = None -OTHER_REF_COL = None - -# --- Ingest sources --- -# raise_error=True: stop on first error (good for dry runs and development) -# raise_error=False: skip bad rows and continue (good for bulk ingestion) -sources_added = 0 -for row in data: - try: - ingest_source( - db, - source=row[SOURCE_COL], - reference=row[REFERENCE_COL], - ra=row[RA_COL] if RA_COL else None, - dec=row[DEC_COL] if DEC_COL else None, - epoch=str(row[EPOCH_COL]) if EPOCH_COL else None, - equinox=str(row[EQUINOX_COL]) if EQUINOX_COL else None, - other_reference=str(row[OTHER_REF_COL]) if OTHER_REF_COL else None, - comment=str(row[COMMENT_COL]) if COMMENT_COL else None, - raise_error=True, - ) - sources_added += 1 - logger.info(f"Ingested: {row[SOURCE_COL]}") - except Exception as e: - logger.warning(f"Skipping {row[SOURCE_COL]}: {e}") - continue - -logger.info(f"Total sources ingested: {sources_added} / {len(data)}") - -# --- Save database --- -# Save path matches data_path in database.toml, consistent with create-astrodb skill. -if SAVE_DB: - db.save_database(directory="data/") - logger.info("Database saved to data/") -``` - -## Key Behaviors -1. **Missing RA/Dec**: if `RA_COL` or `DEC_COL` is `None`, `ingest_source` queries SIMBAD automatically. If SIMBAD has no match, that row is skipped with a warning. -2. **Duplicate sources**: if a source already exists, `ingest_source` adds the new name as an alternate in the `Names` table. With `raise_error=False`, duplicates are skipped with a warning. -3. **Missing reference**: `reference` must already be in `Publications` or ingestion fails. Remind the user to run `ingest_publication` first. -4. **Unicode dashes**: handled automatically by `ingest_source` (en dash, em dash, minus sign, figure dash → `-`). -5. **Column name defaults**: `ra_col_name="ra_deg"`, `dec_col_name="dec_deg"`, `epoch_col_name="epoch_year"` — these are the database column names, not the input table column names. - -## Output -Report: -1. Number of sources successfully ingested vs total rows -2. Any skipped rows with reasons from the WARNING logs -3. Confirm whether database was saved or is still in preview mode (`SAVE_DB = False`) - From bcaff60dde2e7e6b445fd93a73bc15cdfbc41c28 Mon Sep 17 00:00:00 2001 From: ying2212 Date: Mon, 18 May 2026 13:58:51 -0400 Subject: [PATCH 3/3] add DB schema column name mapping guidance --- .claude/skills/astrodb-ingest-source/SKILL.md | 71 ++++++++++++++----- .../references/ingest_source_api.md | 2 +- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/.claude/skills/astrodb-ingest-source/SKILL.md b/.claude/skills/astrodb-ingest-source/SKILL.md index 4684a80..e96e334 100644 --- a/.claude/skills/astrodb-ingest-source/SKILL.md +++ b/.claude/skills/astrodb-ingest-source/SKILL.md @@ -37,22 +37,22 @@ parameter meanings, and common warnings with fixes. ```python from astropy.table import Table -data = Table.read("path/to/file.fits") +data = Table.read("path/to/file.fits") # astropy auto detects .fits, .csv, .ecsv # If auto-detect fails: Table.read(..., format="fits") print(data.colnames) print(data[:3]) ``` -Show the user the **column names**, **dtypes**, and a **3-row preview**. +Show the user the **column names**, **dtypes**, and a **3-row preview** so they can confirm mapping in the next step. --- ## Step 2: Confirm column mappings +Ask the user to confirm two things: **(A) input file columns** and **(B) DB schema column names**. -Show the actual column names from Step 1 — **never assume defaults like `source` or -`ra_deg`**, since real catalogs use names like `Name`, `RA`, `Dec`, `object`, etc. - -Ask the user to confirm: +### A. Input file columns + +Present the actual column names from Step 1 — **do not assume defaults**. | Role | Required? | Notes | |------|-----------|-------| @@ -60,8 +60,8 @@ Ask the user to confirm: | Discovery reference | **Yes** | Must already exist in `Publications` | | RA (decimal degrees) | No | If absent → SIMBAD fallback | | Dec (decimal degrees) | No | If absent → SIMBAD fallback | -| Epoch | No | | -| Equinox | No | | +| Epoch | No | e.g. `"2000.0"` | +| Equinox | No | e.g. `"J2000"` | | Comment | No | | | Other reference | No | | @@ -72,6 +72,27 @@ Example prompt to user: > The table has these columns: `Name, RA, Dec, Dist, Reference` > Which column is the source name? Which is the discovery reference? +### B. DB schema column names + +These are the column names **in the database `Sources` table** — not the input file. +They vary by database. **Always ask the user which DB they are targeting**, then use the +known defaults for that DB: + +| Database | ra_col_name | dec_col_name | epoch_col_name | +|----------|-------------|--------------|----------------| +| astrodb-template-db | `ra_deg` | `dec_deg` | `epoch_year` | +| SIMPLE-db | `ra` | `dec` | `epoch` | +| Unknown | **ask the user** | **ask the user** | **ask the user** | + +To confirm for an unknown DB, check the schema with: +```python +print(db.metadata.tables["Sources"].columns.keys()) +``` +**Example prompt:** +> Which database are you ingesting into — SIMPLE-db, astrodb-template-db, or another? +> (This determines the column names used internally for RA, Dec, and epoch.) + + --- ## Step 3: Write `tmp/ingest_{REF}_sources.py` @@ -103,19 +124,34 @@ db = build_db_from_json( db_name=DB_NAME, ) +# --Load data table-- TABLE_PATH = "path/to/file.fits" # confirmed in Step 1 +data = Table.read(TABLE_PATH) +logger.info(f"Loaded {len(data)} rows from {TABLE_PATH}") + +# --- Column mapping — filled from Step 2 confirmation --- +# Use the ACTUAL column names from your file (not assumed defaults) SOURCE_COL = "Name" # confirmed in Step 2 — required REFERENCE_COL = "Reference" # confirmed in Step 2 — required RA_COL = "RA" # confirmed in Step 2 — set to None → SIMBAD fallback DEC_COL = "Dec" # confirmed in Step 2 — set to None → SIMBAD fallback + +# Optional columns — set to None if not present in table EPOCH_COL = None # optional — set to column name if present EQUINOX_COL = None # optional — set to column name if present COMMENT_COL = None # optional — set to column name if present OTHER_REF_COL = None # optional — set to column name if present -data = Table.read(TABLE_PATH) -logger.info(f"Loaded {len(data)} rows from {TABLE_PATH}") +# --- DB schema column names — confirmed in Step 2B --- +# These are column names IN the database Sources table, not the input file. +# astrodb-template-db defaults: ra_deg, dec_deg, epoch_year +# SIMPLE-db uses: ra, dec, epoch +# To check your DB: print(db.metadata.tables["Sources"].columns.keys()) +RA_COL_NAME = "ra_deg" +DEC_COL_NAME = "dec_deg" +EPOCH_COL_NAME = "epoch_year" +# Ingest Loop sources_added = sources_skipped = 0 for row in data: source_name = str(row[SOURCE_COL]) @@ -130,6 +166,9 @@ for row in data: equinox=str(row[EQUINOX_COL]) if EQUINOX_COL else None, other_reference=str(row[OTHER_REF_COL]) if OTHER_REF_COL else None, comment=str(row[COMMENT_COL]) if COMMENT_COL else None, + ra_col_name=RA_COL_NAME, + dec_col_name=DEC_COL_NAME, + epoch_col_name=EPOCH_COL_NAME, raise_error=True, ) sources_added += 1 @@ -137,7 +176,7 @@ for row in data: except Exception as e: sources_skipped += 1 logger.warning(f"Skipping {source_name}: {e}") - + logger.info(f"Done: {sources_added} ingested, {sources_skipped} skipped out of {len(data)} rows") if SAVE_DB: @@ -159,7 +198,6 @@ Run `tmp/ingest_{REF}_sources.py` with `SAVE_DB = False`. Report: See `references/ingest_source_api.md` for the common warnings table and how to fix each one. ---- ## Step 5: Confirm and save @@ -169,10 +207,8 @@ After a successful dry run, ask the user: **Never set `SAVE_DB = True` automatically** — only on explicit user confirmation. ---- - ## Key Behaviors - + 1. **Missing RA/Dec**: if `RA_COL = None`, `ingest_source` queries SIMBAD automatically. If SIMBAD has no match, that row is skipped with a warning. 2. **Duplicate sources**: if a source already exists, `ingest_source` adds the new name @@ -180,4 +216,7 @@ After a successful dry run, ask the user: 3. **Missing reference**: `reference` must already be in `Publications` or ingestion fails. Remind the user to run `ingest_publication` first. 4. **Unicode dashes**: handled automatically by `ingest_source` - (en dash, em dash, minus sign, figure dash → `-`). \ No newline at end of file + (en dash, em dash, minus sign, figure dash → `-`). +5. **DB schema column names**: defaults (`ra_deg`/`dec_deg`/`epoch_year`) match + astrodb-template-db. SIMPLE-db uses `ra`/`dec`/`epoch`. Wrong values cause all rows + to silently skip — always confirm the target DB in Step 2B. \ No newline at end of file diff --git a/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md b/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md index eeab2a7..2708abc 100644 --- a/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md +++ b/.claude/skills/astrodb-ingest-source/references/ingest_source_api.md @@ -40,7 +40,7 @@ If source already exists: adds new name as alternate in `Names` only. These are column names **in the database Sources table** — not in the input data file. Defaults (`ra_deg`, `dec_deg`) match astrodb-template-db. Only change if your DB schema differs. ---- +--- ## Common warnings and fixes