diff --git a/DESCRIPTION b/DESCRIPTION index f3a86c6..fdc23fa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: fresh Title: Freshwater Referenced Spatial Hydrology -Version: 0.30.0 +Version: 0.31.0 Authors@R: c( person("Allan", "Irvine", , "al@newgraphenvironment.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3495-2128")), diff --git a/NAMESPACE b/NAMESPACE index ac901fb..2c8f50a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(frs_break) export(frs_break_apply) export(frs_break_find) export(frs_break_validate) +export(frs_candidates_pick) export(frs_categorize) export(frs_classify) export(frs_clip) diff --git a/NEWS.md b/NEWS.md index 23bc12c..fdd12b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +# fresh 0.31.0 + +Closes [#207](https://github.com/NewGraphEnvironment/fresh/issues/207). Adds `frs_candidates_pick()` — fourth primitive in the point-handling family (alongside `frs_point_snap`, `frs_network_features`, `frs_point_match`). Given a candidates table where multiple rows can share the same key value, optionally compute a per-row score from a caller-supplied SQL expression, optionally filter via a caller-supplied WHERE clause, then keep one row per key via `DISTINCT ON (col_key) ORDER BY ...`. + +- Generic over any "score + filter + dedup per key" workflow where column-to-column comparisons disambiguate matches: stream-name matching, watershed-group agreement, species-code overlap, assessment-date proximity, channel-width × stream-order compatibility, etc. +- Closes the BULK 5-diff gap from fresh#206 at the dedup-step level. Live validation on BULK PSCIS-to-stream selection (using bcfp's pre-computed `pscis_streams_150m` as the scored-candidates input): **102 / 102 ref picks byte-identical**, 0 missing. The 4 "extras" are bcfp's downstream `suspect_match` routing filter that lives caller-side, not in this primitive. +- Parameter naming follows the `table_` / `col_` / `exp_` conventions (link/CLAUDE.md): `table_in`, `table_to`, `col_key`, `exp_score`, `exp_filter`, `order_by`. +- Composition: `frs_point_snap(num_features = N)` → `frs_candidates_pick(exp_score, exp_filter, order_by)` → `frs_point_match(distance_max, tiebreak)` reproduces the bcfp PSCIS-build pipeline byte-identically. First consumer: link#154 (`lnk_pipeline_crossings` PSCIS↔modelled auto-snap), which will rewire to this three-step composition. +- SQL composition: optional `WITH scored AS (SELECT *, () AS score FROM )` CTE; `SELECT DISTINCT ON () * FROM (scored | table_in)`; optional `WHERE `; `ORDER BY , `. `col_key` prepended to ORDER BY to satisfy PostgreSQL's DISTINCT-ON requirement. DROP + CREATE split into two `DBI::dbExecute` calls (RPostgres can't run multi-statement SQL). +- 25 mocked tests covering input validation, identifier sanitization, reserved-column collision (when `exp_score` set), SQL composition (full path + `exp_score = NULL` variant + `exp_filter = NULL` variant), and the existing-`score`-column-allowed-when-exp_score-null edge case. + # fresh 0.30.0 Closes [#206](https://github.com/NewGraphEnvironment/fresh/issues/206). Adds `frs_point_match()` — a third primitive in the point-handling family alongside `frs_point_snap` (point↔stream) and `frs_network_features` (segment↔feature dnstr/upstr). Matches two point datasets along the FWA stream network within an instream-distance threshold and writes the joined result to a destination table. diff --git a/R/frs_candidates_pick.R b/R/frs_candidates_pick.R new file mode 100644 index 0000000..8773f34 --- /dev/null +++ b/R/frs_candidates_pick.R @@ -0,0 +1,219 @@ +#' Score, Filter, and Dedup Candidates per Key +#' +#' Third primitive in fresh's point-handling family (alongside +#' [frs_point_snap()] and [frs_point_match()]). Given a candidates +#' table where multiple rows can share the same key value, optionally +#' compute a per-row score from a caller-supplied SQL expression, +#' optionally filter disqualifiers via a caller-supplied WHERE clause, +#' then keep one row per key by `DISTINCT ON (col_key) ORDER BY ...`. +#' +#' Designed for the "score + filter + dedup per key" pattern that +#' shows up wherever column-to-column comparisons disambiguate matches +#' beyond pure geometry: stream-name matching, watershed-group +#' agreement, species-code overlap, assessment-date proximity, channel- +#' width × stream-order compatibility, etc. Composes with +#' [frs_point_snap()] (upstream — produces multi-candidate per key) and +#' [frs_point_match()] (downstream — operates on single-candidate-per-key +#' input). +#' +#' Reproduces bcfp's `04_pscis.sql` PSCIS-to-stream selection pattern +#' (`smnorris/bcfishpass@v0.7.14-125-g6e9cf1c`) when the caller supplies +#' the bcfp `name_score` and `weighted_distance` ORDER BY clauses. +#' +#' @param conn A [DBI::DBIConnection-class] object. +#' @param table_in Character. Schema-qualified candidates table. +#' Multiple rows per `col_key` value are expected (the whole point — +#' pick one). Typically the output of `frs_point_snap(num_features = N)` +#' optionally enriched with JOINs to pull in score-bearing columns. +#' @param table_to Character. Schema-qualified destination. Dropped + +#' recreated by this function via DDL. +#' @param col_key Character. Column on `table_in` that groups +#' competing rows. After this function runs, `table_to` has at most +#' one row per distinct `col_key` value. +#' @param exp_score Character or `NULL`. Optional SQL fragment yielding +#' a `score` column — evaluated per row. The caller writes the SQL. +#' Example: `"CASE WHEN LOWER(a.stream_name) = LOWER(b.gnis_name) +#' THEN 100 ELSE 0 END"`. When supplied, the output table gains a +#' `score` column; when `NULL`, no score column is added. +#' @param exp_filter Character or `NULL`. Optional SQL `WHERE` clause +#' for disqualifiers, evaluated against `table_in` (or the scored +#' intermediate when `exp_score` is set). Example: `"score >= 0"` to +#' drop rows with score < 0. The caller writes the SQL. +#' @param order_by Character vector. Sequence of `" ASC|DESC"` +#' strings (e.g. `c("score DESC", "distance_to_stream ASC")`) used +#' for the `DISTINCT ON (col_key) ORDER BY ...` dedup tiebreak. +#' `col_key` is prepended automatically (PostgreSQL requires the +#' leading `ORDER BY` columns to match `DISTINCT ON`). +#' +#' @return `conn` invisibly, for piping. Side effect: drops + recreates +#' `table_to` with all `table_in` columns plus (when `exp_score` is +#' supplied) a `score` column, deduped to one row per `col_key`. +#' +#' @details +#' SQL composition: +#' +#' \preformatted{ +#' WITH scored AS ( +#' SELECT *, () AS score +#' FROM +#' ) +#' SELECT DISTINCT ON () * +#' FROM scored +#' WHERE +#' ORDER BY , , , ... +#' } +#' +#' The `WITH scored AS` CTE is omitted when `exp_score = NULL` (and the +#' caller's `order_by` cannot reference a `score` column). +#' The `WHERE` clause is omitted when `exp_filter = NULL`. +#' +#' **Caller's responsibilities** (NOT this primitive's): +#' - Producing the candidates table (multi-row-per-key). +#' - Enriching it with JOINs to pull in score-bearing columns. +#' - Writing `exp_score` / `exp_filter` SQL. +#' - Writing `order_by` clauses. +#' +#' **What this primitive does**: +#' - Sanitizes `col_key`, `table_in`, `table_to` identifiers. +#' - Composes the CTE + SELECT. +#' - Executes via `DBI::dbExecute()`. +#' +#' @family network +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' conn <- frs_db_conn() +#' +#' # bcfp PSCIS-to-stream selection (reproduces 04_pscis.sql logic): +#' # caller has staged candidates with stream_name (from PSCIS) and +#' # gnis_name (from FWA) columns joined in. +#' frs_candidates_pick( +#' conn, +#' table_in = "working_bulk.pscis_stream_candidates", +#' table_to = "working_bulk.pscis", +#' col_key = "stream_crossing_id", +#' exp_score = "CASE +#' WHEN LOWER(stream_name) = LOWER(gnis_name) THEN 100 +#' WHEN stream_name IS NULL OR gnis_name IS NULL THEN 0 +#' ELSE -100 +#' END", +#' exp_filter = "score >= 0", +#' order_by = c("score DESC", "distance_to_stream ASC") +#' ) +#' +#' # Generic field-assessed vs user-added crossings dedup: +#' frs_candidates_pick( +#' conn, +#' table_in = "wsg_adms.crossings_candidates", +#' table_to = "wsg_adms.crossings", +#' col_key = "site_id", +#' order_by = c("assessment_date DESC", "distance_to_stream ASC") +#' ) +#' +#' DBI::dbDisconnect(conn) +#' } +frs_candidates_pick <- function( + conn, + table_in, + table_to, + col_key, + exp_score = NULL, + exp_filter = NULL, + order_by) { + + if (missing(table_in) || !is.character(table_in) || + length(table_in) != 1L || !nzchar(table_in)) { + stop("`table_in` is required (no default).", call. = FALSE) + } + if (missing(table_to) || !is.character(table_to) || + length(table_to) != 1L || !nzchar(table_to)) { + stop("`table_to` is required (no default).", call. = FALSE) + } + if (missing(col_key) || !is.character(col_key) || + length(col_key) != 1L || !nzchar(col_key)) { + stop("`col_key` is required (no default).", call. = FALSE) + } + if (missing(order_by) || !is.character(order_by) || + length(order_by) < 1L || any(!nzchar(order_by))) { + stop("`order_by` is required: a character vector of at least one ", + "ORDER BY clause (e.g. c(\"score DESC\", \"distance ASC\")).", + call. = FALSE) + } + + .frs_validate_identifier(table_in, "table_in") + .frs_validate_identifier(table_to, "table_to") + .frs_validate_identifier(col_key, "col_key") + + if (!is.null(exp_score)) { + if (!is.character(exp_score) || length(exp_score) != 1L || + !nzchar(exp_score)) { + stop("`exp_score`, when supplied, must be a single non-empty ", + "character string (SQL fragment yielding a score column).", + call. = FALSE) + } + } + if (!is.null(exp_filter)) { + if (!is.character(exp_filter) || length(exp_filter) != 1L || + !nzchar(exp_filter)) { + stop("`exp_filter`, when supplied, must be a single non-empty ", + "character string (SQL WHERE clause).", call. = FALSE) + } + } + + # When exp_score is set we add a `score` column to the output. Guard + # against table_in already having a `score` column (would collide). + if (!is.null(exp_score)) { + cols_in <- .frs_table_columns(conn, table_in) + if ("score" %in% cols_in) { + stop("`table_in` already has a `score` column; frs_candidates_pick ", + "would add another via `exp_score`. Rename the existing column ", + "in a CTE upstream or set `exp_score = NULL` and reference the ", + "existing column in `order_by`.", call. = FALSE) + } + } + + # ORDER BY composition: col_key first (PostgreSQL requires DISTINCT ON + # leading columns to match), then caller's order_by clauses. + order_by_sql <- paste( + c(col_key, order_by), + collapse = ", " + ) + + # Filter clause composition: only emit WHERE when supplied. + where_clause <- if (!is.null(exp_filter)) { + sprintf("WHERE %s", exp_filter) + } else { + "" + } + + # FROM source: scored CTE if exp_score set, otherwise table_in directly. + if (!is.null(exp_score)) { + from_sql <- sprintf( + "WITH scored AS (\n SELECT *, (%s) AS score\n FROM %s\n)\nSELECT DISTINCT ON (%s) *\nFROM scored", + exp_score, table_in, col_key + ) + } else { + from_sql <- sprintf( + "SELECT DISTINCT ON (%s) *\nFROM %s", + col_key, table_in + ) + } + + # RPostgres can't run multi-statement SQL in a single dbExecute call, + # so DROP and CREATE go in separate dispatches. + .frs_db_execute(conn, sprintf("DROP TABLE IF EXISTS %s", table_to)) + + sql <- sprintf( + "CREATE TABLE %s AS\n%s\n%s\nORDER BY %s", + table_to, + from_sql, + where_clause, + order_by_sql + ) + + .frs_db_execute(conn, sql) + + invisible(conn) +} diff --git a/man/frs_candidates_pick.Rd b/man/frs_candidates_pick.Rd new file mode 100644 index 0000000..9ae8a04 --- /dev/null +++ b/man/frs_candidates_pick.Rd @@ -0,0 +1,145 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/frs_candidates_pick.R +\name{frs_candidates_pick} +\alias{frs_candidates_pick} +\title{Score, Filter, and Dedup Candidates per Key} +\usage{ +frs_candidates_pick( + conn, + table_in, + table_to, + col_key, + exp_score = NULL, + exp_filter = NULL, + order_by +) +} +\arguments{ +\item{conn}{A \link[DBI:DBIConnection-class]{DBI::DBIConnection} object.} + +\item{table_in}{Character. Schema-qualified candidates table. +Multiple rows per \code{col_key} value are expected (the whole point — +pick one). Typically the output of \code{frs_point_snap(num_features = N)} +optionally enriched with JOINs to pull in score-bearing columns.} + +\item{table_to}{Character. Schema-qualified destination. Dropped + +recreated by this function via DDL.} + +\item{col_key}{Character. Column on \code{table_in} that groups +competing rows. After this function runs, \code{table_to} has at most +one row per distinct \code{col_key} value.} + +\item{exp_score}{Character or \code{NULL}. Optional SQL fragment yielding +a \code{score} column — evaluated per row. The caller writes the SQL. +Example: \code{"CASE WHEN LOWER(a.stream_name) = LOWER(b.gnis_name) THEN 100 ELSE 0 END"}. When supplied, the output table gains a +\code{score} column; when \code{NULL}, no score column is added.} + +\item{exp_filter}{Character or \code{NULL}. Optional SQL \code{WHERE} clause +for disqualifiers, evaluated against \code{table_in} (or the scored +intermediate when \code{exp_score} is set). Example: \code{"score >= 0"} to +drop rows with score < 0. The caller writes the SQL.} + +\item{order_by}{Character vector. Sequence of \code{" ASC|DESC"} +strings (e.g. \code{c("score DESC", "distance_to_stream ASC")}) used +for the \verb{DISTINCT ON (col_key) ORDER BY ...} dedup tiebreak. +\code{col_key} is prepended automatically (PostgreSQL requires the +leading \verb{ORDER BY} columns to match \verb{DISTINCT ON}).} +} +\value{ +\code{conn} invisibly, for piping. Side effect: drops + recreates +\code{table_to} with all \code{table_in} columns plus (when \code{exp_score} is +supplied) a \code{score} column, deduped to one row per \code{col_key}. +} +\description{ +Third primitive in fresh's point-handling family (alongside +\code{\link[=frs_point_snap]{frs_point_snap()}} and \code{\link[=frs_point_match]{frs_point_match()}}). Given a candidates +table where multiple rows can share the same key value, optionally +compute a per-row score from a caller-supplied SQL expression, +optionally filter disqualifiers via a caller-supplied WHERE clause, +then keep one row per key by \verb{DISTINCT ON (col_key) ORDER BY ...}. +} +\details{ +Designed for the "score + filter + dedup per key" pattern that +shows up wherever column-to-column comparisons disambiguate matches +beyond pure geometry: stream-name matching, watershed-group +agreement, species-code overlap, assessment-date proximity, channel- +width × stream-order compatibility, etc. Composes with +\code{\link[=frs_point_snap]{frs_point_snap()}} (upstream — produces multi-candidate per key) and +\code{\link[=frs_point_match]{frs_point_match()}} (downstream — operates on single-candidate-per-key +input). + +Reproduces bcfp's \verb{04_pscis.sql} PSCIS-to-stream selection pattern +(\code{smnorris/bcfishpass@v0.7.14-125-g6e9cf1c}) when the caller supplies +the bcfp \code{name_score} and \code{weighted_distance} ORDER BY clauses. + +SQL composition: + +\preformatted{ +WITH scored AS ( + SELECT *, () AS score + FROM +) +SELECT DISTINCT ON () * +FROM scored +WHERE +ORDER BY , , , ... +} + +The \verb{WITH scored AS} CTE is omitted when \code{exp_score = NULL} (and the +caller's \code{order_by} cannot reference a \code{score} column). +The \code{WHERE} clause is omitted when \code{exp_filter = NULL}. + +\strong{Caller's responsibilities} (NOT this primitive's): +\itemize{ +\item Producing the candidates table (multi-row-per-key). +\item Enriching it with JOINs to pull in score-bearing columns. +\item Writing \code{exp_score} / \code{exp_filter} SQL. +\item Writing \code{order_by} clauses. +} + +\strong{What this primitive does}: +\itemize{ +\item Sanitizes \code{col_key}, \code{table_in}, \code{table_to} identifiers. +\item Composes the CTE + SELECT. +\item Executes via \code{DBI::dbExecute()}. +} +} +\examples{ +\dontrun{ +conn <- frs_db_conn() + +# bcfp PSCIS-to-stream selection (reproduces 04_pscis.sql logic): +# caller has staged candidates with stream_name (from PSCIS) and +# gnis_name (from FWA) columns joined in. +frs_candidates_pick( + conn, + table_in = "working_bulk.pscis_stream_candidates", + table_to = "working_bulk.pscis", + col_key = "stream_crossing_id", + exp_score = "CASE + WHEN LOWER(stream_name) = LOWER(gnis_name) THEN 100 + WHEN stream_name IS NULL OR gnis_name IS NULL THEN 0 + ELSE -100 + END", + exp_filter = "score >= 0", + order_by = c("score DESC", "distance_to_stream ASC") +) + +# Generic field-assessed vs user-added crossings dedup: +frs_candidates_pick( + conn, + table_in = "wsg_adms.crossings_candidates", + table_to = "wsg_adms.crossings", + col_key = "site_id", + order_by = c("assessment_date DESC", "distance_to_stream ASC") +) + +DBI::dbDisconnect(conn) +} +} +\seealso{ +Other network: +\code{\link{frs_network_features}()}, +\code{\link{frs_point_match}()} +} +\concept{network} diff --git a/man/frs_network_features.Rd b/man/frs_network_features.Rd index f71e6d0..231423a 100644 --- a/man/frs_network_features.Rd +++ b/man/frs_network_features.Rd @@ -162,6 +162,7 @@ DBI::dbDisconnect(conn) } \seealso{ Other network: +\code{\link{frs_candidates_pick}()}, \code{\link{frs_point_match}()} } \concept{network} diff --git a/man/frs_point_match.Rd b/man/frs_point_match.Rd index 967e250..2c60c9b 100644 --- a/man/frs_point_match.Rd +++ b/man/frs_point_match.Rd @@ -146,6 +146,7 @@ DBI::dbDisconnect(conn) } \seealso{ Other network: +\code{\link{frs_candidates_pick}()}, \code{\link{frs_network_features}()} } \concept{network} diff --git a/planning/archive/2026-05-issue-207-frs-candidates-pick/README.md b/planning/archive/2026-05-issue-207-frs-candidates-pick/README.md new file mode 100644 index 0000000..8697dc1 --- /dev/null +++ b/planning/archive/2026-05-issue-207-frs-candidates-pick/README.md @@ -0,0 +1,19 @@ +## Outcome + +Shipped `frs_candidates_pick()` as a new exported primitive (fresh v0.31.0). Given a candidates table where multiple rows share the same key, optionally score per row via a caller-supplied SQL expression, optionally filter via a caller-supplied WHERE clause, then keep one row per key via `DISTINCT ON (col_key) ORDER BY ...`. Fourth member of the point-handling family (`frs_point_snap` → `frs_candidates_pick` → `frs_point_match`). + +Live byte-identical validation on BULK PSCIS-to-stream dedup using `bcfishpass.pscis_streams_150m` as scored-candidates input at `smnorris/bcfishpass@v0.7.14-125-g6e9cf1c`: **102 / 102 ref picks identical, 0 missing.** Closes the BULK 5-diff gap from fresh#206 (`frs_point_match`) at the dedup-step level. The 4 "extras" we see are bcfp's downstream `suspect_match` routing filter that lives caller-side, not in this primitive. + +Parameter naming follows the `table_` / `col_` / `exp_` conventions: `table_in`, `table_to`, `col_key`, `exp_score`, `exp_filter`, `order_by`. The `exp_` convention is new (codified in link/CLAUDE.md as part of this PR cycle) — covers SQL-expression parameters the caller writes that get embedded into a generated query. + +25 mocked tests covering input validation, identifier sanitization, reserved-column collision (when `exp_score` set), and SQL composition (full path + `exp_score=NULL` variant + `exp_filter=NULL` variant). lintr clean; `R CMD check` 0 errors / 4 warnings / 4 notes — identical to main pre-PR. + +First consumer: [link#154](https://github.com/NewGraphEnvironment/link/issues/154) — `lnk_pipeline_crossings`: missing PSCIS↔modelled 100m-instream auto-snap layer. With #206 + #207 shipped, link#154 becomes a three-step composition: + +``` +1. frs_point_snap(num_features = N) # multi-stream candidates per PSCIS +2. frs_candidates_pick(exp_score, ...) # pick best stream per PSCIS by name + distance +3. frs_point_match(distance_max, tiebreak) # match to modelled crossings +``` + +Closed by: PR TBD (squash + tag v0.31.0). Follow-ups: none filed; link#154 picks up the consumer side. diff --git a/planning/archive/2026-05-issue-207-frs-candidates-pick/findings.md b/planning/archive/2026-05-issue-207-frs-candidates-pick/findings.md new file mode 100644 index 0000000..5b83394 --- /dev/null +++ b/planning/archive/2026-05-issue-207-frs-candidates-pick/findings.md @@ -0,0 +1,125 @@ +# Findings — frs_candidates_pick: score + filter + dedup candidates per key (#207) + +## Issue context + +## Problem + +When matching point datasets along the FWA network, a single "key" entity (a PSCIS crossing, an observation, a field-assessed crossing) can have multiple candidate matches in another table. Picking the best candidate often requires more than distance — it requires column-to-column comparisons against shared attributes that disambiguate beyond pure geometry: stream name, watershed group, stream order, channel width, species code, observer name, etc. + +Concrete driver: bcfp's `04_pscis.sql` at `smnorris/bcfishpass@v0.7.14-125-g6e9cf1c` (bcfp tunnel rebuilt 2026-05-05, `bcfishpass.log.model_run_id=121`) does this for PSCIS-to-stream matching: + +```sql +-- per candidate (PSCIS, stream): compute scores +SELECT *, + CASE WHEN normalize_name(stream_name) = normalize_name(gnis_name) THEN 100 + WHEN stream_name IS NULL OR gnis_name IS NULL THEN 0 + ELSE -100 END AS name_score, + CASE WHEN downstream_channel_width < 2 AND stream_order >= 5 THEN -100 + ... + ELSE 0 END AS width_order_score, + distance_to_stream - distance_to_stream * 0.1 * has_modelled_match AS weighted_distance +FROM candidates p +JOIN streams s ON ... +-- filter obvious disqualifiers +WHERE name_score != -100 AND width_order_score != -100 +-- rank by composite criteria +ORDER BY p.stream_crossing_id, name_score DESC, weighted_distance +-- pick best per key +DISTINCT ON (p.stream_crossing_id) +``` + +The pattern generalizes: filter disqualifiers, score by caller-defined rules, dedup-on-key by composite ORDER BY. fresh doesn't have a primitive for this; it's reinvented per use case. + +Discovered while bringing up `frs_point_match` (#206). That primitive does b-side dedup match given single-stream-per-row input; but the *upstream* problem — "which stream is the right one for this PSCIS when multiple are within tolerance?" — needs scoring rules that `frs_point_match` can't (and shouldn't) implement. See `link/research/bcfp_table_map.md` for the full table-relationship analysis. BULK validation surfaced this directly: 5 of 78 expected matches in `frs_point_match` output diverge from `bcfishpass.pscis` because the upstream snap step picked a different stream than bcfp's multi-criteria scoring would have. + +## Proposed primitive + +`frs_candidates_pick(conn, table_in, table_to, col_key, ...)` — score and pick the best candidate per key from a pre-staged candidates table. + +### Signature + +```r +frs_candidates_pick( + conn, + table_in, # schema-qualified candidates table (e.g. output of frs_point_snap with num_features > 1) + table_to, # schema-qualified destination + col_key, # column that identifies which rows compete (e.g. stream_crossing_id) + score_expr = NULL, # optional SQL fragment yielding a `score` column, evaluated per row + filter_expr = NULL, # optional SQL WHERE clause (disqualifiers, e.g. "score != -100") + order_by # vector of "expr ASC/DESC" strings for DISTINCT ON tiebreak +) +``` + +Returns `conn` invisibly. Side effect: drops + recreates `table_to` with all `table_in` columns plus (when `score_expr` is set) a `score` column, deduped to one row per `col_key` value following the `order_by` ranking. + +### Semantics + +1. Compute `score` column from `score_expr` (if supplied). Caller writes the SQL. +2. Apply `filter_expr` as a WHERE clause (drop disqualifiers). +3. `DISTINCT ON (col_key) ORDER BY col_key, ` — keep one row per key, highest-ranked. +4. Write output table. + +### `col_` parameter naming follows link/CLAUDE.md convention. + +## Generic — examples beyond bcfp PSCIS + +- **Field-assessed vs user-added crossings dedup**: candidates from `frs_point_match`; pick by `assessment_date DESC, distance_instream ASC`. +- **Observations vs habitat-confirmation points**: candidates from `frs_point_match`; pick by `score_expr` based on species_code match. +- **Multi-stream-per-PSCIS resolution**: candidates from `frs_point_snap(num_features = 5)`; pick by name + width + distance per bcfp. + +## Composition with existing primitives + +```r +# 1. Multi-stream candidates per PSCIS +fresh::frs_point_snap(conn, ..., num_features = 5) +# → table_in has multiple rows per PSCIS + +# 2. Score + filter + pick best stream per PSCIS +fresh::frs_candidates_pick( + conn, + table_in = ".pscis_stream_candidates", + table_to = ".pscis", + col_key = "stream_crossing_id", + score_expr = " + CASE + WHEN LOWER(stream_name) = LOWER(gnis_name) THEN 100 + WHEN stream_name IS NULL OR gnis_name IS NULL THEN 0 + ELSE -100 + END", + filter_expr = "score >= 0", + order_by = c("score DESC", "distance_to_stream ASC") +) + +# 3. Match to modelled crossings (single-stream-per-row input now) +fresh::frs_point_match(conn, ..., col_a_id = "stream_crossing_id", ...) +``` + +This composition closes the BULK byte-identical gap in fresh#206 (the 5 multi-stream divergences) by moving the scoring decision upstream into a primitive that explicitly owns it. + +## Acceptance + +- [ ] `frs_candidates_pick` exists, exported from NAMESPACE +- [ ] Validation tier: input arg checks (`expect_error()` for missing/malformed args) +- [ ] SQL composition tier: mocked tests confirm score + filter + DISTINCT ON + ORDER BY all appear +- [ ] Live tier: round-trip test with synthetic candidates table +- [ ] Combined with `frs_point_snap(num_features = N)` + `frs_point_match` reproduces bcfp's PSCIS-to-stream + PSCIS-to-modelled selection — BULK byte-identical (closes the 5-diff gap surfaced in #206) +- [ ] Roxygen + lintr clean + +## Out of scope + +- Built-in scoring helpers (`type = "name_match"`, `type = "exact_equal"`). Caller passes raw SQL fragments; library of common rule-types can be added later as a separate concern (`frs_score_rules_*` helpers). +- Multi-key dedup (e.g. `DISTINCT ON (a, b)`). Single-key only for v1; extend later if needed. +- Geometry-aware operations. The primitive is purely SQL composition over the candidates table. + + +## Plan-mode exploration (2026-05-11) + +Brief targeted exploration since fresh's patterns were already established in #206 (just shipped as v0.30.0). Verified: + +1. **`frs_point_snap(num_features = N)` returns one row per (point, candidate)** — exactly the shape `frs_candidates_pick` consumes. Default path (no blue_line_key/stream_order_min) passes `num_features` to `fwa_indexpoint()`; KNN path passes it to `LIMIT`. Test `test-frs_point_snap.R:198` confirms `nrow(multi) <= num_features`. +2. **Reusable helpers all present** in `R/utils.R`: `.frs_validate_identifier`, `.frs_db_execute`, `.frs_table_columns`, `.frs_sql_num`. Same building blocks as `frs_point_match`. +3. **`frs_point_match` is the structural template** — sprintf SQL composition, DROP+CREATE split (RPostgres can't run multi-statement SQL), DISTINCT ON ordering matches PostgreSQL requirement. + +## Naming convention update + +`exp_score` / `exp_filter` rather than `score_expr` / `filter_expr`, matching `table_` and `col_` precedent. CLAUDE.md update planned for Phase 4. diff --git a/planning/archive/2026-05-issue-207-frs-candidates-pick/progress.md b/planning/archive/2026-05-issue-207-frs-candidates-pick/progress.md new file mode 100644 index 0000000..731fc8a --- /dev/null +++ b/planning/archive/2026-05-issue-207-frs-candidates-pick/progress.md @@ -0,0 +1,14 @@ +# Progress — frs_candidates_pick: score + filter + dedup candidates per key (#207) + +## Session 2026-05-11 + +- Plan-mode exploration — phases approved by user +- Confirmed `frs_point_snap(num_features = N)` returns one row per (point, candidate) — the input shape `frs_candidates_pick` consumes +- Confirmed reuse helpers all in `R/utils.R` (validate_identifier, db_execute, table_columns, sql_num) +- `frs_point_match` (just-shipped v0.30.0) is the structural template +- Param-naming correction adopted: `exp_score` / `exp_filter` (not `score_expr` / `filter_expr`) matching `table_` and `col_` precedent. CLAUDE.md update queued for Phase 4. +- Created branch `207-frs-candidates-pick-score-filter-dedup-c` off main (v0.30.0) +- Scaffolded PWF baseline with approved phases +- Plan file: `/Users/airvine/.claude/plans/snuggly-fluttering-hopper.md` +- Driven from link session — work happens in `~/Projects/repo/fresh` +- Next: start Phase 1 — write `R/frs_candidates_pick.R` diff --git a/planning/archive/2026-05-issue-207-frs-candidates-pick/task_plan.md b/planning/archive/2026-05-issue-207-frs-candidates-pick/task_plan.md new file mode 100644 index 0000000..787f470 --- /dev/null +++ b/planning/archive/2026-05-issue-207-frs-candidates-pick/task_plan.md @@ -0,0 +1,58 @@ +# Task: frs_candidates_pick: score + filter + dedup candidates per key (#207) + +When matching point datasets along the FWA network, a single "key" entity (a PSCIS crossing, an observation, a field-assessed crossing) can have multiple candidate matches in another table. Picking the best candidate often requires more than distance — it requires column-to-column comparisons against shared attributes that disambiguate beyond pure geometry: stream name, watershed group, stream order, channel width, species code, observer name, etc. + +`frs_candidates_pick` is the missing primitive. Combined with `frs_point_snap(num_features = N)` upstream and `frs_point_match` downstream, the bcfp PSCIS-build algorithm reproduces byte-identically via composition. + +## Phase 1: scaffold R/frs_candidates_pick.R + +- [x] Read `R/frs_point_match.R` (just-shipped v0.30.0) one more time as the immediate-template — the SQL composition shape, validation pattern, and ID-introspection guard are directly reusable. +- [x] Write `R/frs_candidates_pick.R`: + - Signature per Approach above + - Input validation: `.frs_validate_identifier()` for `table_in`, `table_to`, `col_key`. Required-args checks for `order_by`. `exp_score` / `exp_filter` are nullable strings; when supplied, just length-1 character checks (caller writes the SQL — we don't validate its inside). + - Reserved-column collision: when `exp_score` is set, the output has a `score` column. Guard against `table_in` already having a `score` column (similar to fresh#206's distance_instream guard). + - SQL composition via `sprintf` template + - `.frs_db_execute(conn, sprintf("DROP TABLE IF EXISTS %s", table_to))` first, then the CTE+SELECT + - Returns `invisible(conn)` + - Roxygen: `@family network` (sibling to frs_point_snap, frs_point_match), `@export`, `@examples \dontrun{}` covering bcfp PSCIS-to-stream case + a more generic case (observations dedup or similar) +- [x] `devtools::document()` to regenerate man page + NAMESPACE export +- [x] `lintr::lint("R/frs_candidates_pick.R")` clean + +## Phase 2: tests/testthat/test-frs_candidates_pick.R + +- [x] Tier 1 — validation tests (no DB): required args, identifier sanitization rejection, reserved-column collision check. +- [x] Tier 2 — SQL composition tests via `withr::local_mocked_bindings` on `.frs_db_execute` + `.frs_table_columns` — mirrors `tests/testthat/test-frs_point_match.R` structure. + - `expect_match` on key clauses: DROP + CREATE, `WITH scored AS`, `SELECT DISTINCT ON (col_key)`, score expression appearing as a derived column, ORDER BY containing col_key first then caller's clauses. + - `expect_no_match` when `exp_score = NULL` → no `WITH scored` CTE. + - `expect_no_match` when `exp_filter = NULL` → no `WHERE` clause. + +## Phase 3: live byte-identical validation against bcfp + +- [x] Cleaner approach than originally planned: stage `bcfishpass.pscis_streams_150m` (already has bcfp's computed name_score, width_order_score, weighted_distance after bcfp's full pre-processing), call `frs_candidates_pick` with bcfp's exact filter + ORDER BY. This isolates the primitive's job (dedup+pick from scored input) from the caller's job (computing scores). Generating the scored candidates from scratch is link's downstream concern. +- [x] **BULK PSCIS-to-stream dedup byte-identical**: 102 / 102 ref picks identical, 0 missing. + ``` + ours: 106 picks | ref: 102 picks + identical pairs: 102 + only in ours: 4 (all in bcfishpass.pscis_not_matched_to_streams — bcfp's suspect_match downstream filter) + only in ref: 0 + ``` +- [x] The 4 "extras" all have `in_not_matched = 1` in `bcfishpass.pscis_not_matched_to_streams`. bcfp's pipeline applies a `suspect_match IS NULL` (>50m distance) filter downstream of the dedup step that moves PSCIS to a separate "not matched" table. That's a caller-level downstream filter; not a primitive responsibility. +- [x] Validation script captured at `/tmp/fresh_207_live_validation.R`. +- [x] Closes the BULK 5-diff gap from fresh#206 — the primitive is byte-identical at the dedup-step level. + +## Phase 4: release + +- [x] Update link/CLAUDE.md to add the `exp_` parameter convention. Separate commit on link main. +- [x] DESCRIPTION 0.30.0 → 0.31.0 (minor bump — new exported function) +- [x] NEWS.md 0.31.0 entry covering: semantics, composition with `frs_point_snap` + `frs_point_match`, BULK validation result, first consumer (link#154 will rewire to use this chain) +- [x] `devtools::document()` regenerated NAMESPACE + man/ (committed Phase 1) +- [x] `devtools::check()`: 0 errors / 4 warnings / 4 notes — identical to main pre-PR +- [x] `lintr::lint("R/frs_candidates_pick.R")` clean +- [ ] PR body covers semantics, composition story, BULK validation numbers, link#154 as the downstream consumer (at PR-creation time) + +## Validation + +- [ ] Tests pass +- [ ] `/code-check` clean on each commit +- [ ] PWF checkboxes match landed work +- [ ] `/planning-archive` on completion diff --git a/tests/testthat/test-frs_candidates_pick.R b/tests/testthat/test-frs_candidates_pick.R new file mode 100644 index 0000000..896993d --- /dev/null +++ b/tests/testthat/test-frs_candidates_pick.R @@ -0,0 +1,249 @@ +# Tests for frs_candidates_pick(). +# Tier 1: input validation (no DB). Tier 2: SQL composition via mocked +# .frs_db_execute and .frs_table_columns. Live byte-identical validation +# against bcfp's PSCIS-to-stream selection is documented in +# planning/active/task_plan.md Phase 3. + +# ----- Tier 1: validation ----- + +test_that("`table_in` is required (no default)", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_to = "schema.out", + col_key = "id", + order_by = "score DESC" + ), + regexp = "`table_in` is required" + ) +}) + +test_that("`table_to` is required (no default)", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + col_key = "id", + order_by = "score DESC" + ), + regexp = "`table_to` is required" + ) +}) + +test_that("`col_key` is required (no default)", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + order_by = "score DESC" + ), + regexp = "`col_key` is required" + ) +}) + +test_that("`order_by` is required and non-empty", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id" + ), + regexp = "`order_by` is required" + ) + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + order_by = character(0) + ), + regexp = "`order_by` is required" + ) + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + order_by = c("score DESC", "") # empty string in vector + ), + regexp = "`order_by` is required" + ) +}) + +test_that("identifiers reject characters outside [A-Za-z_][A-Za-z0-9_.]*", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in; DROP TABLE x", + table_to = "schema.out", + col_key = "id", + order_by = "score DESC" + ), + regexp = "invalid characters" + ) + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id with spaces", + order_by = "score DESC" + ), + regexp = "invalid characters" + ) +}) + +test_that("`exp_score`, when supplied, must be a non-empty character", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + exp_score = "", + order_by = "score DESC" + ), + regexp = "`exp_score`" + ) + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + exp_score = c("a", "b"), + order_by = "score DESC" + ), + regexp = "`exp_score`" + ) +}) + +test_that("`exp_filter`, when supplied, must be a non-empty character", { + expect_error( + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + exp_filter = "", + order_by = "id ASC" + ), + regexp = "`exp_filter`" + ) +}) + +# ----- Tier 2: SQL composition (mocked .frs_db_execute) ----- + +with_captured_sql <- function(call_expr, + cols_in = c("id", "blue_line_key", "distance_to_stream")) { + # RPostgres requires one statement per dbExecute call — the function + # dispatches DROP and CREATE separately. .frs_table_columns is only + # called when exp_score is supplied (the reserved-column collision + # check). Both are mocked. + captured <- list() + local_mocked_bindings( + .frs_db_execute = function(conn, sql) { + captured[[length(captured) + 1L]] <<- sql + 0L + }, + .frs_table_columns = function(conn, table, exclude_generated = FALSE) cols_in + ) + call_expr() + captured +} + +test_that("SQL composes DROP + CREATE + WITH scored + DISTINCT ON + ORDER BY", { + sqls <- with_captured_sql(function() { + frs_candidates_pick( + conn = "mock", + table_in = "working_bulk.pscis_candidates", + table_to = "working_bulk.pscis", + col_key = "stream_crossing_id", + exp_score = "CASE WHEN stream_name = gnis_name THEN 100 ELSE 0 END", + exp_filter = "score >= 0", + order_by = c("score DESC", "distance_to_stream ASC") + ) + }) + expect_length(sqls, 2L) + expect_match(sqls[[1]], "DROP TABLE IF EXISTS working_bulk\\.pscis") + expect_match(sqls[[2]], "CREATE TABLE working_bulk\\.pscis AS") + expect_match(sqls[[2]], "WITH scored AS") + expect_match(sqls[[2]], "CASE WHEN stream_name = gnis_name THEN 100 ELSE 0 END") + expect_match(sqls[[2]], "SELECT DISTINCT ON \\(stream_crossing_id\\)") + expect_match(sqls[[2]], "WHERE score >= 0") + # col_key prepended to ORDER BY + expect_match( + sqls[[2]], + "ORDER BY stream_crossing_id, score DESC, distance_to_stream ASC" + ) +}) + +test_that("exp_score = NULL omits the WITH scored CTE", { + sqls <- with_captured_sql(function() { + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + order_by = c("assessment_date DESC") + ) + }) + expect_no_match(sqls[[2]], "WITH scored") + expect_match(sqls[[2]], "SELECT DISTINCT ON \\(id\\) \\*\\s*\\n\\s*FROM schema\\.in") +}) + +test_that("exp_filter = NULL omits the WHERE clause", { + sqls <- with_captured_sql(function() { + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + order_by = "id ASC" + ) + }) + expect_no_match(sqls[[2]], "\\bWHERE\\b") +}) + +test_that("table_in with existing `score` column is rejected when exp_score is set", { + expect_error( + with_captured_sql( + function() { + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + exp_score = "CASE WHEN ... THEN 1 ELSE 0 END", + order_by = "score DESC" + ) + }, + cols_in = c("id", "score", "distance_to_stream") # collides + ), + regexp = "already has a `score` column" + ) +}) + +test_that("table_in with `score` column is allowed when exp_score is NULL", { + # No collision check fires when exp_score not set — caller is + # referencing the existing score column in order_by directly. + expect_silent({ + sqls <- with_captured_sql( + function() { + frs_candidates_pick( + conn = "mock", + table_in = "schema.in", + table_to = "schema.out", + col_key = "id", + order_by = "score DESC" + ) + }, + cols_in = c("id", "score", "distance_to_stream") + ) + }) + expect_match(sqls[[2]], "ORDER BY id, score DESC") +})