Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions R/query_osm_shapes_match_routes.R
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ osm_shapes_match_routes <- function(gtfs, q, geometry = TRUE, gtfs_match = "rout
)
}



# >> Validate OSM data
if (nrow(osm_route_name) == 0) { # Validate that there is an OSM match for GTFS route
warning_routes_missing <<- append(warning_routes_missing, route_name)
Expand Down Expand Up @@ -368,21 +366,31 @@ osm_shapes_match_routes <- function(gtfs, q, geometry = TRUE, gtfs_match = "rout
select(-initial_gtfs, -final_gtfs) |>
st_as_sf(sf_column_name="geometry")

# When multiple osm_id, return those with min distance_diff + points_diff + then stops_diff
if (length(unique(gtfs_route_name_result$osm_id)) < nrow(gtfs_route_name_result)) {
gtfs_route_name_result_unique = gtfs_route_name_result |>
group_by(osm_id) |>
slice_min(order_by = distance_diff + points_diff + stops_diff, with_ties = FALSE) |>
Comment on lines +369 to +373
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says to choose the row with min distance_diff + points_diff + then stops_diff, but slice_min(order_by = distance_diff + points_diff + stops_diff) uses a simple sum (and mixes meters with a stop count). This can change which shape is kept for a duplicated osm_id in unintuitive ways. Consider implementing a lexicographic tie-break (e.g., order by distance_diff, then points_diff, then stops_diff) and/or explicitly weighting/normalizing stops_diff so the score units are comparable.

Suggested change
# When multiple osm_id, return those with min distance_diff + points_diff + then stops_diff
if (length(unique(gtfs_route_name_result$osm_id)) < nrow(gtfs_route_name_result)) {
gtfs_route_name_result_unique = gtfs_route_name_result |>
group_by(osm_id) |>
slice_min(order_by = distance_diff + points_diff + stops_diff, with_ties = FALSE) |>
# When multiple osm_id, return those with min distance_diff, then points_diff, then stops_diff
if (length(unique(gtfs_route_name_result$osm_id)) < nrow(gtfs_route_name_result)) {
gtfs_route_name_result_unique = gtfs_route_name_result |>
group_by(osm_id) |>
arrange(distance_diff, points_diff, stops_diff, .by_group = TRUE) |>
slice(1L) |>

Copilot uses AI. Check for mistakes.
ungroup()

warning_osm_repeated <<- append(warning_osm_repeated, sprintf(
"`%s` %s has %d shapes, but the geometrical match returned only %d (out of %d) OSM routes\n>> `osm_id` for route: %s\n>> The ignored ones were: %s\n>> The duplicated ones were: %s",
"`%s` %s has %d shapes, but the geometrical match returned only %d (out of %d) OSM routes\n>> `osm_id` for route: %s\n>> The ignored ones were: %s\n>> The duplicated ones were: %s\n>> Returning shapes that have greatest geometrical match: %s\n>> Shapes ignored: %s",
gtfs_match, route_name, nrow(gtfs_route_name),
length(unique(gtfs_route_name_result$osm_id)), nrow(osm_route_name),
paste(osm_route_name$osm_id, collapse=", "),
# osm ignored
paste(setdiff(
union(gtfs_route_name_result$osm_id, osm_route_name$osm_id),
intersect(gtfs_route_name_result$osm_id, osm_route_name$osm_id)
), collapse=", "),
paste(unique(gtfs_route_name_result$osm_id[duplicated(gtfs_route_name_result$osm_id)]), collapse=", ")
# osm duplicated
paste(unique(gtfs_route_name_result$osm_id[duplicated(gtfs_route_name_result$osm_id)]), collapse=", "),
# shapes returned
paste(gtfs_route_name_result_unique$shape_id, collapse=", "),
# shapes ignored
paste(setdiff(gtfs_route_name$shape_id, gtfs_route_name_result_unique$shape_id), collapse=", ")
))
return(data.frame(
route_name=route_name
)) # Return NULL for failed elements
gtfs_route_name_result = gtfs_route_name_result_unique
}

return(gtfs_route_name_result)
Expand Down Expand Up @@ -422,15 +430,34 @@ osm_shapes_match_routes <- function(gtfs, q, geometry = TRUE, gtfs_match = "rout
}

not_found <- bind_rows( result[lengths(result)<=1] )
routes_shapes_n = gtfs$routes |> # Start on routes.txt to match line number with route_name
select(route_id, !!gtfs_match) |>
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

select(route_id, !!gtfs_match) is likely incorrect here because gtfs_match is a character scalar (validated earlier), and select() generally requires all_of(gtfs_match) / any_of(gtfs_match) (or sym(gtfs_match) + !!) for programmatic selection. As written, this can fail at runtime and break the new partial_match reporting.

Suggested change
select(route_id, !!gtfs_match) |>
select(route_id, all_of(gtfs_match)) |>

Copilot uses AI. Check for mistakes.
left_join(gtfs$trips |> select(route_id, trip_id, shape_id, direction_id), by="route_id") |>
left_join(shapes_sf, by="shape_id") |>
distinct(.data[[gtfs_match]], shape_id) |>
group_by(.data[[gtfs_match]]) |>
summarise(shapes_n = n())
partial_match = result_success |>
st_drop_geometry() |>
group_by(.data[[gtfs_match]]) |>
summarise(shapes_n = n()) |>
left_join(routes_shapes_n, by=gtfs_match) |>
rename(matched = shapes_n.x, gtfs = shapes_n.y) |>
filter(matched < gtfs)
Comment on lines +440 to +446
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

partial_match is computed unconditionally from result_success via st_drop_geometry(). When there are no successful matches, result_success is the result of bind_rows(list()) (a plain tibble, not an sf), so st_drop_geometry(result_success) will error before the later if (nrow(result_success) == 0) early-return. Guard this block with if (inherits(result_success, "sf") && nrow(result_success) > 0) (or move the nrow(result_success)==0 return above this computation).

Suggested change
partial_match = result_success |>
st_drop_geometry() |>
group_by(.data[[gtfs_match]]) |>
summarise(shapes_n = n()) |>
left_join(routes_shapes_n, by=gtfs_match) |>
rename(matched = shapes_n.x, gtfs = shapes_n.y) |>
filter(matched < gtfs)
if (inherits(result_success, "sf") && nrow(result_success) > 0) {
partial_match = result_success |>
st_drop_geometry() |>
group_by(.data[[gtfs_match]]) |>
summarise(shapes_n = n()) |>
left_join(routes_shapes_n, by=gtfs_match) |>
rename(matched = shapes_n.x, gtfs = shapes_n.y) |>
filter(matched < gtfs)
} else {
# No successful matches or result_success is not an sf: define empty partial_match
partial_match <- data.frame(
matched = integer(),
gtfs = integer()
)
partial_match[[gtfs_match]] <- character()
partial_match <- partial_match[, c(gtfs_match, "matched", "gtfs")]
}

Copilot uses AI. Check for mistakes.

warning_osm_unsorted_stops <- unique(warning_osm_unsorted_stops) # This warning list can have duplicates, ignore
errors <- length(warning_routes_missing) + length(warning_osm_repeated) + length(warning_osm_unsorted_stops) + length(warning_osm_stops_missing)
if (errors>0 || nrow(not_found)) {
w = sprintf(
"There were %d error(s) during the algorithm execution, which led to %d route(s) without a match (route(s) ignored), with the following `%s`:\n\n> %s\n",
"There were %d error(s) during the algorithm execution, which led to %d route(s) without a match (route(s) ignored), with the following `%s`:\n\n> %s\n\nAdditionally, %d routes had partial matches (only some of its shapes had a match):\n\n> %s\n",
errors,
Comment on lines 451 to 453
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The consolidated warning text now states that the %d error(s) "led to %d route(s) without a match (route(s) ignored)", but one of the counted "errors" (warning_osm_repeated) no longer causes the route to be ignored—only some shapes are dropped and returned as a partial match. This message can be misleading; consider splitting the summary into (a) routes ignored (not_found) and (b) routes partially matched, and only attributing the ignored count to conditions that actually remove routes from result_success.

Copilot uses AI. Check for mistakes.
nrow(not_found),
# Not found
gtfs_match,
paste(not_found$route_name, collapse="\n> ")
paste(not_found$route_name, collapse="\n> "),
# Partial match
nrow(partial_match),
paste(partial_match[[gtfs_match]], " (matched ", partial_match[["matched"]], " of ", partial_match[["gtfs"]], " shapes)", collapse="\n> ", sep = "")
)
warning(w)
if (!is.na(log_file)) cat(paste("WARNING! ", w, "\n"), file = log_file, append = TRUE)
Expand Down