From cf9e474a8f12ad7f86a53854b7d51d79b87dd6e6 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Fri, 22 Aug 2025 16:02:43 -0700 Subject: [PATCH 1/7] feat: add function for mode ploidy. --- R/ploidy.R | 30 ++++++++++++++++++++++++++++++ R/utils.R | 17 +++++++++++++++++ vignettes/various_functions.Rmd | 17 ++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/R/ploidy.R b/R/ploidy.R index b5f78b4..9334b08 100644 --- a/R/ploidy.R +++ b/R/ploidy.R @@ -40,3 +40,33 @@ weighted_ploidy <- function( return(sample_ploidy) } + + +#' find the mode CN per chromosome, then mode across the chromosomes +#' +#' Done per chromosome first so that large chromosomes don't dominate the +#' result. It is critical that the input be even bin level data, or this makes +#' no sense to do. Measures of the copy number states need to be done in evenly +#' sized bins. +#' +#' @param bin_df dataframe of bin level data +#' @param sample_col string column name identifying samples +#' @param cn_col string column name for copy number states +#' @param chrom_col string column name for chromosomes +#' @return tibble/dataframe of results by cell_id/sample +mode_ploidy <- function( + bin_df, + sample_col = "cell_id", + cn_col = "state", + chrom_col = "chr") { + bin_df |> + dplyr::group_by(.data[[sample_col]], .data[[chrom_col]]) |> + dplyr::summarise( + chr_mode = cust_mode(.data[[cn_col]]) + ) |> + dplyr::group_by(.data[[sample_col]]) |> + dplyr::summarise( + mode_ploidy = cust_mode(chr_mode) + ) |> + dplyr::ungroup() +} diff --git a/R/utils.R b/R/utils.R index 9f111ac..3fad3ae 100644 --- a/R/utils.R +++ b/R/utils.R @@ -223,3 +223,20 @@ make_cellid_matrix <- function(cell_df, name_col, val_col) { rownames(cell_mtx) <- dplyr::pull(cell_mtx_init, cell_id) return(cell_mtx) } + + +#' find most common element in a vector +#' +#' For ties, just the first one in the list is returned. +#' +#' @param x vector of values +#' @return most common element of input vector +#' @export +cust_mode <- function(x, na.rm = FALSE) { + if (na.rm) { + x <- x[!is.na(x)] + } + + ux <- unique(x) + return(ux[which.max(tabulate(match(x, ux)))]) +} diff --git a/vignettes/various_functions.Rmd b/vignettes/various_functions.Rmd index 38ccd9c..4d11611 100644 --- a/vignettes/various_functions.Rmd +++ b/vignettes/various_functions.Rmd @@ -283,6 +283,9 @@ filt_reads <- ex_reads |> c(filt_reads = nrow(filt_reads), input_reads = nrow(ex_reads)) ``` +
+
+
## Ploidy of samples @@ -294,7 +297,19 @@ We can get an idea of the ploidy of a sample by taking the mean of the copy numb This function does that: ```{r} -weighted_ploidy(ex_segs) |> +dlptools::weighted_ploidy(ex_segs) |> + dplyr::slice_head(n = 5) +``` + +
+ +Alternative, if you want integers, is to get the mode ploidy. The should only be done for evenly sized bins, or the mode wouldn't make much sense. + +```{r} +dlptools::mode_ploidy( + ex_reads, + sample_col = "cell_id" +) |> dplyr::slice_head(n = 5) ``` From 35debbc2d92a9923195e6bed45814f7abb211417 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Mon, 25 Aug 2025 10:56:41 -0700 Subject: [PATCH 2/7] feat: mark CN seg state relative to ploidy. --- NAMESPACE | 2 ++ R/cn_features.R | 52 +++++++++++++++++++++++++++++++++ man/cust_mode.Rd | 17 +++++++++++ man/make_cellid_matrix.Rd | 8 ++--- man/mode_ploidy.Rd | 31 ++++++++++++++++++++ man/ploidy_relative_cn_state.Rd | 27 +++++++++++++++++ 6 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 man/cust_mode.Rd create mode 100644 man/mode_ploidy.Rd create mode 100644 man/ploidy_relative_cn_state.Rd diff --git a/NAMESPACE b/NAMESPACE index 443f0e5..1005166 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(compute_tip_sibling_distances) export(convert_dists_to_pairwise) export(convert_long_reads_to_wide) export(create_expected_bins) +export(cust_mode) export(expand_length_to_bins) export(extract_sigminer_wang_features) export(extract_wu_features) @@ -54,6 +55,7 @@ export(mark_bins_overlapping_centromeres) export(mark_mask_regions) export(medicc_profiles_to_foreground) export(pairwise_bin_difference) +export(ploidy_relative_cn_state) export(plot_bg_state_highlight) export(plot_fg_state_highlight) export(plot_heatmap_of_tip_changes) diff --git a/R/cn_features.R b/R/cn_features.R index b284897..f5281fe 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -214,3 +214,55 @@ extract_sigminer_wang_features <- function(segs_df) { return(sm_tally$nmf_matrix) } + + +#' Could changes of state relative to ploidy. +#' +#' Marks CN segments as a gain or loss, relative to the mode ploidy of the +#' sample. +#' +#' @param segs_df a dataframe of CN segments +#' @param annotate_input boolean. return input dataframe annotating each +#' @param return_matrix boolean. Return a cell-by-feature matrix of counts. +#' @return tibble/dataframe of counts +#' @export +ploidy_relative_cn_state <- function( + segs_df, + sample_col = "cell_id", + annotate_input = FALSE, + return_matrix = FALSE) { + sample_mode_ploidy <- segs_df |> + reads_to_segs() |> + mode_ploidy(sample_col = sample_col) + + segs_df <- dplyr::left_join( + segs_df, + sample_mode_ploidy, + by = sample_col + ) + + segs_df <- segs_df |> + dplyr::mutate( + cn_v_ploidy = dplyr::case_when( + state < mode_ploidy ~ "ploidy-loss", + state > mode_ploidy ~ "ploidy-gain", + state == mode_ploidy ~ "ploidy-match" + ) + ) + + if (annotate_input) { + return(segs_df) + } + + feat_count <- segs_df |> + dplyr::group_by(.data[[sample_col]], .drop = FALSE) |> + dplyr::count(cn_v_ploidy) |> + dplyr::ungroup() + + if (return_matrix) { + feat_mtx <- make_cellid_matrix(feat_count, "cn_v_ploidy", "n") + return(feat_mtx) + } + + return(feat_count) +} diff --git a/man/cust_mode.Rd b/man/cust_mode.Rd new file mode 100644 index 0000000..2df51e8 --- /dev/null +++ b/man/cust_mode.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{cust_mode} +\alias{cust_mode} +\title{find most common element in a vector} +\usage{ +cust_mode(x, na.rm = FALSE) +} +\arguments{ +\item{x}{vector of values} +} +\value{ +most common element of input vector +} +\description{ +For ties, just the first one in the list is returned. +} diff --git a/man/make_cellid_matrix.Rd b/man/make_cellid_matrix.Rd index 87aa7c2..9809f96 100644 --- a/man/make_cellid_matrix.Rd +++ b/man/make_cellid_matrix.Rd @@ -25,12 +25,12 @@ can be pivoted out. } \examples{ cell_df <- data.frame( - cell_id = c("cell_1", "cell_1", "cell_1"), - some_col = c("A", "B", "C"), - some_val = 1:3 + cell_id = c("cell_1", "cell_1", "cell_1"), + some_col = c("A", "B", "C"), + some_val = 1:3 ) -make_cellid_matrix(cell_df, name_col="some_col", val_col="some_val") +make_cellid_matrix(cell_df, name_col = "some_col", val_col = "some_val") # A B C # cell_1 1 2 3 diff --git a/man/mode_ploidy.Rd b/man/mode_ploidy.Rd new file mode 100644 index 0000000..59dbbca --- /dev/null +++ b/man/mode_ploidy.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ploidy.R +\name{mode_ploidy} +\alias{mode_ploidy} +\title{find the mode CN per chromosome, then mode across the chromosomes} +\usage{ +mode_ploidy( + bin_df, + sample_col = "cell_id", + cn_col = "state", + chrom_col = "chr" +) +} +\arguments{ +\item{bin_df}{dataframe of bin level data} + +\item{sample_col}{string column name identifying samples} + +\item{cn_col}{string column name for copy number states} + +\item{chrom_col}{string column name for chromosomes} +} +\value{ +tibble/dataframe of results by cell_id/sample +} +\description{ +Done per chromosome first so that large chromosomes don't dominate the +result. It is critical that the input be even bin level data, or this makes +no sense to do. Measures of the copy number states need to be done in evenly +sized bins. +} diff --git a/man/ploidy_relative_cn_state.Rd b/man/ploidy_relative_cn_state.Rd new file mode 100644 index 0000000..67fd481 --- /dev/null +++ b/man/ploidy_relative_cn_state.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{ploidy_relative_cn_state} +\alias{ploidy_relative_cn_state} +\title{Could changes of state relative to ploidy.} +\usage{ +ploidy_relative_cn_state( + segs_df, + sample_col = "cell_id", + annotate_input = FALSE, + return_matrix = FALSE +) +} +\arguments{ +\item{segs_df}{a dataframe of CN segments} + +\item{annotate_input}{boolean. return input dataframe annotating each} + +\item{return_matrix}{boolean. Return a cell-by-feature matrix of counts.} +} +\value{ +tibble/dataframe of counts +} +\description{ +Marks CN segments as a gain or loss, relative to the mode ploidy of the +sample. +} From 5cda4de99d1365d92b0e05aba665f7d0acb282eb Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Wed, 27 Aug 2025 14:22:14 -0700 Subject: [PATCH 3/7] feat: add segment type marking. --- NAMESPACE | 3 + R/bin_filling.R | 30 +- R/centromere_marking.R | 136 ------- R/chromosome_layouts.R | 433 +++++++++++++++++++++++ R/cn_features.R | 2 +- R/utils.R | 8 + inst/extdata/hg19_gap.txt.gz | Bin 0 -> 7423 bytes inst/extdata/hg38_gap.txt.gz | Bin 0 -> 12368 bytes man/add_centromere_locations.Rd | 2 +- man/add_chromosome_length.Rd | 18 + man/add_telomere_positions.Rd | 23 ++ man/fill_state_from_neighbours.Rd | 2 +- man/import_telos_file.Rd | 28 ++ man/is_chr_used_in_chroms.Rd | 15 + man/load_chrom_info_file.Rd | 2 +- man/load_ucsc_centromeres.Rd | 2 +- man/mark_bins_overlapping_centromeres.Rd | 2 +- man/mark_segs_chromosome_span.Rd | 67 ++++ vignettes/feature_extraction.Rmd | 22 ++ 19 files changed, 624 insertions(+), 171 deletions(-) delete mode 100644 R/centromere_marking.R create mode 100644 R/chromosome_layouts.R create mode 100644 inst/extdata/hg19_gap.txt.gz create mode 100644 inst/extdata/hg38_gap.txt.gz create mode 100644 man/add_chromosome_length.Rd create mode 100644 man/add_telomere_positions.Rd create mode 100644 man/import_telos_file.Rd create mode 100644 man/is_chr_used_in_chroms.Rd create mode 100644 man/mark_segs_chromosome_span.Rd diff --git a/NAMESPACE b/NAMESPACE index 1005166..bded12f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(RAW_COPY_COLORS) export(STATE_COLORS) export(add_centromere_locations) export(add_missing_bins_for_cells) +export(add_telomere_positions) export(add_tip_ancestors_to_df) export(basic_tile_plot) export(cell_cn_profile) @@ -42,6 +43,7 @@ export(get_tips_that_avoid_redundant_comps) export(import_annotations_df) export(import_clones) export(import_dlp_files) +export(import_telos_file) export(import_tree) export(library_from_cell) export(load_chrom_info_file) @@ -53,6 +55,7 @@ export(make_tree_plot_obj) export(map_states_to_letters) export(mark_bins_overlapping_centromeres) export(mark_mask_regions) +export(mark_segs_chromosome_span) export(medicc_profiles_to_foreground) export(pairwise_bin_difference) export(ploidy_relative_cn_state) diff --git a/R/bin_filling.R b/R/bin_filling.R index 9b03eea..f47fa65 100644 --- a/R/bin_filling.R +++ b/R/bin_filling.R @@ -1,31 +1,3 @@ -#' loading UCSC chromosome length files -#' @param version default "hg19", can also load "hg38" -#' @return tibgble of chromosome, total length, etc. -#' @export -load_chrom_info_file <- function(version = c("hg19", "hg38")) { - version <- match.arg(version) - - chrom_files <- list( - hg19 = "hg19_chromInfo.txt.gz", - hg38 = "hg19_chromInfo.txt.gz" - ) - - chrom_info <- get_package_file_path(chrom_files[version]) |> - vroom::vroom( - col_names = c("chr", "total_length", "misc"), - show_col_types = FALSE - ) |> - dplyr::filter( - # remove the unnecessary chromosomes - stringr::str_detect(chr, "_|M", negate = TRUE) - ) |> - dplyr::mutate( - chr = stringr::str_replace(chr, "chr", "") - ) - - return(chrom_info) -} - #' For a given length, create bins of a given size #' #' e.g., length = 10, bin = 5, will get bins: 1-5, 6-10 @@ -128,7 +100,7 @@ add_missing_bins_for_cells <- function( #' other states to fill in. #' #' @param state_df dataframe of state data with NAs for some bins that need -#' to be filled in. E.g., after dlptools::running add_missing_bins_for_cells() +#' to be filled in. E.g., after dlptools::add_missing_bins_for_cells() #' @export #' @importFrom rlang .data fill_state_from_neighbours <- function(state_df, cols_to_fill = c("state")) { diff --git a/R/centromere_marking.R b/R/centromere_marking.R deleted file mode 100644 index cb505fa..0000000 --- a/R/centromere_marking.R +++ /dev/null @@ -1,136 +0,0 @@ -#' add centromere information to reads by chromosome -#' -#' See dlptools::read_and_prep_ucsg_cenrtomeres() for details of file origins. -#' -#' @param centro_file NULL or string to the path if you download yourself -#' @param hg19 boolean to target hg19 for loading -#' @param hg38 boolean to target hg38 for loading -#' @return input table with centromere information added by chromosome -#' @export -add_centromere_locations <- function( - reads_df, centro_file = NULL, version = c("hg19", "hg38")) { - version_choice <- match.arg(version) - - centros <- load_ucsc_centromeres( - centro_file = centro_file, version = version_choice - ) - - reads_df <- dplyr::left_join( - reads_df, - centros, - by = dplyr::join_by(chr == chrom) - ) - - return(reads_df) -} - -#' add boolean if bin overlaps with a centromere. -#' -#' Can optionally specify a padding to mark locations of bins as being close -#' enough to a centromere. Often bins near centromeres are corrupt in their -#' state calls. In the past, we have filtered within 3 Mb of centromeres. -#' -#' IMPORTANT! This adds the padding to each side of the centromere. So if you -#' specify a padding of 3 Mb, it will be within 3 Mb of the start and 3 Mb of -#' the end of the centromere. -#' -#' @param reads_df tibble of read data -#' @param padding int of number of BP to add to each side of the centromere -#' @param bin_start_col Default: start. column name of the start of bins. -#' @param bin_end_col Default: end column name of the end of bins. -#' @param version default 'hg19', or choose 'hg38' for locations of centromeres. -#' @return input table, but with a boolean 'within_centro' column added (and -#' potentially other centromere information columns, if needed) -#' @export -mark_bins_overlapping_centromeres <- function( - reads_df, - padding = 0, - bin_start_col = "start", - bin_end_col = "end", - version = c("hg19", "hg38")) { - version_choice <- match.arg(version) - columns_check <- all(c("centro_start", "centro_end") %in% colnames(reads_df)) - - if (!columns_check) { - warning(paste0( - "centromere locations not present, adding locations for ", - version_choice, - ". If that is not correct, set version to 'hg38')." - )) - - reads_df <- add_centromere_locations(reads_df = reads_df) - } - - reads_df <- reads_df |> - dplyr::mutate( - centromere_padding = padding, - overlaps_centro = ( - .data[[bin_start_col]] <= centro_end + padding & - .data[[bin_end_col]] >= centro_start - padding - ) - ) - - return(reads_df) -} - - -#' load and prep default centromerefiles from UCSC -#' -#' Cytoband files originally obtained from UCSC golden path genomes and parsed -#' to collect the total centromere spans. -#' -#' hg19: -#' https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz -#' hg38: -#' https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz -#' -#' TODO: can I stash the file if it's already loaded once? -#' -#' @param centro_file NULL or string to the path if you download yourself -#' @param hg19 boolean to target hg19 for loading -#' @param hg38 boolean to target hg38 for loading -#' @return tibble of parsed centromere spans -#' @export -load_ucsc_centromeres <- function( - version = c("hg19", "hg38"), - centro_file = NULL) { - version_choice <- match.arg(version) - - if (is.null(centro_file)) { - default_centro_files <- c( - # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz - hg19 = "hg19_cytoBand.txt.gz", - # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz - hg38 = "hg38_cytoBand.txt.gz" - ) - - centro_file <- get_package_file_path(default_centro_files[version_choice]) - } - - # import file and filter to features of interest - centros <- vroom::vroom( - centro_file, - col_names = c("chrom", "start", "end", "arm_full_name", "feat"), - show_col_types = FALSE - ) |> - dplyr::filter(feat == "acen") |> - dplyr::mutate( - # removes the 'chr' from the chromosome name, e.g., chr1 -> 1 - chrom = stringr::str_extract(chrom, "([A-Z]|[0-9]+)") - ) - centros <- centros |> - dplyr::group_by(chrom) |> - dplyr::mutate( - centro_start = min(start), - centro_end = max(end), - centro_span = centro_end - centro_start, - arm = stringr::str_extract(arm_full_name, "[a-z]") - ) - - # pivot out to one line per chromosome of centromere measurements - centros <- centros |> - dplyr::select(-c(arm_full_name, feat)) |> - tidyr::pivot_wider(names_from = arm, values_from = c(start, end)) - - return(centros) -} diff --git a/R/chromosome_layouts.R b/R/chromosome_layouts.R new file mode 100644 index 0000000..b67aa44 --- /dev/null +++ b/R/chromosome_layouts.R @@ -0,0 +1,433 @@ +#' loading UCSC chromosome length files +#' @param version default "hg19", can also load "hg38" +#' @return tibgble of chromosome, total length, etc. +#' @export +load_chrom_info_file <- function(version = c("hg19", "hg38")) { + version <- match.arg(version) + + chrom_files <- list( + hg19 = "hg19_chromInfo.txt.gz", + hg38 = "hg38_chromInfo.txt.gz" + ) + + chrom_info <- get_package_file_path(chrom_files[version]) |> + vroom::vroom( + col_names = c("chr", "total_length", "misc"), + show_col_types = FALSE + ) |> + dplyr::filter( + # remove the unnecessary chromosomes + stringr::str_detect(chr, "_|M", negate = TRUE) + ) |> + dplyr::mutate( + chr = stringr::str_replace(chr, "chr", "") + ) + + return(chrom_info) +} + +#' add chromosome lengths to a cn dataframe +#' +#' Handles chromosomes specified with "chr#" or just #/X/Y. +#' +#' @param cn_df dataframe of copy number information +#' @param version string. hg19 (default) or hg38 +#' @param chrom_col string. name of column with chromosome information. +add_chromosome_length <- function(cn_df, version = c("hg19", "hg38"), chrom_col = "chr") { + chrom_info <- load_chrom_info_file(version = version) + + if (is_chr_used_in_chroms(cn_df[[chrom_col]])) { + chrom_info <- dplyr::mutate(chrom_info, chr = stringr::str_c("chr", chr)) + } + + cn_df <- dplyr::left_join( + cn_df, + chrom_info, + by = dplyr::join_by({{ chrom_col }} == chr) + ) + + return(cn_df) +} + + +#' add centromere information to reads by chromosome +#' +#' See dlptools::read_and_prep_ucsg_cenrtomeres() for details of file origins. +#' +#' @param centro_file NULL or string to the path if you download yourself +#' @param hg19 boolean to target hg19 for loading +#' @param hg38 boolean to target hg38 for loading +#' @return input table with centromere information added by chromosome +#' @export +add_centromere_locations <- function( + reads_df, centro_file = NULL, version = c("hg19", "hg38")) { + version_choice <- match.arg(version) + + centros <- load_ucsc_centromeres( + centro_file = centro_file, version = version_choice + ) + + reads_df <- dplyr::left_join( + reads_df, + centros, + by = dplyr::join_by(chr == chrom) + ) + + return(reads_df) +} + +#' add boolean if bin overlaps with a centromere. +#' +#' Can optionally specify a padding to mark locations of bins as being close +#' enough to a centromere. Often bins near centromeres are corrupt in their +#' state calls. In the past, we have filtered within 3 Mb of centromeres. +#' +#' IMPORTANT! This adds the padding to each side of the centromere. So if you +#' specify a padding of 3 Mb, it will be within 3 Mb of the start and 3 Mb of +#' the end of the centromere. +#' +#' @param reads_df tibble of read data +#' @param padding int of number of BP to add to each side of the centromere +#' @param bin_start_col Default: start. column name of the start of bins. +#' @param bin_end_col Default: end column name of the end of bins. +#' @param version default 'hg19', or choose 'hg38' for locations of centromeres. +#' @return input table, but with a boolean 'within_centro' column added (and +#' potentially other centromere information columns, if needed) +#' @export +mark_bins_overlapping_centromeres <- function( + reads_df, + padding = 0, + bin_start_col = "start", + bin_end_col = "end", + version = c("hg19", "hg38")) { + version_choice <- match.arg(version) + columns_check <- all(c("centro_start", "centro_end") %in% colnames(reads_df)) + + if (!columns_check) { + warning(paste0( + "centromere locations not present, adding locations for ", + version_choice, + ". If that is not correct, set version to 'hg38')." + )) + + reads_df <- add_centromere_locations(reads_df = reads_df) + } + + reads_df <- reads_df |> + dplyr::mutate( + centromere_padding = padding, + overlaps_centro = ( + .data[[bin_start_col]] <= centro_end + padding & + .data[[bin_end_col]] >= centro_start - padding + ) + ) + + return(reads_df) +} + + +#' load and prep default centromerefiles from UCSC +#' +#' Cytoband files originally obtained from UCSC golden path genomes and parsed +#' to collect the total centromere spans. +#' +#' hg19: +#' https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz +#' hg38: +#' https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz +#' +#' TODO: can I stash the file if it's already loaded once? +#' +#' @param centro_file NULL or string to the path if you download yourself +#' @param hg19 boolean to target hg19 for loading +#' @param hg38 boolean to target hg38 for loading +#' @return tibble of parsed centromere spans +#' @export +load_ucsc_centromeres <- function( + version = c("hg19", "hg38"), + centro_file = NULL) { + version_choice <- match.arg(version) + + if (is.null(centro_file)) { + default_centro_files <- c( + # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz + hg19 = "hg19_cytoBand.txt.gz", + # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz + hg38 = "hg38_cytoBand.txt.gz" + ) + + centro_file <- get_package_file_path(default_centro_files[version_choice]) + } + + # import file and filter to features of interest + centros <- vroom::vroom( + centro_file, + col_names = c("chrom", "start", "end", "arm_full_name", "feat"), + show_col_types = FALSE + ) |> + dplyr::filter(feat == "acen") |> + dplyr::mutate( + # removes the 'chr' from the chromosome name, e.g., chr1 -> 1 + chrom = stringr::str_extract(chrom, "([A-Z]|[0-9]+)") + ) + centros <- centros |> + dplyr::group_by(chrom) |> + dplyr::mutate( + centro_start = min(start), + centro_end = max(end), + centro_span = centro_end - centro_start, + arm = stringr::str_extract(arm_full_name, "[a-z]") + ) + + # pivot out to one line per chromosome of centromere measurements + centros <- centros |> + dplyr::select(-c(arm_full_name, feat)) |> + tidyr::pivot_wider(names_from = arm, values_from = c(start, end)) + + return(centros) +} + + +#' load UCSC gap files for telomeres +#' +#' Honestly, this function is sort of pointless. The telomeres in both genome +#' versions are marked as 10Kb for p and q on all chromosomes. Telomeres are +#' variable and better estimates of their size exist, e.g., +#' https://www.nature.com/articles/s41467-024-48917-7 +#' +#' This function just serves as a way to set up easy marking of where CNs occur +#' on their respective chromosome. +#' +#' The only exception to the 10Kb size is chr17. It is not listed in the gap +#' file for hg19. chr17 is known to have small chromosomes. Here, I set the p +#' to 3000 Kb and q to 5000 Kb, inferred from the article referenced above. +#' +#' @param version string. hg19 (default) or hg38 +#' @return tibble of telomere information +#' @export +import_telos_file <- function(version = c("hg19", "hg38")) { + version_choice <- match.arg(version) + + default_telo_files <- c( + # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/gap.txt.gz + hg19 = "hg19_gap.txt.gz", + # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/gap.txt.gz + hg38 = "hg38_gap.txt.gz" + ) + + telos_file <- get_package_file_path(default_telo_files[version_choice]) + + + # col names by inspection and checking against here by looking up gap table + # data format description + # https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=2018561924_ZAtdZC9CFEJw8BKiYeUjd2ImvhS7&clade=mammal&org=Human&db=hg19&hgta_group=allTables&hgta_track=hg19&hgta_table=gap&hgta_regionType=genome&position=chr7%3A155%2C592%2C223-155%2C605%2C565&hgta_outputType=primaryTable&hgta_outFileName= + telos <- readr::read_tsv( + telos_file, + col_names = c( + "bin", "chrom", "telostart", "teloend", "ix", "n", "size", "feat_type", "bridge" + ) + ) |> + dplyr::filter(feat_type == "telomere") |> + dplyr::mutate( + chrom = dplyr::if_else( + # removes the 'chr' from the chromosome name, e.g., chr1 -> 1 + # this is to facilitate matching with our reads/segs files that just + # have 0-0/X/Y + chrom %in% c("chrX", "chrY"), + stringr::str_extract(chrom, "[A-Z]"), + stringr::str_extract(chrom, "[0-9]+") + ) + ) + + # now mark p and q telos + telos <- telos |> + dplyr::mutate( + arm = dplyr::if_else(telostart == 0, "p", "q") + ) |> + dplyr::select(arm, chr = chrom, telostart, teloend) |> + tidyr::pivot_wider(names_from = arm, values_from = c(telostart, teloend)) + + # special case for chr17 + # for hg19, chr17 doesn't have telomeres listed, so need to fake some + # telomere data. + # marking p as 3000 and q as 5000 following: + # https://www.nature.com/articles/s41467-024-48917-7 + chr17_total_len <- load_chrom_info_file() |> + dplyr::filter(chr == 17) |> + dplyr::pull(total_length) + + chr17_telo <- data.frame( + chr = "17", + telostart_p = 0, + teloend_p = 3000, + telostart_q = chr17_total_len - 5000, + teloend_q = chr17_total_len + ) + + telos <- dplyr::bind_rows(telos, chr17_telo) |> + dplyr::relocate(chr, telostart_p, teloend_p) + + return(telos) +} + +#' add telomere positions to a CN df +#' +#' See [dlptools::import_telos_file()] but really just a convenience function. +#' Telomere positions in the UCSC gap file are all just 10Kb, except chr17, +#' which I set to 3Kb and 5Kb, again see linked function for details. +#' +#' @param cn_df dataframe of CN information, segments or read bins. Just needs +#' a 'chr' column. +#' @param version string. hg19 (default) or hg38, not that it makes a +#' difference. +#' @return input dataframe with new columns of telomere information +#' @export +add_telomere_positions <- function(cn_df, version = c("hg19", "hg38")) { + telos <- import_telos_file(version = version) + + cn_df <- dplyr::left_join(cn_df, telos, by = "chr") + + return(cn_df) +} + + +#' label CN segments based on relative chromosomal positions +#' +#' Uses centromere and telomere coordinates to label where a segment sits on a +#' chromosome, relative to telomeres and centromeres. +#' +#' Possible categories are: +#' 1. telomere bound (telo-bound) - segment touches a telomere +#' 2. centromere bound (centro-bound) - segment touches or crosses the +#' centromere +#' 3. arm (arm) - segment spans a whole are (*with conditions) +#' 4. whole chromosome (whole-chrom) - segment spans the entire chromosome +#' (*with conditions) +#' 5. intersitial (inert) - occuring within the chromosome, not touching the +#' centromere, telomeres, and not big enough to be an entire arm. +#' +#' You can set a min_bound_distance which reflects how close a feature needs to +#' be to be considered "touching". For example, we can considere a segment +#' telomere bound if within traditional 1 DLP bin by setting this distance to +#' 500k (default). This allows for some level of measurement error. +#' +#' Users can also set the proportion of the arm or whole chromosome a segment +#' needs to span to be considered either category. Default is spaning 90% of +#' either feature. Meaning, if the segment (end - start)/arm_length is at least +#' 90% of the arm_length, the segment is considered an "arm" spanning segment. +#' +#' This input dataframe requires several columns to be pregenerated. Users need +#' to run [dlptools::add_chromosome_length()], +#' [dlptools::add_centromere_locations()], and +#' [dlptools::add_telomere_positions()]. See the the [features vignette](https://molonc.github.io/dlptools/articles/feature_extraction.html) for more +#' details +#' +#' @param segs_df dataframe of CN segments +#' @param min_bound_distance integer. Distance to adjacent feature to be +#' considered associated with that feature. +#' @param min_span_of_arm float. Proportion of arm to cover to be considered an +#' arm segment. +#' @param min_span_of_chrom float. Proportion of the chromosome to cover to be +#' considered a whole chromosome segment. +#' @param acro_fix_whole_chrom. boolean. Whether to reset acrocentric +#' chromosome CN segments to "whole-chrom" if they span futher than the Q arm. +#' Honestly, probably not useful +#' @return input dataframe, annotated with segment scale information. Primary +#' column of interest being seg_span_event. +#' @export +mark_segs_chromosome_span <- function( + segs_df, + min_bound_distance = 5e5, # given scale of DLP, this should probably be 1 bin width, at least + min_span_of_chrom = 0.9, + min_span_of_arm = 0.9, + acro_fix_whole_chrom = FALSE) { + required_cols <- c("teloend_p", "telostart_q", "centro_start", "centro_end", "total_length") + if (!all(required_cols %in% colnames(segs_df))) { + stop( + paste( + "required columns missing! Need:", + paste(required_cols, collapse = ", "), + "see functions: dlptools::add_chromosome_length, dlptools::add_centromere_locations, dlptools::add_telomere_positions", + sep = "\n" + ) + ) + } + + segs_df <- segs_df |> + dplyr::mutate( + # find closest telomere to the CN event + telo_p_dist = start - teloend_p, + telo_q_dist = telostart_q - end, + telo_dist = dplyr::case_when( + # in telo or touching at both ends + telo_p_dist <= 0 & telo_q_dist <= 0 ~ 0, + # in or touching at one end + telo_p_dist <= 0 | telo_q_dist <= 0 ~ 0, + # closer to p + telo_p_dist <= telo_q_dist ~ telo_p_dist, + # closer to q + telo_p_dist > telo_q_dist ~ telo_q_dist, + ), + telo_bound = telo_dist <= min_bound_distance, + + # find where CN sits relative to centromere + centro_p_dist = centro_start - end, + centro_q_dist = start - centro_end, + centro_dist = dplyr::case_when( + # crossing centro somewhere + centro_p_dist <= 0 & centro_q_dist <= 0 ~ 0, + # on the p side + centro_p_dist >= 0 & centro_q_dist < 0 ~ centro_p_dist, + # on th q side + centro_p_dist < 0 & centro_q_dist >= 0 ~ centro_q_dist, + ), + spans_centro = end > centro_end & start < centro_start, + centro_bound = centro_dist <= min_bound_distance, + + # span of chromosome + spans_chrom = (end - start) / total_length >= min_span_of_chrom, + + # marking of arm events. + p_arm_length = centro_start - teloend_p, + q_arm_length = telostart_q - centro_end, + spans_arm = dplyr::case_when( + end < centro_start ~ (end - start) / p_arm_length >= min_span_of_arm, + start > centro_end ~ (end - start) / q_arm_length >= min_span_of_arm, + .default = FALSE + ), + + # trying to be simple and Shih et al. 2023 10.1038/s41586-023-06266-3 + # inspired + seg_span_event = dplyr::case_when( + spans_chrom ~ "whole-chrom", + centro_bound & telo_bound & !spans_centro ~ "arm", + !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ "arm", + telo_bound & !centro_bound & !spans_chrom ~ "telo-bound", + centro_bound | spans_centro ~ "centro-bound", + .default = "inter" + ) + ) + + if (acro_fix_whole_chrom) { + # these chromosomes have p arms >10Mb. Short, but not crazy. Might not need + # this fix. + acro_chroms <- c("13", "14", "15", "21", "22") + + segs_df <- segs_df |> + dplyr::mutate( + # basically span of the q arm + acro_span = dplyr::if_else( + chr %in% acro_chroms & start > centro_start, + (end - start) / (total_length - centro_end), + NA + ), + seg_span_event = dplyr::case_when( + !(chr %in% acro_chroms) ~ seg_span_event, + acro_span >= q_span ~ "whole-chrom", + .default = seg_span_event + ) + ) + } + + return(segs_df) +} diff --git a/R/cn_features.R b/R/cn_features.R index f5281fe..5dbff35 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -232,7 +232,7 @@ ploidy_relative_cn_state <- function( annotate_input = FALSE, return_matrix = FALSE) { sample_mode_ploidy <- segs_df |> - reads_to_segs() |> + segs_to_reads() |> mode_ploidy(sample_col = sample_col) segs_df <- dplyr::left_join( diff --git a/R/utils.R b/R/utils.R index 3fad3ae..7e65c26 100644 --- a/R/utils.R +++ b/R/utils.R @@ -240,3 +240,11 @@ cust_mode <- function(x, na.rm = FALSE) { ux <- unique(x) return(ux[which.max(tabulate(match(x, ux)))]) } + + +#' detect if chromosomes labels include "chr" +#' @param chrom_vec vector of chromosome labels, probably pulled from a +#' dataframe of cn information. +is_chr_used_in_chroms <- function(chrom_vec) { + return(any(stringr::str_detect(unique(chrom_vec), "chr"))) +} diff --git a/inst/extdata/hg19_gap.txt.gz b/inst/extdata/hg19_gap.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..9d45f51c3e934bd1828b9c239af1833d230b772e GIT binary patch literal 7423 zcmVD!ELq;d4%^I76QeFGYfx?VnhexXabP{GfIF265&ef;|T?eXRD^7Q6`nTBCju8V|dgK0m#F`E$P2r)g~@<`?Dh@Wj-L z?zc*1E_nKQ3MX@DYqsK5VR(_=$FsIr-B0iKx)vuzxpxOpnV^)se(xCxQ}4t8S)g_A z-}fWs*CH7QkZBO^+#rYzo@B~R0P14pmQ4u%k#ezC5 za?gf2>)>L&ky=ozi%jllx|nqZ;r)wD)7f1`R9&moAeBYde;5c7mZLa!^-wENNLdH- z`1Ja6e7kJjv95g8jd<`S{QUU(`0@<%=WicgzdVg$%RZ{gE!>6&`6T=**^*mLN0eBQ z5KZEiZ;7?NCsnw!lLa9f264;$h|J3oX+<<>kS*fYGk3-D91(&So)qMZFkU92YZ%p9 zx;B>}s*~6?32TV-h?r~jNo^8)OGG!*!$<_%K{|>3N9R}@adxj7Mxrr0Ym>NBKD%>J z@bLoUAYa5?ZZ_g(sJyEtRN|q zgtF9#H*Sp=HpG4qwMoLB`QEKa3`uN=lqO+`sl!Iv-zqo7cJ1*LBpW#@L)Nryv-Wvh-?$`lQ(SWP14f5{VKfNQ1aeBF(&9c4RO%56K_@Wo({KTK@Xe^yGca?s>UeI&8c7Hyi(Q5^jxoTVihzn;MyyCs=wBJ1vC(5n&o&6LMEsI|N0? zW!=LNOPx?ln%ItD+2IJbfZn`S5F1N(%qz}s={OsxuWKPBTFsFB0DXYetk&vs6jtGz zRs-4qX=%bG--?FI2e}HM6J%uDpVxmQX>bq}SMcEpC>HKWyrAMiL`BvdfNf0Ybyk9H zP!$Z)X+e>6yV`cFNJOYd2l#+WS$OnS@`RyuM{x*1J`bpzIT&2zcrT*}?zP1YsF>QI zlDZ}4eMF9Ij#_ElWj1NnRzJsK*{&J6(W#aigu-ADuA8)!xDlb0P#&ojS!@szbSgT5 zhI$kSIr1^~dQuc~StQ#Y>*24@*a32S`1Iu;Brw!EaJa~x<9QwO?~floeR}@-$7&4b zuH5f6w%y%?QMVx~>87#CG@og(G+yRfP?ER`0QYl1m87DibrsRoPN~!~1S40V_I;x) z>}PIA0_aQoqtlGR&g_ueD=sAf>a+mdeThJi{W3$&*!J!1yu?RWue!c;Ki)(mGHK7u zF#ql)AmRt0E%Ps0X(T-$r7>_JUR@K3mMmTGv_{K7xmUZy7-zz8w(b2`Vv!T?tov>0rGw*_u zu*FK@g@d|g=RkfmxKRbwi45{dCBN_b5eX_g>wS4Bdg=Y+p`276x`taf9+joL(N_~O6xyk*A`c=dO>d0Mhq%P9E2)Aep z(CXmgqX9yt*37;Q01Diq zbL1Rfcq8wEWRP7Oh}50z7q39Odj+mm(U4ihI!ZhAiQ`F}e(Egy#b^szkiDYaYYWmw z+Luc}qU;*=5ptnyNJ=X?_S?*l{X4hkR&C?(@1=@N_RiPJF22M^1K4xTs!aIm-Z#|D z(Bs4%<#Iryw3RywTsLTwu<|}!1kh5)MbTc5WLW}qN%HRxtE6f0aWjrqbUf1U@BP@K zw?P-?0GmLr>6Nhsh*6w^p{rRyqkqj$20^iO5??Li;zL2h1GuFT6&>W17^8E=g&SZ4 zl=BbjOMq^#Mgj@w11$YYKy=pMdP5U}Spsx`S$6p$drZZ{$Jamg;XeQX0RR8WNGt&5 z6H_yDP{j^xdVy(dqiO&E00960lw8}6Ehi3r?!N&8Hhj&|NirABB+)#~|0`?R)SbO| zN0EYcn$=YW#$LYAZRS2Nz}EYK+sM!~A6GHMRtvHp|N0%e)pYXZ+L?T;itjToB7p%w zD{g#wiA9iK7^7g|ad*yh2aXjG@dP;m2eQ5NH;_0}&|(aOx!%xR`6iT66$Zpfbw;o) z-=R98|M|BGT$ni&OaLnICDn&RXO=;Q{rK-cupli-U@>t95IwN=Slt)EY+=d~O?xCG zc)RZ*>ku03#_jP?6|eo`L$64fcM?8L6~=0~AMf8ocF)d2c{=Mlv_hjjUUv?3t$@gO?i2kq>QHLZR$r$rw~77a`aA7PNr* zbPmOBB-J;+(4+prGuuou@w~&Zb+4wvu(=?2JGGnK^Q^v4Z?B`Aw88e-h|(r(>Cm>< zYE1ltonTRfUv@s4J zv!&Jiw#^53GiqwL;$t4G`L_RlY_)*CF&t>-LeW)uVCV(_Ey{*t1fjc7(?M@nQS$+e z`C=O|zpg#o@j%;gRwgdLePAjZ)|0UVgbaeXa0-NPdlTn=VE0F5k;Qkvq+x?mMfG7m z_PYI(Z{3i_G?t~n}ah_tI{)QuT6r!qx8K6ad68E5-gs3-RE+yHlwNh7na z96XqrY!hv$& z_lkbh$R>^jwJh-UVC)>$2jKxN7iEUCnD6XuBWu5V!xrehA!tB^VYX-I-mg?38?+x> zQDb<0{@f&g$|pzLlrVf9>>hx3h;bFUL@iD5?_^dzYHu8uwM~oKCis(z4d*mkB!s$5 zm(=+(U^A_-nhL>s_IxHE5oh)sc1{56Z1A(nla^huM|CG^dwl|+-KoNw(sa6Dt!)|7 zRh%k}P7zAdVq=nSA+V!^O=}Qx4dD@L zOo;3dVd;EO&=BNBre@m><;OG%2w+2~pCJ3p0#ypgc)RJJ%H(6tik~!CN6#cXok=b~ z%LkagO~khO$OB#|AFeimoA6iiA|Z>s|&KMiAK+#(BF`x74H-bVckGGqPI{;sS4B<1gjb2e{YJKzY8kEw zVHAZC!m^`vOo}I`iCDCWc-!drsiwV8d*Q2&qKC#j*a1sy%vMlB7n&|}@e=dFE)sje zzgWj}0j~Y!_R6%qq>B~HI5dZ3;5IBZ`hv>1c6y8*gUB47oK;q^j*zS@8OjN?PlY5K z=xwXOr!sL6PVY%abk2@%>smK6*4oVuOUy>h1#8Mm4GHk9{&~IlQ4qs&3Ny~&jYWK1 zcK-KZdKxtF*L62D!q@4^<$05IdK2q7TUL;;G&Nb>eJW4?P$@UCU33kVw& zVG%#pZmOT=bWqj~X9l765bC;rew(if_lP4UQ+V4grGbL8DT5u$jb6lsrK`#?A%TD& z6k`uBgeK%V>UT-lk9$#{HrFYS%$hWv+VD0AO4`mXnFrIaWe4R0)Hcw%L?01kMF5G& zsM}05)9CS9jSue|l0Mb;qp-nBJ==5e>Ja)AmVJ^n=0oie>Bk;Up}qEo@SGDj5JM`< zw-CAu-Bxx|(tQ0~5Te5H7fMjyklJ!5P5T73MjYEIA@wQcrH>f8Jqc799Fslk3_C(H zd(kQu-@}Y-8FkC~;GxO_i_G!lG*+AQ%}V*_LOEDgkD1s!1k+#*t>@x`KG;yO>1Yce zg2`(-6D+R_DtG#@AFK1Myie~R4&!+;c4MMe?VBlRDSyoCC!Md4aMy+P<%8(BP?m-| zB6j$khDW(DmP`}B&gQB%1HWRC5)%D9C2h-_2U2znl)Z4vAdQf;tpXO=RyecvDz3PA z$0?cSyP23*ujU82rycBog~aP2F7;)He2O3kjAoS=nUGYf=q9qS4~5) zr9XcD@3*KGvzW-67>#7fc^DhQ%JY8V3B$%d93l3`w3+Z5gy#U$fRkPn8w$s{!V|)yHC~nR0`|yK-h<%lyy=R;H7}!Z z#fM+mJKnF}KyYPnQLJr_-lRuhAk{1RZ%`%KAR#JHB5bXf%!Vi5PA}LVs0s(nt3S1} z+1C#(bA}hxeQ$F`WWZ}(t@*N-xrUy{y>}3tagEXBU|(Fve_z3<)N?N zE5gh#zYpS>)3*!4PH*=5V!f4R)QHk~`|TY5Jk&P9LNHm0*Xq1~;MSsY)6YF)cNlmm z@6_>zh+oCvm-Uv_-l9qy>@SD2hbDp2aP=#wr=AyNwy>UYU1&YNXbTyJI3Fo$P2N6@ z{c~{Pg&8|6NZfEm9LkvS^tAi(mn`6KEGi+G=_L2Wu?6pOQTKxCuunkoVgh4HUw*6J zy#EFUUk)q77A@Wx`|{)}@BY4jSOw=&wqN0#v8_Fthw!DBPs{i7S-@i1JBbgI!c>kI zbM&HEn(qdrC_9w!Y0yhVB46G}Sw_1k)&gd1Z%-VR)+TbfMc%K64aAVFXd7>3eA%o= zNYTr~?njMRs_D6^-|yD1$mtc&-XBkUpbE7Z>PvPEn&F8(TJm-=Bb%}R?U!GOa}ovz z<3?&4`<$3WzJSIJ#LJsdM^HwfryaI&v#9Oyqyj6D!(io^rBS(@&Nl9kf-O)sC^FLF zMVnw5`FhIo4a719HCJeea|vrD?5BJlvIB(ED)=ff>`>-yBz_hw*wudA7qHa0fKqB_ z6d$!0tDee!s?-59*IxUn-2 z8qB^izr2lKp?#$sfBgLOPvWZInZ*a?UJ*%0BBz&|t7s1pczw@tU)jf1U%cTjfMoRA_qQVCQ+^9=6SxwTQYPXvNCZonX*m92)un#7;5`kRUYWYFP6F zSpE74#+EvaUbNhHy%@XQ`k!#TDF?g7zvaNvm3y|z?zc1Sv3FSRmIfIG|As{l9|b+A zV|D5d9DIxdi4pI~t5mL<-eL4G$tM&H9TNehBg~3;V2nrivOIOa{*y39$k?;&mU$jJ z8A;wl7?6OSVF_WuJXKR$>^WKtK#ol2m>!ls|fc;aa`C!GtvzGDU!pn3F-W1>+pK;(1L_l;;C%a+aJ z&LW5-EswKW{!N1X7S<5#(k#Ms>zLF#vukt}9UQRkAtqfJOaKg(ub{BV`#j0}KBbX) zfl3hGv0CodW*}Y|OY3%QZ_QoYIA;D4l}Vwq)%UQ%7E~?-b3hgxx-#B;I?e-R{JR~r z*Lc&-W?(-qoc1tmJB)qltm`7cANay)GpdxOiQ2W0?2=!^G!PrFyj037`IjSn*D0{I zp$vFnF&Y-e^K;_fZDnY7n0f$6W;r5($djU_^rfNs3jhHB|3YLHU{?gZG|I>lv>+W^ z-x`Cae^9gm00030|Ad^$j^i*4M9=vfTf2Uz&kYO)J^zoXM@oqob#gKfsn}6uic7Js zuW>>#p?14Mq&J6gJ|Fh$GRoOE-}FSWU|J4xQ*wJ(O_0k!57>;? z`R~^^Fld@ou=QplAM^0uagUdiFJC~RP=4b8j0iRO7S7ZqR>{i~78&_qzJ1eTFuiW) zdP#BlN55bZ&MjAkjSN^?_#7bB9bwmpjJ+%8*r|ALoYGeGRg6{hVm*F-{`~<~AB-ev zL9|6e3ht`piqp9u(XJ@S8{cMwHMBdW(axsqmNit!jKt+r`QBZ0^zb0VhY3NRFwW);u!tkWP`@bR$Pt)VpGG z7jaVvn?{WxhjWB<`xE5r^zB@W_0v5Wmj^1V4feumXX!o!OPQ9$!{|eFcf#EvA5mSa z*~}5){flz}FQLwM$I%|3l@d_@DW2)F#gD5wL#qQUmPS=aSrhY*9(q)0d8>&3g;S`H z4}aKvxvtBZ@97r~u&k)V#4PCaH*Ii>KbYn|+%DYq3WK0+W2UK4Tk;N;T+I0;54XLN zlx-=bLf0a}2OcEimfI(s5<cc(zfHd+1D;EVn#G68J|#Y9|IWTVj^ zrYo%Fg=9#3*o@S?1hnXQRjT+oq0ysNbt9KI!)0;63Zg@qi4?TTFmid%fPNvBAS9hu zg<)Hayl($&IZs3(e}asI{thJhrEnG!A`{W~iguceV#ZO5VD)tSyqU?BK|GXE%oxq! z>&>FKQ!D>kCW^G)wz8|uIs>cHzP`8(41^yTzgS^dJ_EK;j7m>b*UqBSyF+kJZWWIg zu1&3p(MzU`lSw66-L2Tya$rt$F%ad$ZWb9-w_C0yM*76W2%E;OKCEqt3G+VmB(+Bq zR-D$p7MiJj-pPpUI&RF0!z#aZ(#>x%6IIx$&8K#hyHm0Wc6mS2@M)BVuDE?nJ0`)~ zsyWzosfkeBu{9IC+uQ5O)w1DcD!IwN9%4^Xjclnp55f;%-WZVbHVd$lRRUxsF{_96}F|| z++MdQ3NEgz=#`k=3j0!f;snZ1P??~&3@I3a z0U9Z%ZdyM71TbG5t$w;GJ;#VQAr|9c_!uXjOEA>oN+^xNSog47uM;3=Ld`)&HzTek zHvR(u0RR7doI#evAP7X)a~l;zz5m6ADkb*3CbRpC)ay1e3IS{9AV>mW$$*2G2xIBZu(&3y}28Pu*_^Y9nZ^$F3`~K}Omxv|9Xx zpmM>o3T#{GaFy;u^~?zP0pg%3T0$B@X4-Ii1#nrzs6BTgNZLK);r$Wds0qJqb=e>* zZw8YXcAsb*MN7mqf}|DLCGo5cMJPcOg)sn1TC}f^0Pz;Hm44vGa7*Jqp{Rf_IKGn} zC^J`BD3DGKf^GP6G>nMl<03S|ovZ(GTHB)MTI-MK*6iOxmh&JVjdcxidJjGn3mH)X z7(sK(gtE@FXZfc2)G?!r->c`CSO^r+6RdZ91qEn+AiTuFVE!9u#|9Wd)4uh3de)$r z29L2HB!C1>?GR!NaKXfa<6FlDmE3YArla|@M8C8{uLPC6mEtNE`qw5heIe+Zh3!0w ziI0~4Zu?-2pqg7!UIokyP$-;h5;XU8sK>_-pHUiJ!w@Yt!nZBg*$38Ej8YJBHy)BB zXdX0NnYfhtf+;sjltrhQdt}|s-}>jf1!@xpu?BCh`j@~S=Wh9 z(Sg+&B78vVunjdXNiawg&q`@JkzNp$J}T#&zwx)ufB9R!qdYHxbb{&CV)po8NCSr- zI6_5HIWfsQ>v98Zz9wE)W6jr$$JEp;TdQv`%OtWiCWO)vzUx*}NnVb}ZEF)~q)2lI zue7pVGa7d-wb8+(PTk5~eSv#q1JwwtUU)eNy>ijpxq%037Nj~Ll-j2QeqQ3A7X((p zz5);jyXqS83jb+e_MZ$pNb9s`7&3Kx2w{U=E`2fj4ak|%>k9op4{KVO}CW@Dg&)Rwd~P!Nupy? zmzzKWS9A?WR8u>5)* zgVz6T2JwWqV_L~L>9&NtwjFUfWT?jw=xpI@Xh!SsAx95+Q`gESF*vOZr>*F6^dt@z zpLs|jO7k5$$f7tExw@B55VFeEF8%(SgS=fcCfr}u|8Zb&BPeE$&;4KJ(j(S)za%_f zdcayVXT)_|xV&_M^`&Q6ll#2%f#n<)C*(I&K-~L<7aX-vLOjV9M0FS`u2z=XyNlz+ z3B=F|8KmN6Im@+dB&d6!LXHIOIdgDj=fJ~5Qw~5JwCXnRJ_73ur)pRWIw)1AvZM6< zfck;JfnFb=|N2{iZOP*86Qb+JiX$Y6DEOfz@$GQ@iV8y-PC)?JL9Naa<`9zuJ%JO@ xl&{zHB<&DS$J&7xR{jc>0pe=9(;D%3ItbVBbe?e;Ag)>G{{U$ojwbtK007E1XVCxv literal 0 HcmV?d00001 diff --git a/inst/extdata/hg38_gap.txt.gz b/inst/extdata/hg38_gap.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..1a6b842a1bf46c1270740d0ff67515d5b2bfc7b3 GIT binary patch literal 12368 zcmV-WFt5)aiwFpsS=M6!17~4yE_8Tw0F+wWj@&j9eeQe>S*#mBVX?sef?+t5b%59d zI7Wc|`#n{p)KZJxlgz{D*sddqWYwvQEH1Y%Uw_nFxz&Q7TfO~fqMvVnynlQ9@pk+E z{)@N{Gb*O4H_o@xR zIRTliIgYoT%M~GNEkW}pH!x!}wa}STrS9)myD3WtqCx{XueKE=+er9fY}rH_$l1oh zohf~%NNE)|15qHXY(Z)JQ`j6EM-xdPm-?C#yG4B>3gpsO6yGVmk>NlF@<4dl2Nbi; zgJRxDgA&Vce%a|eO9~u7%S1S-VC4?rcwewc3tVg>!{52aCeUkAuZ1_Hu?-sApmHJY zCnOc;0tcL^#rtV&1eI5$R^R~xr4xW;!|UoR8eBcS0jP=Rvtlo>EqbPvI|q)5*sk0* zC4X*T-+sRRc!&A>x6eO+{~pIiyKYbuggaa{9#IB@hb%n3?G!kIH;sUGP!e^Q zbsUDrO&)t^W!yWEB!Xu>6SfhG)0r68^Am9h@w{vhWg3?2KDC*p;rtpNI4Xs7J@2q#!u|@ z14Zu?ygfv10Jt+}^jsfuV05e92cV7(&cNxH{k=29iV96&bYojD-R=zd82$**k+QIK zi=|8D%To@V9tWPl24^gu=oJUlxFe@h)Ov}p10__cB@mPopycv9UE=G&L8Z}b0%RRX z%two9sveuHo?w&(kE-Xs3rvSg_A-Em6*A)`A@;KA1Xp+>^8^y0UlZujURaDK@Bn>L zJisq@d%O_#1o1dQ@0&=TC_4c;j}UuIMH3(m24NxFp*NFd=1@o+wz(ycjFY|~N!y-< z8URHUl?}y{Dx)NTQ(o(?eUIT^2$Rw_R0B!Xxee#cQp(P1ihxAn3uHWy$dk&jZY66H zv8V{eKz$(5@-iiTT$68hmz161 z6|1fT7yttLa)>8;Dm4;)0MvYt)bfEn3c{YCXPWnLdc56Zc06(#754$EM@He-&p=hs zWdJmEu-Ann`etuf07Z^BWRw8xtFVe9dt(r)6*y1;2p#H(9;=25KIZEq2W?mz02NVm zn5nDw*o+)GC{c+vz>$7*m9S0>HdLAepdRAxD~6Rj!+Nxa01irbt$*VKMgMu$Oh6bFN3Ll)C)|jX`{3K87Z#M zeD(GH$Inlne|!sq727DNpC}sU1RBnJajTdAVBEu)d_7nVOlGy5gMQaT*K~Y z9sL#A02<&LbtiK;TeksP09u&$yE)RE+5k1cYN1DS6zw#CBzSEOsjvxZfW=blzKw>; zB_}`uBJydm)U$aGA_8=Z{__KKBOT_JP3mH)Cv%tZ-2hPc8!eVfw3&O6n5G1<1T=#2 zS@JD+qD^9*#AVS2DcKoKJd_{peM!((g(?c3Nr2as81e;>0oJ1{Zu{yhTDcYjonHt4In%rrC4zN;@gLy+aR&4+jjY}0dnybZ#AVC9IsmReB?TXj{6@XStJ(|Ow z28hhuR)Se}bB6oov>YJjMr|dSM{|xwA^~$$Wef9t-$N!c2?mHH*cMAYnfKC%d9N|6 zSmlP3c}zwVM48O1r5?Wf`W7z(_8=@Kn@+yxskn#kh(|B3IsVGsE&5@Dvae$+l_&EqrL7-pHG?2!|ryvVM}+Nz?Oz7f_alyMpf zdI_*_CE9O}TVu0*faoPwn%H*p4k}3;I6^YP(wAu36VAA|619#YH*(T9F8f1iWe~P4~5+ z;8#d)081b%LX~YL&ICtN1wh}IAg8pe8l!33U9fh~OdqHZgaowoTbry;|Na-Ah2#FK ze){~!Pe`A1^orOt$B`x%Y4ULhJ4YlJHr${@--(l;xTe3wcYFRzn`!&%-H3 zW_-yL)3F#5_#w&mPW zwm0mk-5py6&;~$D-;Z)BOycAGd=wcd;S|p zc|0aIe?By#_YYw|mhbIr`F>wp1e>{e@v?Qr{&MKX`uS=wHcgibC9tf_WwTpWHypXq zk+bbc=ijhQ*qSh;Dpe!*d?q0_U=-{Cc?C1CMMyPLat=gpiimXpwguCw=Xhao=SK%L znJKwXntkepl4-yR3T=kh&k<(L-KRIBBVwZHV9Nq<1gn^2gc!zwY|A$X*&P92WOnsE z9}Hu%)+uKQ0;#M@5mt|1l|s+#<`a~SZT)iZ)Y<4YG6#gstdt-T7(gm_jgK8WBqGsd zt2#{0j zC*hOZyxm7RyHfnZ6HNG`CD#a!zznn z)$-yN8T*`GY2zkVKvlN< zJG%tMpDemBQ|*sjwi);@OxUXb^pR_+2gbc&B;kKThGW^(7Dhf?eZ}we`b;lW$LWEP zSV{Vy&|`x2>wLhyxQlNke*X1od-GeXiuL%N6fx5r{><*!-e4mHsQTQo`~f$y*eVt5 z_}H`X>JuAhBB-Z^%y-mT9#hUW-;4(Y;Sfch`slNc=QF?4neov^Ddy?r+n31Bwm8NG zq@IWlIze^NR`$uZaxkcZVTp=>3mO)k>g#wU%v7%!HK!7Q^0&kP&xP+3hc%?)KrNWI zC~W00#>cJHK#`+>ilu!~*}9BJ&cM=iXpv!g;5Wz(tZc`*R`IeCTpdgT_iQ*7y#00_?4*k?sxYpZ2 zN(aey(&ucf!2?sgZ(*;8<%o~p_3u;GIJo`CjIV#q_{_o>8QCCFOq5w&ZWQjklZHi+ zg8Gr23R6tssQ}L&J8ZYFKSGZ#wX_6J0ttW}q0i$Bqc?#DU}2%|s;A-H>HwXFExb%= zc%`?zZeS}1r?aUyKn1X{iXe|yHdrPViYc!lV9fSmwWzcoTsY+osTV6#ID`bca&{30>4&%L%t(&*nW@G5mCcogt7t)w$Kp^}vk_|HL%*7t9t>~q#qxRT ze*NJMpZ!>}JNq{GPUn7Mz{AaKnmgxjbC+$J8(wRlJKovz+y`CvaVkZLe0Zn(>)c`b zU|TspSopr?2OVi-#d&FzW{=Rke%+@f?hT3}4OisNj`(myZsas=ayD)Jm+SJ$%(gWf zeta_(x%u!A_1l{5+_Loy(^LJ?A5W)GJ>tuHymU>pr?7r}JZ*A!58tGmLy;hvS1ut) z9hNIW+(lN`67(3~dP*D1PO#Pv!WPR|g(+QjH6S^x6i`aWwIBe8wyECNJq!*G1pY4N zuVDaK*>_KRZHMbN$oRI633;bXlK?^OGNt$40vMUytAc}=spp``VOzG@<32@>yedJb zo{=l4@t3{ClBPK*{c#S0s?M7Tv^@fd>mR;j%-&d`Y!_ZJQM(|NIZfoD;Z?OZv(k8> zm}8LYRH!@Ry^#+wMf`Su={@dywRsQ0QBV|IXyDvvgxswvN$7QM8|-VoioO2GM+f?Oz-Vxw3oE(Iv- z8n7R7F<_~>YIQjI6y%n?2v5%A8J=i7uZ5&n(Z(xz-VLLKC=L=wX3_#Y5M^<1vw6zN z@h$hv^$}prs4OmRUdLmPQCC>7Ke1v(H#SS&rpnftQ?}NaP7ri}vbeH&8DFqYNdW6a zjmkX|MUZ}suP|r`LZxL&}#ilTkpi571zh`h64>Lvs z+bF%@mi21}yva4K(V3I2?7+xbUaq<@jCcDS;H#Coyjyf;^iB160`J@Ey>CkR6QwC* z9)Md=MeJJ(dMj^6P80z}R%i9GU3ASo{DD+;!_+|~Y5n$R1{OC9FsuVZ7wh8fT}~7$ zyp<>b`-;$gd9&=yKx)&Q173Ytd zIhNG?Pdn1nvpv4xqX@It0H~_5cLGd;XcRM8wk?fQ)_B8W2BZ-_>L!iQ;^1p2k)SDm zqGB-bTNs+jyA%$BXx#rK?};{MI1WBoRnZiA@jO#YeDJ2GEjG~*Hfk&Qpb>&mAxtMH z(G)^6_qPxqp1B$|i>7FzR`yOZ6`62Hvjl)NhchjE70nCZ|3JHF*e>GwmQfflfp*ax zPjriWs)Oi%PeU|E6Fr5YrCkst7tP^3>s~}Bu$K>_t%;Uso_+6-Gr2Er`RAsS*Xt!`z6J8=wW&N_*fu#Pcp>FR9f!4#!v4L7u% z;%tCGe6BaqFtWdXoViRGZWv0@S|jUXZ7&QwJA_CMm$8c05InF~9cvC1C?9S<=^bi+ zH>{_5Nl+|f;GZYc!b)SCaAJ+f*iaSLl`N)3AH+o69>Dlc*-OMRWNx{X;L(IG<2e#b z%c49oiNN-Ep64#k2RY>-_^boQ-?=Tb)3RbF^qeL`n5(~2TMuzKnZ&1&Nnre)$-9fg z(6E1x7-Go$oydEL=UJ>nhWL#Dc={AC!?TzQ`A>j5H9s!#Dg>1ndVm4!)K1HtGJNjY ziYRwII|a$wr+88!A{iDU235sh;wnTW!$QO$P5TrtiFM;-h(imqwAN4cM0C=+$_|E! z8PRDR$36j-kOqVtlPP=)>}PGw7p!jIu~2&~JEA5qCMJ#J2XJ!KeekgbgT{*8SY)-B z>HKfaZvN}lG%cwmbF|A%z#oyHu#P2%IBwcXYykX1G7)NTtt-X5v%dxpR~8eU`kOfJ zWi`N}g&`Apgj=C?25q80lUzvuasMxsmg2*rOSf~HOmtyfAB1Zk@vjL>HJPy3;`;px zU!EmV2>m7#Bj-iI1#IVTcC*N}vZ-;l4dz&#b$r42&cM9hV z#C$??Rg;X2E``S+Hwxz)B%%i32C6C>8NB-Yl<;XX(=|r9_;t^RQ?T3IV3Ti?Nqs%s zMo*eb1r0X)M*6Py`g?e`S$7)L1#NPuZ-f7Pn>9ZIE zkA8QnuZI_$CK>iM{A}{4bzD!zLOpXJvKv$b*UhcIAKu42+=kCj-Q4Qy;ZX&{GAzFQ zsheAUJ3Q1hjmAdkCfRmP{^ckL-cbzTM%$7$CV4iB?`r{C6M2?Ika?I&hNQ42i(fG$ z6A1mxC|N$EE-I{2Z||5UX+jJ#TFxgSwf+pTV~%Nv#BiK1;sYchjD(TNH3wzF@>sWC z;`LE>bmXM{n5L2A?wO5i39+Y?#LMUsULi=`txV{ z*N?4Mo|4(_mG9A|#?2Ooo!aPbbX{59aU=|qP6(=JVYrb=TgAE?0x_3~&K;r~`EY$g?6Es<@Okz=G838FoM44QQ z;Wj7!9-m5^K64RUl51~kZEbauWR{*Gk(IrJDw=`Uj*VR|)}ExrqXR;+g*N<#@W8|G z?xI<|8c11DQyQQ@LFM+boGN% z5h6J+^If})l3rvNKj zkP&q~1u+$ys()@|=Ql*>-I@`lO5D8or>^15E5sZFMayItEa+;(z-KCMEZ!VXYBeBwn z>D$@g6Z!>$RigCztmzNr(mhPk@3nSpOY^5-Q6KH(+zg~Hf6VRC&wd%13)DBQqU*44 zMbg0w!yyaVE9u_5#*JRK$*$l985G``uQ$G zQc#ao2Ivig^P6R$erXuSOkX=eyhA!-$}$k#>1~@He1mjpuFIg)O}`d|XNX1lc^V^& z#qu4}5t@^s5C3fEu|VoX3_@SHX=XGT!KEs8K9ASFR4Ka*E((jlh7!*FQlqfI&apy% zsvv)gHDtxT{&VQP|ANK|vN>sjoCCCcTTex}d`B2Q7Vaoc>Wz1zwhR>+%j<<*o&zy>A z^|0ddCC>LpIPrYcLO=r}mjWKQJ%WdeFDQLTwiQuk-?OwwPqFcG`u&Ju*F?1uH4oI6 zT_32@dg4##Z}h)7*cPC6(03@svHBYvo;%Wb!=@M%OHN_Bbz5aTd@Ic66;XhU6XUy= zC6^ofbpPxP2L;t$`Jr!23ve&tMzeo_=YVf(uZ%sNKD(A3O0oP2kNSuTW{*z$FzrO9 z4Ws@6POUWEQJ|lozNSz-vvx1}fn)#p61&#uJF)XprvRf`6)fl9&s2%WrY-^q`;NI8 z+3=Z`^Lvf({SRcMZEVn3G*?tFtmx@hFkHj_!%rnkFFH{UO-Z5G>7--b4ya*2ZKVpy zAp9tWKsB*Sed{`SAK{H!7PjRPLF!(gMYU5{@%zzlt1BDz{8m!1D%e8V|8I}K+a=R< z=|C2=&iXC$%-3#THS9lq^fciSWm*W-!%MX6O|A<6?E!9c1ltg{P&FLRsZVsT23S|R zc+GqKgz7txXv5J zWe4u?PD!@(Ue)EpOG{R1li9N1hK?wo!|+WyiZ=CTzHO@wAQz7MOG zZy(1^yuLarF;JguHd5`g1#iL8ssXf)`U`K91|eN)RsO3!`-Fb>=V2GGVRuMXF5~Ce zps7<=W*zo5?4La{)A0V#Dn`s{_PDES(q$cX7vC68N7Tv{ozI_ox-XYy54K}*YeG)` z`;anab8&~a`-I($BFa(3zmlJ|=0fTb_i%FysrQbQw<6}R#C-1SPQxRfK*Oe|NGmQ% zoP2YOMEru!dwfAYAEtK_kH|@$_Vo5rws(A!VjQd{a!x*W%>!6-S@mdp{Gc99Bq?*` zs}~bdGXKe2xTxYt*wBH^oQ1B88ej|7*7@>yz}t*N*|W_YfmDu*$(pP7dA`2jk!=gK z5m|Mje|I6TcMYWKC&##;-7_c<$H zK0bUbtvJOiHsV;5MeBTVVVkYA(tQuSH73in&UFu`MGKoI9TcTEFBwhcCS|_I61F4Q zNfW$)Bd47HwFt#WS>7}ALSL@IPuT;L0XY{ z{?Imw=<%d^KY11xy5UG_cCLzN#MK=)di^>(U-&_3pLmH$jDDZt^6HeW6YMqQgG7q7 zSqLXhwUa5|B;RV-k6U_S!BI#F`a5JQ`BgWmy>*7X@D?OX(!(ME`oC$f2v+9Jv)K!W7t1@n;OvY{GsGfqDpmo#OsIi z0M7xv16xt~e)}X~G?dRi&N$`F0IA#&IXIq<*WitG_Y2Av9hH=4ri|+8sWf6I;+<%X88_Y5b;_#O0u{R|C}_kg;Q-Bf_3qP-J-(|e9KVCCmz=_Q zOJb}p?FEOLhsxp@_N7&ScR~O4C%n1BiDpvt@?#%Va~{gSU!tiCkEka;A)2o?x;+U* zPVeKS_Y0?^Ok&DRYM?q3U}aru>#=`%v?G8VY*TjV>ME!d*tolu``1Sjh0elE6nuH! zvW0D%Pb)4C?v1bmkT{zt^4MJ&_QziSYLgx`r-|T}{UWnOGww7JWz9jX#{CxGpBHkmS ze+T7zGe}zt-wxmJYA9?3T6tkq8T%&g;c)3j6dii(QTtuJ$rxK^E;Ou(unCrixo*(R zcSLVuwp^8W0S1uFZ|+?iS2D(yLu;qVC!(0=0wG;Gt7eRCWluP~EgT58wd;`4=yo+! zs}uX0{13jLBuJ}4$^%D~F<$tw%LA_Gy;#^$DU)EHmoZ+N>TOt{alS`-)lsspQm}T& z(&*Tqupjejv-ZlHk(Fh|5KVEe!u_oJ4SOcBQ?^9ZvV{P3 zun5Hhd8DvoR1N=_?W?tAH=%Tz5tqlPd%9zqS8mfqYieSR3I;|U`%=|3ugTo`nxS2G zRBxjyPq$S5=s9j``zW}C?~amNlm&Nlcbe)kRDU@XE1jT^T|vGnlXl4T^?J&m;f6S-h;`E z!90@FY*K%_s9y+;?xGBIu<&tNoz`$*g!8H(*s}R!MpA>*?0Xt{8BZ?&)ag#czUy3f zVnrjA3s5rMe%&Z)HuD8g6XnM;)YsjklJp7DDMoXH4T7yk)hnQh?~IZ8R1x{feTuSJY7CEMoy_W6NG1=_E)0#)xD|`3WxRM3 z+sm>Ds9O}K<`ObWdi0a}IaKg@*jU2#;S0>x?|;m&dC(e?QRH;OKY+}cgp7(y`Lk~(vH`IG(2koPO&=Ps z{DL}nWs4?-|GoO1FS~m7>Fxpr~A4FxqDcq1h&!+b@pey6{;<;%m7K0dqSg{e8Mj;S{P9*Mx6b< zDUALD2zeXmIC1Jqe|yo{$Gn=OnRlGM(@J~j)bb81)akKN#3(1S2?@!5oXcnGAQZUN&7sq)w5xEztf?2$d=X3 z)_!82Wrus|^94saan=tf-gjc!;ixY>^h3`yA2T1{q0J&1X|xlsD$DZc`FUJBik%!J zzA2La!wl;nv}F(~fD`3SEpMv34>;}n@H4-@{^c5d)@y99=K4PX00960jGapogfIw1 z*K-@cXzqWp;WfID5N9^-x>BYTI-#L2SEn{I@p_`1Ak1w4dWYtvlbV)`t=uO#U7A7| z+`2I4aFyM|;0qZo(PhJQ5)L#yumFPbZ`+af{9B7dPe# zhs5Ep_v2z1f_!qX7TMNL`jqm`j(eT?f5zqM1&#TF=da_BdV{@e*XPJ zN6#Dptnc+q{elQem2Ab_mQ|Jr&(RJiiCnCvbT^+`UQ1AfAMyH84A$HfO=sQ2`SP}>;UN)O^%I`apRV-E7d z&9sX%@on)P(Ud;{oLhkcP1bQE+3)&aGi9Jd&_t;0m1VVRxyAQBlJYvl5m-?O4xSNn zLQt3*Q#wd?O&0=eSWG#?H0D8?;I9y66YU4ja=sGH<^7I{Zr4N7fp}I8PO%(aJt0l} z#0R3@B&iB-1pzzr`qP-m=n+AW5y37MwPNL?m`hz=pl_-Ib&gco9^sPq z1h|j=F%_W7d>1>!LA93v!W6t-U9M5v=M7Ta9;;-TmYWJtY3g+75RhL(3&=|4l|tJC zATyyp6zVXPGfd^DJnku)R;&=H5}gWmXtixA9rrwMh@{it^Iop1?vFCj5<3KhSIO7H zWaJo50-kPqhdB4x)TBuonhH--V{(t!Q*iz1ofNF}&^Yt)cI4OV+7NEoBsLp8Sk{p` zK*Flq0a+V#*E^W{%Q^%uYS9DJA>7#OGk|_=0a9@Fd;jztd+6REJC$sLsP4sWRRV5W zxeh3l$ZG%vM83LO5dMS6;)47G00960eVs9G!!Qg*=iJ5;EipYq(E=ShXA6)U^!Vi? zWw#ZhPUesgE4D1t;{Tr!&^TjzRs#M5kdRSzOislsxh&L)=RR&9&BGOXqy;U4RqHfu z+mVeJoSc@bE<8ypB7U(FCH@y=hfLvt*W6@_f>6>*{UB@%MIyjCPP_-NWIEuPBw03J zBPq4i;>tOE!J0jX#?S4!MrKV@+z7;Z&QaZ2RdmaE;GqFoYW#w~MfD&Yl=3G*9p`IK zh({f&?m!F4^xu3|Zm;j~niSK-`{T>W=$EU_q4?%a4rnVJ>xRpssajv2*`O_v>V_6@ zY(^O%d7-VKs9OX9`_W)yel@LtbS{Cpu^-Jgk}Vp-DMOZK&?Yd4&OfTI-$Z_hW+4cu zbAhTyFT@PrZLs!!u%pd{zJ~@|B#Dqu_y_hd>YHc2Ycn%94x5dDwsYjY!IsvIPP}oCZE2dfQoaG4{Afx~ssI2y CmhRdB literal 0 HcmV?d00001 diff --git a/man/add_centromere_locations.Rd b/man/add_centromere_locations.Rd index 5ebd865..6478d90 100644 --- a/man/add_centromere_locations.Rd +++ b/man/add_centromere_locations.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/centromere_marking.R +% Please edit documentation in R/chromosome_layouts.R \name{add_centromere_locations} \alias{add_centromere_locations} \title{add centromere information to reads by chromosome} diff --git a/man/add_chromosome_length.Rd b/man/add_chromosome_length.Rd new file mode 100644 index 0000000..5686d9e --- /dev/null +++ b/man/add_chromosome_length.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{add_chromosome_length} +\alias{add_chromosome_length} +\title{add chromosome lengths to a cn dataframe} +\usage{ +add_chromosome_length(cn_df, version = c("hg19", "hg38"), chrom_col = "chr") +} +\arguments{ +\item{cn_df}{dataframe of copy number information} + +\item{version}{string. hg19 (default) or hg38} + +\item{chrom_col}{string. name of column with chromosome information.} +} +\description{ +Handles chromosomes specified with "chr#" or just #/X/Y. +} diff --git a/man/add_telomere_positions.Rd b/man/add_telomere_positions.Rd new file mode 100644 index 0000000..cff59de --- /dev/null +++ b/man/add_telomere_positions.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{add_telomere_positions} +\alias{add_telomere_positions} +\title{add telomere positions to a CN df} +\usage{ +add_telomere_positions(cn_df, version = c("hg19", "hg38")) +} +\arguments{ +\item{cn_df}{dataframe of CN information, segments or read bins. Just needs +a 'chr' column.} + +\item{version}{string. hg19 (default) or hg38, not that it makes a +difference.} +} +\value{ +input dataframe with new columns of telomere information +} +\description{ +See \code{\link[=import_telos_file]{import_telos_file()}} but really just a convenience function. +Telomere positions in the UCSC gap file are all just 10Kb, except chr17, +which I set to 3Kb and 5Kb, again see linked function for details. +} diff --git a/man/fill_state_from_neighbours.Rd b/man/fill_state_from_neighbours.Rd index e6dfd3a..4d3ffd7 100644 --- a/man/fill_state_from_neighbours.Rd +++ b/man/fill_state_from_neighbours.Rd @@ -8,7 +8,7 @@ fill_state_from_neighbours(state_df, cols_to_fill = c("state")) } \arguments{ \item{state_df}{dataframe of state data with NAs for some bins that need -to be filled in. E.g., after dlptools::running add_missing_bins_for_cells()} +to be filled in. E.g., after dlptools::add_missing_bins_for_cells()} } \description{ This function fills in NAs of columns with the information from an upstream diff --git a/man/import_telos_file.Rd b/man/import_telos_file.Rd new file mode 100644 index 0000000..fed1dca --- /dev/null +++ b/man/import_telos_file.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{import_telos_file} +\alias{import_telos_file} +\title{load UCSC gap files for telomeres} +\usage{ +import_telos_file(version = c("hg19", "hg38")) +} +\arguments{ +\item{version}{string. hg19 (default) or hg38} +} +\value{ +tibble of telomere information +} +\description{ +Honestly, this function is sort of pointless. The telomeres in both genome +versions are marked as 10Kb for p and q on all chromosomes. Telomeres are +variable and better estimates of their size exist, e.g., +https://www.nature.com/articles/s41467-024-48917-7 +} +\details{ +This function just serves as a way to set up easy marking of where CNs occur +on their respective chromosome. + +The only exception to the 10Kb size is chr17. It is not listed in the gap +file for hg19. chr17 is known to have small chromosomes. Here, I set the p +to 3000 Kb and q to 5000 Kb, inferred from the article referenced above. +} diff --git a/man/is_chr_used_in_chroms.Rd b/man/is_chr_used_in_chroms.Rd new file mode 100644 index 0000000..c47a6a6 --- /dev/null +++ b/man/is_chr_used_in_chroms.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{is_chr_used_in_chroms} +\alias{is_chr_used_in_chroms} +\title{detect if chromosomes labels include "chr"} +\usage{ +is_chr_used_in_chroms(chrom_vec) +} +\arguments{ +\item{chrom_vec}{vector of chromosome labels, probably pulled from a +dataframe of cn information.} +} +\description{ +detect if chromosomes labels include "chr" +} diff --git a/man/load_chrom_info_file.Rd b/man/load_chrom_info_file.Rd index a93a162..c809b40 100644 --- a/man/load_chrom_info_file.Rd +++ b/man/load_chrom_info_file.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bin_filling.R +% Please edit documentation in R/chromosome_layouts.R \name{load_chrom_info_file} \alias{load_chrom_info_file} \title{loading UCSC chromosome length files} diff --git a/man/load_ucsc_centromeres.Rd b/man/load_ucsc_centromeres.Rd index 03c8d60..6af3c7d 100644 --- a/man/load_ucsc_centromeres.Rd +++ b/man/load_ucsc_centromeres.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/centromere_marking.R +% Please edit documentation in R/chromosome_layouts.R \name{load_ucsc_centromeres} \alias{load_ucsc_centromeres} \title{load and prep default centromerefiles from UCSC} diff --git a/man/mark_bins_overlapping_centromeres.Rd b/man/mark_bins_overlapping_centromeres.Rd index e0bac20..cb32121 100644 --- a/man/mark_bins_overlapping_centromeres.Rd +++ b/man/mark_bins_overlapping_centromeres.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/centromere_marking.R +% Please edit documentation in R/chromosome_layouts.R \name{mark_bins_overlapping_centromeres} \alias{mark_bins_overlapping_centromeres} \title{add boolean if bin overlaps with a centromere.} diff --git a/man/mark_segs_chromosome_span.Rd b/man/mark_segs_chromosome_span.Rd new file mode 100644 index 0000000..d27ecbc --- /dev/null +++ b/man/mark_segs_chromosome_span.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{mark_segs_chromosome_span} +\alias{mark_segs_chromosome_span} +\title{label CN segments based on relative chromosomal positions} +\usage{ +mark_segs_chromosome_span( + segs_df, + min_bound_distance = 5e+05, + min_span_of_chrom = 0.9, + min_span_of_arm = 0.9, + acro_fix_whole_chrom = FALSE +) +} +\arguments{ +\item{segs_df}{dataframe of CN segments} + +\item{min_bound_distance}{integer. Distance to adjacent feature to be +considered associated with that feature.} + +\item{min_span_of_chrom}{float. Proportion of the chromosome to cover to be +considered a whole chromosome segment.} + +\item{min_span_of_arm}{float. Proportion of arm to cover to be considered an +arm segment.} + +\item{acro_fix_whole_chrom.}{boolean. Whether to reset acrocentric +chromosome CN segments to "whole-chrom" if they span futher than the Q arm. +Honestly, probably not useful} +} +\value{ +input dataframe, annotated with segment scale information. Primary +column of interest being seg_span_event. +} +\description{ +Uses centromere and telomere coordinates to label where a segment sits on a +chromosome, relative to telomeres and centromeres. +} +\details{ +Possible categories are: +\enumerate{ +\item telomere bound (telo-bound) - segment touches a telomere +\item centromere bound (centro-bound) - segment touches or crosses the +centromere +\item arm (arm) - segment spans a whole are (*with conditions) +\item whole chromosome (whole-chrom) - segment spans the entire chromosome +(*with conditions) +\item intersitial (inert) - occuring within the chromosome, not touching the +centromere, telomeres, and not big enough to be an entire arm. +} + +You can set a min_bound_distance which reflects how close a feature needs to +be to be considered "touching". For example, we can considere a segment +telomere bound if within traditional 1 DLP bin by setting this distance to +500k (default). This allows for some level of measurement error. + +Users can also set the proportion of the arm or whole chromosome a segment +needs to span to be considered either category. Default is spaning 90\% of +either feature. Meaning, if the segment (end - start)/arm_length is at least +90\% of the arm_length, the segment is considered an "arm" spanning segment. + +This input dataframe requires several columns to be pregenerated. Users need +to run \code{\link[=add_chromosome_length]{add_chromosome_length()}}, +\code{\link[=add_centromere_locations]{add_centromere_locations()}}, and +\code{\link[=add_telomere_positions]{add_telomere_positions()}}. See the the \href{https://molonc.github.io/dlptools/articles/feature_extraction.html}{features vignette} for more +details +} diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd index b483501..b6f26fb 100644 --- a/vignettes/feature_extraction.Rmd +++ b/vignettes/feature_extraction.Rmd @@ -106,3 +106,25 @@ sig_feats[1:4, 1:4]


+ +## More Features: ploidy informed CN change + +CN change relative to ploidy might be interesting to have a feature. Here we can mark 3 types of events for segments: + +1. CN is a gain relative to mode ploidy of sample/cell +2. CN is a loss relative to mode ploidy of sample/cell +3. CN is a match to mode ploidy + +Employs the function dlptools::mode_ploidy() to calculate mode ploidy of each sample/cell. + +```{r} +dlptools::ploidy_relative_cn_state( + segs_df, + # with option just to annotate the input df + # annotate_input = TRUE + # or, alternatively, return a count matrix + # return_matrix = TRUE +) |> + dplyr::slice_head(n = 6) +``` + From c4d3628e2b509cd7d9a8b908a89e4e4d9f961149 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Wed, 27 Aug 2025 16:02:54 -0700 Subject: [PATCH 4/7] feat: count features of segment span types. --- NAMESPACE | 4 +- R/chromosome_layouts.R | 65 +++++++++++-------- R/cn_features.R | 55 ++++++++++++++-- man/SEG_CHROM_EVENT_LABELS.Rd | 16 +++++ man/add_centromere_locations.Rd | 4 +- ..._state.Rd => extract_ploidy_cn_feature.Rd} | 10 +-- man/extract_segment_position_feature.Rd | 37 +++++++++++ man/extract_wu_features.Rd | 5 +- man/mark_segs_chromosome_span.Rd | 10 +-- vignettes/feature_extraction.Rmd | 2 +- 10 files changed, 164 insertions(+), 44 deletions(-) create mode 100644 man/SEG_CHROM_EVENT_LABELS.Rd rename man/{ploidy_relative_cn_state.Rd => extract_ploidy_cn_feature.Rd} (71%) create mode 100644 man/extract_segment_position_feature.Rd diff --git a/NAMESPACE b/NAMESPACE index bded12f..03c139b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(BAF_COLORS) export(CNV_COLOURS) export(DEFAULT_CONTINUOUS_COLOR_RANGE) export(RAW_COPY_COLORS) +export(SEG_CHROM_EVENT_LABELS) export(STATE_COLORS) export(add_centromere_locations) export(add_missing_bins_for_cells) @@ -25,6 +26,8 @@ export(convert_long_reads_to_wide) export(create_expected_bins) export(cust_mode) export(expand_length_to_bins) +export(extract_ploidy_cn_feature) +export(extract_segment_position_feature) export(extract_sigminer_wang_features) export(extract_wu_features) export(factor_column_mixedsort) @@ -58,7 +61,6 @@ export(mark_mask_regions) export(mark_segs_chromosome_span) export(medicc_profiles_to_foreground) export(pairwise_bin_difference) -export(ploidy_relative_cn_state) export(plot_bg_state_highlight) export(plot_fg_state_highlight) export(plot_heatmap_of_tip_changes) diff --git a/R/chromosome_layouts.R b/R/chromosome_layouts.R index b67aa44..0630ef4 100644 --- a/R/chromosome_layouts.R +++ b/R/chromosome_layouts.R @@ -1,3 +1,13 @@ +#' global of event labels for [dlptools::mark_segs_chromosome_span()] +#' +#' Set as a global, as useful in other contexts +#' @export +SEG_CHROM_EVENT_LABELS <- c( + arm = "arm", whole = "whole-chrom", telo = "telo-bound", centro = "centro-bound", + inter = "inter" +) + + #' loading UCSC chromosome length files #' @param version default "hg19", can also load "hg38" #' @return tibgble of chromosome, total length, etc. @@ -54,26 +64,27 @@ add_chromosome_length <- function(cn_df, version = c("hg19", "hg38"), chrom_col #' #' See dlptools::read_and_prep_ucsg_cenrtomeres() for details of file origins. #' +#' @param cn_df dataframe of cn states for read bins or segments #' @param centro_file NULL or string to the path if you download yourself #' @param hg19 boolean to target hg19 for loading #' @param hg38 boolean to target hg38 for loading #' @return input table with centromere information added by chromosome #' @export add_centromere_locations <- function( - reads_df, centro_file = NULL, version = c("hg19", "hg38")) { + cn_df, centro_file = NULL, version = c("hg19", "hg38")) { version_choice <- match.arg(version) centros <- load_ucsc_centromeres( centro_file = centro_file, version = version_choice ) - reads_df <- dplyr::left_join( - reads_df, + cn_df <- dplyr::left_join( + cn_df, centros, by = dplyr::join_by(chr == chrom) ) - return(reads_df) + return(cn_df) } #' add boolean if bin overlaps with a centromere. @@ -316,11 +327,10 @@ add_telomere_positions <- function(cn_df, version = c("hg19", "hg38")) { #' either feature. Meaning, if the segment (end - start)/arm_length is at least #' 90% of the arm_length, the segment is considered an "arm" spanning segment. #' -#' This input dataframe requires several columns to be pregenerated. Users need -#' to run [dlptools::add_chromosome_length()], +#' This function runs several other functions including: +#' [dlptools::add_chromosome_length()], #' [dlptools::add_centromere_locations()], and -#' [dlptools::add_telomere_positions()]. See the the [features vignette](https://molonc.github.io/dlptools/articles/feature_extraction.html) for more -#' details +#' [dlptools::add_telomere_positions()]. #' #' @param segs_df dataframe of CN segments #' @param min_bound_distance integer. Distance to adjacent feature to be @@ -329,6 +339,7 @@ add_telomere_positions <- function(cn_df, version = c("hg19", "hg38")) { #' arm segment. #' @param min_span_of_chrom float. Proportion of the chromosome to cover to be #' considered a whole chromosome segment. +#' @param version string. hg19 (default) or hg38 #' @param acro_fix_whole_chrom. boolean. Whether to reset acrocentric #' chromosome CN segments to "whole-chrom" if they span futher than the Q arm. #' Honestly, probably not useful @@ -340,20 +351,12 @@ mark_segs_chromosome_span <- function( min_bound_distance = 5e5, # given scale of DLP, this should probably be 1 bin width, at least min_span_of_chrom = 0.9, min_span_of_arm = 0.9, + version = c("hg19", "hg38"), acro_fix_whole_chrom = FALSE) { - required_cols <- c("teloend_p", "telostart_q", "centro_start", "centro_end", "total_length") - if (!all(required_cols %in% colnames(segs_df))) { - stop( - paste( - "required columns missing! Need:", - paste(required_cols, collapse = ", "), - "see functions: dlptools::add_chromosome_length, dlptools::add_centromere_locations, dlptools::add_telomere_positions", - sep = "\n" - ) - ) - } - segs_df <- segs_df |> + add_chromosome_length(version = version) |> + add_centromere_locations(version = version) |> + add_telomere_positions(version = version) |> dplyr::mutate( # find closest telomere to the CN event telo_p_dist = start - teloend_p, @@ -399,12 +402,12 @@ mark_segs_chromosome_span <- function( # trying to be simple and Shih et al. 2023 10.1038/s41586-023-06266-3 # inspired seg_span_event = dplyr::case_when( - spans_chrom ~ "whole-chrom", - centro_bound & telo_bound & !spans_centro ~ "arm", - !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ "arm", - telo_bound & !centro_bound & !spans_chrom ~ "telo-bound", - centro_bound | spans_centro ~ "centro-bound", - .default = "inter" + spans_chrom ~ SEG_CHROM_EVENT_LABELS["whole"], # "whole-chrom", + centro_bound & telo_bound & !spans_centro ~ SEG_CHROM_EVENT_LABELS["arm"], + !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ SEG_CHROM_EVENT_LABELS["arm"], + telo_bound & !centro_bound & !spans_chrom ~ SEG_CHROM_EVENT_LABELS["telo"], # "telo-bound", + centro_bound | spans_centro ~ SEG_CHROM_EVENT_LABELS["centro"], # "centro-bound" + .default = SEG_CHROM_EVENT_LABELS["inter"] ) ) @@ -423,11 +426,19 @@ mark_segs_chromosome_span <- function( ), seg_span_event = dplyr::case_when( !(chr %in% acro_chroms) ~ seg_span_event, - acro_span >= q_span ~ "whole-chrom", + acro_span >= q_span ~ SEG_CHROM_EVENT_LABELS["whole"], .default = seg_span_event ) ) } + segs_df <- dplyr::mutate( + segs_df, + seg_span_event = factor( + seg_span_event, + levels = unname(SEG_CHROM_EVENT_LABELS) + ) + ) + return(segs_df) } diff --git a/R/cn_features.R b/R/cn_features.R index 5dbff35..7d2cbd2 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -23,7 +23,8 @@ #' in their actual code is set to 1. This function follows their code, but can #' be altered. #' -#' @param segs_df a dataframe of CN segments +#' @param segs_df dataframe. CN segments +#' @param sample_col string. Name of the column with cell_id/other sample name #' @param state_bin_max int. Maximum CN to consider for bins. All CNs of this #' value and higher are grouped together. Default of 5 follows paper. #' @param bin_breaks floats, how to break up segment sizes. Bins will be one @@ -35,7 +36,7 @@ #' @param ... can pass change_split_val to alter critical value for AA/BB split #' @return default return is a tibble of feature counts for each cell id. #' @export -extract_wu_features <- function(segs_df, state_bin_max = 5, bin_breaks = NA, annotate_input = FALSE, return_matrix = FALSE, ...) { +extract_wu_features <- function(segs_df, sample_col = "cell_id", state_bin_max = 5, bin_breaks = NA, annotate_input = FALSE, return_matrix = FALSE, ...) { # paper code: https://github.com/XSLiuLab/single-cell-CNA-signature/blob/main/code/divide_feature.R # default segment size breakpoints @@ -61,7 +62,7 @@ extract_wu_features <- function(segs_df, state_bin_max = 5, bin_breaks = NA, ann !dplyr::if_any(c(cn_bin, seg_bin, seg_change, seg_shape), is.na) ) |> dplyr::group_by( - cell_id, cn_bin, seg_bin, seg_change, seg_shape, + .data[[sample_col]], cn_bin, seg_bin, seg_change, seg_shape, .drop = FALSE ) |> dplyr::count() |> @@ -221,12 +222,13 @@ extract_sigminer_wang_features <- function(segs_df) { #' Marks CN segments as a gain or loss, relative to the mode ploidy of the #' sample. #' -#' @param segs_df a dataframe of CN segments +#' @param segs_df dataframe. CN segments +#' @param sample_col string. Name of the column with cell_id/other sample name #' @param annotate_input boolean. return input dataframe annotating each #' @param return_matrix boolean. Return a cell-by-feature matrix of counts. #' @return tibble/dataframe of counts #' @export -ploidy_relative_cn_state <- function( +extract_ploidy_cn_feature <- function( segs_df, sample_col = "cell_id", annotate_input = FALSE, @@ -266,3 +268,46 @@ ploidy_relative_cn_state <- function( return(feat_count) } + + +#' count the segment-span-on-chromosome event types. +#' +#' Critical to this fuction is [dlptools::mark_segs_chromosome_span()]. It is +#' important to read and understand that function and its arguments. +#' +#' This function basically just calls [dlptools::mark_segs_chromosome_span()] +#' and summarizes the results. Arguments can be passed to that underlying +#' function. Passing no arguments means you are happy with the defaults. See +#' [dlptools::mark_segs_chromosome_span()] to understand what the defaults are. +#' +#' @param segs_df dataframe of CN segments +#' @param sample_col string. Name of the column with cell_id/other sample name +#' @param annotate_input boolean. return input dataframe annotating each +#' segment. +#' @param return_matrix boolean. Return a cell-by-feature matrix of counts. +#' @return tibble/dataframe of counts +#' @export +extract_segment_position_feature <- function( + segs_df, + sample_col = "cell_id", + annotate_input = FALSE, + return_matrix = FALSE, + ...) { + segs_df <- mark_segs_chromosome_span(segs_df, ...) + + if (annotate_input) { + return(segs_df) + } + + seg_span_counts <- segs_df |> + dplyr::group_by(.data[[sample_col]], .drop = FALSE) |> + dplyr::count(seg_span_event) |> + dplyr::ungroup() + + if (return_matrix) { + feat_mtx <- make_cellid_matrix(seg_span_counts, "seg_span_event", "n") + return(feat_mtx) + } + + return(seg_span_counts) +} diff --git a/man/SEG_CHROM_EVENT_LABELS.Rd b/man/SEG_CHROM_EVENT_LABELS.Rd new file mode 100644 index 0000000..d44bc73 --- /dev/null +++ b/man/SEG_CHROM_EVENT_LABELS.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\docType{data} +\name{SEG_CHROM_EVENT_LABELS} +\alias{SEG_CHROM_EVENT_LABELS} +\title{global of event labels for \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}} +\format{ +An object of class \code{character} of length 5. +} +\usage{ +SEG_CHROM_EVENT_LABELS +} +\description{ +Set as a global, as useful in other contexts +} +\keyword{datasets} diff --git a/man/add_centromere_locations.Rd b/man/add_centromere_locations.Rd index 6478d90..c9a9c85 100644 --- a/man/add_centromere_locations.Rd +++ b/man/add_centromere_locations.Rd @@ -5,12 +5,14 @@ \title{add centromere information to reads by chromosome} \usage{ add_centromere_locations( - reads_df, + cn_df, centro_file = NULL, version = c("hg19", "hg38") ) } \arguments{ +\item{cn_df}{dataframe of cn states for read bins or segments} + \item{centro_file}{NULL or string to the path if you download yourself} \item{hg19}{boolean to target hg19 for loading} diff --git a/man/ploidy_relative_cn_state.Rd b/man/extract_ploidy_cn_feature.Rd similarity index 71% rename from man/ploidy_relative_cn_state.Rd rename to man/extract_ploidy_cn_feature.Rd index 67fd481..ba012f7 100644 --- a/man/ploidy_relative_cn_state.Rd +++ b/man/extract_ploidy_cn_feature.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cn_features.R -\name{ploidy_relative_cn_state} -\alias{ploidy_relative_cn_state} +\name{extract_ploidy_cn_feature} +\alias{extract_ploidy_cn_feature} \title{Could changes of state relative to ploidy.} \usage{ -ploidy_relative_cn_state( +extract_ploidy_cn_feature( segs_df, sample_col = "cell_id", annotate_input = FALSE, @@ -12,7 +12,9 @@ ploidy_relative_cn_state( ) } \arguments{ -\item{segs_df}{a dataframe of CN segments} +\item{segs_df}{dataframe. CN segments} + +\item{sample_col}{string. Name of the column with cell_id/other sample name} \item{annotate_input}{boolean. return input dataframe annotating each} diff --git a/man/extract_segment_position_feature.Rd b/man/extract_segment_position_feature.Rd new file mode 100644 index 0000000..c80553f --- /dev/null +++ b/man/extract_segment_position_feature.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_segment_position_feature} +\alias{extract_segment_position_feature} +\title{count the segment-span-on-chromosome event types.} +\usage{ +extract_segment_position_feature( + segs_df, + sample_col = "cell_id", + annotate_input = FALSE, + return_matrix = FALSE, + ... +) +} +\arguments{ +\item{segs_df}{dataframe of CN segments} + +\item{sample_col}{string. Name of the column with cell_id/other sample name} + +\item{annotate_input}{boolean. return input dataframe annotating each +segment.} + +\item{return_matrix}{boolean. Return a cell-by-feature matrix of counts.} +} +\value{ +tibble/dataframe of counts +} +\description{ +Critical to this fuction is \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}. It is +important to read and understand that function and its arguments. +} +\details{ +This function basically just calls \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}} +and summarizes the results. Arguments can be passed to that underlying +function. Passing no arguments means you are happy with the defaults. See +\code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}} to understand what the defaults are. +} diff --git a/man/extract_wu_features.Rd b/man/extract_wu_features.Rd index 402a04c..fa684d3 100644 --- a/man/extract_wu_features.Rd +++ b/man/extract_wu_features.Rd @@ -6,6 +6,7 @@ \usage{ extract_wu_features( segs_df, + sample_col = "cell_id", state_bin_max = 5, bin_breaks = NA, annotate_input = FALSE, @@ -14,7 +15,9 @@ extract_wu_features( ) } \arguments{ -\item{segs_df}{a dataframe of CN segments} +\item{segs_df}{dataframe. CN segments} + +\item{sample_col}{string. Name of the column with cell_id/other sample name} \item{state_bin_max}{int. Maximum CN to consider for bins. All CNs of this value and higher are grouped together. Default of 5 follows paper.} diff --git a/man/mark_segs_chromosome_span.Rd b/man/mark_segs_chromosome_span.Rd index d27ecbc..4a0a008 100644 --- a/man/mark_segs_chromosome_span.Rd +++ b/man/mark_segs_chromosome_span.Rd @@ -9,6 +9,7 @@ mark_segs_chromosome_span( min_bound_distance = 5e+05, min_span_of_chrom = 0.9, min_span_of_arm = 0.9, + version = c("hg19", "hg38"), acro_fix_whole_chrom = FALSE ) } @@ -24,6 +25,8 @@ considered a whole chromosome segment.} \item{min_span_of_arm}{float. Proportion of arm to cover to be considered an arm segment.} +\item{version}{string. hg19 (default) or hg38} + \item{acro_fix_whole_chrom.}{boolean. Whether to reset acrocentric chromosome CN segments to "whole-chrom" if they span futher than the Q arm. Honestly, probably not useful} @@ -59,9 +62,8 @@ needs to span to be considered either category. Default is spaning 90\% of either feature. Meaning, if the segment (end - start)/arm_length is at least 90\% of the arm_length, the segment is considered an "arm" spanning segment. -This input dataframe requires several columns to be pregenerated. Users need -to run \code{\link[=add_chromosome_length]{add_chromosome_length()}}, +This function runs several other functions including: +\code{\link[=add_chromosome_length]{add_chromosome_length()}}, \code{\link[=add_centromere_locations]{add_centromere_locations()}}, and -\code{\link[=add_telomere_positions]{add_telomere_positions()}}. See the the \href{https://molonc.github.io/dlptools/articles/feature_extraction.html}{features vignette} for more -details +\code{\link[=add_telomere_positions]{add_telomere_positions()}}. } diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd index b6f26fb..b3ac2b2 100644 --- a/vignettes/feature_extraction.Rmd +++ b/vignettes/feature_extraction.Rmd @@ -118,7 +118,7 @@ CN change relative to ploidy might be interesting to have a feature. Here we can Employs the function dlptools::mode_ploidy() to calculate mode ploidy of each sample/cell. ```{r} -dlptools::ploidy_relative_cn_state( +dlptools::extract_ploidy_cn_feature( segs_df, # with option just to annotate the input df # annotate_input = TRUE From 5686d661b061773fddc62daa8e3cfbea734755fd Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Thu, 28 Aug 2025 11:35:35 -0700 Subject: [PATCH 5/7] feat: document new CN attribute functions. Also does a bunch of refactoring to make functions more generally useable. --- NAMESPACE | 5 +- R/chromosome_layouts.R | 35 ++-- R/cn_features.R | 28 +--- R/colours.R | 2 + R/ploidy.R | 60 +++++++ man/SEG_CHROM_EVENT_LABELS.Rd | 16 -- man/extract_ploidy_cn_feature.Rd | 7 +- man/mark_cn_relative_to_ploidy.Rd | 37 +++++ man/mark_segs_chromosome_span.Rd | 2 +- vignettes/feature_extraction.Rmd | 265 +++++++++++++++++++++++++++++- vignettes/various_functions.Rmd | 197 +++++++++++++++------- 11 files changed, 534 insertions(+), 120 deletions(-) delete mode 100644 man/SEG_CHROM_EVENT_LABELS.Rd create mode 100644 man/mark_cn_relative_to_ploidy.Rd diff --git a/NAMESPACE b/NAMESPACE index 03c139b..77981cd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,10 +4,11 @@ export("%>%") export(ASCN_COLORS) export(ASCN_PHASE_COLORS) export(BAF_COLORS) +export(BASE_PLOT_THEME) export(CNV_COLOURS) export(DEFAULT_CONTINUOUS_COLOR_RANGE) +export(GEN_PLOT_COLS) export(RAW_COPY_COLORS) -export(SEG_CHROM_EVENT_LABELS) export(STATE_COLORS) export(add_centromere_locations) export(add_missing_bins_for_cells) @@ -57,9 +58,11 @@ export(make_corrupt_tree_heatmap) export(make_tree_plot_obj) export(map_states_to_letters) export(mark_bins_overlapping_centromeres) +export(mark_cn_relative_to_ploidy) export(mark_mask_regions) export(mark_segs_chromosome_span) export(medicc_profiles_to_foreground) +export(mode_ploidy) export(pairwise_bin_difference) export(plot_bg_state_highlight) export(plot_fg_state_highlight) diff --git a/R/chromosome_layouts.R b/R/chromosome_layouts.R index 0630ef4..7694179 100644 --- a/R/chromosome_layouts.R +++ b/R/chromosome_layouts.R @@ -1,13 +1,3 @@ -#' global of event labels for [dlptools::mark_segs_chromosome_span()] -#' -#' Set as a global, as useful in other contexts -#' @export -SEG_CHROM_EVENT_LABELS <- c( - arm = "arm", whole = "whole-chrom", telo = "telo-bound", centro = "centro-bound", - inter = "inter" -) - - #' loading UCSC chromosome length files #' @param version default "hg19", can also load "hg38" #' @return tibgble of chromosome, total length, etc. @@ -121,7 +111,7 @@ mark_bins_overlapping_centromeres <- function( ". If that is not correct, set version to 'hg38')." )) - reads_df <- add_centromere_locations(reads_df = reads_df) + reads_df <- add_centromere_locations(cn_df = reads_df) } reads_df <- reads_df |> @@ -314,7 +304,7 @@ add_telomere_positions <- function(cn_df, version = c("hg19", "hg38")) { #' 3. arm (arm) - segment spans a whole are (*with conditions) #' 4. whole chromosome (whole-chrom) - segment spans the entire chromosome #' (*with conditions) -#' 5. intersitial (inert) - occuring within the chromosome, not touching the +#' 5. intersitial (inter) - occuring within the chromosome, not touching the #' centromere, telomeres, and not big enough to be an entire arm. #' #' You can set a min_bound_distance which reflects how close a feature needs to @@ -353,6 +343,11 @@ mark_segs_chromosome_span <- function( min_span_of_arm = 0.9, version = c("hg19", "hg38"), acro_fix_whole_chrom = FALSE) { + event_labels <- c( + arm = "arm", whole = "whole-chrom", telo = "telo-bound", + centro = "centro-bound", inter = "inter" + ) + segs_df <- segs_df |> add_chromosome_length(version = version) |> add_centromere_locations(version = version) |> @@ -402,12 +397,12 @@ mark_segs_chromosome_span <- function( # trying to be simple and Shih et al. 2023 10.1038/s41586-023-06266-3 # inspired seg_span_event = dplyr::case_when( - spans_chrom ~ SEG_CHROM_EVENT_LABELS["whole"], # "whole-chrom", - centro_bound & telo_bound & !spans_centro ~ SEG_CHROM_EVENT_LABELS["arm"], - !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ SEG_CHROM_EVENT_LABELS["arm"], - telo_bound & !centro_bound & !spans_chrom ~ SEG_CHROM_EVENT_LABELS["telo"], # "telo-bound", - centro_bound | spans_centro ~ SEG_CHROM_EVENT_LABELS["centro"], # "centro-bound" - .default = SEG_CHROM_EVENT_LABELS["inter"] + spans_chrom ~ event_labels["whole"], # "whole-chrom", + centro_bound & telo_bound & !spans_centro ~ event_labels["arm"], + !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ event_labels["arm"], + telo_bound & !centro_bound & !spans_chrom ~ event_labels["telo"], # "telo-bound", + centro_bound | spans_centro ~ event_labels["centro"], # "centro-bound" + .default = event_labels["inter"] ) ) @@ -426,7 +421,7 @@ mark_segs_chromosome_span <- function( ), seg_span_event = dplyr::case_when( !(chr %in% acro_chroms) ~ seg_span_event, - acro_span >= q_span ~ SEG_CHROM_EVENT_LABELS["whole"], + acro_span >= q_span ~ event_labels["whole"], .default = seg_span_event ) ) @@ -436,7 +431,7 @@ mark_segs_chromosome_span <- function( segs_df, seg_span_event = factor( seg_span_event, - levels = unname(SEG_CHROM_EVENT_LABELS) + levels = unname(event_labels) ) ) diff --git a/R/cn_features.R b/R/cn_features.R index 7d2cbd2..9ecd4e9 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -217,10 +217,11 @@ extract_sigminer_wang_features <- function(segs_df) { } -#' Could changes of state relative to ploidy. +#' Count changes of state relative to ploidy. #' #' Marks CN segments as a gain or loss, relative to the mode ploidy of the -#' sample. +#' sample. Internally using [dlptools::mark_cn_relative_to_ploidy]. See that +#' function for argument details. #' #' @param segs_df dataframe. CN segments #' @param sample_col string. Name of the column with cell_id/other sample name @@ -229,29 +230,16 @@ extract_sigminer_wang_features <- function(segs_df) { #' @return tibble/dataframe of counts #' @export extract_ploidy_cn_feature <- function( - segs_df, + segs_df = NA, sample_col = "cell_id", annotate_input = FALSE, return_matrix = FALSE) { - sample_mode_ploidy <- segs_df |> - segs_to_reads() |> - mode_ploidy(sample_col = sample_col) - - segs_df <- dplyr::left_join( - segs_df, - sample_mode_ploidy, - by = sample_col + segs_df <- mark_cn_relative_to_ploidy( + in_df = segs_df, + df_type = "segs", + sample_col = sample_col ) - segs_df <- segs_df |> - dplyr::mutate( - cn_v_ploidy = dplyr::case_when( - state < mode_ploidy ~ "ploidy-loss", - state > mode_ploidy ~ "ploidy-gain", - state == mode_ploidy ~ "ploidy-match" - ) - ) - if (annotate_input) { return(segs_df) } diff --git a/R/colours.R b/R/colours.R index b918b9f..98351de 100644 --- a/R/colours.R +++ b/R/colours.R @@ -72,6 +72,7 @@ RAW_COPY_COLORS <- circlize::colorRamp2( CNV_COLOURS[1:(length(CNV_COLOURS) - 1)] ) +#' @export GEN_PLOT_COLS <- c( "#4269D0FF", "#3CA951FF", "#FF725CFF", "#EFB118FF", "#A463F2FF", @@ -79,6 +80,7 @@ GEN_PLOT_COLS <- c( "#9498A0FF", "#6CC5B0FF" ) +#' @export BASE_PLOT_THEME <- list( ggplot2::theme_minimal() + ggplot2::theme( diff --git a/R/ploidy.R b/R/ploidy.R index 9334b08..54cf561 100644 --- a/R/ploidy.R +++ b/R/ploidy.R @@ -54,6 +54,7 @@ weighted_ploidy <- function( #' @param cn_col string column name for copy number states #' @param chrom_col string column name for chromosomes #' @return tibble/dataframe of results by cell_id/sample +#' @export mode_ploidy <- function( bin_df, sample_col = "cell_id", @@ -70,3 +71,62 @@ mode_ploidy <- function( ) |> dplyr::ungroup() } + +#' Mark if CN states are gains or losses relative to cell ploidy +#' +#' CNs > 2 are not necessarily amplifications, and CNs < 2 are not necessarily +#' the only states of losses. Ploidy of samples may not be diploid. This +#' function will give you an idea if the CN you see is a gain or loss relative +#' to the ploidy of a sample. For example, if the sample has a ploidy of 4, +#' then a CN of 3 is a loss. +#' +#' Ploidy is inferred by mode CN state using [dlptools::mode_ploidy()] and +#' states > ploidy are marked as gains, states < ploidy losses, and states +#' matching ploidy as matched. +#' +#' @param in_df dataframe of CN states. +#' @param df_type string. "reads" (default) or "segs" for CN segments, which +#' will internally converted to bin based for mode calculation. +#' reads in order to infer mode ploidy. +#' @param sample_col string. Name of the column with cell_id/other sample name +#' @return input dataframe, with new columns of information. +#' @export +mark_cn_relative_to_ploidy <- function( + in_df, + df_type = c("reads", "segs"), + sample_col = "cell_id", + ...) { + df_type <- match.arg(df_type) + + event_labels <- c( + gain = "ploidy-gain", + match = "ploidy-match", + loss = "ploidy-loss" + ) + + if (df_type == "segs") { + reads_df <- segs_to_reads(in_df, ...) + } else if (df_type == "reads") { + reads_df <- in_df + } + + sample_mode_ploidy <- mode_ploidy(reads_df, sample_col = sample_col, ...) + + in_df <- dplyr::left_join( + in_df, + sample_mode_ploidy, + by = sample_col + ) + + in_df <- in_df |> + dplyr::mutate( + cn_v_ploidy = dplyr::case_when( + state < mode_ploidy ~ event_labels["loss"], + state > mode_ploidy ~ event_labels["gain"], + state == mode_ploidy ~ event_labels["match"] + ), + cn_v_ploidy = factor(cn_v_ploidy, levels = unname(event_labels)) + ) + + return(in_df) +} diff --git a/man/SEG_CHROM_EVENT_LABELS.Rd b/man/SEG_CHROM_EVENT_LABELS.Rd deleted file mode 100644 index d44bc73..0000000 --- a/man/SEG_CHROM_EVENT_LABELS.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chromosome_layouts.R -\docType{data} -\name{SEG_CHROM_EVENT_LABELS} -\alias{SEG_CHROM_EVENT_LABELS} -\title{global of event labels for \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}} -\format{ -An object of class \code{character} of length 5. -} -\usage{ -SEG_CHROM_EVENT_LABELS -} -\description{ -Set as a global, as useful in other contexts -} -\keyword{datasets} diff --git a/man/extract_ploidy_cn_feature.Rd b/man/extract_ploidy_cn_feature.Rd index ba012f7..066d86c 100644 --- a/man/extract_ploidy_cn_feature.Rd +++ b/man/extract_ploidy_cn_feature.Rd @@ -2,10 +2,10 @@ % Please edit documentation in R/cn_features.R \name{extract_ploidy_cn_feature} \alias{extract_ploidy_cn_feature} -\title{Could changes of state relative to ploidy.} +\title{Count changes of state relative to ploidy.} \usage{ extract_ploidy_cn_feature( - segs_df, + segs_df = NA, sample_col = "cell_id", annotate_input = FALSE, return_matrix = FALSE @@ -25,5 +25,6 @@ tibble/dataframe of counts } \description{ Marks CN segments as a gain or loss, relative to the mode ploidy of the -sample. +sample. Internally using \link{mark_cn_relative_to_ploidy}. See that +function for argument details. } diff --git a/man/mark_cn_relative_to_ploidy.Rd b/man/mark_cn_relative_to_ploidy.Rd new file mode 100644 index 0000000..357bf3a --- /dev/null +++ b/man/mark_cn_relative_to_ploidy.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ploidy.R +\name{mark_cn_relative_to_ploidy} +\alias{mark_cn_relative_to_ploidy} +\title{Mark if CN states are gains or losses relative to cell ploidy} +\usage{ +mark_cn_relative_to_ploidy( + in_df, + df_type = c("reads", "segs"), + sample_col = "cell_id", + ... +) +} +\arguments{ +\item{in_df}{dataframe of CN states.} + +\item{df_type}{string. "reads" (default) or "segs" for CN segments, which +will internally converted to bin based for mode calculation. +reads in order to infer mode ploidy.} + +\item{sample_col}{string. Name of the column with cell_id/other sample name} +} +\value{ +input dataframe, with new columns of information. +} +\description{ +CNs > 2 are not necessarily amplifications, and CNs < 2 are not necessarily +the only states of losses. Ploidy of samples may not be diploid. This +function will give you an idea if the CN you see is a gain or loss relative +to the ploidy of a sample. For example, if the sample has a ploidy of 4, +then a CN of 3 is a loss. +} +\details{ +Ploidy is inferred by mode CN state using \code{\link[=mode_ploidy]{mode_ploidy()}} and +states > ploidy are marked as gains, states < ploidy losses, and states +matching ploidy as matched. +} diff --git a/man/mark_segs_chromosome_span.Rd b/man/mark_segs_chromosome_span.Rd index 4a0a008..e1db366 100644 --- a/man/mark_segs_chromosome_span.Rd +++ b/man/mark_segs_chromosome_span.Rd @@ -48,7 +48,7 @@ centromere \item arm (arm) - segment spans a whole are (*with conditions) \item whole chromosome (whole-chrom) - segment spans the entire chromosome (*with conditions) -\item intersitial (inert) - occuring within the chromosome, not touching the +\item intersitial (inter) - occuring within the chromosome, not touching the centromere, telomeres, and not big enough to be an entire arm. } diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd index b3ac2b2..cc808d3 100644 --- a/vignettes/feature_extraction.Rmd +++ b/vignettes/feature_extraction.Rmd @@ -14,11 +14,14 @@ knitr::opts_chunk$set( ) ``` -```{r setup} +```{r setup, message=FALSE, warning=FALSE} +library(tidyverse) library(dlptools) ``` -dlptools offers two feature extraction types: +## Features Defined by Other Publications + +dlptools offers two externally defined feature extraction types: 1. Wu et al. https://www.biorxiv.org/content/10.1101/2025.03.02.641098v1 2. Wang et al. https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1009557 @@ -107,7 +110,7 @@ sig_feats[1:4, 1:4]

-## More Features: ploidy informed CN change +## ploidy informed CN change CN change relative to ploidy might be interesting to have a feature. Here we can mark 3 types of events for segments: @@ -128,3 +131,259 @@ dlptools::extract_ploidy_cn_feature( dplyr::slice_head(n = 6) ``` +
+
+
+ +## Segment Span Types + +It might be useful to know where segments sit on chromosomes and how much of a chromosome they cover. We can label segments based on 5 categories: + +1. telomere bound (`telo-bound`) - segment touches a telomere +2. centromere bound (`centro-bound`) - segment touches or crosses the centromere +3. arm (`arm`) - segment spans a whole are (*with conditions) +4. whole chromosome (`whole-chrom`) - segment spans the entire chromosome (*with conditions) +5. intersitial (`inter`) - occuring within the chromosome, not touching the centromere, telomeres, and not big enough to be an entire arm. + +See `?dlptools::mark_segs_chromosome_span` for more details about this marking, as there are various options to tweak these definitions, such as: + +* how much of an arm a CN needs to cover to be considered an "arm-level" event +* how much of a chromosome a segment needs to cover to be a "whole-chromosome" event +* how close a seg needs to be to a telo/centromere to be considered "touching" + +We can mark a dataframe with the information: + +```{r, message=FALSE, warning=FALSE} +dlptools::mark_segs_chromosome_span( + segs_df, + # see ?dlptools::mark_segs_chromosome_span for more args +) |> + dplyr::relocate( + # key added columns + seg_span_event, telo_bound, centro_bound, spans_chrom, spans_arm + ) |> + dplyr::slice_head(n = 5) +``` + +
+
+ +More useful in the "feature" context, is to directly extract and count them up. This function bascially just calls the one above, with counting added: + +```{r, message=FALSE, warning=FALSE} +dlptools::extract_segment_position_feature( + segs_df + # with option just to annotate the input df + # annotate_input = TRUE + # or, alternatively, return a count matrix + # return_matrix = TRUE +) |> + dplyr::slice_head(n = 6) +``` + + +## Feature Combinations + +With some of these feature sets, we can combine them into one monster feature set. Is this a good idea? A bad idea? Is the output even interpretable? Are all ov these even useful? You'll have to decide that. + +The feature sets of: + +1. `dlptools::extract_wu_features` +2. `dlptools::extract_ploidy_cn_feature` +3. `dlptools::extract_segment_position_feature` + +are all "atomic", in that they describe aspects of individual segments. + +First, we can collect labels all all features for the segments: + +```{r, message=FALSE, warning=FALSE} +feat_anno_segs <- segs_df |> + dlptools::extract_wu_features(annotate_input = TRUE) |> + dlptools::extract_ploidy_cn_feature(annotate_input = TRUE) |> + dlptools::extract_segment_position_feature(annotate_input = TRUE) |> + dplyr::select( + cell_id, chr, start, end, + # these are then the feature columns + seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event + ) + + +dplyr::slice_head(feat_anno_segs, n = 5) +``` + + +Then, we can just count each category and leave the counts per cell per feature as our feature set for signature extraction. This would yield a 21 feature set (6 core feature types with: 5, 3, 3, 2, 3, 5 categories across them) + + + + +```{r, fig.width=8, fig.height=4, message=FALSE, warning=FALSE} +feat_cols <- c( + "seg_bin", "cn_bin", "seg_change", "seg_shape", "cn_v_ploidy", + "seg_span_event" +) + +total_counts <- purrr::map(feat_cols, \(feat_col) { + feat_anno_segs |> + dplyr::group_by(cell_id, dplyr::pick(feat_col), .drop = FALSE) |> + dplyr::count() |> + dplyr::mutate( + feature = feat_col + ) |> + dplyr::rename(category = feat_col) |> + dplyr::relocate(cell_id, feature, category) +}) |> + purrr::list_rbind() + + + +ggplot( + dplyr::filter(total_counts, cell_id %in% unique(total_counts$cell_id) + [1:4]), + aes(x = category, y = n, fill = feature) +) + + geom_col(show.legend = FALSE) + + facet_grid( + cell_id ~ feature, + scales = "free", + axes = "all_x", + switch = "x" + ) + + theme_classic() + + theme( + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + strip.placement = "outside" + ) +``` + + +
+
+ +We could opt to cross all of the features into a "mega-feature-set". But, this would be: + +* 4 features (containing 3, 5, 2, 3 categories across them) with from `dlptools::extract_wu_features` +* 1 feature with 3 features categories from `dlptools::extract_ploidy_cn_feature` +* 1 feature with 5 feature categories from `dlptools::extract_segment_position_feature` + +```{r} +all_feat_labels <- list( + levels(feat_anno_segs$seg_bin), + levels(feat_anno_segs$cn_bin), + levels(feat_anno_segs$seg_change), + levels(feat_anno_segs$seg_shape), + levels(feat_anno_segs$cn_v_ploidy), + levels(feat_anno_segs$seg_span_event) +) + +crossed_labels <- expand.grid(all_feat_labels) + +crossed_labels[1:4, ] +``` + + +```{r} +nrow(crossed_labels) +``` + + +making for a cross feature set with 1350 possible categories. That might be too detailed, and you should check if most categories are 0 for your data, indicating useless features. + +
+ +Also, probably want to ask if crossing the category actually makes sense and adds biological information. For example, here, you can't even have a small sized (< 5Mb) arm event (smallest arm is >10 Mb), so crossing size and chromosome location doesn't actually lead to fully crossed features or any gain in information (and these two are somewhat redundant). Or, does it matter if the segment shape parameter is a ploidy-gain or a ploidy-loss? Does knowing that distinguish processes or teach you anything biologically? (and that's only two parameters being crossed). + +
+ +But, forging ahead bravely: + +```{r, fig.width=8, fig.height=4} +# collect all possible feature labels + +crossed_counts <- feat_anno_segs |> + dplyr::group_by( + # sample ID column + cell_id, + # feature columns + seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event, + .drop = FALSE + ) |> + dplyr::count() |> + dplyr::mutate( + cross_feat = stringr::str_c( + seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event, + sep = ":" + ) + ) + + +ggplot( + dplyr::filter(crossed_counts, cell_id %in% unique(total_counts$cell_id) + [1:4]), + aes(x = cross_feat, y = n) +) + + geom_col() + + facet_grid(cell_id ~ ., axes = "all_x") + + theme_classic() + + theme( + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + ) + + xlab("Crossed Feature") +``` + +
+
+ +Or, you could do some sort of combination of cross and uncrossed? I don't know. This whole signature field is the wild west and people do all kinds of things. + +```{r, fig.width=8, fig.height=4, message=FALSE, warning=FALSE} +# count the Wu features as crossed: 90 feature categories +wu_crossed_counts <- dlptools::extract_wu_features(segs_df) |> + dplyr::select(cell_id, feat_cat, n) |> + dplyr::mutate( + feature_group = "Wu-crossed" + ) + +# do ploidy_cn_change and segment placement as uncrossed? +ploidy_counts <- dlptools::extract_ploidy_cn_feature(segs_df) |> + dplyr::rename(feat_cat = cn_v_ploidy) |> + dplyr::mutate( + feature_group = "ploidy" + ) + + +seg_pos_counts <- dlptools::extract_segment_position_feature(segs_df) |> + dplyr::rename(feat_cat = seg_span_event) |> + dplyr::mutate( + feature_group = "position" + ) + +all_feat_counts <- purrr::list_rbind(list( + wu_crossed_counts, ploidy_counts, seg_pos_counts +)) |> + dplyr::arrange(cell_id, feature_group) |> + dplyr::mutate( + feature_group = factor(feature_group, levels = unique(feature_group)) + ) + +ggplot( + dplyr::filter(all_feat_counts, cell_id %in% unique(total_counts$cell_id) + [1:4]), + aes(x = feat_cat, y = n, fill = feature_group) +) + + geom_col(show.legend = FALSE) + + facet_grid(cell_id ~ feature_group, axes = "all_x", scales = "free_x", space = "free") + + scale_fill_manual(values = dlptools::GEN_PLOT_COLS) + + theme_classic() + + theme( + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + ) + + xlab("Mixed Features") +``` + +
+
+
diff --git a/vignettes/various_functions.Rmd b/vignettes/various_functions.Rmd index 4d11611..a70f603 100644 --- a/vignettes/various_functions.Rmd +++ b/vignettes/various_functions.Rmd @@ -17,8 +17,8 @@ knitr::opts_chunk$set( ``` -```{r} -library(dplyr) +```{r, warning=FALSE, message=FALSE, echo=FALSE} +library(tidyverse) library(vroom) library(dlptools) ``` @@ -52,8 +52,10 @@ vroom::vroom( ) ``` +
+
-## Marking Centromere Locations +## Marking Centromere and Telomere Locations Centromeres are problematic for DLP data, the state calls are often inflated and highly inconsistent between adjacent bins. It's a messy region that we often want to omit. The default masking file (above) pretty much captures these regions, but sometimes we may want to be more specific. @@ -79,7 +81,7 @@ Alternatively, this can be done in parts: ```{r} # e.g., just adding the centromere information by chromosome dlptools::add_centromere_locations( - reads_df = ex_reads, + cn_df = ex_reads, version = "hg19" # default, don't need to specify. Alternative is "hg38" ) |> dplyr::select( @@ -91,6 +93,23 @@ dlptools::add_centromere_locations( ``` +Also available is telomere marking, but see `?dlptools::import_telos_file` for why this isn't the most in-depth or accurate thing: + +```{r} +dlptools::add_telomere_positions( + ex_reads + # version = "hg19" is the default +) |> + dplyr::relocate( + # added columns + telostart_p, teloend_p, telostart_q, teloend_q + ) |> + dplyr::slice_head(n = 5) +``` + +
+
+ ## Information From Cell IDs This is assuming our standard DLP cell ids, e.g., `AT23998-A138956A-R03-C34`. @@ -125,6 +144,8 @@ ex_reads |> dplyr::slice_sample(n = 5) ``` +
+
## Inferring Missing Bin Data @@ -235,6 +256,8 @@ dlptools::load_chrom_info_file( ``` +
+
## Reads to Segments @@ -251,6 +274,9 @@ ex_segs[1:4, ] *warning:* this function will leave some unexpected gaps when dataframes have been filtered and bins removed. Inspect carfully if you have dropped bins from your dataframe. +
+
+ ## Segments to Reads Split segments back into bins of some size. @@ -284,6 +310,98 @@ c(filt_reads = nrow(filt_reads), input_reads = nrow(ex_reads)) ```
+
+ +## Where Segments Occur + +Useful to know is where segments occur relative to chromosome features like the centromere and telomeres. + +There are a few key arguments to this function, like setting distances for how close segments need to be to telomeres and centromeres to be consider "bound", and minimum spans of arms/chromosomes. + +Please see `?dlptools::mark_segs_chromosome_span` for more information on how this works: + +```{r} +seg_locations <- dlptools::mark_segs_chromosome_span( + ex_segs, + # version = "hg19" assumed, but hg38 possible too. +) + +dplyr::relocate( + seg_locations, + # key added columns, more present too + seg_span_event, telo_bound, centro_bound, spans_chrom, spans_arm +) +``` + + +Visually, this is what we're doing: + +```{r, echo = FALSE, fig.width=8, fig.height=6, fig.cap="Labels of CN segments on chromosomes. Vertical lines indicate locations of telomeres and centromeres."} +chr_attrs <- dplyr::left_join( + dlptools::load_ucsc_centromeres(), + dlptools::import_telos_file(), + by = dplyr::join_by(chrom == chr) +) |> + dplyr::rename( + chr = chrom + ) |> + dplyr::select(chr, centro_start, centro_end, dplyr::contains("telo")) |> + tidyr::pivot_longer(cols = -chr) |> + dplyr::mutate( + plot_name = dplyr::case_when( + stringr::str_detect(name, "start") ~ "start", + stringr::str_detect(name, "end") ~ "end", + ), + section_id = dplyr::case_when( + stringr::str_detect(name, "centro") ~ "centro", + stringr::str_detect(name, "telo.*p") ~ "telo-p", + stringr::str_detect(name, "telo.*q") ~ "telo-q" + ) + ) |> + dplyr::select(chr, section_id, plot_name, value) + + +targ_chrs <- c(1, 2, 8, 13, 14) + +some_cells <- unique(seg_locations$cell_id)[1:5] + +seg_locations$chr <- factor( + seg_locations$chr, + levels = gtools::mixedsort(unique(seg_locations$chr)) +) + +ggplot( + dplyr::filter(seg_locations, cell_id %in% some_cells & chr %in% targ_chrs) +) + + geom_vline( + # add genome features. Hm, maybe just do as vlines + data = dplyr::filter(chr_attrs, chr %in% targ_chrs), + # aes(xmin=start, xmax=end, ymin=-1, ymax=-2, fill=section_id) + aes(xintercept = value), + color = "black" + ) + + geom_segment( + aes(x = start, xend = end, y = state, color = seg_span_event), + linewidth = 3 + ) + + facet_grid( + cell_id ~ chr, + scales = "free", + space = "free_x", + switch = "x", + axes = "all_x", + ) + + theme_classic() + + theme( + axis.text.x = element_blank(), + strip.placement = "outside", + strip.background = element_blank() + ) + + scale_color_manual(values = dlptools::GEN_PLOT_COLS) +``` + + +

@@ -313,7 +431,24 @@ dlptools::mode_ploidy( dplyr::slice_head(n = 5) ``` +
+ +Related, we can also infer if a CN change is a gain or loss, relative to ploidy: +```{r} +dlptools::mark_cn_relative_to_ploidy( + ex_reads, # or can pass segments df + df_type = "reads" # df_type = "segs" +) |> + dplyr::relocate( + # key added columns + mode_ploidy, cn_v_ploidy + ) |> + dplyr::slice_head(n = 5) +``` + +
+
## Long to Wide Reads (or segments) @@ -359,58 +494,8 @@ dlptools::ASCN_PHASE_COLORS # dlptools::BAF_COLORS ``` -## Data Importing - -If you've followed a semi-standard approach for downloading such as this: - -```bash -# bash -cd where/to/save/my_dlp/ - -ticket="SC-8382" -azcopy copy https://singlecellresults.blob.core.windows.net/results/${ticket}/results/annotation/ ${ticket} --recursive -azcopy copy https://singlecellresults.blob.core.windows.net/results/${ticket}/results/hmmcopy/ ${ticket} --recursive - -``` - -which produces a directory with this sort of structure: - -```bash -├── SC-8382 -│ ├── annotation -│ │ ├── metrics.csv.gz -│ │ ├── #[...other files ...] -│ └── hmmcopy -│ │ ├── #[...other files ...] -│ ├── reads.csv.gz -│ ├── segments.csv.gz -├── SC-8408 -│ ├── annotation -│ │ ├── metrics.csv.gz -│ │ ├── #[...other files ...] -│ └── hmmcopy -│ │ ├── #[...other files ...] -│ ├── reads.csv.gz -│ ├── segments.csv.gz -├── SC-8650 -``` - -then there are some functions that can help with loading data from these into a consistent dataframe: - -```{r, eval=FALSE} -dlp_dir <- "/where/to/save/my_dlp/" - -# loading metrics -metrics <- dlptools::import_dlp_files(dlp_dir, "metrics") - -# loading reads -reads_df <- dlptools::import_dlp_files(dlp_dir, "reads") - -# loading segments...but generally this isn't a great idea -# segs_df <- dlptools::import_dlp_files(dlp_dir, "segments") -``` - - +
+
## Other From a9d12f7b041056a9596277fb1f76883f70fa6e56 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Thu, 28 Aug 2025 11:38:49 -0700 Subject: [PATCH 6/7] chore: version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5d978bb..1d2b7ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dlptools Title: DLPTools: Handling DLP+ Data -Version: 0.3.17 +Version: 0.3.18 Authors@R: person("Ben", "Furman", , "bfurman@bccrc.ca", role = c("aut", "cre")) Description: A collections of functions for basic manipulation and plotting of From 85fb251962603a0d061d9beea28719535529b6ab Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Thu, 28 Aug 2025 11:57:12 -0700 Subject: [PATCH 7/7] bugfix: move to direct imports. --- vignettes/feature_extraction.Rmd | 44 +++++++++++++++++++------------- vignettes/various_functions.Rmd | 23 ++++++++--------- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd index cc808d3..f2cc62f 100644 --- a/vignettes/feature_extraction.Rmd +++ b/vignettes/feature_extraction.Rmd @@ -15,7 +15,7 @@ knitr::opts_chunk$set( ``` ```{r setup, message=FALSE, warning=FALSE} -library(tidyverse) +library(ggplot2) library(dlptools) ``` @@ -217,7 +217,7 @@ Then, we can just count each category and leave the counts per cell per feature -```{r, fig.width=8, fig.height=4, message=FALSE, warning=FALSE} +```{r, message=FALSE, warning=FALSE} feat_cols <- c( "seg_bin", "cn_bin", "seg_change", "seg_shape", "cn_v_ploidy", "seg_span_event" @@ -234,9 +234,9 @@ total_counts <- purrr::map(feat_cols, \(feat_col) { dplyr::relocate(cell_id, feature, category) }) |> purrr::list_rbind() +``` - - +```{r, echo=FALSE, fig.width=8, fig.height=4} ggplot( dplyr::filter(total_counts, cell_id %in% unique(total_counts$cell_id) [1:4]), @@ -257,7 +257,6 @@ ggplot( ) ``` -

@@ -298,7 +297,7 @@ Also, probably want to ask if crossing the category actually makes sense and add But, forging ahead bravely: -```{r, fig.width=8, fig.height=4} +```{r} # collect all possible feature labels crossed_counts <- feat_anno_segs |> @@ -316,8 +315,9 @@ crossed_counts <- feat_anno_segs |> sep = ":" ) ) +``` - +```{r, echo=FALSE, fig.width=8, fig.height=4} ggplot( dplyr::filter(crossed_counts, cell_id %in% unique(total_counts$cell_id) [1:4]), @@ -333,12 +333,13 @@ ggplot( xlab("Crossed Feature") ``` +

Or, you could do some sort of combination of cross and uncrossed? I don't know. This whole signature field is the wild west and people do all kinds of things. -```{r, fig.width=8, fig.height=4, message=FALSE, warning=FALSE} +```{r, message=FALSE, warning=FALSE} # count the Wu features as crossed: 90 feature categories wu_crossed_counts <- dlptools::extract_wu_features(segs_df) |> dplyr::select(cell_id, feat_cat, n) |> @@ -367,23 +368,30 @@ all_feat_counts <- purrr::list_rbind(list( dplyr::mutate( feature_group = factor(feature_group, levels = unique(feature_group)) ) +``` -ggplot( + +```{r, echo=FALSE, fig.width=8, fig.height=4} +ggplot2::ggplot( dplyr::filter(all_feat_counts, cell_id %in% unique(total_counts$cell_id) [1:4]), - aes(x = feat_cat, y = n, fill = feature_group) + ggplot2::aes(x = feat_cat, y = n, fill = feature_group) ) + - geom_col(show.legend = FALSE) + - facet_grid(cell_id ~ feature_group, axes = "all_x", scales = "free_x", space = "free") + - scale_fill_manual(values = dlptools::GEN_PLOT_COLS) + - theme_classic() + - theme( - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), + ggplot2::geom_col(show.legend = FALSE) + + ggplot2::facet_grid( + cell_id ~ feature_group, + axes = "all_x", scales = "free_x", space = "free" ) + - xlab("Mixed Features") + ggplot2::scale_fill_manual(values = dlptools::GEN_PLOT_COLS) + + ggplot2::theme_classic() + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), + axis.ticks.x = ggplot2::element_blank(), + ) + + ggplot2::xlab("Mixed Features") ``` +


diff --git a/vignettes/various_functions.Rmd b/vignettes/various_functions.Rmd index a70f603..b253dba 100644 --- a/vignettes/various_functions.Rmd +++ b/vignettes/various_functions.Rmd @@ -18,7 +18,6 @@ knitr::opts_chunk$set( ```{r, warning=FALSE, message=FALSE, echo=FALSE} -library(tidyverse) library(vroom) library(dlptools) ``` @@ -370,34 +369,34 @@ seg_locations$chr <- factor( levels = gtools::mixedsort(unique(seg_locations$chr)) ) -ggplot( +ggplot2::ggplot( dplyr::filter(seg_locations, cell_id %in% some_cells & chr %in% targ_chrs) ) + - geom_vline( + ggplot2::geom_vline( # add genome features. Hm, maybe just do as vlines data = dplyr::filter(chr_attrs, chr %in% targ_chrs), # aes(xmin=start, xmax=end, ymin=-1, ymax=-2, fill=section_id) - aes(xintercept = value), + ggplot2::aes(xintercept = value), color = "black" ) + - geom_segment( - aes(x = start, xend = end, y = state, color = seg_span_event), + ggplot2::geom_segment( + ggplot2::aes(x = start, xend = end, y = state, color = seg_span_event), linewidth = 3 ) + - facet_grid( + ggplot2::facet_grid( cell_id ~ chr, scales = "free", space = "free_x", switch = "x", axes = "all_x", ) + - theme_classic() + - theme( - axis.text.x = element_blank(), + ggplot2::theme_classic() + + ggplot2::theme( + axis.text.x = ggplot2::element_blank(), strip.placement = "outside", - strip.background = element_blank() + strip.background = ggplot2::element_blank() ) + - scale_color_manual(values = dlptools::GEN_PLOT_COLS) + ggplot2::scale_color_manual(values = dlptools::GEN_PLOT_COLS) ```