From ed33af4ff71e563570d3121a3a460342adbd5ade Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Tue, 9 Sep 2025 15:24:34 -0700 Subject: [PATCH 1/5] feat: add some process based feature extractions. --- DESCRIPTION | 2 +- NAMESPACE | 6 + R/chromosome_layouts.R | 64 ++++++- R/cn_features.R | 236 +++++++++++++++++++++++- R/utils.R | 18 +- man/confirm_cols_present.Rd | 20 ++ man/create_chrom_arm_intervals.Rd | 19 ++ man/create_chrom_window_intervals.Rd | 19 ++ man/extract_bp_per_arm.Rd | 21 +++ man/extract_bp_per_window.Rd | 22 +++ man/extract_breakpoints.Rd | 38 ++++ man/extract_changepoint.Rd | 51 +++++ man/extract_segment_position_feature.Rd | 2 +- man/extract_segment_sizes.Rd | 30 +++ man/is_chr_used_in_chroms.Rd | 3 + 15 files changed, 545 insertions(+), 6 deletions(-) create mode 100644 man/confirm_cols_present.Rd create mode 100644 man/create_chrom_arm_intervals.Rd create mode 100644 man/create_chrom_window_intervals.Rd create mode 100644 man/extract_bp_per_arm.Rd create mode 100644 man/extract_bp_per_window.Rd create mode 100644 man/extract_breakpoints.Rd create mode 100644 man/extract_changepoint.Rd create mode 100644 man/extract_segment_sizes.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 1d2b7ec..8bfe9bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dlptools Title: DLPTools: Handling DLP+ Data -Version: 0.3.18 +Version: 0.3.19 Authors@R: person("Ben", "Furman", , "bfurman@bccrc.ca", role = c("aut", "cre")) Description: A collections of functions for basic manipulation and plotting of diff --git a/NAMESPACE b/NAMESPACE index 77981cd..f3bd013 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,11 +24,17 @@ export(chip_plot) export(compute_tip_sibling_distances) export(convert_dists_to_pairwise) export(convert_long_reads_to_wide) +export(create_chrom_arm_intervals) +export(create_chrom_window_intervals) export(create_expected_bins) export(cust_mode) export(expand_length_to_bins) +export(extract_bp_per_arm) +export(extract_bp_per_window) +export(extract_breakpoints) export(extract_ploidy_cn_feature) export(extract_segment_position_feature) +export(extract_segment_sizes) export(extract_sigminer_wang_features) export(extract_wu_features) export(factor_column_mixedsort) diff --git a/R/chromosome_layouts.R b/R/chromosome_layouts.R index d684fc4..7c5a823 100644 --- a/R/chromosome_layouts.R +++ b/R/chromosome_layouts.R @@ -21,7 +21,8 @@ load_chrom_info_file <- function(version = c("hg19", "hg38")) { ) |> dplyr::mutate( chr = stringr::str_replace(chr, "chr", "") - ) + ) |> + dplyr::select(-misc) return(chrom_info) } @@ -437,3 +438,64 @@ mark_segs_chromosome_span <- function( return(segs_df) } + +#' break a chromosome up into intervals of a defined window size +#' +#' @param window_size integer. The size of window to split the chromosome into. +#' @param genome_version string. "hg19" (default) or "hg38" +#' @return list. Named by chromosome, vectors of window starts. +#' @export +create_chrom_window_intervals <- function( + window_size, + genome_version = c("hg19", "hg38")) { + genome_version <- match.arg(genome_version) + + chr_info <- suppressWarnings( + chr_info <- load_chrom_info_file(version = genome_version) + ) + intervals <- purrr::map(chr_info$total_length, \(total_length) { + max_end <- total_length + window_size + intervals <- seq(1, max_end, window_size) + intervals + }) + names(intervals) <- chr_info$chr + return(intervals) +} + +#' create a list of intervals spanning chromosome arms +#' +#' Splits a chromosome at the middle of the centromere. Sets up intervals for +#' splitting each chromosome arm. +#' +#' @param genome_version string. "hg19" (default) or "hg38" +#' @return list. Named by chromosome, vectors of how to break a chromsome into +#' intervals of arms. +#' @export +create_chrom_arm_intervals <- function(genome_version = c("hg19", "hg38")) { + chrom_layouts <- suppressWarnings( + load_ucsc_centromeres(version = genome_version) + ) + chrom_lengths <- suppressWarnings( + load_chrom_info_file(version = genome_version) + ) + + chrom_info <- dplyr::left_join( + chrom_lengths, + chrom_layouts, + by = dplyr::join_by("chr" == "chrom") + ) + + intervals <- purrr::pmap( + dplyr::select( + chrom_info, + total_length, centro_start, centro_end + ), + \(total_length, centro_start, centro_end) { + middle_centro <- centro_start + round(((centro_end - centro_start) / 2)) + c(1, middle_centro, total_length + 1) + } + ) + names(intervals) <- chrom_info$chr + + return(intervals) +} diff --git a/R/cn_features.R b/R/cn_features.R index 9ecd4e9..c315ba4 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -260,7 +260,7 @@ extract_ploidy_cn_feature <- function( #' count the segment-span-on-chromosome event types. #' -#' Critical to this fuction is [dlptools::mark_segs_chromosome_span()]. It is +#' Critical to this function is [dlptools::mark_segs_chromosome_span()]. It is #' important to read and understand that function and its arguments. #' #' This function basically just calls [dlptools::mark_segs_chromosome_span()] @@ -299,3 +299,237 @@ extract_segment_position_feature <- function( return(seg_span_counts) } + + +extract_process_features <- function(segs_df) { + +} + +#' sizes of the segments +#' +#' This function is as simple as it sounds, end - start. +#' +#' Used as a setup for extracting process based features *á la*: +#' * Macintyre et al. 2018 +#' * Drews et al. 2018 +#' +#' Really, something like this should be used to generate values that then you +#' define categories for to count occurrences. +#' +#' Related functions include: [dlptools::extract_extract_changepoint] and +#' [dlptools::extract_breakpoints] +#' +#' @param segs_df dataframe. copy number segments for samples. +#' @return dataframe. sample ids and all observed segment sizes. +#' @export +extract_segment_sizes <- function(segs_df) { + confirm_cols_present(c("start", "end"), segs_df) + + seg_sizes <- segs_df |> + dplyr::mutate(seg_size = end - start + 1) |> + dplyr::select(cell_id, seg_size) + + return(seg_sizes) +} + +#' extract the CN change between adjacent segments. +#' +#' Change points are the change in copy number state between adjacent segments. +#' If one segment is 4 and the adjacent segment is 1, the change point is 3. +#' +#' Change points are based on the difference to the "left" adjacent segment, +#' when moving from BP 1 to the end of a chromosome. So if there are 3 +#' segments: 4 - 1 - 2, the change points would be: +#' |1 - 4| and |2 - 1| resulting in: 3, 1 +#' +#' For the first segment on a chromosome, +#' \href{Drews et al.}{https://www.nature.com/articles/s41586-022-04789-9} +#' compared it to a hypothetical diploid. So if the first segment on a +#' chromosome is 5, the change point would be 5 - 2 = 3. That's fine if the +#' base genome is diploid, but doesn't work so well for other ploidies, or +#' cases where you don't want to assume a diploid base case. +#' +#' `first_seg_correction` provides options over what to do. "diploid" for Drews +#' solution, "cn_mode" to compare to sample ploidy estimate based on mode, or +#' "ignore" to not count anything for first segments. +#' +#' @param segs_df dataframe. Sample copy number segments. +#' first_seg_correction +#' @param sample_col string. Name of column with cell/sample names +#' @param chrom_col string. Name of column with chromosome names +#' @param cn_col string. Name of column with segment copy number states. +#' @param ... can pass arguments to [dlptools::segs_to_reads] +#' @return dataframe. Sample IDs and the observed breakpoint counts per scope. +extract_changepoint <- function( + segs_df, + first_seg_correction = c("cn_mode", "diploid", "ignore"), + sample_col = "cell_id", + chrom_col = "chr", + cn_col = "state", + ...) { + first_seg_correction <- match.arg(first_seg_correction) + + if (first_seg_correction == "cn_mode") { + mode_ploidies <- segs_to_reads(segs_df, ...) |> + mode_ploidy( + sample_col = sample_col, + cn_col = cn_col, chrom_col = chrom_col + ) + + segs_df <- dplyr::left_join( + segs_df, mode_ploidies, + by = sample_col + ) |> + dplyr::rename( + first_seg_comp_val = mode_ploidy + ) + } else if (first_seg_correction == "diploid") { + segs_df <- dplyr::mutate(segs_df, first_seg_comp_val = 2) + } else if (first_seg_correction == "ignore") { + segs_df <- dplyr::mutate(segs_df, first_seg_comp_val = NA) + } + + change_points <- segs_df |> + dplyr::group_by(.data[[sample_col]], .data[[chrom_col]]) |> + dplyr::mutate( + left_seg = dplyr::lag(.data[[cn_col]]), + left_seg = dplyr::if_else( + is.na(left_seg), + first_seg_comp_val, + left_seg + ), + cn_change = abs(.data[[cn_col]] - left_seg) + ) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(cn_change)) |> + dplyr::select(.data[[sample_col]], cn_change) + + return(change_points) +} + +#' convenience function to extracting breakpoints per window size +#' +#' just calling the generic [dlptools::extract_breakpoints], with some +#' pre-loaded options. See that function for details. +#' @return dataframe. Sample IDs and the observed breakpoint counts observed in +#' the specified window size. +#' @export +extract_bp_per_window <- function( + segs_df, + window_size = 10e6, # 10Mb is standard for most of these papers + sample_col = "cell_id", + chrom_col = "chr", + genome_version = c("hg19", "hg38")) { + extract_breakpoints( + segs_df = segs_df, + scope = "windows", + window_size = window_size, + genome_version = genome_version, + sample_col = sample_col, + chrom_col = chrom_col, + ) +} + +#' convenience function to extracting breakpoints per chromosome arm +#' +#' just calling the generic [dlptools::extract_breakpoints], with some +#' pre-loaded options. See that function for details. +#' @return dataframe. Sample IDs and the observed breakpoint counts on +#' chromosome arms. +#' @export +extract_bp_per_arm <- function( + segs_df, + sample_col = "cell_id", + chrom_col = "chr", + genome_version = c("hg19", "hg38")) { + extract_breakpoints( + segs_df = segs_df, + scope = "arms", + genome_version = genome_version, + sample_col = sample_col, + chrom_col = chrom_col, + ) +} + +#' extracting counts of breakpoints per user-defined scope +#' +#' Counting the number of breakpoints (i.e., transitions between copy number +#' segments) per arm or per window (typically 10Mb). +#' +#' @param segs_df dataframe. Copy number segments for samples +#' @param scope string. "windows" (default) or "arms", i.e., what to target for +#' counting +#' @param genome_version string. "hg19" (default) or "hg38" +#' @param window_size integer. How big of a window to use, if extracting counts +#' per a "windows" scope. Most publications use 10Mb, which is the default +#' (10e6) +#' @param sample_col string. Name of column with cell/sample names +#' @param chrom_col string. Name of column with chromosome names +#' @return dataframe. Sample IDs and the observed breakpoint counts per scope. +#' @export +extract_breakpoints <- function( + segs_df, + scope = c("windows", "arms"), + genome_version = c("hg19", "hg38"), + window_size = 10e6, # 10Mb is standard for most of these papers + sample_col = "cell_id", + chrom_col = "chr") { + scope <- match.arg(scope) + + genome_version <- match.arg(genome_version) + + confirm_cols_present("end", segs_df) + + chr_info <- suppressWarnings(load_chrom_info_file(version = genome_version)) + + # DLP can output bins longer than the chromosome to maintain it's 500Kb + # bin size + segs_df_p <- dplyr::left_join( + segs_df, chr_info, + by = setNames(chrom_col, "chr") + ) |> + dplyr::mutate( + end = dplyr::if_else(end > total_length, total_length, end) + ) + + # set up chromosome intervals, could then generalize to arms? + if (scope == "windows") { + intervals <- create_chrom_window_intervals( + window_size = window_size, + genome_version = genome_version + ) + } else if (scope == "arms") { + intervals <- create_chrom_arm_intervals( + genome_version = genome_version + ) + } + + to_process <- dplyr::distinct( + segs_df, + sample = .data[[sample_col]], + chrom = .data[[chrom_col]] + ) + + bp_counts <- purrr::map2_df( + to_process$sample, + to_process$chrom, + \(sample, chrom) { + ends <- segs_df_p |> + dplyr::filter( + .data[[sample_col]] == sample & + .data[[chrom_col]] == chrom + ) |> + dplyr::pull(end) + + counts <- table(cut(ends, breaks = intervals[[chrom]])) + + tibble::tibble( + !!sample_col := sample, + chrom = chrom, + breakpoints = unname(counts) + ) + } + ) + + return(bp_counts) +} diff --git a/R/utils.R b/R/utils.R index 7e65c26..27565c9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -224,7 +224,6 @@ make_cellid_matrix <- function(cell_df, name_col, val_col) { return(cell_mtx) } - #' find most common element in a vector #' #' For ties, just the first one in the list is returned. @@ -241,10 +240,25 @@ cust_mode <- function(x, na.rm = FALSE) { return(ux[which.max(tabulate(match(x, ux)))]) } - #' detect if chromosomes labels include "chr" +#' #' @param chrom_vec vector of chromosome labels, probably pulled from a #' dataframe of cn information. +#' @return boolean. TRUE if "chr" found in the strings. is_chr_used_in_chroms <- function(chrom_vec) { return(any(stringr::str_detect(unique(chrom_vec), "chr"))) } + +#' confirm if columns exist in dataframe +#' +#' @param cols vector of strings. Names of columns you are searching for in the +#' dataframe. +#' @param df. The dataframe to look in for the columns +#' @return TRUE or raises a stop +confirm_cols_present <- function(cols, df) { + if (!all(cols %in% colnames(df))) { + col_names <- paste(cols, collapse = ", ") + stop(paste0("need all columns: ", col_names)) + } + return(TRUE) +} diff --git a/man/confirm_cols_present.Rd b/man/confirm_cols_present.Rd new file mode 100644 index 0000000..5a23264 --- /dev/null +++ b/man/confirm_cols_present.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{confirm_cols_present} +\alias{confirm_cols_present} +\title{confirm if columns exist in dataframe} +\usage{ +confirm_cols_present(cols, df) +} +\arguments{ +\item{cols}{vector of strings. Names of columns you are searching for in the +dataframe.} + +\item{df.}{The dataframe to look in for the columns} +} +\value{ +TRUE or raises a stop +} +\description{ +confirm if columns exist in dataframe +} diff --git a/man/create_chrom_arm_intervals.Rd b/man/create_chrom_arm_intervals.Rd new file mode 100644 index 0000000..8f805ce --- /dev/null +++ b/man/create_chrom_arm_intervals.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{create_chrom_arm_intervals} +\alias{create_chrom_arm_intervals} +\title{create a list of intervals spanning chromosome arms} +\usage{ +create_chrom_arm_intervals(genome_version = c("hg19", "hg38")) +} +\arguments{ +\item{genome_version}{string. "hg19" (default) or "hg38"} +} +\value{ +list. Named by chromosome, vectors of how to break a chromsome into +intervals of arms. +} +\description{ +Splits a chromosome at the middle of the centromere. Sets up intervals for +splitting each chromosome arm. +} diff --git a/man/create_chrom_window_intervals.Rd b/man/create_chrom_window_intervals.Rd new file mode 100644 index 0000000..01a17db --- /dev/null +++ b/man/create_chrom_window_intervals.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chromosome_layouts.R +\name{create_chrom_window_intervals} +\alias{create_chrom_window_intervals} +\title{break a chromosome up into intervals of a defined window size} +\usage{ +create_chrom_window_intervals(window_size, genome_version = c("hg19", "hg38")) +} +\arguments{ +\item{window_size}{integer. The size of window to split the chromosome into.} + +\item{genome_version}{string. "hg19" (default) or "hg38"} +} +\value{ +list. Named by chromosome, vectors of window starts. +} +\description{ +break a chromosome up into intervals of a defined window size +} diff --git a/man/extract_bp_per_arm.Rd b/man/extract_bp_per_arm.Rd new file mode 100644 index 0000000..2aa326a --- /dev/null +++ b/man/extract_bp_per_arm.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_bp_per_arm} +\alias{extract_bp_per_arm} +\title{convenience function to extracting breakpoints per chromosome arm} +\usage{ +extract_bp_per_arm( + segs_df, + sample_col = "cell_id", + chrom_col = "chr", + genome_version = c("hg19", "hg38") +) +} +\value{ +dataframe. Sample IDs and the observed breakpoint counts on +chromosome arms. +} +\description{ +just calling the generic \link{extract_breakpoints}, with some +pre-loaded options. See that function for details. +} diff --git a/man/extract_bp_per_window.Rd b/man/extract_bp_per_window.Rd new file mode 100644 index 0000000..9625fe7 --- /dev/null +++ b/man/extract_bp_per_window.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_bp_per_window} +\alias{extract_bp_per_window} +\title{convenience function to extracting breakpoints per window size} +\usage{ +extract_bp_per_window( + segs_df, + window_size = 1e+07, + sample_col = "cell_id", + chrom_col = "chr", + genome_version = c("hg19", "hg38") +) +} +\value{ +dataframe. Sample IDs and the observed breakpoint counts observed in +the specified window size. +} +\description{ +just calling the generic \link{extract_breakpoints}, with some +pre-loaded options. See that function for details. +} diff --git a/man/extract_breakpoints.Rd b/man/extract_breakpoints.Rd new file mode 100644 index 0000000..e5e8ab5 --- /dev/null +++ b/man/extract_breakpoints.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_breakpoints} +\alias{extract_breakpoints} +\title{extracting counts of breakpoints per user-defined scope} +\usage{ +extract_breakpoints( + segs_df, + scope = c("windows", "arms"), + genome_version = c("hg19", "hg38"), + window_size = 1e+07, + sample_col = "cell_id", + chrom_col = "chr" +) +} +\arguments{ +\item{segs_df}{dataframe. Copy number segments for samples} + +\item{scope}{string. "windows" (default) or "arms", i.e., what to target for +counting} + +\item{genome_version}{string. "hg19" (default) or "hg38"} + +\item{window_size}{integer. How big of a window to use, if extracting counts +per a "windows" scope. Most publications use 10Mb, which is the default +(10e6)} + +\item{sample_col}{string. Name of column with cell/sample names} + +\item{chrom_col}{string. Name of column with chromosome names} +} +\value{ +dataframe. Sample IDs and the observed breakpoint counts per scope. +} +\description{ +Counting the number of breakpoints (i.e., transitions between copy number +segments) per arm or per window (typically 10Mb). +} diff --git a/man/extract_changepoint.Rd b/man/extract_changepoint.Rd new file mode 100644 index 0000000..ca8494d --- /dev/null +++ b/man/extract_changepoint.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_changepoint} +\alias{extract_changepoint} +\title{extract the CN change between adjacent segments.} +\usage{ +extract_changepoint( + segs_df, + first_seg_correction = c("cn_mode", "diploid", "ignore"), + sample_col = "cell_id", + chrom_col = "chr", + cn_col = "state", + ... +) +} +\arguments{ +\item{segs_df}{dataframe. Sample copy number segments. +first_seg_correction} + +\item{sample_col}{string. Name of column with cell/sample names} + +\item{chrom_col}{string. Name of column with chromosome names} + +\item{cn_col}{string. Name of column with segment copy number states.} + +\item{...}{can pass arguments to \link{segs_to_reads}} +} +\value{ +dataframe. Sample IDs and the observed breakpoint counts per scope. +} +\description{ +Change points are the change in copy number state between adjacent segments. +If one segment is 4 and the adjacent segment is 1, the change point is 3. +} +\details{ +Change points are based on the difference to the "left" adjacent segment, +when moving from BP 1 to the end of a chromosome. So if there are 3 +segments: 4 - 1 - 2, the change points would be: +|1 - 4| and |2 - 1| resulting in: 3, 1 + +For the first segment on a chromosome, +\href{Drews et al.}{https://www.nature.com/articles/s41586-022-04789-9} +compared it to a hypothetical diploid. So if the first segment on a +chromosome is 5, the change point would be 5 - 2 = 3. That's fine if the +base genome is diploid, but doesn't work so well for other ploidies, or +cases where you don't want to assume a diploid base case. + +\code{first_seg_correction} provides options over what to do. "diploid" for Drews +solution, "cn_mode" to compare to sample ploidy estimate based on mode, or +"ignore" to not count anything for first segments. +} diff --git a/man/extract_segment_position_feature.Rd b/man/extract_segment_position_feature.Rd index c80553f..fa59162 100644 --- a/man/extract_segment_position_feature.Rd +++ b/man/extract_segment_position_feature.Rd @@ -26,7 +26,7 @@ segment.} tibble/dataframe of counts } \description{ -Critical to this fuction is \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}. It is +Critical to this function is \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}. It is important to read and understand that function and its arguments. } \details{ diff --git a/man/extract_segment_sizes.Rd b/man/extract_segment_sizes.Rd new file mode 100644 index 0000000..d2c611b --- /dev/null +++ b/man/extract_segment_sizes.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_segment_sizes} +\alias{extract_segment_sizes} +\title{sizes of the segments} +\usage{ +extract_segment_sizes(segs_df) +} +\arguments{ +\item{segs_df}{dataframe. copy number segments for samples.} +} +\value{ +dataframe. sample ids and all observed segment sizes. +} +\description{ +This function is as simple as it sounds, end - start. +} +\details{ +Used as a setup for extracting process based features \emph{á la}: +\itemize{ +\item Macintyre et al. 2018 +\item Drews et al. 2018 +} + +Really, something like this should be used to generate values that then you +define categories for to count occurrences. + +Related functions include: \link{extract_extract_changepoint} and +\link{extract_breakpoints} +} diff --git a/man/is_chr_used_in_chroms.Rd b/man/is_chr_used_in_chroms.Rd index c47a6a6..0b90333 100644 --- a/man/is_chr_used_in_chroms.Rd +++ b/man/is_chr_used_in_chroms.Rd @@ -10,6 +10,9 @@ is_chr_used_in_chroms(chrom_vec) \item{chrom_vec}{vector of chromosome labels, probably pulled from a dataframe of cn information.} } +\value{ +boolean. TRUE if "chr" found in the strings. +} \description{ detect if chromosomes labels include "chr" } From d9432a60722ef3bd700591004f1ebcdd537923f5 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Tue, 9 Sep 2025 15:30:11 -0700 Subject: [PATCH 2/5] feat: allow flexible sample column name. --- R/cn_features.R | 7 +++++-- man/extract_segment_sizes.Rd | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/cn_features.R b/R/cn_features.R index c315ba4..cf84628 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -320,14 +320,17 @@ extract_process_features <- function(segs_df) { #' [dlptools::extract_breakpoints] #' #' @param segs_df dataframe. copy number segments for samples. +#' @param sample_col string. Name of column with cell/sample names #' @return dataframe. sample ids and all observed segment sizes. #' @export -extract_segment_sizes <- function(segs_df) { +extract_segment_sizes <- function( + segs_df, + sample_col = "cell_id") { confirm_cols_present(c("start", "end"), segs_df) seg_sizes <- segs_df |> dplyr::mutate(seg_size = end - start + 1) |> - dplyr::select(cell_id, seg_size) + dplyr::select(.data[[sample_col]], seg_size) return(seg_sizes) } diff --git a/man/extract_segment_sizes.Rd b/man/extract_segment_sizes.Rd index d2c611b..3bff2bf 100644 --- a/man/extract_segment_sizes.Rd +++ b/man/extract_segment_sizes.Rd @@ -4,10 +4,12 @@ \alias{extract_segment_sizes} \title{sizes of the segments} \usage{ -extract_segment_sizes(segs_df) +extract_segment_sizes(segs_df, sample_col = "cell_id") } \arguments{ \item{segs_df}{dataframe. copy number segments for samples.} + +\item{sample_col}{string. Name of column with cell/sample names} } \value{ dataframe. sample ids and all observed segment sizes. From 381164392d398f029d3635d3d3be51cc3c9b5df4 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Wed, 10 Sep 2025 15:26:49 -0700 Subject: [PATCH 3/5] feat: count oscillations --- NAMESPACE | 2 + R/cn_features.R | 84 +++++++++++++++++++++++++- R/oscilations.R | 114 +++++++++++++++++++++++++++++++++++ man/count_oscillations.Rd | 41 +++++++++++++ man/decide_if_oscillation.Rd | 33 ++++++++++ man/extract_oscillations.Rd | 73 ++++++++++++++++++++++ man/extract_wu_features.Rd | 3 +- 7 files changed, 347 insertions(+), 3 deletions(-) create mode 100644 R/oscilations.R create mode 100644 man/count_oscillations.Rd create mode 100644 man/decide_if_oscillation.Rd create mode 100644 man/extract_oscillations.Rd diff --git a/NAMESPACE b/NAMESPACE index f3bd013..00e17a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ export(chip_plot) export(compute_tip_sibling_distances) export(convert_dists_to_pairwise) export(convert_long_reads_to_wide) +export(count_oscillations) export(create_chrom_arm_intervals) export(create_chrom_window_intervals) export(create_expected_bins) @@ -32,6 +33,7 @@ export(expand_length_to_bins) export(extract_bp_per_arm) export(extract_bp_per_window) export(extract_breakpoints) +export(extract_oscillations) export(extract_ploidy_cn_feature) export(extract_segment_position_feature) export(extract_segment_sizes) diff --git a/R/cn_features.R b/R/cn_features.R index cf84628..49d88b0 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -11,7 +11,8 @@ #' They employ 4 base features, that they cross for 90 categories: #' 1. CN states: 5 bins, 0-1, 2, 3, 4, 5+ #' 2. segment size: 3 bins, <5 MB, 5-10Mb, 10Mb, -#' 3. segment shape: 3 bins, LL (low left, low right segment), HH (high left, high right segment), OT (other) +#' 3. segment shape: 3 bins, LL (low left, low right segment), HH (high left, +#' high right segment), OT (other) #' 4. segment change: 2 bins: AA (difference between surrounding segments <= #' some critical value), or BB #' @@ -31,7 +32,8 @@ #' more than breaks. Defaults follow paper. Default is < 5Mb, 5--10Mb, > 10Mb #' specified as c(5e6, 10e6 + 1). Internally, base::cut() is used, so 2 splits #' produces 3 bins. -#' @param annotate_input boolean. return input dataframe annotating each segment with the feature categories it falls into. +#' @param annotate_input boolean. return input dataframe annotating each +#' segment with the feature categories it falls into. #' @param return_matrix boolean. Return a cell-by-feature matrix of counts. #' @param ... can pass change_split_val to alter critical value for AA/BB split #' @return default return is a tibble of feature counts for each cell id. @@ -536,3 +538,81 @@ extract_breakpoints <- function( return(bp_counts) } + +#' extract the legths of chains of osscilating copy number segments +#' +#' Osscilating segments of copy numbers take from of N-M-N, e.g., 3-4-3. The +#' length of these chains reflects the number of oscillations. Chains are +#' evaluated in 3-segment intervals (the minimum required to recognize an +#' oscillation). Chromosomse with < 3 segments cannot have oscillations, and +#' will receive a 0. +#' +#' Chromosomes are evaluated in 3-segments sliding windows, with a window +#' receiving a 1 if the 3-set is an oscillation, and a 0 if not. Adjacent +#' oscillation sets are summed, non-osscilating sets are left as 0. +#' +#' Examples of Chain : length +#' * 2-3-4: 0 +#' * 3-4-3 : 1 +#' * 3-4-3-4: 2 +#' * 3-4-3-2: 1 0 (3-4-3 is a chain, 4-3-2 is not) +#' * 2-3-4-5: 0 0 +#' +#' An early paper arguing for the use of oscillations to detect chromothripsis +#' outlined that oscillations should be within 1 or 2 CN values. Using N-M-N +#' notation from above, this would mean that Ns should be the same value and M +#' would be within 1 or 2 copy values of N. Some pubilications relax these +#' constraints, and for example allow M to be any value other than N. This +#' means that 3-300-3 is counted as an oscillation by several publications. +#' Granted, this is an oscillation, but not likely to be one indicative of +#' chromothripsis. +#' +#' Others allow some flexiblity in the matching of the first and second N, +#' allowing them to be within some margin of difference. Componding these two +#' things leads to fairly extreme senarios, with some methods allowing chains +#' like 5-300-6 to count as an oscillation. +#' +#' These behaviours can be replicated here, but isn't recommended. By default, +#' Ns must match exactly, and M must be within 2 CNs. +#' +#' @param segs_df dataframe. Copy number segments for samples. +#' @param middle_bound integer. How many copy number values away the middle +#' segment can be from the end segments to count as an oscillation. Default 2 +#' (e.g., 3-4-3 and 3-5-3 both count as oscillations, but 3-7-3 would not). +#' @param ends_bound integer. How many CN values apart the two ends of a +#' 3-segment set are allow to be to count as an oscillation. Default is 0, +#' i.e., they must be the same copy number. +#' @param sample_col string. Name of column with cell/sample names +#' @param chrom_col string. Name of column with chromosome names +#' @param cn_col string. Name of column with segment copy number states. +#' @return dataframe. sample IDs and observed chain lengths. +#' @export +extract_oscillations <- function( + segs_df, + middle_bound = 2, + ends_bound = 0, + sample_col = "cell_id", + chrom_col = "chr", + cn_col = "state") { + # TODO: this is pretty gross. Is it really better that filtering, + # splitting, or something else? + res <- segs_df %>% + split(.[[sample_col]]) |> + purrr::map( + .x = _, \(x) split(x[[cn_col]], x[[chrom_col]]) + ) |> + purrr::map(\(samp, samp_name) { + purrr::map(samp, \(chrom) { + count_oscillations(chrom) + }) + }) + + sample_osc_counts <- res |> + purrr::list_flatten(name_spec = "{outer}&&&{inner}") |> + tibble::enframe() |> + tidyr::unnest_longer(value) |> + tidyr::separate_wider_delim(name, names = c(sample_col, "chrom"), "&&&") |> + dplyr::select(dplyr::any_of(sample_col), OSc = value) + + return(sample_osc_counts) +} diff --git a/R/oscilations.R b/R/oscilations.R new file mode 100644 index 0000000..5553f6f --- /dev/null +++ b/R/oscilations.R @@ -0,0 +1,114 @@ +#' decide if a 3 segment chain counts as an oscillation +#' +#' See [dlptools::extract_oscillations] for a discussion of oscillations and +#' recognizing them in sample copy number segments. +#' +#' @param left_cn int. CN value of segment on left side of the chain +#' @param middle_cn int. CN value of segment middle of the chain +#' @param left_cn int. CN value of segment on right side of the chain +#' @param middle_bound int|Inf. Default Inf. How different the middle CN is +#' allowed to be from either side to count as an oscillation. See +#' [dlptools::extract_oscillations] for a discussion of this +#' @param ends_bound int. Default 0. How different the ends of a 3-segment set +#' are allowed to be to count as an oscillation +#' @return boolean. True if 3 CNs constitute an oscillation, False if not. +decide_if_oscillation <- function( + left_cn, middle_cn, right_cn, + middle_bound = Inf, + ends_bound = 0) { + # are the ends the same, or within a defined degree of difference + ends_match <- abs(left_cn - right_cn) <= ends_bound + # is the middle and within any boundary of difference + middle_different <- ( + middle_cn != right_cn & abs(middle_cn - right_cn) <= middle_bound + ) + is_osc <- ends_match & middle_different + return(is_osc) +} + +#' count the length of oscillating chains of CN. +#' +#' Given a vector of CN values, detect and summarize lengths of any oscillating +#' chains that are present in the sequence, along with 0s for non-oscillating +#' sets. +#' +#' A full discussion of this function can be found in the documentation of. +#' [dlptools::extract_oscillations]. +#' +#' Examples: +#' * 3-2-3 = 1 +#' * 1-2-3 = 0 +#' * 3-2-3-1-2-3 = 1 0 +#' * 3-2-3-2 = 2 +#' * 3-2-3-2-1 = 2 0 +#' +#' @param cn_vals vector of ints. Segment state values. Ideally from the +#' chromosome of 1 sample. +#' @param middle_bound int|Inf. Default 2. How different the middle CN is +#' allowed to be from either side to count as an oscillation. See +#' [dlptools::extract_oscillations] for a discussion of this. Default of 2 +#' means the middle CN must be within 2 copy values of the flanking segments. +#' @param ends_bound int. Default 0. How different the ends of a 3-segment set +#' are allowed to be to count as an oscillation +#' @return vector of ints. +#' @export +count_oscillations <- function( + cn_vals, + middle_bound = 2, + ends_bound = 0) { + if (length(cn_vals) < 3) { + return(0) # or NA? + } + + osc_count <- 0 + total_osc <- c() + for (i in 3:length(cn_vals)) { + # since we start on position 3, we are effectively at the "right_cn" + is_osc <- decide_if_oscillation( + left_cn = cn_vals[i - 2], + middle_cn = cn_vals[i - 1], + right_cn = cn_vals[i], + middle_bound = middle_bound, + ends_bound = ends_bound + ) + # this would be a vanilla comparison, which most papers take, the + # defaults of the function reflect this case where basically the ends + # need to match, and the middle can be anything. + # is_osc = cn_vals[i-2] == cn_vals[i] && cn_vals[i-1] != cn_vals[i] + + is_end_of_seq <- i == length(cn_vals) + + if (is_osc) { + # accumulate a count for it being within an oscillation + osc_count <- osc_count + 1 + } else if (!is_osc && !is_end_of_seq) { + # if we are not in an oscillation and not at the terminal end + if (osc_count > 0) { + # store any oscillation we have accumulated + total_osc <- c(total_osc, osc_count) + # reset the oscillation count, as we are not in an oscillation + osc_count <- 0 + } + # add 0 for the current 3-set segment not being an oscillation + total_osc <- c(total_osc, 0) + } + + # set up cases to catch the last 3-set segment + if (is_end_of_seq) { + if (is_osc) { + # if we are ending on the oscillation, store the count + total_osc <- c(total_osc, osc_count) + } else if (!is_osc) { + # ending not on an oscillation + if (osc_count > 0) { + # store any accumulated oscillation + total_osc <- c(total_osc, osc_count) + } + # finally, store a 0 for the current 3-set segment not being an + # oscillation + total_osc <- c(total_osc, 0) + } + } + } + return(total_osc) +} diff --git a/man/count_oscillations.Rd b/man/count_oscillations.Rd new file mode 100644 index 0000000..006fb47 --- /dev/null +++ b/man/count_oscillations.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oscilations.R +\name{count_oscillations} +\alias{count_oscillations} +\title{count the length of oscillating chains of CN.} +\usage{ +count_oscillations(cn_vals, middle_bound = 2, ends_bound = 0) +} +\arguments{ +\item{cn_vals}{vector of ints. Segment state values. Ideally from the +chromosome of 1 sample.} + +\item{middle_bound}{int|Inf. Default 2. How different the middle CN is +allowed to be from either side to count as an oscillation. See +\link{extract_oscillations} for a discussion of this. Default of 2 +means the middle CN must be within 2 copy values of the flanking segments.} + +\item{ends_bound}{int. Default 0. How different the ends of a 3-segment set +are allowed to be to count as an oscillation} +} +\value{ +vector of ints. +} +\description{ +Given a vector of CN values, detect and summarize lengths of any oscillating +chains that are present in the sequence, along with 0s for non-oscillating +sets. +} +\details{ +A full discussion of this function can be found in the documentation of. +\link{extract_oscillations}. + +Examples: +\itemize{ +\item 3-2-3 = 1 +\item 1-2-3 = 0 +\item 3-2-3-1-2-3 = 1 0 +\item 3-2-3-2 = 2 +\item 3-2-3-2-1 = 2 0 +} +} diff --git a/man/decide_if_oscillation.Rd b/man/decide_if_oscillation.Rd new file mode 100644 index 0000000..e134dce --- /dev/null +++ b/man/decide_if_oscillation.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oscilations.R +\name{decide_if_oscillation} +\alias{decide_if_oscillation} +\title{decide if a 3 segment chain counts as an oscillation} +\usage{ +decide_if_oscillation( + left_cn, + middle_cn, + right_cn, + middle_bound = Inf, + ends_bound = 0 +) +} +\arguments{ +\item{left_cn}{int. CN value of segment on right side of the chain} + +\item{middle_cn}{int. CN value of segment middle of the chain} + +\item{middle_bound}{int|Inf. Default Inf. How different the middle CN is +allowed to be from either side to count as an oscillation. See +\link{extract_oscillations} for a discussion of this} + +\item{ends_bound}{int. Default 0. How different the ends of a 3-segment set +are allowed to be to count as an oscillation} +} +\value{ +boolean. True if 3 CNs constitute an oscillation, False if not. +} +\description{ +See \link{extract_oscillations} for a discussion of oscillations and +recognizing them in sample copy number segments. +} diff --git a/man/extract_oscillations.Rd b/man/extract_oscillations.Rd new file mode 100644 index 0000000..ee75b4c --- /dev/null +++ b/man/extract_oscillations.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cn_features.R +\name{extract_oscillations} +\alias{extract_oscillations} +\title{extract the legths of chains of osscilating copy number segments} +\usage{ +extract_oscillations( + segs_df, + middle_bound = 2, + ends_bound = 0, + sample_col = "cell_id", + chrom_col = "chr", + cn_col = "state" +) +} +\arguments{ +\item{segs_df}{dataframe. Copy number segments for samples.} + +\item{middle_bound}{integer. How many copy number values away the middle +segment can be from the end segments to count as an oscillation. Default 2 +(e.g., 3-4-3 and 3-5-3 both count as oscillations, but 3-7-3 would not).} + +\item{ends_bound}{integer. How many CN values apart the two ends of a +3-segment set are allow to be to count as an oscillation. Default is 0, +i.e., they must be the same copy number.} + +\item{sample_col}{string. Name of column with cell/sample names} + +\item{chrom_col}{string. Name of column with chromosome names} + +\item{cn_col}{string. Name of column with segment copy number states.} +} +\value{ +dataframe. sample IDs and observed chain lengths. +} +\description{ +Osscilating segments of copy numbers take from of N-M-N, e.g., 3-4-3. The +length of these chains reflects the number of oscillations. Chains are +evaluated in 3-segment intervals (the minimum required to recognize an +oscillation). Chromosomse with < 3 segments cannot have oscillations, and +will receive a 0. +} +\details{ +Chromosomes are evaluated in 3-segments sliding windows, with a window +receiving a 1 if the 3-set is an oscillation, and a 0 if not. Adjacent +oscillation sets are summed, non-osscilating sets are left as 0. + +Examples of Chain : length +\itemize{ +\item 2-3-4: 0 +\item 3-4-3 : 1 +\item 3-4-3-4: 2 +\item 3-4-3-2: 1 0 (3-4-3 is a chain, 4-3-2 is not) +\item 2-3-4-5: 0 0 +} + +An early paper arguing for the use of oscillations to detect chromothripsis +outlined that oscillations should be within 1 or 2 CN values. Using N-M-N +notation from above, this would mean that Ns should be the same value and M +would be within 1 or 2 copy values of N. Some pubilications relax these +constraints, and for example allow M to be any value other than N. This +means that 3-300-3 is counted as an oscillation by several publications. +Granted, this is an oscillation, but not likely to be one indicative of +chromothripsis. + +Others allow some flexiblity in the matching of the first and second N, +allowing them to be within some margin of difference. Componding these two +things leads to fairly extreme senarios, with some methods allowing chains +like 5-300-6 to count as an oscillation. + +These behaviours can be replicated here, but isn't recommended. By default, +Ns must match exactly, and M must be within 2 CNs. +} diff --git a/man/extract_wu_features.Rd b/man/extract_wu_features.Rd index fa684d3..528c12c 100644 --- a/man/extract_wu_features.Rd +++ b/man/extract_wu_features.Rd @@ -27,7 +27,8 @@ more than breaks. Defaults follow paper. Default is < 5Mb, 5--10Mb, > 10Mb specified as c(5e6, 10e6 + 1). Internally, base::cut() is used, so 2 splits produces 3 bins.} -\item{annotate_input}{boolean. return input dataframe annotating each segment with the feature categories it falls into.} +\item{annotate_input}{boolean. return input dataframe annotating each +segment with the feature categories it falls into.} \item{return_matrix}{boolean. Return a cell-by-feature matrix of counts.} From 065c9c8710dd7dd111bfaefe5de8e21584d78c1f Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Wed, 10 Sep 2025 16:15:39 -0700 Subject: [PATCH 4/5] docs: demonstrate new process feature functions. --- NAMESPACE | 1 + R/cn_features.R | 5 +- man/extract_changepoint.Rd | 6 +- man/extract_wu_features.Rd | 3 +- vignettes/feature_extraction.Rmd | 170 +++++++++++++++++++++++++++++-- 5 files changed, 174 insertions(+), 11 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 00e17a4..d930047 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ export(expand_length_to_bins) export(extract_bp_per_arm) export(extract_bp_per_window) export(extract_breakpoints) +export(extract_changepoint) export(extract_oscillations) export(extract_ploidy_cn_feature) export(extract_segment_position_feature) diff --git a/R/cn_features.R b/R/cn_features.R index 49d88b0..cfab6dd 100644 --- a/R/cn_features.R +++ b/R/cn_features.R @@ -359,12 +359,14 @@ extract_segment_sizes <- function( #' "ignore" to not count anything for first segments. #' #' @param segs_df dataframe. Sample copy number segments. -#' first_seg_correction +#' @param first_seg_correction string. Default: 'cn_mode'. Options include +#' "diploid" or "ignore". #' @param sample_col string. Name of column with cell/sample names #' @param chrom_col string. Name of column with chromosome names #' @param cn_col string. Name of column with segment copy number states. #' @param ... can pass arguments to [dlptools::segs_to_reads] #' @return dataframe. Sample IDs and the observed breakpoint counts per scope. +#' @export extract_changepoint <- function( segs_df, first_seg_correction = c("cn_mode", "diploid", "ignore"), @@ -530,7 +532,6 @@ extract_breakpoints <- function( tibble::tibble( !!sample_col := sample, - chrom = chrom, breakpoints = unname(counts) ) } diff --git a/man/extract_changepoint.Rd b/man/extract_changepoint.Rd index ca8494d..b4a9dd2 100644 --- a/man/extract_changepoint.Rd +++ b/man/extract_changepoint.Rd @@ -14,8 +14,10 @@ extract_changepoint( ) } \arguments{ -\item{segs_df}{dataframe. Sample copy number segments. -first_seg_correction} +\item{segs_df}{dataframe. Sample copy number segments.} + +\item{first_seg_correction}{string. Default: 'cn_mode'. Options include +"diploid" or "ignore".} \item{sample_col}{string. Name of column with cell/sample names} diff --git a/man/extract_wu_features.Rd b/man/extract_wu_features.Rd index 528c12c..004dad2 100644 --- a/man/extract_wu_features.Rd +++ b/man/extract_wu_features.Rd @@ -50,7 +50,8 @@ They employ 4 base features, that they cross for 90 categories: \enumerate{ \item CN states: 5 bins, 0-1, 2, 3, 4, 5+ \item segment size: 3 bins, <5 MB, 5-10Mb, 10Mb, -\item segment shape: 3 bins, LL (low left, low right segment), HH (high left, high right segment), OT (other) +\item segment shape: 3 bins, LL (low left, low right segment), HH (high left, +high right segment), OT (other) \item segment change: 2 bins: AA (difference between surrounding segments <= some critical value), or BB } diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd index f2cc62f..fa6cc67 100644 --- a/vignettes/feature_extraction.Rmd +++ b/vignettes/feature_extraction.Rmd @@ -28,6 +28,28 @@ dlptools offers two externally defined feature extraction types: The Wang et al features are just a thin wrapper around [sigminer](https://github.com/ShixiangWang/sigminer?tab=readme-ov-file) functions. Check out that package for more tools and actual signature extraction. +
+ +As well, dlptools offers functions to extract the standard "**process based**" features, defines/inspired by: + +* Macintyre et al. 2018 +* Maclachlan et al. 2021 +* Drews et al. 2022 +* and others + +
+ +These features include: + +* segment size +* copy number change points +* breakpoint counts per arm or MB window +* length of oscillating copy number changes + +Often, extracting these features is a starting point, and they are further summarized into categories. These categories are either set by hand, or informed by mixture models. + +Some publications also include the actual CN of the segments (in categories) as a feature, but I haven't included a function for that...presumably you can pull that out of your data. +

@@ -39,6 +61,7 @@ Perhaps the workflow would look something like this: reads_df <- vroom::vroom("data/example_full_reads.tsv.gz") segs_df <- reads_df |> + dplyr::filter(!chr %in% c("X", "Y")) |> dlptools::mark_mask_regions() |> dlptools::mark_bins_overlapping_centromeres(padding = 3e6) |> dplyr::filter( @@ -64,6 +87,8 @@ dplyr::slice_head(segs_df, n = 5)

+### Wu et al. + For extracting Wu et al. features: ```{r} @@ -98,6 +123,8 @@ see `?dlptools::extract_wu_features` for more.

+### Wang et al., aka Sigminer + Alternatively, we can get Wang et al. features. Though if going this route, it would be a good idea to look more at [sigminer](https://github.com/ShixiangWang/sigminer?tab=readme-ov-file) itself: ```{r, warning=FALSE, message = FALSE} @@ -110,7 +137,140 @@ sig_feats[1:4, 1:4]

-## ploidy informed CN change +## Process-Based Features + +1. segment lengths. + +This is as simple as it sounds, and the function is unnecessary. These lengths would need to be subsequently categorized somehow. + +```{r} +dlptools::extract_segment_sizes( + segs_df = segs_df +) |> + dplyr::slice_head(n = 10) +``` + +
+ +2. Copy number change points + +These are the values of the change in copy number from segment to segment. E.g., 2-5 would yield a 3. + +See `?dlptools::extract_changepoint()` for an important discussion of the `first_seg_correction` parameter. + +```{r} +dlptools::extract_changepoint( + segs_df = segs_df +) |> + dplyr::slice_head(n = 10) +``` + +
+ +3. Extracting number of breakpoints per chromosome arm + +A breakpoint is a change in the copy number. + +```{r} +dlptools::extract_breakpoints( + segs_df = segs_df, + scope = "arms" +) |> + dplyr::slice_head(n = 10) + +# there is an alias of: dlptools::extract_bp_per_arm() +``` + +
+ +4. Extract breakpoints per MB window + +Typically, publications have used 10Mb. + +```{r} +dlptools::extract_breakpoints( + segs_df = segs_df, + scope = "windows" + # window_size = 10e6 # default +) |> + dplyr::slice_head(n = 10) + +# there is an alias of: dlptools::extract_bp_per_window() +``` + +
+ +5. lengths of copy number oscillations + + +Overall, this is a bit of a funny feature. But basically, chromosomes are assessed in 3-segment sets to determine if it counts as an oscillation. If there are more than one oscillations in a row, they are added up. If the 3-segment set is not an oscillation, it gets a 0. + +
+ +* 1-2-3 is not an oscillation, length 0 +* 2-3-2 is an oscillation of length 1 +* 2-3-2-3 is an oscillation of length 2 +* 2-3-2-3-5 is an oscillation of length 2 followed by a 0 for the non-oscillating component +* 1-2-3-4-5 has no oscillations, and returns 0 0 0 for the 3 possible 3-set segments + +
+ +I have dissected the code for various publications and packages that count this feature and they all work differently, some have outright bugs, and others lack some internal consistency in how they assess 0s. So, please read `?dlptools::extract_oscillations` for a long discussion of how exactly this is working. + +
+ +Overall, 0s are typically abundant, and should maybe be considered for removal...? I don't know, they just tend to overwhelm. Again, all publications using this feature have done so differently, so it's the wild west. Do what feels right. + +```{r} +dlptools::extract_oscillations( + segs_df = segs_df + # important arguments are: + # middle_bound = 2 # default + # ends_bound = 0 # default + # see function documentation for what these are +) |> + dplyr::slice_head(n = 10) +``` + +
+ +You can also play directly with `dlptools::count_oscillations` on vectors to see how the inernals of this function are working: + +```{r} +dlptools::count_oscillations(1:3) + +dlptools::count_oscillations(c(2, 3, 2)) + +dlptools::count_oscillations(c(2, 3, 2, 3)) + +dlptools::count_oscillations(c(2, 3, 2, 3, 8, 5, 4, 5)) + +# most other publications/package do not care about the middle value +dlptools::count_oscillations( + c(5, 1000, 5), + middle_bound = Inf +) + +# some publications would allow this as counting as an oscillation. Where the +# first and last segments are allowed to be within a certain distance of one +# another, and the middle value doesn't matter. +dlptools::count_oscillations( + c(5, 1000, 6), + middle_bound = Inf, + ends_bound = 1 +) +``` + + +
+
+
+ +## Other vaguely feature like things + +These could be considered features, maybe only within certain contexts. Maybe they are just more useful as descriptions of the data as a whole. + +### ploidy informed CN change CN change relative to ploidy might be interesting to have a feature. Here we can mark 3 types of events for segments: @@ -131,11 +291,9 @@ dlptools::extract_ploidy_cn_feature( dplyr::slice_head(n = 6) ``` -
-

-## Segment Span Types +### Segment Span Types It might be useful to know where segments sit on chromosomes and how much of a chromosome they cover. We can label segments based on 5 categories: @@ -182,7 +340,7 @@ dlptools::extract_segment_position_feature( ``` -## Feature Combinations +## Feature Combinations...? With some of these feature sets, we can combine them into one monster feature set. Is this a good idea? A bad idea? Is the output even interpretable? Are all ov these even useful? You'll have to decide that. @@ -337,7 +495,7 @@ ggplot(

-Or, you could do some sort of combination of cross and uncrossed? I don't know. This whole signature field is the wild west and people do all kinds of things. +Or, you could do some sort of combination of cross and uncrossed? I don't know. Again, this whole signature field is the wild west and people do all kinds of things. ```{r, message=FALSE, warning=FALSE} # count the Wu features as crossed: 90 feature categories From 8f6bae6bcd8e3a49461cf8cf00226effff8e8575 Mon Sep 17 00:00:00 2001 From: benjaminfurman Date: Wed, 10 Sep 2025 16:16:24 -0700 Subject: [PATCH 5/5] chore: bump versiond --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8bfe9bb..126423e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dlptools Title: DLPTools: Handling DLP+ Data -Version: 0.3.19 +Version: 0.3.20 Authors@R: person("Ben", "Furman", , "bfurman@bccrc.ca", role = c("aut", "cre")) Description: A collections of functions for basic manipulation and plotting of