diff --git a/DESCRIPTION b/DESCRIPTION
index 5d978bb..1d2b7ec 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: dlptools
Title: DLPTools: Handling DLP+ Data
-Version: 0.3.17
+Version: 0.3.18
Authors@R:
person("Ben", "Furman", , "bfurman@bccrc.ca", role = c("aut", "cre"))
Description: A collections of functions for basic manipulation and plotting of
diff --git a/NAMESPACE b/NAMESPACE
index 443f0e5..77981cd 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,12 +4,15 @@ export("%>%")
export(ASCN_COLORS)
export(ASCN_PHASE_COLORS)
export(BAF_COLORS)
+export(BASE_PLOT_THEME)
export(CNV_COLOURS)
export(DEFAULT_CONTINUOUS_COLOR_RANGE)
+export(GEN_PLOT_COLS)
export(RAW_COPY_COLORS)
export(STATE_COLORS)
export(add_centromere_locations)
export(add_missing_bins_for_cells)
+export(add_telomere_positions)
export(add_tip_ancestors_to_df)
export(basic_tile_plot)
export(cell_cn_profile)
@@ -22,7 +25,10 @@ export(compute_tip_sibling_distances)
export(convert_dists_to_pairwise)
export(convert_long_reads_to_wide)
export(create_expected_bins)
+export(cust_mode)
export(expand_length_to_bins)
+export(extract_ploidy_cn_feature)
+export(extract_segment_position_feature)
export(extract_sigminer_wang_features)
export(extract_wu_features)
export(factor_column_mixedsort)
@@ -41,6 +47,7 @@ export(get_tips_that_avoid_redundant_comps)
export(import_annotations_df)
export(import_clones)
export(import_dlp_files)
+export(import_telos_file)
export(import_tree)
export(library_from_cell)
export(load_chrom_info_file)
@@ -51,8 +58,11 @@ export(make_corrupt_tree_heatmap)
export(make_tree_plot_obj)
export(map_states_to_letters)
export(mark_bins_overlapping_centromeres)
+export(mark_cn_relative_to_ploidy)
export(mark_mask_regions)
+export(mark_segs_chromosome_span)
export(medicc_profiles_to_foreground)
+export(mode_ploidy)
export(pairwise_bin_difference)
export(plot_bg_state_highlight)
export(plot_fg_state_highlight)
diff --git a/R/bin_filling.R b/R/bin_filling.R
index 9b03eea..f47fa65 100644
--- a/R/bin_filling.R
+++ b/R/bin_filling.R
@@ -1,31 +1,3 @@
-#' loading UCSC chromosome length files
-#' @param version default "hg19", can also load "hg38"
-#' @return tibgble of chromosome, total length, etc.
-#' @export
-load_chrom_info_file <- function(version = c("hg19", "hg38")) {
- version <- match.arg(version)
-
- chrom_files <- list(
- hg19 = "hg19_chromInfo.txt.gz",
- hg38 = "hg19_chromInfo.txt.gz"
- )
-
- chrom_info <- get_package_file_path(chrom_files[version]) |>
- vroom::vroom(
- col_names = c("chr", "total_length", "misc"),
- show_col_types = FALSE
- ) |>
- dplyr::filter(
- # remove the unnecessary chromosomes
- stringr::str_detect(chr, "_|M", negate = TRUE)
- ) |>
- dplyr::mutate(
- chr = stringr::str_replace(chr, "chr", "")
- )
-
- return(chrom_info)
-}
-
#' For a given length, create bins of a given size
#'
#' e.g., length = 10, bin = 5, will get bins: 1-5, 6-10
@@ -128,7 +100,7 @@ add_missing_bins_for_cells <- function(
#' other states to fill in.
#'
#' @param state_df dataframe of state data with NAs for some bins that need
-#' to be filled in. E.g., after dlptools::running add_missing_bins_for_cells()
+#' to be filled in. E.g., after dlptools::add_missing_bins_for_cells()
#' @export
#' @importFrom rlang .data
fill_state_from_neighbours <- function(state_df, cols_to_fill = c("state")) {
diff --git a/R/centromere_marking.R b/R/centromere_marking.R
deleted file mode 100644
index cb505fa..0000000
--- a/R/centromere_marking.R
+++ /dev/null
@@ -1,136 +0,0 @@
-#' add centromere information to reads by chromosome
-#'
-#' See dlptools::read_and_prep_ucsg_cenrtomeres() for details of file origins.
-#'
-#' @param centro_file NULL or string to the path if you download yourself
-#' @param hg19 boolean to target hg19 for loading
-#' @param hg38 boolean to target hg38 for loading
-#' @return input table with centromere information added by chromosome
-#' @export
-add_centromere_locations <- function(
- reads_df, centro_file = NULL, version = c("hg19", "hg38")) {
- version_choice <- match.arg(version)
-
- centros <- load_ucsc_centromeres(
- centro_file = centro_file, version = version_choice
- )
-
- reads_df <- dplyr::left_join(
- reads_df,
- centros,
- by = dplyr::join_by(chr == chrom)
- )
-
- return(reads_df)
-}
-
-#' add boolean if bin overlaps with a centromere.
-#'
-#' Can optionally specify a padding to mark locations of bins as being close
-#' enough to a centromere. Often bins near centromeres are corrupt in their
-#' state calls. In the past, we have filtered within 3 Mb of centromeres.
-#'
-#' IMPORTANT! This adds the padding to each side of the centromere. So if you
-#' specify a padding of 3 Mb, it will be within 3 Mb of the start and 3 Mb of
-#' the end of the centromere.
-#'
-#' @param reads_df tibble of read data
-#' @param padding int of number of BP to add to each side of the centromere
-#' @param bin_start_col Default: start. column name of the start of bins.
-#' @param bin_end_col Default: end column name of the end of bins.
-#' @param version default 'hg19', or choose 'hg38' for locations of centromeres.
-#' @return input table, but with a boolean 'within_centro' column added (and
-#' potentially other centromere information columns, if needed)
-#' @export
-mark_bins_overlapping_centromeres <- function(
- reads_df,
- padding = 0,
- bin_start_col = "start",
- bin_end_col = "end",
- version = c("hg19", "hg38")) {
- version_choice <- match.arg(version)
- columns_check <- all(c("centro_start", "centro_end") %in% colnames(reads_df))
-
- if (!columns_check) {
- warning(paste0(
- "centromere locations not present, adding locations for ",
- version_choice,
- ". If that is not correct, set version to 'hg38')."
- ))
-
- reads_df <- add_centromere_locations(reads_df = reads_df)
- }
-
- reads_df <- reads_df |>
- dplyr::mutate(
- centromere_padding = padding,
- overlaps_centro = (
- .data[[bin_start_col]] <= centro_end + padding &
- .data[[bin_end_col]] >= centro_start - padding
- )
- )
-
- return(reads_df)
-}
-
-
-#' load and prep default centromerefiles from UCSC
-#'
-#' Cytoband files originally obtained from UCSC golden path genomes and parsed
-#' to collect the total centromere spans.
-#'
-#' hg19:
-#' https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
-#' hg38:
-#' https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
-#'
-#' TODO: can I stash the file if it's already loaded once?
-#'
-#' @param centro_file NULL or string to the path if you download yourself
-#' @param hg19 boolean to target hg19 for loading
-#' @param hg38 boolean to target hg38 for loading
-#' @return tibble of parsed centromere spans
-#' @export
-load_ucsc_centromeres <- function(
- version = c("hg19", "hg38"),
- centro_file = NULL) {
- version_choice <- match.arg(version)
-
- if (is.null(centro_file)) {
- default_centro_files <- c(
- # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
- hg19 = "hg19_cytoBand.txt.gz",
- # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
- hg38 = "hg38_cytoBand.txt.gz"
- )
-
- centro_file <- get_package_file_path(default_centro_files[version_choice])
- }
-
- # import file and filter to features of interest
- centros <- vroom::vroom(
- centro_file,
- col_names = c("chrom", "start", "end", "arm_full_name", "feat"),
- show_col_types = FALSE
- ) |>
- dplyr::filter(feat == "acen") |>
- dplyr::mutate(
- # removes the 'chr' from the chromosome name, e.g., chr1 -> 1
- chrom = stringr::str_extract(chrom, "([A-Z]|[0-9]+)")
- )
- centros <- centros |>
- dplyr::group_by(chrom) |>
- dplyr::mutate(
- centro_start = min(start),
- centro_end = max(end),
- centro_span = centro_end - centro_start,
- arm = stringr::str_extract(arm_full_name, "[a-z]")
- )
-
- # pivot out to one line per chromosome of centromere measurements
- centros <- centros |>
- dplyr::select(-c(arm_full_name, feat)) |>
- tidyr::pivot_wider(names_from = arm, values_from = c(start, end))
-
- return(centros)
-}
diff --git a/R/chromosome_layouts.R b/R/chromosome_layouts.R
new file mode 100644
index 0000000..7694179
--- /dev/null
+++ b/R/chromosome_layouts.R
@@ -0,0 +1,439 @@
+#' loading UCSC chromosome length files
+#' @param version default "hg19", can also load "hg38"
+#' @return tibgble of chromosome, total length, etc.
+#' @export
+load_chrom_info_file <- function(version = c("hg19", "hg38")) {
+ version <- match.arg(version)
+
+ chrom_files <- list(
+ hg19 = "hg19_chromInfo.txt.gz",
+ hg38 = "hg38_chromInfo.txt.gz"
+ )
+
+ chrom_info <- get_package_file_path(chrom_files[version]) |>
+ vroom::vroom(
+ col_names = c("chr", "total_length", "misc"),
+ show_col_types = FALSE
+ ) |>
+ dplyr::filter(
+ # remove the unnecessary chromosomes
+ stringr::str_detect(chr, "_|M", negate = TRUE)
+ ) |>
+ dplyr::mutate(
+ chr = stringr::str_replace(chr, "chr", "")
+ )
+
+ return(chrom_info)
+}
+
+#' add chromosome lengths to a cn dataframe
+#'
+#' Handles chromosomes specified with "chr#" or just #/X/Y.
+#'
+#' @param cn_df dataframe of copy number information
+#' @param version string. hg19 (default) or hg38
+#' @param chrom_col string. name of column with chromosome information.
+add_chromosome_length <- function(cn_df, version = c("hg19", "hg38"), chrom_col = "chr") {
+ chrom_info <- load_chrom_info_file(version = version)
+
+ if (is_chr_used_in_chroms(cn_df[[chrom_col]])) {
+ chrom_info <- dplyr::mutate(chrom_info, chr = stringr::str_c("chr", chr))
+ }
+
+ cn_df <- dplyr::left_join(
+ cn_df,
+ chrom_info,
+ by = dplyr::join_by({{ chrom_col }} == chr)
+ )
+
+ return(cn_df)
+}
+
+
+#' add centromere information to reads by chromosome
+#'
+#' See dlptools::read_and_prep_ucsg_cenrtomeres() for details of file origins.
+#'
+#' @param cn_df dataframe of cn states for read bins or segments
+#' @param centro_file NULL or string to the path if you download yourself
+#' @param hg19 boolean to target hg19 for loading
+#' @param hg38 boolean to target hg38 for loading
+#' @return input table with centromere information added by chromosome
+#' @export
+add_centromere_locations <- function(
+ cn_df, centro_file = NULL, version = c("hg19", "hg38")) {
+ version_choice <- match.arg(version)
+
+ centros <- load_ucsc_centromeres(
+ centro_file = centro_file, version = version_choice
+ )
+
+ cn_df <- dplyr::left_join(
+ cn_df,
+ centros,
+ by = dplyr::join_by(chr == chrom)
+ )
+
+ return(cn_df)
+}
+
+#' add boolean if bin overlaps with a centromere.
+#'
+#' Can optionally specify a padding to mark locations of bins as being close
+#' enough to a centromere. Often bins near centromeres are corrupt in their
+#' state calls. In the past, we have filtered within 3 Mb of centromeres.
+#'
+#' IMPORTANT! This adds the padding to each side of the centromere. So if you
+#' specify a padding of 3 Mb, it will be within 3 Mb of the start and 3 Mb of
+#' the end of the centromere.
+#'
+#' @param reads_df tibble of read data
+#' @param padding int of number of BP to add to each side of the centromere
+#' @param bin_start_col Default: start. column name of the start of bins.
+#' @param bin_end_col Default: end column name of the end of bins.
+#' @param version default 'hg19', or choose 'hg38' for locations of centromeres.
+#' @return input table, but with a boolean 'within_centro' column added (and
+#' potentially other centromere information columns, if needed)
+#' @export
+mark_bins_overlapping_centromeres <- function(
+ reads_df,
+ padding = 0,
+ bin_start_col = "start",
+ bin_end_col = "end",
+ version = c("hg19", "hg38")) {
+ version_choice <- match.arg(version)
+ columns_check <- all(c("centro_start", "centro_end") %in% colnames(reads_df))
+
+ if (!columns_check) {
+ warning(paste0(
+ "centromere locations not present, adding locations for ",
+ version_choice,
+ ". If that is not correct, set version to 'hg38')."
+ ))
+
+ reads_df <- add_centromere_locations(cn_df = reads_df)
+ }
+
+ reads_df <- reads_df |>
+ dplyr::mutate(
+ centromere_padding = padding,
+ overlaps_centro = (
+ .data[[bin_start_col]] <= centro_end + padding &
+ .data[[bin_end_col]] >= centro_start - padding
+ )
+ )
+
+ return(reads_df)
+}
+
+
+#' load and prep default centromerefiles from UCSC
+#'
+#' Cytoband files originally obtained from UCSC golden path genomes and parsed
+#' to collect the total centromere spans.
+#'
+#' hg19:
+#' https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
+#' hg38:
+#' https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
+#'
+#' TODO: can I stash the file if it's already loaded once?
+#'
+#' @param centro_file NULL or string to the path if you download yourself
+#' @param hg19 boolean to target hg19 for loading
+#' @param hg38 boolean to target hg38 for loading
+#' @return tibble of parsed centromere spans
+#' @export
+load_ucsc_centromeres <- function(
+ version = c("hg19", "hg38"),
+ centro_file = NULL) {
+ version_choice <- match.arg(version)
+
+ if (is.null(centro_file)) {
+ default_centro_files <- c(
+ # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz
+ hg19 = "hg19_cytoBand.txt.gz",
+ # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
+ hg38 = "hg38_cytoBand.txt.gz"
+ )
+
+ centro_file <- get_package_file_path(default_centro_files[version_choice])
+ }
+
+ # import file and filter to features of interest
+ centros <- vroom::vroom(
+ centro_file,
+ col_names = c("chrom", "start", "end", "arm_full_name", "feat"),
+ show_col_types = FALSE
+ ) |>
+ dplyr::filter(feat == "acen") |>
+ dplyr::mutate(
+ # removes the 'chr' from the chromosome name, e.g., chr1 -> 1
+ chrom = stringr::str_extract(chrom, "([A-Z]|[0-9]+)")
+ )
+ centros <- centros |>
+ dplyr::group_by(chrom) |>
+ dplyr::mutate(
+ centro_start = min(start),
+ centro_end = max(end),
+ centro_span = centro_end - centro_start,
+ arm = stringr::str_extract(arm_full_name, "[a-z]")
+ )
+
+ # pivot out to one line per chromosome of centromere measurements
+ centros <- centros |>
+ dplyr::select(-c(arm_full_name, feat)) |>
+ tidyr::pivot_wider(names_from = arm, values_from = c(start, end))
+
+ return(centros)
+}
+
+
+#' load UCSC gap files for telomeres
+#'
+#' Honestly, this function is sort of pointless. The telomeres in both genome
+#' versions are marked as 10Kb for p and q on all chromosomes. Telomeres are
+#' variable and better estimates of their size exist, e.g.,
+#' https://www.nature.com/articles/s41467-024-48917-7
+#'
+#' This function just serves as a way to set up easy marking of where CNs occur
+#' on their respective chromosome.
+#'
+#' The only exception to the 10Kb size is chr17. It is not listed in the gap
+#' file for hg19. chr17 is known to have small chromosomes. Here, I set the p
+#' to 3000 Kb and q to 5000 Kb, inferred from the article referenced above.
+#'
+#' @param version string. hg19 (default) or hg38
+#' @return tibble of telomere information
+#' @export
+import_telos_file <- function(version = c("hg19", "hg38")) {
+ version_choice <- match.arg(version)
+
+ default_telo_files <- c(
+ # https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/gap.txt.gz
+ hg19 = "hg19_gap.txt.gz",
+ # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/gap.txt.gz
+ hg38 = "hg38_gap.txt.gz"
+ )
+
+ telos_file <- get_package_file_path(default_telo_files[version_choice])
+
+
+ # col names by inspection and checking against here by looking up gap table
+ # data format description
+ # https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=2018561924_ZAtdZC9CFEJw8BKiYeUjd2ImvhS7&clade=mammal&org=Human&db=hg19&hgta_group=allTables&hgta_track=hg19&hgta_table=gap&hgta_regionType=genome&position=chr7%3A155%2C592%2C223-155%2C605%2C565&hgta_outputType=primaryTable&hgta_outFileName=
+ telos <- readr::read_tsv(
+ telos_file,
+ col_names = c(
+ "bin", "chrom", "telostart", "teloend", "ix", "n", "size", "feat_type", "bridge"
+ )
+ ) |>
+ dplyr::filter(feat_type == "telomere") |>
+ dplyr::mutate(
+ chrom = dplyr::if_else(
+ # removes the 'chr' from the chromosome name, e.g., chr1 -> 1
+ # this is to facilitate matching with our reads/segs files that just
+ # have 0-0/X/Y
+ chrom %in% c("chrX", "chrY"),
+ stringr::str_extract(chrom, "[A-Z]"),
+ stringr::str_extract(chrom, "[0-9]+")
+ )
+ )
+
+ # now mark p and q telos
+ telos <- telos |>
+ dplyr::mutate(
+ arm = dplyr::if_else(telostart == 0, "p", "q")
+ ) |>
+ dplyr::select(arm, chr = chrom, telostart, teloend) |>
+ tidyr::pivot_wider(names_from = arm, values_from = c(telostart, teloend))
+
+ # special case for chr17
+ # for hg19, chr17 doesn't have telomeres listed, so need to fake some
+ # telomere data.
+ # marking p as 3000 and q as 5000 following:
+ # https://www.nature.com/articles/s41467-024-48917-7
+ chr17_total_len <- load_chrom_info_file() |>
+ dplyr::filter(chr == 17) |>
+ dplyr::pull(total_length)
+
+ chr17_telo <- data.frame(
+ chr = "17",
+ telostart_p = 0,
+ teloend_p = 3000,
+ telostart_q = chr17_total_len - 5000,
+ teloend_q = chr17_total_len
+ )
+
+ telos <- dplyr::bind_rows(telos, chr17_telo) |>
+ dplyr::relocate(chr, telostart_p, teloend_p)
+
+ return(telos)
+}
+
+#' add telomere positions to a CN df
+#'
+#' See [dlptools::import_telos_file()] but really just a convenience function.
+#' Telomere positions in the UCSC gap file are all just 10Kb, except chr17,
+#' which I set to 3Kb and 5Kb, again see linked function for details.
+#'
+#' @param cn_df dataframe of CN information, segments or read bins. Just needs
+#' a 'chr' column.
+#' @param version string. hg19 (default) or hg38, not that it makes a
+#' difference.
+#' @return input dataframe with new columns of telomere information
+#' @export
+add_telomere_positions <- function(cn_df, version = c("hg19", "hg38")) {
+ telos <- import_telos_file(version = version)
+
+ cn_df <- dplyr::left_join(cn_df, telos, by = "chr")
+
+ return(cn_df)
+}
+
+
+#' label CN segments based on relative chromosomal positions
+#'
+#' Uses centromere and telomere coordinates to label where a segment sits on a
+#' chromosome, relative to telomeres and centromeres.
+#'
+#' Possible categories are:
+#' 1. telomere bound (telo-bound) - segment touches a telomere
+#' 2. centromere bound (centro-bound) - segment touches or crosses the
+#' centromere
+#' 3. arm (arm) - segment spans a whole are (*with conditions)
+#' 4. whole chromosome (whole-chrom) - segment spans the entire chromosome
+#' (*with conditions)
+#' 5. intersitial (inter) - occuring within the chromosome, not touching the
+#' centromere, telomeres, and not big enough to be an entire arm.
+#'
+#' You can set a min_bound_distance which reflects how close a feature needs to
+#' be to be considered "touching". For example, we can considere a segment
+#' telomere bound if within traditional 1 DLP bin by setting this distance to
+#' 500k (default). This allows for some level of measurement error.
+#'
+#' Users can also set the proportion of the arm or whole chromosome a segment
+#' needs to span to be considered either category. Default is spaning 90% of
+#' either feature. Meaning, if the segment (end - start)/arm_length is at least
+#' 90% of the arm_length, the segment is considered an "arm" spanning segment.
+#'
+#' This function runs several other functions including:
+#' [dlptools::add_chromosome_length()],
+#' [dlptools::add_centromere_locations()], and
+#' [dlptools::add_telomere_positions()].
+#'
+#' @param segs_df dataframe of CN segments
+#' @param min_bound_distance integer. Distance to adjacent feature to be
+#' considered associated with that feature.
+#' @param min_span_of_arm float. Proportion of arm to cover to be considered an
+#' arm segment.
+#' @param min_span_of_chrom float. Proportion of the chromosome to cover to be
+#' considered a whole chromosome segment.
+#' @param version string. hg19 (default) or hg38
+#' @param acro_fix_whole_chrom. boolean. Whether to reset acrocentric
+#' chromosome CN segments to "whole-chrom" if they span futher than the Q arm.
+#' Honestly, probably not useful
+#' @return input dataframe, annotated with segment scale information. Primary
+#' column of interest being seg_span_event.
+#' @export
+mark_segs_chromosome_span <- function(
+ segs_df,
+ min_bound_distance = 5e5, # given scale of DLP, this should probably be 1 bin width, at least
+ min_span_of_chrom = 0.9,
+ min_span_of_arm = 0.9,
+ version = c("hg19", "hg38"),
+ acro_fix_whole_chrom = FALSE) {
+ event_labels <- c(
+ arm = "arm", whole = "whole-chrom", telo = "telo-bound",
+ centro = "centro-bound", inter = "inter"
+ )
+
+ segs_df <- segs_df |>
+ add_chromosome_length(version = version) |>
+ add_centromere_locations(version = version) |>
+ add_telomere_positions(version = version) |>
+ dplyr::mutate(
+ # find closest telomere to the CN event
+ telo_p_dist = start - teloend_p,
+ telo_q_dist = telostart_q - end,
+ telo_dist = dplyr::case_when(
+ # in telo or touching at both ends
+ telo_p_dist <= 0 & telo_q_dist <= 0 ~ 0,
+ # in or touching at one end
+ telo_p_dist <= 0 | telo_q_dist <= 0 ~ 0,
+ # closer to p
+ telo_p_dist <= telo_q_dist ~ telo_p_dist,
+ # closer to q
+ telo_p_dist > telo_q_dist ~ telo_q_dist,
+ ),
+ telo_bound = telo_dist <= min_bound_distance,
+
+ # find where CN sits relative to centromere
+ centro_p_dist = centro_start - end,
+ centro_q_dist = start - centro_end,
+ centro_dist = dplyr::case_when(
+ # crossing centro somewhere
+ centro_p_dist <= 0 & centro_q_dist <= 0 ~ 0,
+ # on the p side
+ centro_p_dist >= 0 & centro_q_dist < 0 ~ centro_p_dist,
+ # on th q side
+ centro_p_dist < 0 & centro_q_dist >= 0 ~ centro_q_dist,
+ ),
+ spans_centro = end > centro_end & start < centro_start,
+ centro_bound = centro_dist <= min_bound_distance,
+
+ # span of chromosome
+ spans_chrom = (end - start) / total_length >= min_span_of_chrom,
+
+ # marking of arm events.
+ p_arm_length = centro_start - teloend_p,
+ q_arm_length = telostart_q - centro_end,
+ spans_arm = dplyr::case_when(
+ end < centro_start ~ (end - start) / p_arm_length >= min_span_of_arm,
+ start > centro_end ~ (end - start) / q_arm_length >= min_span_of_arm,
+ .default = FALSE
+ ),
+
+ # trying to be simple and Shih et al. 2023 10.1038/s41586-023-06266-3
+ # inspired
+ seg_span_event = dplyr::case_when(
+ spans_chrom ~ event_labels["whole"], # "whole-chrom",
+ centro_bound & telo_bound & !spans_centro ~ event_labels["arm"],
+ !telo_bound & !centro_bound & !spans_chrom & spans_arm ~ event_labels["arm"],
+ telo_bound & !centro_bound & !spans_chrom ~ event_labels["telo"], # "telo-bound",
+ centro_bound | spans_centro ~ event_labels["centro"], # "centro-bound"
+ .default = event_labels["inter"]
+ )
+ )
+
+ if (acro_fix_whole_chrom) {
+ # these chromosomes have p arms >10Mb. Short, but not crazy. Might not need
+ # this fix.
+ acro_chroms <- c("13", "14", "15", "21", "22")
+
+ segs_df <- segs_df |>
+ dplyr::mutate(
+ # basically span of the q arm
+ acro_span = dplyr::if_else(
+ chr %in% acro_chroms & start > centro_start,
+ (end - start) / (total_length - centro_end),
+ NA
+ ),
+ seg_span_event = dplyr::case_when(
+ !(chr %in% acro_chroms) ~ seg_span_event,
+ acro_span >= q_span ~ event_labels["whole"],
+ .default = seg_span_event
+ )
+ )
+ }
+
+ segs_df <- dplyr::mutate(
+ segs_df,
+ seg_span_event = factor(
+ seg_span_event,
+ levels = unname(event_labels)
+ )
+ )
+
+ return(segs_df)
+}
diff --git a/R/cn_features.R b/R/cn_features.R
index b284897..9ecd4e9 100644
--- a/R/cn_features.R
+++ b/R/cn_features.R
@@ -23,7 +23,8 @@
#' in their actual code is set to 1. This function follows their code, but can
#' be altered.
#'
-#' @param segs_df a dataframe of CN segments
+#' @param segs_df dataframe. CN segments
+#' @param sample_col string. Name of the column with cell_id/other sample name
#' @param state_bin_max int. Maximum CN to consider for bins. All CNs of this
#' value and higher are grouped together. Default of 5 follows paper.
#' @param bin_breaks floats, how to break up segment sizes. Bins will be one
@@ -35,7 +36,7 @@
#' @param ... can pass change_split_val to alter critical value for AA/BB split
#' @return default return is a tibble of feature counts for each cell id.
#' @export
-extract_wu_features <- function(segs_df, state_bin_max = 5, bin_breaks = NA, annotate_input = FALSE, return_matrix = FALSE, ...) {
+extract_wu_features <- function(segs_df, sample_col = "cell_id", state_bin_max = 5, bin_breaks = NA, annotate_input = FALSE, return_matrix = FALSE, ...) {
# paper code: https://github.com/XSLiuLab/single-cell-CNA-signature/blob/main/code/divide_feature.R
# default segment size breakpoints
@@ -61,7 +62,7 @@ extract_wu_features <- function(segs_df, state_bin_max = 5, bin_breaks = NA, ann
!dplyr::if_any(c(cn_bin, seg_bin, seg_change, seg_shape), is.na)
) |>
dplyr::group_by(
- cell_id, cn_bin, seg_bin, seg_change, seg_shape,
+ .data[[sample_col]], cn_bin, seg_bin, seg_change, seg_shape,
.drop = FALSE
) |>
dplyr::count() |>
@@ -214,3 +215,87 @@ extract_sigminer_wang_features <- function(segs_df) {
return(sm_tally$nmf_matrix)
}
+
+
+#' Count changes of state relative to ploidy.
+#'
+#' Marks CN segments as a gain or loss, relative to the mode ploidy of the
+#' sample. Internally using [dlptools::mark_cn_relative_to_ploidy]. See that
+#' function for argument details.
+#'
+#' @param segs_df dataframe. CN segments
+#' @param sample_col string. Name of the column with cell_id/other sample name
+#' @param annotate_input boolean. return input dataframe annotating each
+#' @param return_matrix boolean. Return a cell-by-feature matrix of counts.
+#' @return tibble/dataframe of counts
+#' @export
+extract_ploidy_cn_feature <- function(
+ segs_df = NA,
+ sample_col = "cell_id",
+ annotate_input = FALSE,
+ return_matrix = FALSE) {
+ segs_df <- mark_cn_relative_to_ploidy(
+ in_df = segs_df,
+ df_type = "segs",
+ sample_col = sample_col
+ )
+
+ if (annotate_input) {
+ return(segs_df)
+ }
+
+ feat_count <- segs_df |>
+ dplyr::group_by(.data[[sample_col]], .drop = FALSE) |>
+ dplyr::count(cn_v_ploidy) |>
+ dplyr::ungroup()
+
+ if (return_matrix) {
+ feat_mtx <- make_cellid_matrix(feat_count, "cn_v_ploidy", "n")
+ return(feat_mtx)
+ }
+
+ return(feat_count)
+}
+
+
+#' count the segment-span-on-chromosome event types.
+#'
+#' Critical to this fuction is [dlptools::mark_segs_chromosome_span()]. It is
+#' important to read and understand that function and its arguments.
+#'
+#' This function basically just calls [dlptools::mark_segs_chromosome_span()]
+#' and summarizes the results. Arguments can be passed to that underlying
+#' function. Passing no arguments means you are happy with the defaults. See
+#' [dlptools::mark_segs_chromosome_span()] to understand what the defaults are.
+#'
+#' @param segs_df dataframe of CN segments
+#' @param sample_col string. Name of the column with cell_id/other sample name
+#' @param annotate_input boolean. return input dataframe annotating each
+#' segment.
+#' @param return_matrix boolean. Return a cell-by-feature matrix of counts.
+#' @return tibble/dataframe of counts
+#' @export
+extract_segment_position_feature <- function(
+ segs_df,
+ sample_col = "cell_id",
+ annotate_input = FALSE,
+ return_matrix = FALSE,
+ ...) {
+ segs_df <- mark_segs_chromosome_span(segs_df, ...)
+
+ if (annotate_input) {
+ return(segs_df)
+ }
+
+ seg_span_counts <- segs_df |>
+ dplyr::group_by(.data[[sample_col]], .drop = FALSE) |>
+ dplyr::count(seg_span_event) |>
+ dplyr::ungroup()
+
+ if (return_matrix) {
+ feat_mtx <- make_cellid_matrix(seg_span_counts, "seg_span_event", "n")
+ return(feat_mtx)
+ }
+
+ return(seg_span_counts)
+}
diff --git a/R/colours.R b/R/colours.R
index b918b9f..98351de 100644
--- a/R/colours.R
+++ b/R/colours.R
@@ -72,6 +72,7 @@ RAW_COPY_COLORS <- circlize::colorRamp2(
CNV_COLOURS[1:(length(CNV_COLOURS) - 1)]
)
+#' @export
GEN_PLOT_COLS <- c(
"#4269D0FF", "#3CA951FF", "#FF725CFF",
"#EFB118FF", "#A463F2FF",
@@ -79,6 +80,7 @@ GEN_PLOT_COLS <- c(
"#9498A0FF", "#6CC5B0FF"
)
+#' @export
BASE_PLOT_THEME <- list(
ggplot2::theme_minimal() +
ggplot2::theme(
diff --git a/R/ploidy.R b/R/ploidy.R
index b5f78b4..54cf561 100644
--- a/R/ploidy.R
+++ b/R/ploidy.R
@@ -40,3 +40,93 @@ weighted_ploidy <- function(
return(sample_ploidy)
}
+
+
+#' find the mode CN per chromosome, then mode across the chromosomes
+#'
+#' Done per chromosome first so that large chromosomes don't dominate the
+#' result. It is critical that the input be even bin level data, or this makes
+#' no sense to do. Measures of the copy number states need to be done in evenly
+#' sized bins.
+#'
+#' @param bin_df dataframe of bin level data
+#' @param sample_col string column name identifying samples
+#' @param cn_col string column name for copy number states
+#' @param chrom_col string column name for chromosomes
+#' @return tibble/dataframe of results by cell_id/sample
+#' @export
+mode_ploidy <- function(
+ bin_df,
+ sample_col = "cell_id",
+ cn_col = "state",
+ chrom_col = "chr") {
+ bin_df |>
+ dplyr::group_by(.data[[sample_col]], .data[[chrom_col]]) |>
+ dplyr::summarise(
+ chr_mode = cust_mode(.data[[cn_col]])
+ ) |>
+ dplyr::group_by(.data[[sample_col]]) |>
+ dplyr::summarise(
+ mode_ploidy = cust_mode(chr_mode)
+ ) |>
+ dplyr::ungroup()
+}
+
+#' Mark if CN states are gains or losses relative to cell ploidy
+#'
+#' CNs > 2 are not necessarily amplifications, and CNs < 2 are not necessarily
+#' the only states of losses. Ploidy of samples may not be diploid. This
+#' function will give you an idea if the CN you see is a gain or loss relative
+#' to the ploidy of a sample. For example, if the sample has a ploidy of 4,
+#' then a CN of 3 is a loss.
+#'
+#' Ploidy is inferred by mode CN state using [dlptools::mode_ploidy()] and
+#' states > ploidy are marked as gains, states < ploidy losses, and states
+#' matching ploidy as matched.
+#'
+#' @param in_df dataframe of CN states.
+#' @param df_type string. "reads" (default) or "segs" for CN segments, which
+#' will internally converted to bin based for mode calculation.
+#' reads in order to infer mode ploidy.
+#' @param sample_col string. Name of the column with cell_id/other sample name
+#' @return input dataframe, with new columns of information.
+#' @export
+mark_cn_relative_to_ploidy <- function(
+ in_df,
+ df_type = c("reads", "segs"),
+ sample_col = "cell_id",
+ ...) {
+ df_type <- match.arg(df_type)
+
+ event_labels <- c(
+ gain = "ploidy-gain",
+ match = "ploidy-match",
+ loss = "ploidy-loss"
+ )
+
+ if (df_type == "segs") {
+ reads_df <- segs_to_reads(in_df, ...)
+ } else if (df_type == "reads") {
+ reads_df <- in_df
+ }
+
+ sample_mode_ploidy <- mode_ploidy(reads_df, sample_col = sample_col, ...)
+
+ in_df <- dplyr::left_join(
+ in_df,
+ sample_mode_ploidy,
+ by = sample_col
+ )
+
+ in_df <- in_df |>
+ dplyr::mutate(
+ cn_v_ploidy = dplyr::case_when(
+ state < mode_ploidy ~ event_labels["loss"],
+ state > mode_ploidy ~ event_labels["gain"],
+ state == mode_ploidy ~ event_labels["match"]
+ ),
+ cn_v_ploidy = factor(cn_v_ploidy, levels = unname(event_labels))
+ )
+
+ return(in_df)
+}
diff --git a/R/utils.R b/R/utils.R
index 9f111ac..7e65c26 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -223,3 +223,28 @@ make_cellid_matrix <- function(cell_df, name_col, val_col) {
rownames(cell_mtx) <- dplyr::pull(cell_mtx_init, cell_id)
return(cell_mtx)
}
+
+
+#' find most common element in a vector
+#'
+#' For ties, just the first one in the list is returned.
+#'
+#' @param x vector of values
+#' @return most common element of input vector
+#' @export
+cust_mode <- function(x, na.rm = FALSE) {
+ if (na.rm) {
+ x <- x[!is.na(x)]
+ }
+
+ ux <- unique(x)
+ return(ux[which.max(tabulate(match(x, ux)))])
+}
+
+
+#' detect if chromosomes labels include "chr"
+#' @param chrom_vec vector of chromosome labels, probably pulled from a
+#' dataframe of cn information.
+is_chr_used_in_chroms <- function(chrom_vec) {
+ return(any(stringr::str_detect(unique(chrom_vec), "chr")))
+}
diff --git a/inst/extdata/hg19_gap.txt.gz b/inst/extdata/hg19_gap.txt.gz
new file mode 100644
index 0000000..9d45f51
Binary files /dev/null and b/inst/extdata/hg19_gap.txt.gz differ
diff --git a/inst/extdata/hg38_gap.txt.gz b/inst/extdata/hg38_gap.txt.gz
new file mode 100644
index 0000000..1a6b842
Binary files /dev/null and b/inst/extdata/hg38_gap.txt.gz differ
diff --git a/man/add_centromere_locations.Rd b/man/add_centromere_locations.Rd
index 5ebd865..c9a9c85 100644
--- a/man/add_centromere_locations.Rd
+++ b/man/add_centromere_locations.Rd
@@ -1,16 +1,18 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/centromere_marking.R
+% Please edit documentation in R/chromosome_layouts.R
\name{add_centromere_locations}
\alias{add_centromere_locations}
\title{add centromere information to reads by chromosome}
\usage{
add_centromere_locations(
- reads_df,
+ cn_df,
centro_file = NULL,
version = c("hg19", "hg38")
)
}
\arguments{
+\item{cn_df}{dataframe of cn states for read bins or segments}
+
\item{centro_file}{NULL or string to the path if you download yourself}
\item{hg19}{boolean to target hg19 for loading}
diff --git a/man/add_chromosome_length.Rd b/man/add_chromosome_length.Rd
new file mode 100644
index 0000000..5686d9e
--- /dev/null
+++ b/man/add_chromosome_length.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chromosome_layouts.R
+\name{add_chromosome_length}
+\alias{add_chromosome_length}
+\title{add chromosome lengths to a cn dataframe}
+\usage{
+add_chromosome_length(cn_df, version = c("hg19", "hg38"), chrom_col = "chr")
+}
+\arguments{
+\item{cn_df}{dataframe of copy number information}
+
+\item{version}{string. hg19 (default) or hg38}
+
+\item{chrom_col}{string. name of column with chromosome information.}
+}
+\description{
+Handles chromosomes specified with "chr#" or just #/X/Y.
+}
diff --git a/man/add_telomere_positions.Rd b/man/add_telomere_positions.Rd
new file mode 100644
index 0000000..cff59de
--- /dev/null
+++ b/man/add_telomere_positions.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chromosome_layouts.R
+\name{add_telomere_positions}
+\alias{add_telomere_positions}
+\title{add telomere positions to a CN df}
+\usage{
+add_telomere_positions(cn_df, version = c("hg19", "hg38"))
+}
+\arguments{
+\item{cn_df}{dataframe of CN information, segments or read bins. Just needs
+a 'chr' column.}
+
+\item{version}{string. hg19 (default) or hg38, not that it makes a
+difference.}
+}
+\value{
+input dataframe with new columns of telomere information
+}
+\description{
+See \code{\link[=import_telos_file]{import_telos_file()}} but really just a convenience function.
+Telomere positions in the UCSC gap file are all just 10Kb, except chr17,
+which I set to 3Kb and 5Kb, again see linked function for details.
+}
diff --git a/man/cust_mode.Rd b/man/cust_mode.Rd
new file mode 100644
index 0000000..2df51e8
--- /dev/null
+++ b/man/cust_mode.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{cust_mode}
+\alias{cust_mode}
+\title{find most common element in a vector}
+\usage{
+cust_mode(x, na.rm = FALSE)
+}
+\arguments{
+\item{x}{vector of values}
+}
+\value{
+most common element of input vector
+}
+\description{
+For ties, just the first one in the list is returned.
+}
diff --git a/man/extract_ploidy_cn_feature.Rd b/man/extract_ploidy_cn_feature.Rd
new file mode 100644
index 0000000..066d86c
--- /dev/null
+++ b/man/extract_ploidy_cn_feature.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cn_features.R
+\name{extract_ploidy_cn_feature}
+\alias{extract_ploidy_cn_feature}
+\title{Count changes of state relative to ploidy.}
+\usage{
+extract_ploidy_cn_feature(
+ segs_df = NA,
+ sample_col = "cell_id",
+ annotate_input = FALSE,
+ return_matrix = FALSE
+)
+}
+\arguments{
+\item{segs_df}{dataframe. CN segments}
+
+\item{sample_col}{string. Name of the column with cell_id/other sample name}
+
+\item{annotate_input}{boolean. return input dataframe annotating each}
+
+\item{return_matrix}{boolean. Return a cell-by-feature matrix of counts.}
+}
+\value{
+tibble/dataframe of counts
+}
+\description{
+Marks CN segments as a gain or loss, relative to the mode ploidy of the
+sample. Internally using \link{mark_cn_relative_to_ploidy}. See that
+function for argument details.
+}
diff --git a/man/extract_segment_position_feature.Rd b/man/extract_segment_position_feature.Rd
new file mode 100644
index 0000000..c80553f
--- /dev/null
+++ b/man/extract_segment_position_feature.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cn_features.R
+\name{extract_segment_position_feature}
+\alias{extract_segment_position_feature}
+\title{count the segment-span-on-chromosome event types.}
+\usage{
+extract_segment_position_feature(
+ segs_df,
+ sample_col = "cell_id",
+ annotate_input = FALSE,
+ return_matrix = FALSE,
+ ...
+)
+}
+\arguments{
+\item{segs_df}{dataframe of CN segments}
+
+\item{sample_col}{string. Name of the column with cell_id/other sample name}
+
+\item{annotate_input}{boolean. return input dataframe annotating each
+segment.}
+
+\item{return_matrix}{boolean. Return a cell-by-feature matrix of counts.}
+}
+\value{
+tibble/dataframe of counts
+}
+\description{
+Critical to this fuction is \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}. It is
+important to read and understand that function and its arguments.
+}
+\details{
+This function basically just calls \code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}}
+and summarizes the results. Arguments can be passed to that underlying
+function. Passing no arguments means you are happy with the defaults. See
+\code{\link[=mark_segs_chromosome_span]{mark_segs_chromosome_span()}} to understand what the defaults are.
+}
diff --git a/man/extract_wu_features.Rd b/man/extract_wu_features.Rd
index 402a04c..fa684d3 100644
--- a/man/extract_wu_features.Rd
+++ b/man/extract_wu_features.Rd
@@ -6,6 +6,7 @@
\usage{
extract_wu_features(
segs_df,
+ sample_col = "cell_id",
state_bin_max = 5,
bin_breaks = NA,
annotate_input = FALSE,
@@ -14,7 +15,9 @@ extract_wu_features(
)
}
\arguments{
-\item{segs_df}{a dataframe of CN segments}
+\item{segs_df}{dataframe. CN segments}
+
+\item{sample_col}{string. Name of the column with cell_id/other sample name}
\item{state_bin_max}{int. Maximum CN to consider for bins. All CNs of this
value and higher are grouped together. Default of 5 follows paper.}
diff --git a/man/fill_state_from_neighbours.Rd b/man/fill_state_from_neighbours.Rd
index e6dfd3a..4d3ffd7 100644
--- a/man/fill_state_from_neighbours.Rd
+++ b/man/fill_state_from_neighbours.Rd
@@ -8,7 +8,7 @@ fill_state_from_neighbours(state_df, cols_to_fill = c("state"))
}
\arguments{
\item{state_df}{dataframe of state data with NAs for some bins that need
-to be filled in. E.g., after dlptools::running add_missing_bins_for_cells()}
+to be filled in. E.g., after dlptools::add_missing_bins_for_cells()}
}
\description{
This function fills in NAs of columns with the information from an upstream
diff --git a/man/import_telos_file.Rd b/man/import_telos_file.Rd
new file mode 100644
index 0000000..fed1dca
--- /dev/null
+++ b/man/import_telos_file.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chromosome_layouts.R
+\name{import_telos_file}
+\alias{import_telos_file}
+\title{load UCSC gap files for telomeres}
+\usage{
+import_telos_file(version = c("hg19", "hg38"))
+}
+\arguments{
+\item{version}{string. hg19 (default) or hg38}
+}
+\value{
+tibble of telomere information
+}
+\description{
+Honestly, this function is sort of pointless. The telomeres in both genome
+versions are marked as 10Kb for p and q on all chromosomes. Telomeres are
+variable and better estimates of their size exist, e.g.,
+https://www.nature.com/articles/s41467-024-48917-7
+}
+\details{
+This function just serves as a way to set up easy marking of where CNs occur
+on their respective chromosome.
+
+The only exception to the 10Kb size is chr17. It is not listed in the gap
+file for hg19. chr17 is known to have small chromosomes. Here, I set the p
+to 3000 Kb and q to 5000 Kb, inferred from the article referenced above.
+}
diff --git a/man/is_chr_used_in_chroms.Rd b/man/is_chr_used_in_chroms.Rd
new file mode 100644
index 0000000..c47a6a6
--- /dev/null
+++ b/man/is_chr_used_in_chroms.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{is_chr_used_in_chroms}
+\alias{is_chr_used_in_chroms}
+\title{detect if chromosomes labels include "chr"}
+\usage{
+is_chr_used_in_chroms(chrom_vec)
+}
+\arguments{
+\item{chrom_vec}{vector of chromosome labels, probably pulled from a
+dataframe of cn information.}
+}
+\description{
+detect if chromosomes labels include "chr"
+}
diff --git a/man/load_chrom_info_file.Rd b/man/load_chrom_info_file.Rd
index a93a162..c809b40 100644
--- a/man/load_chrom_info_file.Rd
+++ b/man/load_chrom_info_file.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/bin_filling.R
+% Please edit documentation in R/chromosome_layouts.R
\name{load_chrom_info_file}
\alias{load_chrom_info_file}
\title{loading UCSC chromosome length files}
diff --git a/man/load_ucsc_centromeres.Rd b/man/load_ucsc_centromeres.Rd
index 03c8d60..6af3c7d 100644
--- a/man/load_ucsc_centromeres.Rd
+++ b/man/load_ucsc_centromeres.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/centromere_marking.R
+% Please edit documentation in R/chromosome_layouts.R
\name{load_ucsc_centromeres}
\alias{load_ucsc_centromeres}
\title{load and prep default centromerefiles from UCSC}
diff --git a/man/make_cellid_matrix.Rd b/man/make_cellid_matrix.Rd
index 87aa7c2..9809f96 100644
--- a/man/make_cellid_matrix.Rd
+++ b/man/make_cellid_matrix.Rd
@@ -25,12 +25,12 @@ can be pivoted out.
}
\examples{
cell_df <- data.frame(
- cell_id = c("cell_1", "cell_1", "cell_1"),
- some_col = c("A", "B", "C"),
- some_val = 1:3
+ cell_id = c("cell_1", "cell_1", "cell_1"),
+ some_col = c("A", "B", "C"),
+ some_val = 1:3
)
-make_cellid_matrix(cell_df, name_col="some_col", val_col="some_val")
+make_cellid_matrix(cell_df, name_col = "some_col", val_col = "some_val")
# A B C
# cell_1 1 2 3
diff --git a/man/mark_bins_overlapping_centromeres.Rd b/man/mark_bins_overlapping_centromeres.Rd
index e0bac20..cb32121 100644
--- a/man/mark_bins_overlapping_centromeres.Rd
+++ b/man/mark_bins_overlapping_centromeres.Rd
@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/centromere_marking.R
+% Please edit documentation in R/chromosome_layouts.R
\name{mark_bins_overlapping_centromeres}
\alias{mark_bins_overlapping_centromeres}
\title{add boolean if bin overlaps with a centromere.}
diff --git a/man/mark_cn_relative_to_ploidy.Rd b/man/mark_cn_relative_to_ploidy.Rd
new file mode 100644
index 0000000..357bf3a
--- /dev/null
+++ b/man/mark_cn_relative_to_ploidy.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ploidy.R
+\name{mark_cn_relative_to_ploidy}
+\alias{mark_cn_relative_to_ploidy}
+\title{Mark if CN states are gains or losses relative to cell ploidy}
+\usage{
+mark_cn_relative_to_ploidy(
+ in_df,
+ df_type = c("reads", "segs"),
+ sample_col = "cell_id",
+ ...
+)
+}
+\arguments{
+\item{in_df}{dataframe of CN states.}
+
+\item{df_type}{string. "reads" (default) or "segs" for CN segments, which
+will internally converted to bin based for mode calculation.
+reads in order to infer mode ploidy.}
+
+\item{sample_col}{string. Name of the column with cell_id/other sample name}
+}
+\value{
+input dataframe, with new columns of information.
+}
+\description{
+CNs > 2 are not necessarily amplifications, and CNs < 2 are not necessarily
+the only states of losses. Ploidy of samples may not be diploid. This
+function will give you an idea if the CN you see is a gain or loss relative
+to the ploidy of a sample. For example, if the sample has a ploidy of 4,
+then a CN of 3 is a loss.
+}
+\details{
+Ploidy is inferred by mode CN state using \code{\link[=mode_ploidy]{mode_ploidy()}} and
+states > ploidy are marked as gains, states < ploidy losses, and states
+matching ploidy as matched.
+}
diff --git a/man/mark_segs_chromosome_span.Rd b/man/mark_segs_chromosome_span.Rd
new file mode 100644
index 0000000..e1db366
--- /dev/null
+++ b/man/mark_segs_chromosome_span.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chromosome_layouts.R
+\name{mark_segs_chromosome_span}
+\alias{mark_segs_chromosome_span}
+\title{label CN segments based on relative chromosomal positions}
+\usage{
+mark_segs_chromosome_span(
+ segs_df,
+ min_bound_distance = 5e+05,
+ min_span_of_chrom = 0.9,
+ min_span_of_arm = 0.9,
+ version = c("hg19", "hg38"),
+ acro_fix_whole_chrom = FALSE
+)
+}
+\arguments{
+\item{segs_df}{dataframe of CN segments}
+
+\item{min_bound_distance}{integer. Distance to adjacent feature to be
+considered associated with that feature.}
+
+\item{min_span_of_chrom}{float. Proportion of the chromosome to cover to be
+considered a whole chromosome segment.}
+
+\item{min_span_of_arm}{float. Proportion of arm to cover to be considered an
+arm segment.}
+
+\item{version}{string. hg19 (default) or hg38}
+
+\item{acro_fix_whole_chrom.}{boolean. Whether to reset acrocentric
+chromosome CN segments to "whole-chrom" if they span futher than the Q arm.
+Honestly, probably not useful}
+}
+\value{
+input dataframe, annotated with segment scale information. Primary
+column of interest being seg_span_event.
+}
+\description{
+Uses centromere and telomere coordinates to label where a segment sits on a
+chromosome, relative to telomeres and centromeres.
+}
+\details{
+Possible categories are:
+\enumerate{
+\item telomere bound (telo-bound) - segment touches a telomere
+\item centromere bound (centro-bound) - segment touches or crosses the
+centromere
+\item arm (arm) - segment spans a whole are (*with conditions)
+\item whole chromosome (whole-chrom) - segment spans the entire chromosome
+(*with conditions)
+\item intersitial (inter) - occuring within the chromosome, not touching the
+centromere, telomeres, and not big enough to be an entire arm.
+}
+
+You can set a min_bound_distance which reflects how close a feature needs to
+be to be considered "touching". For example, we can considere a segment
+telomere bound if within traditional 1 DLP bin by setting this distance to
+500k (default). This allows for some level of measurement error.
+
+Users can also set the proportion of the arm or whole chromosome a segment
+needs to span to be considered either category. Default is spaning 90\% of
+either feature. Meaning, if the segment (end - start)/arm_length is at least
+90\% of the arm_length, the segment is considered an "arm" spanning segment.
+
+This function runs several other functions including:
+\code{\link[=add_chromosome_length]{add_chromosome_length()}},
+\code{\link[=add_centromere_locations]{add_centromere_locations()}}, and
+\code{\link[=add_telomere_positions]{add_telomere_positions()}}.
+}
diff --git a/man/mode_ploidy.Rd b/man/mode_ploidy.Rd
new file mode 100644
index 0000000..59dbbca
--- /dev/null
+++ b/man/mode_ploidy.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ploidy.R
+\name{mode_ploidy}
+\alias{mode_ploidy}
+\title{find the mode CN per chromosome, then mode across the chromosomes}
+\usage{
+mode_ploidy(
+ bin_df,
+ sample_col = "cell_id",
+ cn_col = "state",
+ chrom_col = "chr"
+)
+}
+\arguments{
+\item{bin_df}{dataframe of bin level data}
+
+\item{sample_col}{string column name identifying samples}
+
+\item{cn_col}{string column name for copy number states}
+
+\item{chrom_col}{string column name for chromosomes}
+}
+\value{
+tibble/dataframe of results by cell_id/sample
+}
+\description{
+Done per chromosome first so that large chromosomes don't dominate the
+result. It is critical that the input be even bin level data, or this makes
+no sense to do. Measures of the copy number states need to be done in evenly
+sized bins.
+}
diff --git a/vignettes/feature_extraction.Rmd b/vignettes/feature_extraction.Rmd
index b483501..f2cc62f 100644
--- a/vignettes/feature_extraction.Rmd
+++ b/vignettes/feature_extraction.Rmd
@@ -14,11 +14,14 @@ knitr::opts_chunk$set(
)
```
-```{r setup}
+```{r setup, message=FALSE, warning=FALSE}
+library(ggplot2)
library(dlptools)
```
-dlptools offers two feature extraction types:
+## Features Defined by Other Publications
+
+dlptools offers two externally defined feature extraction types:
1. Wu et al. https://www.biorxiv.org/content/10.1101/2025.03.02.641098v1
2. Wang et al. https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1009557
@@ -103,6 +106,292 @@ sig_feats <- extract_sigminer_wang_features(segs_df)
sig_feats[1:4, 1:4]
```
+
+
+
+
+## ploidy informed CN change
+
+CN change relative to ploidy might be interesting to have a feature. Here we can mark 3 types of events for segments:
+
+1. CN is a gain relative to mode ploidy of sample/cell
+2. CN is a loss relative to mode ploidy of sample/cell
+3. CN is a match to mode ploidy
+
+Employs the function dlptools::mode_ploidy() to calculate mode ploidy of each sample/cell.
+
+```{r}
+dlptools::extract_ploidy_cn_feature(
+ segs_df,
+ # with option just to annotate the input df
+ # annotate_input = TRUE
+ # or, alternatively, return a count matrix
+ # return_matrix = TRUE
+) |>
+ dplyr::slice_head(n = 6)
+```
+
+
+
+
+
+## Segment Span Types
+
+It might be useful to know where segments sit on chromosomes and how much of a chromosome they cover. We can label segments based on 5 categories:
+
+1. telomere bound (`telo-bound`) - segment touches a telomere
+2. centromere bound (`centro-bound`) - segment touches or crosses the centromere
+3. arm (`arm`) - segment spans a whole are (*with conditions)
+4. whole chromosome (`whole-chrom`) - segment spans the entire chromosome (*with conditions)
+5. intersitial (`inter`) - occuring within the chromosome, not touching the centromere, telomeres, and not big enough to be an entire arm.
+
+See `?dlptools::mark_segs_chromosome_span` for more details about this marking, as there are various options to tweak these definitions, such as:
+
+* how much of an arm a CN needs to cover to be considered an "arm-level" event
+* how much of a chromosome a segment needs to cover to be a "whole-chromosome" event
+* how close a seg needs to be to a telo/centromere to be considered "touching"
+
+We can mark a dataframe with the information:
+
+```{r, message=FALSE, warning=FALSE}
+dlptools::mark_segs_chromosome_span(
+ segs_df,
+ # see ?dlptools::mark_segs_chromosome_span for more args
+) |>
+ dplyr::relocate(
+ # key added columns
+ seg_span_event, telo_bound, centro_bound, spans_chrom, spans_arm
+ ) |>
+ dplyr::slice_head(n = 5)
+```
+
+
+
+
+More useful in the "feature" context, is to directly extract and count them up. This function bascially just calls the one above, with counting added:
+
+```{r, message=FALSE, warning=FALSE}
+dlptools::extract_segment_position_feature(
+ segs_df
+ # with option just to annotate the input df
+ # annotate_input = TRUE
+ # or, alternatively, return a count matrix
+ # return_matrix = TRUE
+) |>
+ dplyr::slice_head(n = 6)
+```
+
+
+## Feature Combinations
+
+With some of these feature sets, we can combine them into one monster feature set. Is this a good idea? A bad idea? Is the output even interpretable? Are all ov these even useful? You'll have to decide that.
+
+The feature sets of:
+
+1. `dlptools::extract_wu_features`
+2. `dlptools::extract_ploidy_cn_feature`
+3. `dlptools::extract_segment_position_feature`
+
+are all "atomic", in that they describe aspects of individual segments.
+
+First, we can collect labels all all features for the segments:
+
+```{r, message=FALSE, warning=FALSE}
+feat_anno_segs <- segs_df |>
+ dlptools::extract_wu_features(annotate_input = TRUE) |>
+ dlptools::extract_ploidy_cn_feature(annotate_input = TRUE) |>
+ dlptools::extract_segment_position_feature(annotate_input = TRUE) |>
+ dplyr::select(
+ cell_id, chr, start, end,
+ # these are then the feature columns
+ seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event
+ )
+
+
+dplyr::slice_head(feat_anno_segs, n = 5)
+```
+
+
+Then, we can just count each category and leave the counts per cell per feature as our feature set for signature extraction. This would yield a 21 feature set (6 core feature types with: 5, 3, 3, 2, 3, 5 categories across them)
+
+
+
+
+```{r, message=FALSE, warning=FALSE}
+feat_cols <- c(
+ "seg_bin", "cn_bin", "seg_change", "seg_shape", "cn_v_ploidy",
+ "seg_span_event"
+)
+
+total_counts <- purrr::map(feat_cols, \(feat_col) {
+ feat_anno_segs |>
+ dplyr::group_by(cell_id, dplyr::pick(feat_col), .drop = FALSE) |>
+ dplyr::count() |>
+ dplyr::mutate(
+ feature = feat_col
+ ) |>
+ dplyr::rename(category = feat_col) |>
+ dplyr::relocate(cell_id, feature, category)
+}) |>
+ purrr::list_rbind()
+```
+
+```{r, echo=FALSE, fig.width=8, fig.height=4}
+ggplot(
+ dplyr::filter(total_counts, cell_id %in% unique(total_counts$cell_id)
+ [1:4]),
+ aes(x = category, y = n, fill = feature)
+) +
+ geom_col(show.legend = FALSE) +
+ facet_grid(
+ cell_id ~ feature,
+ scales = "free",
+ axes = "all_x",
+ switch = "x"
+ ) +
+ theme_classic() +
+ theme(
+ axis.text.x = element_blank(),
+ axis.ticks.x = element_blank(),
+ strip.placement = "outside"
+ )
+```
+
+
+
+
+We could opt to cross all of the features into a "mega-feature-set". But, this would be:
+
+* 4 features (containing 3, 5, 2, 3 categories across them) with from `dlptools::extract_wu_features`
+* 1 feature with 3 features categories from `dlptools::extract_ploidy_cn_feature`
+* 1 feature with 5 feature categories from `dlptools::extract_segment_position_feature`
+
+```{r}
+all_feat_labels <- list(
+ levels(feat_anno_segs$seg_bin),
+ levels(feat_anno_segs$cn_bin),
+ levels(feat_anno_segs$seg_change),
+ levels(feat_anno_segs$seg_shape),
+ levels(feat_anno_segs$cn_v_ploidy),
+ levels(feat_anno_segs$seg_span_event)
+)
+
+crossed_labels <- expand.grid(all_feat_labels)
+
+crossed_labels[1:4, ]
+```
+
+
+```{r}
+nrow(crossed_labels)
+```
+
+
+making for a cross feature set with 1350 possible categories. That might be too detailed, and you should check if most categories are 0 for your data, indicating useless features.
+
+
+
+Also, probably want to ask if crossing the category actually makes sense and adds biological information. For example, here, you can't even have a small sized (< 5Mb) arm event (smallest arm is >10 Mb), so crossing size and chromosome location doesn't actually lead to fully crossed features or any gain in information (and these two are somewhat redundant). Or, does it matter if the segment shape parameter is a ploidy-gain or a ploidy-loss? Does knowing that distinguish processes or teach you anything biologically? (and that's only two parameters being crossed).
+
+
+
+But, forging ahead bravely:
+
+```{r}
+# collect all possible feature labels
+
+crossed_counts <- feat_anno_segs |>
+ dplyr::group_by(
+ # sample ID column
+ cell_id,
+ # feature columns
+ seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event,
+ .drop = FALSE
+ ) |>
+ dplyr::count() |>
+ dplyr::mutate(
+ cross_feat = stringr::str_c(
+ seg_bin, cn_bin, seg_change, seg_shape, cn_v_ploidy, seg_span_event,
+ sep = ":"
+ )
+ )
+```
+
+```{r, echo=FALSE, fig.width=8, fig.height=4}
+ggplot(
+ dplyr::filter(crossed_counts, cell_id %in% unique(total_counts$cell_id)
+ [1:4]),
+ aes(x = cross_feat, y = n)
+) +
+ geom_col() +
+ facet_grid(cell_id ~ ., axes = "all_x") +
+ theme_classic() +
+ theme(
+ axis.text.x = element_blank(),
+ axis.ticks.x = element_blank(),
+ ) +
+ xlab("Crossed Feature")
+```
+
+
+
+
+
+Or, you could do some sort of combination of cross and uncrossed? I don't know. This whole signature field is the wild west and people do all kinds of things.
+
+```{r, message=FALSE, warning=FALSE}
+# count the Wu features as crossed: 90 feature categories
+wu_crossed_counts <- dlptools::extract_wu_features(segs_df) |>
+ dplyr::select(cell_id, feat_cat, n) |>
+ dplyr::mutate(
+ feature_group = "Wu-crossed"
+ )
+
+# do ploidy_cn_change and segment placement as uncrossed?
+ploidy_counts <- dlptools::extract_ploidy_cn_feature(segs_df) |>
+ dplyr::rename(feat_cat = cn_v_ploidy) |>
+ dplyr::mutate(
+ feature_group = "ploidy"
+ )
+
+
+seg_pos_counts <- dlptools::extract_segment_position_feature(segs_df) |>
+ dplyr::rename(feat_cat = seg_span_event) |>
+ dplyr::mutate(
+ feature_group = "position"
+ )
+
+all_feat_counts <- purrr::list_rbind(list(
+ wu_crossed_counts, ploidy_counts, seg_pos_counts
+)) |>
+ dplyr::arrange(cell_id, feature_group) |>
+ dplyr::mutate(
+ feature_group = factor(feature_group, levels = unique(feature_group))
+ )
+```
+
+
+```{r, echo=FALSE, fig.width=8, fig.height=4}
+ggplot2::ggplot(
+ dplyr::filter(all_feat_counts, cell_id %in% unique(total_counts$cell_id)
+ [1:4]),
+ ggplot2::aes(x = feat_cat, y = n, fill = feature_group)
+) +
+ ggplot2::geom_col(show.legend = FALSE) +
+ ggplot2::facet_grid(
+ cell_id ~ feature_group,
+ axes = "all_x", scales = "free_x", space = "free"
+ ) +
+ ggplot2::scale_fill_manual(values = dlptools::GEN_PLOT_COLS) +
+ ggplot2::theme_classic() +
+ ggplot2::theme(
+ axis.text.x = ggplot2::element_blank(),
+ axis.ticks.x = ggplot2::element_blank(),
+ ) +
+ ggplot2::xlab("Mixed Features")
+```
+
+
diff --git a/vignettes/various_functions.Rmd b/vignettes/various_functions.Rmd
index 38ccd9c..b253dba 100644
--- a/vignettes/various_functions.Rmd
+++ b/vignettes/various_functions.Rmd
@@ -17,8 +17,7 @@ knitr::opts_chunk$set(
```
-```{r}
-library(dplyr)
+```{r, warning=FALSE, message=FALSE, echo=FALSE}
library(vroom)
library(dlptools)
```
@@ -52,8 +51,10 @@ vroom::vroom(
)
```
+
+
-## Marking Centromere Locations
+## Marking Centromere and Telomere Locations
Centromeres are problematic for DLP data, the state calls are often inflated and highly inconsistent between adjacent bins. It's a messy region that we often want to omit. The default masking file (above) pretty much captures these regions, but sometimes we may want to be more specific.
@@ -79,7 +80,7 @@ Alternatively, this can be done in parts:
```{r}
# e.g., just adding the centromere information by chromosome
dlptools::add_centromere_locations(
- reads_df = ex_reads,
+ cn_df = ex_reads,
version = "hg19" # default, don't need to specify. Alternative is "hg38"
) |>
dplyr::select(
@@ -91,6 +92,23 @@ dlptools::add_centromere_locations(
```
+Also available is telomere marking, but see `?dlptools::import_telos_file` for why this isn't the most in-depth or accurate thing:
+
+```{r}
+dlptools::add_telomere_positions(
+ ex_reads
+ # version = "hg19" is the default
+) |>
+ dplyr::relocate(
+ # added columns
+ telostart_p, teloend_p, telostart_q, teloend_q
+ ) |>
+ dplyr::slice_head(n = 5)
+```
+
+
+
+
## Information From Cell IDs
This is assuming our standard DLP cell ids, e.g., `AT23998-A138956A-R03-C34`.
@@ -125,6 +143,8 @@ ex_reads |>
dplyr::slice_sample(n = 5)
```
+
+
## Inferring Missing Bin Data
@@ -235,6 +255,8 @@ dlptools::load_chrom_info_file(
```
+
+
## Reads to Segments
@@ -251,6 +273,9 @@ ex_segs[1:4, ]
*warning:* this function will leave some unexpected gaps when dataframes have been filtered and bins removed. Inspect carfully if you have dropped bins from your dataframe.
+
+
+
## Segments to Reads
Split segments back into bins of some size.
@@ -283,6 +308,101 @@ filt_reads <- ex_reads |>
c(filt_reads = nrow(filt_reads), input_reads = nrow(ex_reads))
```
+
+
+
+## Where Segments Occur
+
+Useful to know is where segments occur relative to chromosome features like the centromere and telomeres.
+
+There are a few key arguments to this function, like setting distances for how close segments need to be to telomeres and centromeres to be consider "bound", and minimum spans of arms/chromosomes.
+
+Please see `?dlptools::mark_segs_chromosome_span` for more information on how this works:
+
+```{r}
+seg_locations <- dlptools::mark_segs_chromosome_span(
+ ex_segs,
+ # version = "hg19" assumed, but hg38 possible too.
+)
+
+dplyr::relocate(
+ seg_locations,
+ # key added columns, more present too
+ seg_span_event, telo_bound, centro_bound, spans_chrom, spans_arm
+)
+```
+
+
+Visually, this is what we're doing:
+
+```{r, echo = FALSE, fig.width=8, fig.height=6, fig.cap="Labels of CN segments on chromosomes. Vertical lines indicate locations of telomeres and centromeres."}
+chr_attrs <- dplyr::left_join(
+ dlptools::load_ucsc_centromeres(),
+ dlptools::import_telos_file(),
+ by = dplyr::join_by(chrom == chr)
+) |>
+ dplyr::rename(
+ chr = chrom
+ ) |>
+ dplyr::select(chr, centro_start, centro_end, dplyr::contains("telo")) |>
+ tidyr::pivot_longer(cols = -chr) |>
+ dplyr::mutate(
+ plot_name = dplyr::case_when(
+ stringr::str_detect(name, "start") ~ "start",
+ stringr::str_detect(name, "end") ~ "end",
+ ),
+ section_id = dplyr::case_when(
+ stringr::str_detect(name, "centro") ~ "centro",
+ stringr::str_detect(name, "telo.*p") ~ "telo-p",
+ stringr::str_detect(name, "telo.*q") ~ "telo-q"
+ )
+ ) |>
+ dplyr::select(chr, section_id, plot_name, value)
+
+
+targ_chrs <- c(1, 2, 8, 13, 14)
+
+some_cells <- unique(seg_locations$cell_id)[1:5]
+
+seg_locations$chr <- factor(
+ seg_locations$chr,
+ levels = gtools::mixedsort(unique(seg_locations$chr))
+)
+
+ggplot2::ggplot(
+ dplyr::filter(seg_locations, cell_id %in% some_cells & chr %in% targ_chrs)
+) +
+ ggplot2::geom_vline(
+ # add genome features. Hm, maybe just do as vlines
+ data = dplyr::filter(chr_attrs, chr %in% targ_chrs),
+ # aes(xmin=start, xmax=end, ymin=-1, ymax=-2, fill=section_id)
+ ggplot2::aes(xintercept = value),
+ color = "black"
+ ) +
+ ggplot2::geom_segment(
+ ggplot2::aes(x = start, xend = end, y = state, color = seg_span_event),
+ linewidth = 3
+ ) +
+ ggplot2::facet_grid(
+ cell_id ~ chr,
+ scales = "free",
+ space = "free_x",
+ switch = "x",
+ axes = "all_x",
+ ) +
+ ggplot2::theme_classic() +
+ ggplot2::theme(
+ axis.text.x = ggplot2::element_blank(),
+ strip.placement = "outside",
+ strip.background = ggplot2::element_blank()
+ ) +
+ ggplot2::scale_color_manual(values = dlptools::GEN_PLOT_COLS)
+```
+
+
+
+
+
## Ploidy of samples
@@ -294,11 +414,40 @@ We can get an idea of the ploidy of a sample by taking the mean of the copy numb
This function does that:
```{r}
-weighted_ploidy(ex_segs) |>
+dlptools::weighted_ploidy(ex_segs) |>
+ dplyr::slice_head(n = 5)
+```
+
+
+
+Alternative, if you want integers, is to get the mode ploidy. The should only be done for evenly sized bins, or the mode wouldn't make much sense.
+
+```{r}
+dlptools::mode_ploidy(
+ ex_reads,
+ sample_col = "cell_id"
+) |>
dplyr::slice_head(n = 5)
```
+
+
+Related, we can also infer if a CN change is a gain or loss, relative to ploidy:
+
+```{r}
+dlptools::mark_cn_relative_to_ploidy(
+ ex_reads, # or can pass segments df
+ df_type = "reads" # df_type = "segs"
+) |>
+ dplyr::relocate(
+ # key added columns
+ mode_ploidy, cn_v_ploidy
+ ) |>
+ dplyr::slice_head(n = 5)
+```
+
+
## Long to Wide Reads (or segments)
@@ -344,58 +493,8 @@ dlptools::ASCN_PHASE_COLORS
# dlptools::BAF_COLORS
```
-## Data Importing
-
-If you've followed a semi-standard approach for downloading such as this:
-
-```bash
-# bash
-cd where/to/save/my_dlp/
-
-ticket="SC-8382"
-azcopy copy https://singlecellresults.blob.core.windows.net/results/${ticket}/results/annotation/ ${ticket} --recursive
-azcopy copy https://singlecellresults.blob.core.windows.net/results/${ticket}/results/hmmcopy/ ${ticket} --recursive
-
-```
-
-which produces a directory with this sort of structure:
-
-```bash
-├── SC-8382
-│ ├── annotation
-│ │ ├── metrics.csv.gz
-│ │ ├── #[...other files ...]
-│ └── hmmcopy
-│ │ ├── #[...other files ...]
-│ ├── reads.csv.gz
-│ ├── segments.csv.gz
-├── SC-8408
-│ ├── annotation
-│ │ ├── metrics.csv.gz
-│ │ ├── #[...other files ...]
-│ └── hmmcopy
-│ │ ├── #[...other files ...]
-│ ├── reads.csv.gz
-│ ├── segments.csv.gz
-├── SC-8650
-```
-
-then there are some functions that can help with loading data from these into a consistent dataframe:
-
-```{r, eval=FALSE}
-dlp_dir <- "/where/to/save/my_dlp/"
-
-# loading metrics
-metrics <- dlptools::import_dlp_files(dlp_dir, "metrics")
-
-# loading reads
-reads_df <- dlptools::import_dlp_files(dlp_dir, "reads")
-
-# loading segments...but generally this isn't a great idea
-# segs_df <- dlptools::import_dlp_files(dlp_dir, "segments")
-```
-
-
+
+
## Other