Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 41 additions & 5 deletions src/deconstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,11 +859,45 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t
// jaccard clustering (using handles for now) on traversals
vector<pair<double, int64_t>> trav_cluster_info;
vector<int> unused_child_snarl_mapping;
vector<vector<int>> trav_clusters = cluster_traversals(graph, travs, sorted_travs,
vector<pair<handle_t, handle_t>>(),
cluster_threshold,
trav_cluster_info,
unused_child_snarl_mapping);
vector<vector<int>> trav_clusters;

// If a minimum-allele-length gate is set, skip clustering at sites
// where every used traversal's interior (non-boundary) sequence is
// shorter than the threshold. This lets the clustering be SV-only
// (default 50bp matches the standard SV size cutoff) while leaving
// small variants represented exactly as they are.
bool gate_clustering = false;
if (cluster_min_allele_len > 0 && cluster_threshold < 1.0) {
int64_t max_alt_len = 0;
for (size_t i = 0; i < travs.size(); ++i) {
if (!use_trav[i]) continue;
const Traversal& t = travs[i];
int64_t len = 0;
if (t.size() > 2) {
for (size_t k = 1; k + 1 < t.size(); ++k) {
len += graph->get_length(t[k]);
}
}
if (len > max_alt_len) max_alt_len = len;
}
gate_clustering = max_alt_len < cluster_min_allele_len;
}

if (gate_clustering) {
// Trivial clusters (one per used traversal) in sorted_travs order.
// Mirrors what cluster_traversals would return when nothing collapses.
trav_cluster_info.assign(travs.size(), make_pair(-1.0, (int64_t)0));
for (int idx : sorted_travs) {
trav_clusters.push_back({idx});
trav_cluster_info[idx] = make_pair(1.0, (int64_t)0);
}
} else {
trav_clusters = cluster_traversals(graph, travs, sorted_travs,
vector<pair<handle_t, handle_t>>(),
cluster_threshold,
trav_cluster_info,
unused_child_snarl_mapping);
}

#ifdef debug
cerr << "cluster priority";
Expand Down Expand Up @@ -1286,6 +1320,7 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
bool strict_conflicts,
bool long_ref_contig,
double cluster_threshold,
int64_t cluster_min_allele_len,
gbwt::GBWT* gbwt,
bool star_allele) {

Expand All @@ -1301,6 +1336,7 @@ void Deconstructor::deconstruct(vector<string> ref_paths, const PathPositionHand
this->gbwt_reference_samples = gbwtgraph::parse_reference_samples_tag(*gbwt);
}
this->cluster_threshold = cluster_threshold;
this->cluster_min_allele_len = cluster_min_allele_len;
this->gbwt = gbwt;
this->star_allele = star_allele;

Expand Down
6 changes: 6 additions & 0 deletions src/deconstructor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class Deconstructor : public VCFOutputCaller {
bool strict_conflicts,
bool long_ref_contig,
double cluster_threshold = 1.0,
int64_t cluster_min_allele_len = 0,
gbwt::GBWT* gbwt = nullptr,
bool star_allele = false);

Expand Down Expand Up @@ -188,6 +189,11 @@ class Deconstructor : public VCFOutputCaller {
// merge if identical (which is what deconstruct has always done)
double cluster_threshold = 1.0;

// only apply cluster_threshold at sites where the longest non-boundary
// traversal sequence is at least this many bp. 0 disables the gate
// (so cluster_threshold applies at every site).
int64_t cluster_min_allele_len = 0;

// use *-alleles to represent haplotypes that span the parent site but don't
// traverse the current nested site (e.g., a deletion that spans a nested SNP)
// only works with include_nested
Expand Down
18 changes: 18 additions & 0 deletions src/subcommand/deconstruct_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ void help_deconstruct(char** argv) {
<< " for reference if possible (i.e. only one ref sample)" << endl
<< " -L, --cluster F cluster traversals whose (handle) Jaccard coefficient" << endl
<< " is >= F together [1.0; experimental]" << endl
<< " --cluster-min-len N only apply -L clustering at sites where the longest" << endl
<< " non-boundary traversal is >= N bp (50 = SVs only," << endl
<< " 0 = always) [0]" << endl
<< " -R, --star-allele use *-alleles to represent haplotypes that span the" << endl
<< " parent but don't traverse nested sites (requires -a)" << endl
<< " -t, --threads N use N threads" << endl
Expand Down Expand Up @@ -94,8 +97,11 @@ int main_deconstruct(int argc, char** argv) {
bool untangle_traversals = false;
bool contig_only_ref = false;
double cluster_threshold = 1.0;
int64_t cluster_min_allele_len = 0;
bool star_allele = false;

constexpr int OPT_CLUSTER_MIN_LEN = 1000;

int c;
optind = 2; // force optind past command positional argument
while (true) {
Expand All @@ -118,6 +124,7 @@ int main_deconstruct(int argc, char** argv) {
{"strict-conflicts", no_argument, 0, 'S'},
{"contig-only-ref", no_argument, 0, 'C'},
{"cluster", required_argument, 0, 'L'},
{"cluster-min-len", required_argument, 0, OPT_CLUSTER_MIN_LEN},
{"star-allele", no_argument, 0, 'R'},
{"threads", required_argument, 0, 't'},
{"verbose", no_argument, 0, 'v'},
Expand Down Expand Up @@ -182,6 +189,12 @@ int main_deconstruct(int argc, char** argv) {
case 'L':
cluster_threshold = max(0.0, min(1.0, parse<double>(optarg)));
break;
case OPT_CLUSTER_MIN_LEN:
cluster_min_allele_len = parse<int64_t>(optarg);
if (cluster_min_allele_len < 0) {
logger.error() << "--cluster-min-len must be >= 0" << endl;
}
break;
case 'R':
star_allele = true;
break;
Expand All @@ -206,6 +219,10 @@ int main_deconstruct(int argc, char** argv) {
logger.error() << "-R can only be used with -a" << endl;
}

if (cluster_min_allele_len > 0 && cluster_threshold >= 1.0) {
logger.warn() << "--cluster-min-len has no effect without -L (cluster threshold < 1.0)" << endl;
}

// Read the graph
unique_ptr<PathHandleGraph> path_handle_graph_up;
unique_ptr<GBZGraph> gbz_graph;
Expand Down Expand Up @@ -422,6 +439,7 @@ int main_deconstruct(int argc, char** argv) {
strict_conflicts,
!contig_only_ref,
cluster_threshold,
cluster_min_allele_len,
gbwt_index,
star_allele);

Expand Down
16 changes: 14 additions & 2 deletions test/t/26_deconstruct.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 59
plan tests 63

vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg
vg index hla.vg -x hla.xg
Expand Down Expand Up @@ -156,7 +156,19 @@ is "$(tail -1 small_cluster_0.vcf | awk '{print $5}')" "GATTTGA,G" "cluster-free
is "$(tail -1 small_cluster_3.vcf | awk '{print $5}')" "G" "clustered deconstruction finds fewer alt alleles"
is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered deconstruction finds correct allele info"

rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf
# --cluster-min-len gates when the longest non-boundary traversal is below
# the threshold. small_cluster's only snarl has max interior length 6 bp.
vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 0 > small_cluster_3_off.vcf
is "$(tail -1 small_cluster_3_off.vcf | awk '{print $5}')" "G" "--cluster-min-len 0 is equivalent to clustering everywhere"
vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 50 > small_cluster_3_50.vcf
is "$(tail -1 small_cluster_3_50.vcf | awk '{print $5}')" "GATTTGA,G" "--cluster-min-len 50 gates clustering on a small site"
vg deconstruct small_cluster.gfa -p x -L 0.3 --cluster-min-len 6 > small_cluster_3_6.vcf
is "$(tail -1 small_cluster_3_6.vcf | awk '{print $5}')" "G" "--cluster-min-len at the site length still clusters"
vg deconstruct small_cluster.gfa -p x --cluster-min-len 50 2>/dev/null > small_cluster_min_only.vcf
diff small_cluster_0.vcf small_cluster_min_only.vcf
is "$?" 0 "--cluster-min-len without -L is a no-op"

rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf small_cluster_3_off.vcf small_cluster_3_50.vcf small_cluster_3_6.vcf small_cluster_min_only.vcf

# Nesting tests now use a two-step process:
# 1. Compute gref cover with vg paths
Expand Down
Loading