diff --git a/INSTALL b/INSTALL index bcdd2f4e3..b02782c3e 100644 --- a/INSTALL +++ b/INSTALL @@ -266,3 +266,22 @@ mingw-w64-x86_64-tools-git (The last is only needed for building libraries compatible with MSVC.) +Windows MSYS2/MINGW64 +--------------------- + +The configure script must be used as without it the compilation will +likely fail. + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) diff --git a/bam2bcf.c b/bam2bcf.c index 76a0d439b..1cd019f49 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -234,11 +234,14 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // particularly indicative of being a good REF match either, // at least not in low coverage. So require solid coverage // before we start utilising such quals. - b = 0; + if (b != 0) + b = 5; q = (int)bam_get_qual(p->b)[p->qpos]; seqQ = (3*seqQ + 2*q)/8; } if (_n > 20 && seqQ > 40) seqQ = 40; + // Note baseQ changes some output fields such as I16, but has no + // significant affect on "call". baseQ = p->aux>>8&0xff; is_diff = (b != 0); @@ -357,9 +360,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp); } + // Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed + // ratios. This is detrimental on Illumina, but beneficial on PacBio CCS. + // It's possibly related to the homopolyer error likelihoods or overall + // Indel accuracy. Maybe tie this in to the -h option? + r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype + + // TODO: account for the number of unassigned reads. If depth is 50, + // but AD is 5,7 then it may look like a variant but it's probably + // should be low quality. + return n; } diff --git a/bam2bcf.h b/bam2bcf.h index e778b8952..2ffca4c6a 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -156,7 +156,7 @@ extern "C" { int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); - int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); + int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, int ref_len); void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); #ifdef __cplusplus diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 108d50557..cd5e7956b 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017, 2021-2022 Genome Research Ltd. Author: Heng Li @@ -23,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +//#define CONS_DEBUG + #include #include #include @@ -40,6 +42,18 @@ KSORT_INIT_GENERIC(uint32_t) #define MAX_TYPES 64 +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef ABS +# define ABS(a) ((a)<0?-(a):(a)) +#endif + +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + // Take a reference position tpos and convert to a query position (returned). // This uses the CIGAR string plus alignment c->pos to do the mapping. // @@ -74,8 +88,16 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, return last_y; } -// FIXME: check if the inserted sequence is consistent with the homopolymer run -// l is the relative gap length and l_run is the length of the homopolymer on the reference +// l is the relative gap length and l_run is the length of the homopolymer +// on the reference. +// +// Larger seqQ is good, so increasing tandemQ calls more indels, +// and longer l_run means fewer calls. It is capped later at 255. +// For short l_runs, the qual is simply based on size of indel +// larger ones being considered more likely to be real. +// Longer indels get assigned a score based on the relative indel size +// to homopolymer, where l_run base will have already been verified by +// the caller to ensure it's compatible. static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) { int q, qh; @@ -251,12 +273,29 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, return NULL; } t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; + for (i = 0; i < m; ++i) { + int sz = (int32_t)(aux[i] - MINUS_CONST); + int j; + for (j = i+1; j < m; j++) + if (aux[j] != aux[i]) + break; + + if (sz == 0 + || (j-i >= bca->min_support && + // Note, doesn't handle bca->per_sample_flt yet + (bca->per_sample_flt + || (double)(j-i) / n_tot >= bca->min_frac))) + types[t++] = sz; + i = j-1; + } free(aux); + if (t <= 1) { + free(types); + return NULL; + } + n_types = t; + // Find reference type; types[?] == 0) for (t = 0; t < n_types; ++t) if (types[t] == 0) break; @@ -269,142 +308,536 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, return types; } -// Part of bcf_call_gap_prep. -// -// Construct per-sample consensus. -// -// Returns an array of consensus seqs, -// or NULL on failure. -static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, - int pos, bcf_callaux_t *bca, const char *ref, - int left, int right) { - int i, k, s, L = right - left + 1, max_i, max2_i; - char **ref_sample; // returned - uint32_t *cns = NULL, max, max2; - char *ref0 = NULL, *r; - ref_sample = (char**) calloc(n, sizeof(char*)); - cns = (uint32_t*) calloc(L, 4); - ref0 = (char*) calloc(L, 1); - if (!ref_sample || !cns || !ref0) { - n = 0; - goto err; +// Increment ins["str"] and freq["str"] +#define NI 100 // number of alternative insertion sequences +// Could use a hash table too, but expectation is a tiny number of alternatives +typedef struct { + char *str[NI]; + int len[NI]; + int freq[NI]; +} str_freq; + +static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { + int j; + + for (j = 0; j < NI && sf->str[j]; j++) { + if (sf->len[j] == len && memcmp(sf->str[j], str, len) == 0) + break; + } + if (j >= NI) + return 0; // too many choices; discard + + sf->freq[j]+=freq; + if (!sf->str[j]) { + // new insertion + if (!(sf->str[j] = malloc(len+1))) + return -1; + memcpy(sf->str[j], str, len); + sf->len[j] = len; } - // Convert ref ASCII to 0-15. - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; + return 0; +} - // NB: one consensus per sample 'n', not per indel type. - // FIXME: consider fixing this. We should compute alignments vs - // types, not vs samples? Or types/sample combined? - for (s = 0; s < n; ++s) { - r = ref_sample[s] = (char*) calloc(L, 1); - if (!r) { - n = s-1; - goto err; - } +/* + * Compute the consensus for a specific indel type at pos. + * + * left_shift is the number of inserted(+) or deleted(-) bases added to + * the consensus before we get to pos. This is necessary so the alignment + * band is correct as it's expected to start at left/right edges in + * sync + * + * We accumulate into several buffers for counting base types: + * cons_base - consensus of data with p->indel == type, bases or gap + * ref_base - consensus of data with p->indel != type, bases or gap + * cons_ins - consensus of data with p->indel == type, insertions + * ref_ins - consensus of data with p->indel == type, bases or gap + * + * The purpose of cons_ins vs cons_base is if we have very low + * coverage due to nearly all reads being another type, then we can + * still get a robust consensus using the other data. If we don't + * have shallow data, then we'll not use as much of ref_base as we may + * have correlated variants. + * + * Eg: + * REF: AGCTATGAGGCTGATA + * SEQ: AGGTAGGAGGGTGATA (x1) + * SEQ: AGCTACGAGG*TGATA (x24) + * SEQ: AGCTACTAGG*TGATA (x24) + * + * Cons for no-del is Cs not Gs. Cannot trust it, so use N if shallow. + * CON: AGCTACNAGGGTGATA + * + * There are still some problems in cons_ins vs ref_ins assignment. + * We sometimes seem multiple similar-length insertions added at + * different locations. Ideally we'd like to consider these as all + * the same insertion if the size is the same and it's comparable seq. + */ +static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int ref_len, int left, int right, + int sample, int type, int biggest_del, + int *left_shift, int *right_shift, + int *band, int *tcon_len, int *cpos_pos) { + // Map ASCII ACGTN* to 012345 + static uint8_t base6[256] = { + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,5,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + //A C G *^ T + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + }; + + // single base or del + int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base)); + // multi-base insertions + str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); + + // non-indel ref for all reads on this sample, rather than those just + // matching type. We use this for handling the case where we have a + // homozygous deletion being studied, but with 1 or 2 reads misaligned + // and containing a base there. + // + // Eg if the type[]=0 consensus is made up of a very small sample size, + // which is also enriched for highly error prone data. We can use + // the other reads from type[] != 0 to flesh out the consensus and + // improve accuracy. + int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); + str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins)); + int i, j, k, s = sample; + char **cons = NULL; + + if (!cons_base || !cons_ins || !ref_base || !ref_ins) + goto err; - memset(cns, 0, sizeof(int) * L); + //-------------------------------------------------- + // Accumulate sequences into cons_base and cons_ins arrays + int local_band_max = 0; // maximum absolute deviation from diagonal + for (i = 0; i < n_plp[s]; i++) { + const bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + int x = b->core.pos; // ref coordinate + int y = 0; // seq coordinate + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + + int local_band = 0; // current deviation from diagonal + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int len = cigar[k] >> BAM_CIGAR_SHIFT; + int base; + int skip_to = 0; + + switch(op) { + case BAM_CSOFT_CLIP: + y += len; + break; - // collect ref and non-ref counts in cns - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - int x = b->core.pos, y = 0; - - // TODO: pileup exposes pileup_ind, but we also need e.g. - // pileup_len to know how much of the current CIGAR op-len - // we've used (or have remaining). If we had that, we - // could start at p->qpos without having to scan through - // the entire CIGAR string until we find it. - // - // Without it about all we could do is have a side channel - // to cache the last known coords. Messy, so punt for now. - // This is no longer the bottle neck until we get to 1000s of - // CIGAR ops. - - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (x + l >= left) { - j = left - x > 0 ? left - x : 0; - int j_end = right - x < l ? right - x : l; - for (; j < j_end; j++) - // Append to cns. Note this is ref coords, - // so insertions aren't in cns and deletions - // will have lower coverage. - - // FIXME: want true consensus (with ins) per - // type, so we can independently compare each - // seq to each consensus and see which it - // matches best, so we get proper GT analysis. - cns[x+j-left] += - (bam_seqi(seq, y+j) == ref0[x+j-left]) - ? 1 // REF - : (1<<16); // ALT + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + // Can short-cut this with j_start and j_end based on + // x+len and left,right + for (j = 0; j < len; j++, x++, y++) { + if (x < left) continue; + if (x >= right) break; + + base = bam_seqi(seq, y); + if (p->indel == type) + // Convert 4-bit base ambig code to 0,1,2,3,4 range + cons_base[x-left][seq_nt16_int[base]]++; + else if (x != pos+1) // indel being assessed question + ref_base[x-left][seq_nt16_int[base]]++; + } + break; + } + + case BAM_CINS: { + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < local_band) + local_band_max = local_band; + } + + char ins[1024]; + for (j = 0; j < len; j++, y++) { + if (x < left) continue; + if (x >= right) + break; + base = bam_seqi(seq, y); + if (j < 1024) + ins[j] = seq_nt16_int[base]; + } + + // Insertions come before a ref match. + // 5I 5M is IIIIIM M M M M events, not + // {IIIII,M} M M M M choice. So we need to include the + // next match in our sequence when choosing the consensus. + if (x >= left && x < right) { + int ilen = j<1024?j:1024; + if (p->indel == type /*&& x == pos+1*/) { + // Assume any ins of the same size is the same ins. + // (This rescues misaligned insertions.) + if (bcf_cgp_append_cons(&cons_ins[x-left], ins, + ilen, 1) < 0) + goto err; + } else if (x != pos+1){ + if (bcf_cgp_append_cons(&ref_ins[x-left], ins, + ilen, 1) < 0) + goto err; } - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - x += l; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - y += l; + } + break; + } + + case BAM_CDEL: + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < -local_band) + local_band_max = -local_band; } - if (x > right) - break; + // Maybe not perfect for I/D combos, but likely sufficient. + for (j = 0; j < len; j++, x++) { + if (x < left) continue; + if (x >= right) break; + if ((p->indel == type && !p->is_del) || // starts here + (p->indel == 0 && p->is_del && len == -type)) // left + cons_base[x-left][5]++; + else if (x+len <= pos+1 || (skip_to && x > skip_to)) + ref_base[x-left][5]++; + else if (x <= pos && x+len > pos+1) { + // we have a deletion which overlaps pos, but + // isn't the same "type". We don't wish to + // include these as they may bias the + // evaluation by confirming against a + // secondary consensus produced with the other + // deletion. We set a marker for how long to + // skip adding to ref_base. + if (x > skip_to) + skip_to = x+len; + } + } + break; } } - // Determine a sample specific reference. - for (i = 0; i < right - left; ++i) - r[i] = ref0[i]; - - // Find deepest and 2nd deepest ALT region (max & max2). - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) - max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) - max2 = cns[i], max2_i = i; + // Also track the biggest deviation +/- from diagonal. We use + // this band observation in our BAQ alignment step. + if (*band < local_band_max) + *band = local_band_max; + } + + //-------------------------------------------------- + // Expand cons_base to include depth from ref_base/ref_ins + // Caveat: except at pos itself, where true ref is used if type != 0 + for (i = 0; i < right-left; i++) { + // Total observed depth + int t = cons_base[i][0] + cons_base[i][1] + cons_base[i][2] + + cons_base[i][3] + cons_base[i][4] + cons_base[i][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + t += cons_ins[i].freq[j]; } - // Masks mismatches present in at least 70% of the reads with 'N'. - // This code is nREF/(nREF+n_ALT) >= 70% for deepest region. - // The effect is that at least 30% of bases differing to REF will - // use "N" in consensus, so we don't penalise ALT or REF when - // aligning against it. (A poor man IUPAC code) + // Similarly for depth on the non-ALT calls (NB: not necessarily + // REF as maybe it's other ALTs). + int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] + + ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + r += ref_ins[i].freq[j]; + } + + // When evaluating this particular indel, we don't want to + // penalise alignments by SNP errors elsewhere. This can + // happen when we have low depth for a particular 'type'. // - // Why is it only done in two loci at most? - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) - max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) - max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; + // So add in a little data from ref_base/ref_ins. + double rfract = (r - t*2)*.75 / (r+1); + + if (rfract < 1.01 / (r+1e-10)) + rfract = 1.01 / (r+1e-10); // low depth compensation + + // TODO: consider limiting rfract so we never drown out the + // signal. We want to use the remaining data only to correct + // for sequencing errors in low depth alleles. If we get + // conflicts, it's better to use N than to change a base + // incase that variant is genuine. + if (i+left >= pos+1 && i+left < pos+1-biggest_del) { + // We're overlapping the current indel region, so + // we don't wish to bring in evidence from the other + // "type" data as it'll harm calling. + continue; + } else { + // Otherwise add in a portion of other data to + // boost low population numbers. + cons_base[i][0] += rfract * ref_base[i][0]; + cons_base[i][1] += rfract * ref_base[i][1]; + cons_base[i][2] += rfract * ref_base[i][2]; + cons_base[i][3] += rfract * ref_base[i][3]; + cons_base[i][4] += rfract * ref_base[i][4]; + cons_base[i][5] += rfract * ref_base[i][5]; + } + + // Similarly for insertions too; consider a different rfract here? + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + if (bcf_cgp_append_cons(&cons_ins[i], + ref_ins[i].str[j], ref_ins[i].len[j], + rfract * ref_ins[i].freq[j]) < 0) + goto err; + } + } + + //-------------------------------------------------- + // Allocate consensus buffer, to worst case length + int max_len = right-left; + for (i = 0; i < right-left; i++) { + if (!cons_ins[i].str[0]) + continue; - //for (i = 0; i < right - left; ++i) - // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); - //fputc('\n', stderr); + int ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].str[j] && ins < cons_ins[i].len[j]) + ins = cons_ins[i].len[j]; + } + max_len += ins; } + cons = malloc((max_len+1)*2 + sizeof(char *)*2); + if (!cons) + goto err; + cons[0] = (char *)&cons[2]; + cons[1] = cons[0] + max_len+1; + + //-------------------------------------------------- + // Merge insertions where they are the same length but different + // sequences. + // NB: we could just index by length and have accumulators for each, + // instead of storing separately and merging later (here). + // Ie str_freq.str is [NI][5] instead. + for (i = 0; i < right-left; i++) { + int ins[1024][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; - free(ref0); - free(cns); + if (cons_ins[i].freq[j] == 0) + continue; // already merged - return ref_sample; + int l; + for (l = 0; l < cons_ins[i].len[j]; l++) { + // Append to relevant frequency counter, zero all others + ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0; + uint8_t b = cons_ins[i].str[j][l]; + ins[l][b] = cons_ins[i].freq[j]; + } - err: - free(ref0); - free(cns); - if (ref_sample) { - for (s = 0; s < n; s++) - free(ref_sample[s]); - free(ref_sample); + // Merge other insertions of the same length to ins[] counters + for (k = j+1; k < NI; k++) { + if (!cons_ins[i].str[k]) + break; + if (cons_ins[i].len[k] != cons_ins[i].len[j]) + continue; + if (cons_ins[i].freq[k] == 0) + continue; // redundant? + + // Merge str[j] and str[k] + for (l = 0; l < cons_ins[i].len[k]; l++) { + uint8_t b = cons_ins[i].str[k][l]; + ins[l][b] += cons_ins[i].freq[k]; + } + cons_ins[i].freq[j] += cons_ins[i].freq[k]; + cons_ins[i].freq[k] = 0; + } + + // Now replace ins[j] with the consensus insertion of this len. + for (l = 0; l < cons_ins[i].len[j]; l++) { + int max_v = 0, base = 0; + int tot = ins[l][0] + ins[l][1] + ins[l][2] + + ins[l][3] + ins[l][4]; + if (max_v < ins[l][0]) max_v = ins[l][0], base = 0; + if (max_v < ins[l][1]) max_v = ins[l][1], base = 1; + if (max_v < ins[l][2]) max_v = ins[l][2], base = 2; + if (max_v < ins[l][3]) max_v = ins[l][3], base = 3; + if (max_v < ins[l][4]) max_v = ins[l][4], base = 4; + + cons_ins[i].str[j][l] = (max_v > 0.6*tot) ? base : 4; + } + } } - return NULL; +#define CONS_CUTOFF .40 // 40% needed for base vs N +#define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] +#define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0] +#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] HOM +#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N + + //-------------------------------------------------- + // Walk through the frequency arrays to call the consensus. + // We produce cons[0] and cons[1]. Both include strongly + // homozygous indels. Both also include the indel at 'pos'. + // However for heterozygous indels we call the most likely event + // for cons[0] and the less-likely alternative in cons[1]. + // TODO: a proper phase analysis so multiple events end up + // combining together into the correct consensus. + *left_shift = 0; + *right_shift = 0; + int cnum; + + // Het call filled out in cnum==0 (+ve or -ve). + // Used in cnum==1 to do the opposite of whichever way we did before. + int heti[1024] = {0}, hetd[1024] = {0}; + + *cpos_pos = -1; + for (cnum = 0; cnum < 2; cnum++) { + for (i = k = 0; i < right-left; i++) { + // Location in consensus matching the indel itself + if (i >= pos-left+1 && *cpos_pos == -1) + *cpos_pos = k; + + int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0; + for (j = 0; j < 6; j++) { + // Top 2 consensus calls + if (max_v < cons_base[i][j]) { + max_v2 = max_v, max_j2 = max_j; + max_v = cons_base[i][j], max_j = j; + } else if (max_v2 < cons_base[i][j]) { + max_v2 = cons_base[i][j], max_j2 = j; + } + tot += cons_base[i][j]; + } + + // +INS + int max_v_ins = 0, max_j_ins = 0; + int tot_ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].freq[j] == 0) + continue; // previously merged + + if (max_v_ins < cons_ins[i].freq[j]) + //if (i != pos-left+1 || cons_ins[i].len[j] == type) + max_v_ins = cons_ins[i].freq[j], max_j_ins = j; + tot_ins += cons_ins[i].freq[j]; + } + + // NB: tot is based on next matching base, so it includes + // everything with or without the insertion. + int tot_sum = tot; + int always_ins = + (i == pos-left+1 && type>0) || // current eval + max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM + int het_ins = 0; + if (!always_ins && max_v_ins >= bca->min_support) { + // Candidate HET ins. + if (cnum == 0) { + het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum; + if (i < 1024) heti[i] = het_ins + ? 1 + : (max_v_ins > .3*tot_sum ? -1:0); + } else { + // HET but uncalled before + het_ins = i < 1024 ? (heti[i] == -1) : 0; + } + } + + if (always_ins || het_ins) { + if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { + // Insert bases + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { + if (cnum == 0) { + if (k < pos-left+*left_shift) + (*left_shift)++; + else + (*right_shift)++; + } + cons[cnum][k++] = cons_ins[i].str[max_j_ins][j]; + } + } else { + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) + cons[cnum][k++] = 4; // 'N'; + } + } + + // Call deletions & bases + int always_del = (type < 0 && i > pos-left && i <= pos-left-type) + || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del + int het_del = 0; + if (!always_del && cons_base[i][5] >= bca->min_support) { + // Candidate HET del. + if (cnum == 0) { + het_del = cons_base[i][5] >= CONS_CUTOFF * tot; + if (i < 1024) { + if (i > pos-left && i <= pos-left-biggest_del) + hetd[i] = 0; + else + hetd[i] = het_del + ? 1 + : (cons_base[i][5] >= .3 * tot ? -1 : 0); + } + } else { + // HET del uncalled on cnum 0 + het_del = i < 1024 ? (hetd[i] == -1) : 0; + if (max_j == 5 && het_del == 0) { + max_v = max_v2; + max_j = max_j2; + } + } + } + if (always_del || het_del) { + // Deletion + if (k < pos-left+*left_shift) + (*left_shift)--; + else + (*right_shift)++; + } else { + // Finally the easy case - a non-indel base or an N + if (max_v > CONS_CUTOFF*tot) + cons[cnum][k++] = max_j; // "ACGTN*" + else if (max_v > 0) + cons[cnum][k++] = 4; // 'N'; + else { + cons[cnum][k] = left+k < ref_len + ? base6[(uint8_t)ref[left+k]] + : 4; + k++; + } + } + } + + tcon_len[cnum] = k; + } + + // TODO: replace by io_lib's string pool for rapid tidying. + // For now this isn't the bottleneck though. + for (i = 0; i < right-left; i++) { + for (j = 0; j < NI; j++) { + if (cons_ins[i].str[j]) + free(cons_ins[i].str[j]); + if (ref_ins[i].str[j]) + free(ref_ins[i].str[j]); + } + } + + err: + free(cons_base); + free(ref_base); + free(cons_ins); + free(ref_ins); + + return cons; } // The length of the homopolymer run around the current position @@ -427,11 +860,13 @@ static int bcf_cgp_l_run(const char *ref, int pos) { } -// Compute the consensus for this sample 's', minus indels which -// get added later. -static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, - int pos, int *types, int n_types, - int max_ins, int s) { +// Compute the insertion consensus for this sample 's' via a basic +// majority rule. +// +// TODO: merge this into bcf_cgp_consensus as another return value? +static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { int i, j, t, k; int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); if (!inscns_aux) @@ -478,25 +913,53 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, return inscns; } -#ifndef MIN -# define MIN(a,b) ((a)<(b)?(a):(b)) -#endif - // Part of bcf_call_gap_prep. // // Realign using BAQ to get an alignment score of a single read vs -// a haplotype consensus. +// a haplotype consensus. TODO: replace BAQ with something more robust. +// +// There are many coordinates, so let's explain them. +// - left, right, tbeg, tend, r_start and r_end are in aligned reference +// coordinates. +// left/right start from pos +/- indel_win_size. +// r_start/r_end are the BAM first and last mapped coord on the reference. +// tbeg and tend are the intersection of the two. +// - qbeg and qend are in BAM sequence coordinates +// - qpos is in sequence coordinates, relative to qbeg. +// +// To see what this means, we have illustrations with coordinates +// above the seqs in reference space and below the seqs in BAM seq space. +// +// Overlap left: +// tbeg tend +// r_start left pos r_end right +// REF :..............|--------------------#------:--------------|... +// SEQ :..............|--------------------#------| +// 0 qbeg qpos qend +// +// Overlap right: +// r_start tend +// left tbeg pos right r_end +// REF ...|--------------:-----#---------------------|...........: +// SEQ |-----#---------------------|...........: +// qbeg qpos qend +// 0 +// +// The "-" sequence is the bit passed in. +// Ie ref2 spans left..right and query spans qbeg..qend. +// We need to adjust ref2 therefore to tbeg..tend. // // Fills out score // Returns 0 on success, // <0 on error static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, - int type, uint8_t *ref2, uint8_t *query, + int type, int band, + uint8_t *ref1, uint8_t *ref2, uint8_t *query, int r_start, int r_end, int long_read, - int tbeg, int tend, + int tbeg, int tend1, int tend2, int left, int right, int qbeg, int qend, - int qpos, int max_deletion, + int pos, int qpos, int max_deletion, int *score) { // Illumina probaln_par_t apf = { 1e-4, 1e-2, 10 }; @@ -510,11 +973,38 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, } type = abs(type); - apf.bw = type + 3; - int l, sc; + if (band > (qend-qbeg)/2-3) + band = (qend-qbeg)/2-3; + apf.bw = band + 3; // or abs(l_ref - l_query), so we want to keep similar + + int l, sc1, sc2; const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; + // Trim poly_Ns at ends of ref. + // This helps to keep len(ref) and len(query) similar, to reduce + // band size and reduce the chance of -ve BAQ scores. + for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++) + if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) + break; + if (l > ABS(type)) + tbeg += l-ABS(type); + + for (l = tend1-tbeg-1; l >= 0; l--) + if (ref1[l + tbeg-left] != 4) + break; + l = tend1-tbeg-1 - l; + if (l > ABS(type)) + tend1 -= l-ABS(type); + + for (l = tend2-tbeg-1; l >= 0; l--) + if (ref2[l + tbeg-left] != 4) + break; + l = tend2-tbeg-1 - l; + if (l > ABS(type)) { + tend2 -= l-ABS(type); + } + // Get segment of quality, either ZQ tag or if absent QUAL. if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) return -1; @@ -531,21 +1021,63 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // The bottom 8 bits are length-normalised score while // the top bits are unnormalised. - sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, - query, qend - qbeg, qq, &apf, 0, 0); - if (sc < 0) { + // + // Try original cons and new cons and pick best. + // This doesn't reduce FN much (infact maybe adds very slightly), + // but it does reduce GT errors and is a slight reduction to FP. + sc2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg, + query, qend - qbeg, qq, &apf, 0, 0); + + if (tend1 != tend2 || + memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, + tend1 - tbeg) != 0) + sc1 = probaln_glocal(ref1 + tbeg - left, tend1 - tbeg, + query, qend - qbeg, qq, &apf, 0, 0); + else + sc1 = INT_MAX; // skip + +#ifdef CONS_DEBUG + fprintf(stderr, "\nref1"); + fprintf(stderr, "%c ", + memcmp(ref1+tbeg-left, query, qend-qbeg)?':':'='); + for (int j = 0; j < tend1-tbeg; j++) + putc("ACGTN"[(uint8_t)ref1[j+tbeg-left]], stderr); + putc('\n', stderr); + fprintf(stderr, "ref2"); + fprintf(stderr, "%c ", + memcmp(ref2+tbeg-left, query, qend-qbeg)?':':'='); + for (int j = 0; j < tend2-tbeg; j++) + putc("ACGTN"[(uint8_t)ref2[j+tbeg-left]], stderr); + putc('\n', stderr); + fprintf(stderr, "qury: "); + for (int j = 0; j < qend-qbeg; j++) + putc("ACGTN"[(uint8_t)query[j]], stderr); + putc('\n', stderr); + fprintf(stderr, "sc1 %-9d sc2 %-9d ", sc1, sc2); +#endif + + if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); return 0; } + if (sc1 < 0) { + // sc2 is already correct + } else if (sc2 < 0) { + sc2 = sc1; + } else { + // sc1 and sc2 both pass, so use best + if (sc2 > sc1) + sc2 = sc1; + } // used for adjusting indelQ below - l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; - *score = sc<<8 | MIN(255, l); + l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias); + *score = sc2<<8 | MIN(255, l); rep_ele *reps, *elt, *tmp; uint8_t *seg = ref2 + tbeg - left; - int seg_len = tend - tbeg + type; + int seg_len = tend2 - tbeg; // Note: although seg moves (tbeg varies), ref2 is reused many times // so we could factor out some find_STR calls. However it's not the @@ -566,11 +1098,21 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // This is emphasised further if the sequence ends with // soft clipping. DL_FOREACH_SAFE(reps, elt, tmp) { - if (elt->start <= qpos && elt->end >= qpos) { - iscore += (elt->end-elt->start) / elt->rep_len; // c - if (elt->start+tbeg <= r_start || - elt->end+tbeg >= r_end) - iscore += 2*(elt->end-elt->start); + int str_beg = elt->start+tbeg; + int str_end = elt->end+tbeg; + + + if (str_beg <= pos && str_end >= pos) { + // Overlaps indel region; num repeat units. + iscore += (elt->end-elt->start) / elt->rep_len; + } +#define STR_HALO2 (2+2*elt->rep_len) + if (str_beg <= pos+STR_HALO2 && str_end >= pos-STR_HALO2) { + // Worst: extends beyond read end by >= 1 repeat unit + if (str_beg <= r_start-elt->rep_len) + iscore += 10*(r_start - str_beg)/elt->rep_len; + if (str_end >= r_end+elt->rep_len) + iscore += 10*(elt->end+tbeg - r_end)/elt->rep_len; } DL_DELETE(reps, elt); @@ -580,6 +1122,9 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // Apply STR score to existing indelQ l = (*score&0xff)*.8 + iscore*2; *score = (*score & ~0xff) | MIN(255, l); +#ifdef CONS_DEBUG + fprintf(stderr, " iscore %-4d l %d %d..%d\n\n", iscore, l, r_start, r_end); +#endif free(qq); @@ -601,6 +1146,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; + // Labelling is confusing here. + // sct is short for score. + // sc is score + t(type) + // Why aren't these variable names reversed? int *sct = &score[K*n_types], seqQ, indelQ; for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; for (t = 1; t < n_types; ++t) // insertion sort @@ -614,6 +1163,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, * compromise for multi-allelic indels. */ if ((sc[0]&0x3f) == ref_type) { + // sc >> 14 is the total score. It's been shifted by 8 + // from normalised score and 6 from type. indelQ = (sc[1]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); } else { @@ -622,8 +1173,14 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, indelQ = (sc[t]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); } - tmp = sc[0]>>6 & 0xff; + + tmp = sc[0]>>6 & 0xff; // normalised score + // reduce indelQ + // high score = bad, low score = good. + // low normalised scores leave indelQ unmodified + // high normalised scores set indelQ to 0 + // inbetween scores have a linear scale from indelQ to 0 indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); // Doesn't really help accuracy, but permits -h to take @@ -631,9 +1188,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (indelQ > seqQ) indelQ = seqQ; if (indelQ > 255) indelQ = 255; if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; - // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + + // use 22 bits in total + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; + sumq[sc[0]&0x3f] += indelQ; } } // determine bca->indel_types[] and bca->inscns @@ -692,14 +1250,14 @@ specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. - 8: indel quality .. aux&0xff */ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, - bcf_callaux_t *bca, const char *ref) + bcf_callaux_t *bca, const char *ref, int ref_len) { if (ref == 0 || bca == 0) return -1; - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; - int *score, max_ref2; - int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref2, *query, **ref_sample; + int i, s, t, n_types, *types = NULL, max_rd_len, left, right, max_ins; + int *score = NULL; + int N, K, l_run, ref_type, n_alt = -1; + char *inscns = NULL, *query = NULL; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -715,54 +1273,63 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, &max_rd_len, &n_types, &ref_type, &N); if (!types) - return -1; + goto err; // calculate left and right boundary left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; right = pos + bca->indel_win_size; - if (types[0] < 0) right -= types[0]; + int del_size = types[0]<0 ? -types[0] : 0; + right += del_size; // in case the alignments stand out the reference for (i = pos; i < right; ++i) if (ref[i] == 0) break; right = i; - - /* The following call fixes a long-existing flaw in the INDEL - * calling model: the interference of nearby SNPs. However, it also - * reduces the power because sometimes, substitutions caused by - * indels are not distinguishable from true mutations. Multiple - * sequence realignment helps to increase the power. - * - * Masks mismatches present in at least 70% of the reads with 'N'. - */ - ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right); + // compute the likelihood given each type of indel for each read + max_ins = types[n_types - 1]; // max_ins is at least 0 // The length of the homopolymer run around the current position l_run = bcf_cgp_l_run(ref, pos); + int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]]; + int l_run_ins = 0; // construct the consensus sequence (minus indels, which are added later) - max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, - types, n_types, max_ins, s); + // TODO: replace filling inscns[] with calc_consensus return + // so the merges of the insertion consensus for type[t] is + // reported directly. (It may need adjustment to avoid N) + inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); if (!inscns) return -1; } - // compute the likelihood given each type of indel for each read - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); - ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; double nqual_over_60 = bca->nqual / 60.0; + int biggest_del = 0; + int biggest_ins = 0; + for (t = 0; t < n_types; t++) { + if (biggest_del > types[t]) + biggest_del = types[t]; + if (biggest_ins < types[t]) + biggest_ins = types[t]; + } + int band = biggest_ins - biggest_del; // NB del is -ve + for (t = 0; t < n_types; ++t) { int l, ir; - // compute indelreg + // Compute indelreg. This is the context in the reference. Eg: + // + // REF: AG--TTTC Inscns is "TT". + // SEQ: AGTTTTTC Indelreg is 3; next 3 "TTT" bases + // + // => GTTT GTTTTT is call. if (types[t] == 0) ir = 0; else if (types[t] > 0) @@ -773,40 +1340,46 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (ir > bca->indelreg) bca->indelreg = ir; - // Identify max deletion length - int max_deletion = 0; - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - if (max_deletion < -p->indel) - max_deletion = -p->indel; - } - } - // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - // Construct ref2 from ref_sample, inscns and indels. - // This is now the true sample consensus (possibly prepended - // and appended with reference if sample data doesn't span - // the full length). - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - - if (types[t] <= 0) - j += -types[t]; - else - for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - - for (; j < right && ref[j]; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) - ref2[k] = 4; - - if (right > j) - right = j; - - // align each read to ref2 + char **tcons; + int left_shift, right_shift; + int tcon_len[2]; + int cpos_pos; + tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len, + left, right, s, types[t], biggest_del, + &left_shift, &right_shift, &band, + tcon_len, &cpos_pos); +#ifdef CONS_DEBUG + { + int j; + for (j = 0; j < 2; j++) { + int k; + fprintf(stderr, "Cons%d @ %d %4d/%4d ", + j, pos, types[t], left_shift); + for (k = 0; k < tcon_len[j]; k++) { + if (k == cpos_pos) + putc('#', stderr); + putc("ACGTN"[(uint8_t)tcons[j][k]], stderr); + } + putc('\n', stderr); + } + } +#endif + + // Scan for base-runs in the insertion. + // We use this to avoid over-correction in est_seqQ when the + // insertion is not part of the neighbouring homopolymer. + int k = tcons[0][cpos_pos], j; + for (j = 0; j < types[t]; j++) + if (tcons[0][cpos_pos+j] != k) + break; + if (j && j == types[t]) + l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN + if (types[t] < 0) + l_run_ins |= 0xff; + + // align each read to consensus(es) for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -858,34 +1431,60 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, continue; // determine the start and end of sequences for alignment - // FIXME: loops over CIGAR multiple times int left2 = left, right2 = right; - if (p->b->core.l_qseq > 1000) { + int min_win_size = MAX(-biggest_del, biggest_ins); + min_win_size += ABS(left_shift) + ABS(right_shift); + { + rep_ele *reps, *elt, *tmp; + reps = find_STR(tcons[0], tcon_len[0], 0); + //int max_str = 0; + int tot_str = 0; + DL_FOREACH_SAFE(reps, elt, tmp) { + // if (max_str < elt->end - elt->start) + // max_str = elt->end - elt->start; + tot_str += elt->end - elt->start; + DL_DELETE(reps, elt); + free(elt); + } + + // Ideally max_str should be enough, but it's still not + // sufficient in longer range some repeats. + //min_win_size += max_str; + min_win_size += tot_str; + } + min_win_size += 10; + if (p->b->core.l_qseq > 1000) { // ||1 for 7f-long // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. - if (pos - left >= bca->indel_win_size) - left2 += bca->indel_win_size/2; - if (right-pos >= bca->indel_win_size) - right2 -= bca->indel_win_size/2; + if (pos - left >= min_win_size) + left2 = MAX(left2, pos - min_win_size); + if (right-pos >= min_win_size) + right2 = MIN(right2, pos + min_win_size); } + // Genomic coords for first and last base of query + // alignment. This is only used in bcf_cgp_align_score + // for computing scores by looking for the proximity + // of STRs with the end of the query alignment. int r_start = p->b->core.pos; int r_end = bam_cigar2rlen(p->b->core.n_cigar, bam_get_cigar(p->b)) -1 + r_start; - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2, - 0, &tbeg); + // Map left2/right2 genomic coordinates to qbeg/qend + // query coordinates. The query may not span the + // entire left/right region, so this also returns the + // equivalent genomic coords for qbeg/qend in tbeg/tend. + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + left2, 0, &tbeg); qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, 0, &tend) - qbeg; - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2, - 1, &tend); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + right2, 1, &tend); - if (types[t] < 0) { - int l = -types[t]; - tbeg = tbeg - l > left? tbeg - l : left; - } + int old_tend = tend; + int old_tbeg = tbeg; // write the query sequence for (l = qbeg; l < qend; ++l) @@ -895,53 +1494,59 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // RG platform field. int long_read = p->b->core.l_qseq > 1000; - // do realignment; this is the bottleneck + // tbeg and tend are the genomic locations equivalent + // to qbeg and qend on the sequence. + // These may being entirely within our left/right + // coordinates over which we've computed the + // consensus, or overlapping to left/right. + // + // We know an estimation of band, plus biggest indel, + // so we can trim tbeg/tend to a smaller region if we + // wish here. This speeds up BAQ scoring. + int wband = band + MAX(-biggest_del, biggest_ins)*2 + 20; + int tend1 = left + tcon_len[0] - (left2-left); + int tend2 = left + tcon_len[1] - (left2-left); + tend1 = MIN(tend1, old_tend + wband); + tend2 = MIN(tend2, old_tend + wband); + tbeg = MAX(left2, old_tbeg - wband); + + // do realignment; this is the bottleneck. + // + // Note low score = good, high score = bad. if (tend > tbeg) { - if (bcf_cgp_align_score(p, bca, types[t], - (uint8_t *)ref2 + left2-left, + if (bcf_cgp_align_score(p, bca, types[t], band, + (uint8_t *)tcons[0] + left2-left, + (uint8_t *)tcons[1] + left2-left, (uint8_t *)query, r_start, r_end, long_read, - tbeg, tend, left2, right2, - qbeg, qend, qpos, max_deletion, + tbeg, tend1, tend2, + left2, left + tcon_len[0], + qbeg, qend, pos,qpos, -biggest_del, &score[K*n_types + t]) < 0) { - score[K*n_types + t] = 0xffffff; - return -1; + goto err; } } else { // place holder large cost for reads that cover the // region entirely within a deletion (thus tend < tbeg). score[K*n_types + t] = 0xffffff; } -#if 0 - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); - fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) - fputc("ACGTN"[(int)query[l]], stderr); - fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " - "qbeg=%d tbeg=%d score=%d\n", - pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, sc); -#endif } + free(tcons); } } // compute indelQ + if (!(l_run_base & l_run_ins)) + l_run = 1; // different base type in ins to flanking region. n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, ref_type, types, n_types, score); + err: // free - free(ref2); free(query); free(score); - - for (i = 0; i < n; ++i) - free(ref_sample[i]); - - free(ref_sample); - free(types); free(inscns); + free(types); + free(inscns); return n_alt > 0? 0 : -1; } diff --git a/mpileup.c b/mpileup.c index fd5aa510e..35c9b2117 100644 --- a/mpileup.c +++ b/mpileup.c @@ -575,7 +575,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth && (bcf_callaux_clean(conf->bca, &conf->bc), - bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) + bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref, ref_len) >= 0)) { for (i = 0; i < conf->gplp->n; ++i) bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); @@ -1224,7 +1224,7 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; - mplp.indel_win_size = 110; + mplp.indel_win_size = 80; mplp.clevel = -1; hts_srand48(0); @@ -1410,9 +1410,9 @@ int main_mpileup(int argc, char *argv[]) char *tmp; mplp.indel_win_size = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); - if ( mplp.indel_win_size < 110 ) + if ( mplp.indel_win_size < 20 ) { - mplp.indel_win_size = 110; + mplp.indel_win_size = 20; fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); } } diff --git a/str_finder.c b/str_finder.c index 800cbfef9..ebf26561d 100644 --- a/str_finder.c +++ b/str_finder.c @@ -1,7 +1,7 @@ /* str_finder.c -- Short Tandem Repeat finder. Originally from Crumble (https://github.com/jkbonfield/crumble) - Copyright (C) 2015-2016, 2021 Genome Research Ltd. + Copyright (C) 2015-2016, 2021-2022 Genome Research Ltd. Author: James Bonfield @@ -139,10 +139,10 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, */ rep_ele *find_STR(char *cons, int len, int lower_only) { int i, j; - uint32_t w = 0; + uint64_t w = 0; rep_ele *reps = NULL; - for (i = j = 0; i < len && j < 15; i++) { + for (i = j = 0; i < len && j < 26; i++) { if (cons[i] == '*') continue; w <<= 2; @@ -162,6 +162,18 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { add_rep(&reps, cons, len, i, 6, lower_only, w); if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); + if (j>=15 && (w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + if (j>=17 && (w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + if (j>=19 && (w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i,10, lower_only, w); + if (j>=21 && (w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i,11, lower_only, w); + if (j>=23 && (w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i,12, lower_only, w); + if (j>=24 && (w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i,13, lower_only, w); j++; } @@ -172,21 +184,33 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { w <<= 2; w |= cons[i]; //printf("%3d %c w=%08x\n", i, cons[i], w); - if ((w&0xffff) == ((w>>16)&0xffff)) + if ((w&0xfffffff) == ((w>>28)&0xfffffff)) + add_rep(&reps, cons, len, i, 14, lower_only, w); + else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i, 13, lower_only, w); + else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i, 12, lower_only, w); + else if ((w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i, 11, lower_only, w); + else if ((w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i, 10, lower_only, w); + else if ((w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + else if ((w&0xffff) == ((w>>16)&0xffff)) add_rep(&reps, cons, len, i, 8, lower_only, w); - else if ((w&0x3fff) == ((w>>14)&0x3fff)) + else if ((w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); - else if ((w&0x0fff) == ((w>>12)&0x0fff)) + else if ((w&0x0fff) == ((w>>12)&0x0fff)) add_rep(&reps, cons, len, i, 6, lower_only, w); - else if ((w&0x03ff) == ((w>>10)&0x03ff)) + else if ((w&0x03ff) == ((w>>10)&0x03ff)) add_rep(&reps, cons, len, i, 5, lower_only, w); - else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) add_rep(&reps, cons, len, i, 4, lower_only, w); - else if ((w&0x003f) == ((w>> 6)&0x003f)) + else if ((w&0x003f) == ((w>> 6)&0x003f)) add_rep(&reps, cons, len, i, 3, lower_only, w); - else if ((w&0x000f) == ((w>> 4)&0x000f)) + else if ((w&0x000f) == ((w>> 4)&0x000f)) add_rep(&reps, cons, len, i, 2, lower_only, w); - else if ((w&0x0003) == ((w>> 2)&0x0003)) + else if ((w&0x0003) == ((w>> 2)&0x0003)) add_rep(&reps, cons, len, i, 1, lower_only, w); } diff --git a/test/mpileup/indel-AD.1.out b/test/mpileup/indel-AD.1.out index 9d785f963..6b28b5264 100644 --- a/test/mpileup/indel-AD.1.out +++ b/test/mpileup/indel-AD.1.out @@ -166,9 +166,9 @@ 000000F 535 . G A,<*> 0 . DP=125;I16=65,52,0,1,4309,171791,12,144,7020,421200,60,3600,2679,64385,25,625;QS=0.997223,0.00277713,0;SGB=-0.379885;RPBZ=-1.14518;MQBZ=0;MQSBZ=0;BQBZ=-1.74874;SCBZ=0.0762539;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255:117,1,0 000000F 536 . T G,A,<*> 0 . DP=125;I16=65,51,0,2,4274,171298,24,288,6960,417600,120,7200,2661,64041,48,1154;QS=0.994416,0.002792,0.002792,0;VDB=0.1;SGB=-0.453602;RPBZ=-1.69957;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=-0.714873;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:116,1,1,0 000000F 537 . A <*> 0 . DP=125;I16=65,53,0,0,4390,175290,0,0,7080,424800,0,0,2713,65375,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:118,0 -000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=37,25,30,27,2480,99200,2280,91200,3720,223200,3420,205200,1399,33485,1298,31150;QS=0.260049,0.739951;VDB=0.0363603;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,27:62,57 +000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=36,26,31,27,2480,99200,2320,92800,3720,223200,3480,208800,1405,33737,1313,31375;QS=0.385658,0.614342;VDB=0.0342923;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,193:62,58 000000F 538 . C <*> 0 . DP=65;I16=36,26,0,0,2195,86349,0,0,3720,223200,0,0,1432,34600,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,187,255:62,0 -000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=30,26,36,25,2240,89600,2440,97600,3360,201600,3660,219600,1279,30735,1380,33214;QS=0.218762,0.781238;VDB=0.0165787;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,27:56,61 +000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=31,27,36,26,2320,92800,2480,99200,3480,208800,3720,223200,1318,31556,1405,33839;QS=0.360065,0.639935;VDB=0.0289027;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,195:58,62 000000F 539 . T <*> 0 . DP=60;I16=29,26,0,0,2120,86238,0,0,3300,198000,0,0,1260,30374,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,166,255:55,0 000000F 540 . G <*> 0 . DP=124;I16=64,53,0,0,4130,161310,0,0,7020,421200,0,0,2703,65511,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 000000F 541 . G <*> 0 . DP=124;I16=64,53,0,0,4143,160525,0,0,7020,421200,0,0,2705,65703,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 @@ -286,11 +286,11 @@ 000000F 653 . A <*> 0 . DP=21;I16=9,12,0,0,804,31130,0,0,1260,75600,0,0,303,5555,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 000000F 654 . A <*> 0 . DP=21;I16=9,12,0,0,659,22061,0,0,1260,75600,0,0,285,5117,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 000000F 655 . C <*> 0 . DP=21;I16=9,12,0,0,664,22342,0,0,1260,75600,0,0,266,4666,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 -000000F 655 . CACAATACAA CACAA 0 . INDEL;IDV=6;IMF=0.285714;DP=21;I16=0,2,5,1,240,28800,720,86400,120,7200,360,21600,46,1060,100,1788;QS=0.0977444,0.902256;VDB=0.00018837;SGB=-0.616816;RPBZ=-2.81289;MQBZ=0;MQSBZ=0;SCBZ=-2.52262;FS=0;MQ0F=0 PL:AD 159,0,2:2,6 +000000F 655 . CACAATACAA CACAA 0 . INDEL;IDV=6;IMF=0.285714;DP=21;I16=0,0,5,1,0,0,720,86400,0,0,360,21600,0,0,100,1788;QS=0,1;VDB=0.00211394;SGB=-0.616816;RPBZ=-2.81289;MQBZ=0;MQSBZ=0;SCBZ=-2.52262;FS=0;MQ0F=0 PL:AD 179,18,0:0,6 000000F 656 . A <*> 0 . DP=11;I16=4,7,0,0,404,15690,0,0,660,39600,0,0,141,2411,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 657 . C <*> 0 . DP=11;I16=4,7,0,0,413,15607,0,0,660,39600,0,0,131,2189,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 658 . A <*> 0 . DP=10;I16=3,7,0,0,121,1651,0,0,600,36000,0,0,122,1986,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,30,79:10,0 -000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,1,2,300,30000,300,30000,180,10800,180,10800,59,1205,50,902;QS=0.31694,0.68306;VDB=0.00155977;SGB=-0.511536;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 99,0,38:3,3 +000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,3,2,300,30000,500,50000,180,10800,300,18000,59,1205,76,1240;QS=0.111111,0.888889;VDB=0.000108203;SGB=-0.590765;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 124,1,0:3,5 000000F 659 . A <*> 0 . DP=10;I16=3,5,0,0,86,1088,0,0,480,28800,0,0,75,1077,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,24,63:8,0 000000F 660 . T <*> 0 . DP=2;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;FS=0;MQ0F=0 PL:AD 0,0,0:0,0 000000F 661 . A <*> 0 . DP=8;I16=0,2,0,0,8,32,0,0,120,7200,0,0,26,340,0,0;QS=1,0;FS=0;MQ0F=0 PL:AD 0,6,7:2,0 diff --git a/test/mpileup/indel-AD.2.out b/test/mpileup/indel-AD.2.out index d932c5bd9..9cde5d51e 100644 --- a/test/mpileup/indel-AD.2.out +++ b/test/mpileup/indel-AD.2.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:14,6 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:12,6 diff --git a/test/mpileup/indel-AD.3.out b/test/mpileup/indel-AD.3.out index 2dd65ab61..c19a1febf 100644 --- a/test/mpileup/indel-AD.3.out +++ b/test/mpileup/indel-AD.3.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:45,23 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:43,25 diff --git a/test/mpileup/indel-AD.4.out b/test/mpileup/indel-AD.4.out index 21fc59740..d118a5daf 100644 --- a/test/mpileup/indel-AD.4.out +++ b/test/mpileup/indel-AD.4.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:62,6 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:62,6 diff --git a/test/mpileup/mpileup.2.out b/test/mpileup/mpileup.2.out index 649046db6..1b363cf30 100644 --- a/test/mpileup/mpileup.2.out +++ b/test/mpileup/mpileup.2.out @@ -224,7 +224,7 @@ 17 300 . A <*> 0 . DP=27;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,204:8:0 0,21,210:7:0 17 301 . G <*> 0 . DP=25;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,255:10:0 0,21,195:7:0 0,21,196:7:0 17 302 . T <*> 0 . DP=25;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,231:10:0 0,21,172:7:0 0,21,202:7:0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 161,0,99:11:6 158,0,14:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 157,0,98:11:6 158,0,14:7:6 201,21,0:7:7 17 303 . G <*> 0 . DP=25;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,21,197:7:0 0,21,195:7:0 17 304 . C <*> 0 . DP=27;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,206:8:0 0,24,200:8:0 17 305 . C <*> 0 . DP=27;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,213:8:0 0,24,211:8:0 diff --git a/test/mpileup/mpileup.4.out b/test/mpileup/mpileup.4.out index d7366fae6..c6ece56bf 100644 --- a/test/mpileup/mpileup.4.out +++ b/test/mpileup/mpileup.4.out @@ -228,7 +228,7 @@ 17 300 . A <*> 0 . DP=27;DPR=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:6,5,0,0:11,0 0,24,204:8:0:0:3,5,0,0:8,0 0,21,210:7:0:0:2,5,0,0:7,0 17 301 . G <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,255:10:0:0:5,5,0,0:10,0 0,21,195:7:0:0:3,4,0,0:7,0 0,21,196:7:0:0:2,5,0,0:7,0 17 302 . T <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,231:10:0:0:5,5,0,0:10,0 0,21,172:7:0:0:3,4,0,0:7,0 0,21,202:7:0:0:2,5,0,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 161,0,99:11:6:6:1,4,4,2:5,6 158,0,14:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 157,0,98:11:6:6:1,4,4,2:5,6 158,0,14:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 17 303 . G <*> 0 . DP=25;DPR=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,21,197:7:0:0:3,4,0,0:7,0 0,21,195:7:0:0:2,5,0,0:7,0 17 304 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,206:8:0:0:4,4,0,0:8,0 0,24,200:8:0:0:2,6,0,0:8,0 17 305 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,213:8:0:0:4,4,0,0:8,0 0,24,211:8:0:0:2,6,0,0:8,0 diff --git a/test/mpileup/mpileup.5.out b/test/mpileup/mpileup.5.out index c72170dbd..908c02be5 100644 --- a/test/mpileup/mpileup.5.out +++ b/test/mpileup/mpileup.5.out @@ -230,7 +230,7 @@ 17 300 . A <*> 0 . DP=27;ADF=11,0;ADR=15,0;AD=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:6,0:5,0:11,0 0,24,204:8:0:3,0:5,0:8,0 0,21,210:7:0:2,0:5,0:7,0 17 301 . G <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,255:10:0:5,0:5,0:10,0 0,21,195:7:0:3,0:4,0:7,0 0,21,196:7:0:2,0:5,0:7,0 17 302 . T <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,231:10:0:5,0:5,0:10,0 0,21,172:7:0:3,0:4,0:7,0 0,21,202:7:0:2,0:5,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 161,0,99:11:6:1,4:4,2:5,6 158,0,14:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 157,0,98:11:6:1,4:4,2:5,6 158,0,14:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 17 303 . G <*> 0 . DP=25;ADF=10,0;ADR=15,0;AD=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,21,197:7:0:3,0:4,0:7,0 0,21,195:7:0:2,0:5,0:7,0 17 304 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,206:8:0:4,0:4,0:8,0 0,24,200:8:0:2,0:6,0:8,0 17 305 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,213:8:0:4,0:4,0:8,0 0,24,211:8:0:2,0:6,0:8,0 diff --git a/test/mpileup/mpileup.6.out b/test/mpileup/mpileup.6.out index 4337f6de6..28d9abfb8 100644 --- a/test/mpileup/mpileup.6.out +++ b/test/mpileup/mpileup.6.out @@ -62,7 +62,7 @@ 17 283 . C <*> . . END=296;MinDP=5;QS=3,0 PL:DP 0,33,240:11 0,18,119:6 0,15,122:5 17 297 . C G,<*> 0 . DP=25;I16=9,15,1,0,901,34305,4,16,1138,59338,60,3600,445,9901,10,100;QS=2.98261,0.0173913,0;SGB=-0.556633;RPBZ=-1.24856;MQBZ=0.806872;MQSBZ=-3.22749;BQBZ=-1.67542;SCBZ=-0.368383;FS=0;MQ0F=0 PL:DP:DV 0,33,255,33,255,255:11:0 0,15,168,21,171,168:8:1 0,18,161,18,161,161:6:0 17 298 . A <*> . . END=301;MinDP=7;QS=3,0 PL:DP 0,30,231:10 0,21,172:7 0,21,189:7 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 161,0,99:11:6 158,0,14:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 157,0,98:11:6 158,0,14:7:6 201,21,0:7:7 17 303 . G <*> . . END=334;MinDP=7;QS=3,0 PL:DP 0,30,235:10 0,21,197:7 0,21,195:7 17 335 . A G,<*> 0 . DP=32;I16=13,18,1,0,1084,40336,4,16,1589,87297,60,3600,555,11943,0,0;QS=2.98919,0.0108108,0;SGB=-0.556633;RPBZ=-1.67921;MQBZ=0.622171;MQSBZ=-2.25492;BQBZ=-1.68602;SCBZ=-0.258065;FS=0;MQ0F=0 PL:DP:DV 0,33,252,33,252,252:11:0 0,27,219,27,219,219:9:0 0,25,245,33,248,245:12:1 17 336 . A <*> . . MinDP=9;QS=3,0 PL:DP 0,33,255:11 0,27,212:9 0,36,255:12