From ffc34240764f76d8d2f69b796aedd6f461bbbf28 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Nov 2023 12:33:23 +0000 Subject: [PATCH 1/8] Add a new indel caller. This is a combination of PR #1679 from 2022 coupled with more recent changes to replace BAQ with edlib for evaluating the alignment of reads against candidate alleles. Due to the complexity of rebasing many commits with many conflicts, these have been squashed together for simplicity. Please consult the original commit log in the PR for more detailed history. PR 1679 is a major restructuring of the indel caller that produces proper consensus sequences. TODO: paste collected commit messages here. The newer edlib changes are designed to dramtically increase the speed of indel calling on long read data by use of a modern algorithm. Note edlib's alignments are very crude, with just +1 for all differences including substitutions and each indel base. However if we've computed our candidate allele consensus sequences correctly then the aligning against the real allele should be a minimal score regardless of scoring regime, so the cost of an HMM or even affine weights do not help much. The lack of quality values is compensated for by detection of minimum quality within STRs. --- LICENSE | 25 + Makefile | 7 +- bam2bcf.c | 52 +- bam2bcf.h | 27 +- bam2bcf_edlib.c | 1768 +++++++++++++++++++++++++++++++++++++++++++++++ bam2bcf_indel.c | 13 +- edlib.c | 1547 +++++++++++++++++++++++++++++++++++++++++ edlib.h | 277 ++++++++ mpileup.c | 87 ++- str_finder.c | 98 ++- str_finder.h | 5 +- 11 files changed, 3874 insertions(+), 32 deletions(-) create mode 100644 bam2bcf_edlib.c create mode 100644 edlib.c create mode 100644 edlib.h diff --git a/LICENSE b/LICENSE index 586dafae5..dbe9739ea 100644 --- a/LICENSE +++ b/LICENSE @@ -772,3 +772,28 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +----------------------------------------------------------------------------- + +License for edlib.[ch] + +The MIT License (MIT) + +Copyright (c) 2014 Martin Šošić + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile index 3ee1772aa..79647fff3 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,10 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \ vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ - mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \ + mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o bam2bcf_edlib.o \ + read_consensus.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ - ccall.o em.o prob1.o kmin.o str_finder.o gff.o + ccall.o em.o prob1.o kmin.o str_finder.o gff.o edlib.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -234,6 +235,7 @@ vcfbuf_h = vcfbuf.h $(htslib_vcf_h) abuf_h = abuf.h $(htslib_vcf_h) dbuf_h = dbuf.h $(htslib_vcf_h) bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h) +edlib.h = edlib.h bam_sample_h = bam_sample.h $(htslib_sam_h) cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h) read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h) @@ -285,6 +287,7 @@ mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(hts bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h) bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) +bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) $(edlib.h) read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h) version.o: version.h version.c diff --git a/bam2bcf.c b/bam2bcf.c index 88e25de1f..402d42687 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -268,6 +268,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } + // Detect if indel occurs anywhere in this sample + int indel_in_sample = 0; + if (bca->edlib) { + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel) indel_in_sample = 1; + } + } + // fill the bases array double nqual_over_60 = bca->nqual / 60.0; int ADR_ref_missed[4] = {0}; @@ -298,7 +307,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t b = p->aux>>16&0x3f; // indel type seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias - if ( !bca->indels_v20 ) + if (bca->edlib) { + if (indel_in_sample) { + seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + } else { + // An indel in another sample, but not this. So just use + // basic sequence confidences. + q = bam_get_qual(p->b)[p->qpos]; + if (q > bca->max_baseQ) q = bca->max_baseQ; + seqQ = 99; + } + } + + if ( !bca->indels_v20 && !bca->edlib ) { /* This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain @@ -341,6 +362,25 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } continue; } + + // FIXME: CHECK if this is still needed with edlib mode + // It's a slight variant on the one above guarded by --indels-2.0 + if (bca->edlib) { + if (indel_in_sample && p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + if (b != 0) + b = 5; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + } + + // Note baseQ changes some output fields such as I16, but has no + // significant affect on "call". baseQ = p->aux>>8&0xff; } else @@ -478,9 +518,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp); } + // Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed + // ratios. This is detrimental on Illumina, but beneficial on PacBio CCS. + // It's possibly related to the homopolyer error likelihoods or overall + // Indel accuracy. Maybe tie this in to the -h option? + r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype + + // TODO: account for the number of unassigned reads. If depth is 50, + // but AD is 5,7 then it may look like a variant but it probably + // should be low quality. + return n; } diff --git a/bam2bcf.h b/bam2bcf.h index 955c022bf..11413ee83 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -122,7 +122,7 @@ typedef struct __bcf_callaux_t { // for internal uses int max_bases; int indel_types[4]; // indel lengths - int indel_win_size, indels_v20; + int indel_win_size, indels_v20, edlib; int maxins, indelreg; int read_len; char *inscns; @@ -130,6 +130,7 @@ typedef struct __bcf_callaux_t { errmod_t *e; void *rghash; float indel_bias; // adjusts indel score threshold; lower => call more. + float del_bias; // (-.9 < x < .9) error profile; >0 => more del, <0 => more ins int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm} unsigned int nnm[2]; // number of nm observations float nm[2]; // cumulative count of mismatches in ref and alt reads @@ -193,11 +194,35 @@ extern "C" { const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); + int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, + bcf_callaux_t *bca, const char *ref, int ref_len); + void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); int bcf_cgp_l_run(const char *ref, int pos); int est_indelreg(int pos, const char *ref, int l, char *ins4); +/* ---------------------------------------------------------------------- + * Shared between bam2bcf_indel.c and bam2bcf_edlib.c + */ + +// Take a reference position tpos and convert to a query position (returned). +// This uses the CIGAR string plus alignment c->pos to do the mapping. +// +// *_tpos is returned as tpos if query overlaps tpos, but for deletions +// it'll be either the start (is_left) or end (!is_left) ref position. +int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos); + +// Identify spft-clip length, position in seq, and clipped seq len +void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, + int *sc_len_r, int *slen_r, int *epos_r, int *end); + +// Compute the consensus for this sample 's', minus indels which +// get added later. +char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s); + #ifdef __cplusplus } #endif diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c new file mode 100644 index 000000000..b39310ddc --- /dev/null +++ b/bam2bcf_edlib.c @@ -0,0 +1,1768 @@ +/* bam2bcf_indel.c -- indel caller. + + Copyright (C) 2010, 2011 Broad Institute. + Copyright (C) 2012-2014,2016-2017, 2021-2023 Genome Research Ltd. + + Author: Heng Li + Petr Danecek + James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +//#define CONS_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include "bam2bcf.h" +#include "str_finder.h" + +#include +// Is there no way to share these between the 3 implementations? +KSORT_INIT_STATIC_GENERIC(uint32_t) + +#define MINUS_CONST 0x10000000 + +#define MAX_TYPES 64 + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef ABS +# define ABS(a) ((a)<0?-(a):(a)) +#endif + +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +// l is the relative gap length and l_run is the length of the homopolymer +// on the reference. +// +// Larger seqQ is good, so increasing tandemQ calls more indels, +// and longer l_run means fewer calls. It is capped later at 255. +// For short l_runs, the qual is simply based on size of indel +// larger ones being considered more likely to be real. +// Longer indels get assigned a score based on the relative indel size +// to homopolymer, where l_run base will have already been verified by +// the caller to ensure it's compatible. +static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run, int str_len) +{ + int q, qh; + // Short indels are more likely sequencing error than large ones. + // So "seqQ" scales with size of observation "l". + // + // Note openQ and extQ are error likelihoods in Phred scale. Hence high + // openQ means we're very unlikely to miscall an indel. + // Ie it's not the open/ext "costs" normally used in alignment; more the reverse. + // + // We use MIN(q,qh) below, so we can remove the q component by specifying + // a large -o parameter in mpileup. + q = bca->openQ + bca->extQ * (abs(l) - 1); + + // Orig method; best with Illumina (high openQ) +// qh = bca->tandemQ * (double)abs(l) / l_run + .499; + + // Penalise longer homopolymers quadratically more, but boost shorter ones. + // Best with CCS (low openQ) + //qh = 2 * bca->tandemQ * pow((double)abs(l) / l_run, 1.5) + .499; + + // (l/l_run)^1.26 for openQ=25 or ^1 for openQ=40. +// double openQ = MIN(40, bca->openQ); +// qh = (30/openQ) * bca->tandemQ +// * pow((double)abs(l) / l_run, 1/sqrt(openQ/40)) + .499; + + // Linear scaled on openQ too + qh = bca->tandemQ * (double)abs(l) / l_run + .499; + + // Generic maybe ? + // power = 1/sqrt(MIN(40,bca->openQ)/40.); + // qh = ... * pow((double)abs(l)/l_run, power) + + // bam2bcf.c caps has "if q>seqQ) q=seqQ" so it caps base qual 'q'. + // A 1bp indel would therefore have a maximum qual it could be considered based + // on open+ext. Hence why openQ is phred score indicating if the base is real + // or an over/under-call. (high openQ means high trust in base) + return q < qh? q : qh; +} + +// Part of bcf_call_gap_prep. +// +// Scans the pileup to identify all the different sizes of indels +// present. +// types[] returned is sorted by size, from smallest (maybe negative) to largest. +// +// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r, +// or NULL on error. +static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int *max_rd_len_r, int *n_types_r, + int *ref_type_r, int *N_r) { + int i, j, t, s, N, m, max_rd_len, n_types; + int n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + int *types; + + // N is the total number of reads + for (s = N = 0; s < n; ++s) + N += n_plp[s]; + + bca->max_support = bca->max_frac = 0; + aux = (uint32_t*) calloc(N + 1, 4); + if (!aux) + return NULL; + + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type (REF) + + // Fill out aux[] array with all the non-zero indel sizes. + // Also tally number with indels (n_alt) and total (n_tot). + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + + // FIXME: cache me in pileup struct. + j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + double frac = (double)na/nt; + if ( !indel_support_ok && na >= bca->min_support + && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) + bca->max_support = na, bca->max_frac = frac; + + n_alt += na; + n_tot += nt; + } + + // Sort aux[] and dedup + ks_introsort(uint32_t, m, aux); + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + + // Taking totals makes it hard to call rare indels (IMF filter) + if ( !bca->per_sample_flt ) + indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac + || n_alt < bca->min_support ) + ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); + return NULL; + } + + // Bail out if we have far too many types of indel + if (n_types >= MAX_TYPES) { + free(aux); + // TODO revisit how/whether to control printing this warning + if (hts_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. " + "Skip the position.\n", __func__, pos + 1); + return NULL; + } + + // To prevent long stretches of N's to be mistaken for indels + // (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases are Ns. + int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len + ?2*bca->indel_win_size : max_rd_len); + for (i=pos; i(i-pos) ) { + free(aux); + return NULL; + } + + // Finally fill out the types[] array detailing the size of insertion + // or deletion. + types = (int*)calloc(n_types, sizeof(int)); + if (!types) { + free(aux); + return NULL; + } + t = 0; + for (i = 0; i < m; ++i) { + int sz = (int32_t)(aux[i] - MINUS_CONST); + int j; + for (j = i+1; j < m; j++) + if (aux[j] != aux[i]) + break; + + if (sz == 0 + || (j-i >= bca->min_support && + // Note, doesn't handle bca->per_sample_flt yet + (bca->per_sample_flt + || (double)(j-i) / n_tot >= bca->min_frac))) + types[t++] = sz; + i = j-1; + } + free(aux); + + if (t <= 1) { + free(types); + return NULL; + } + n_types = t; + + // Find reference type; types[?] == 0) + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + + *ref_type_r = t; + *n_types_r = n_types; + *max_rd_len_r = max_rd_len; + *N_r = N; + + return types; +} + +// Increment ins["str"] and freq["str"] +#define NI 100 // number of alternative insertion sequences +// Could use a hash table too, but expectation is a tiny number of alternatives +typedef struct { + char *str[NI]; + int len[NI]; + int freq[NI]; +} str_freq; + +static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { + int j; + + for (j = 0; j < NI && sf->str[j]; j++) { + if (sf->len[j] == len && memcmp(sf->str[j], str, len) == 0) + break; + } + if (j >= NI) + return 0; // too many choices; discard + + sf->freq[j]+=freq; + if (!sf->str[j]) { + // new insertion + if (!(sf->str[j] = malloc(len+1))) + return -1; + memcpy(sf->str[j], str, len); + sf->len[j] = len; + } + + return 0; +} + +/* + * Compute the consensus for a specific indel type at pos. + * + * left_shift is the number of inserted(+) or deleted(-) bases added to + * the consensus before we get to pos. This is necessary so the alignment + * band is correct as it's expected to start at left/right edges in + * sync + * + * We accumulate into several buffers for counting base types: + * cons_base - consensus of data with p->indel == type, bases or gap + * ref_base - consensus of data with p->indel != type, bases or gap + * cons_ins - consensus of data with p->indel == type, insertions + * ref_ins - consensus of data with p->indel == type, bases or gap + * + * The purpose of cons_ins vs cons_base is if we have very low + * coverage due to nearly all reads being another type, then we can + * still get a robust consensus using the other data. If we don't + * have shallow data, then we'll not use as much of ref_base as we may + * have correlated variants. + * + * Eg: + * REF: AGCTATGAGGCTGATA + * SEQ: AGGTAGGAGGGTGATA (x1) + * SEQ: AGCTACGAGG*TGATA (x24) + * SEQ: AGCTACTAGG*TGATA (x24) + * + * Cons for no-del is Cs not Gs. Cannot trust it, so use N if shallow. + * CON: AGCTACNAGGGTGATA + * + * There are still some problems in cons_ins vs ref_ins assignment. + * We sometimes seem multiple similar-length insertions added at + * different locations. Ideally we'd like to consider these as all + * the same insertion if the size is the same and it's comparable seq. + */ +static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + int ref_len, int left, int right, + int sample, int type, int biggest_del, + int *left_shift, int *right_shift, + int *band, int *tcon_len, int *cpos_pos) { + // Map ASCII ACGTN* to 012345 + static uint8_t base6[256] = { + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,5,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + //A C G *^ T + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + }; + + // single base or del + int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base)); + // multi-base insertions + str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); + + // non-indel ref for all reads on this sample, rather than those just + // matching type. We use this for handling the case where we have a + // homozygous deletion being studied, but with 1 or 2 reads misaligned + // and containing a base there. + // + // Eg if the type[]=0 consensus is made up of a very small sample size, + // which is also enriched for highly error prone data. We can use + // the other reads from type[] != 0 to flesh out the consensus and + // improve accuracy. + int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); + str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins)); + int i, j, k, s = sample; + char **cons = NULL; + + if (!cons_base || !cons_ins || !ref_base || !ref_ins) + goto err; + + //-------------------------------------------------- + // Accumulate sequences into cons_base and cons_ins arrays + int local_band_max = 0; // maximum absolute deviation from diagonal + for (i = 0; i < n_plp[s]; i++) { + const bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + int x = b->core.pos; // ref coordinate + int y = 0; // seq coordinate + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + + int local_band = 0; // current deviation from diagonal + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int len = cigar[k] >> BAM_CIGAR_SHIFT; + int base; + int skip_to = 0; + + switch(op) { + case BAM_CSOFT_CLIP: + y += len; + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + // Can short-cut this with j_start and j_end based on + // x+len and left,right + for (j = 0; j < len; j++, x++, y++) { + if (x < left) continue; + if (x >= right) break; + + base = bam_seqi(seq, y); + if (p->indel == type) + // Convert 4-bit base ambig code to 0,1,2,3,4 range + cons_base[x-left][seq_nt16_int[base]]++; + else if (x != pos+1) // indel being assessed question + ref_base[x-left][seq_nt16_int[base]]++; + } + break; + } + + case BAM_CINS: { + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < local_band) + local_band_max = local_band; + } + + char ins[1024]; + for (j = 0; j < len; j++, y++) { + if (x < left) continue; + if (x >= right) + break; + base = bam_seqi(seq, y); + if (j < 1024) + ins[j] = seq_nt16_int[base]; + } + + // Insertions come before a ref match. + // 5I 5M is IIIIIM M M M M events, not + // {IIIII,M} M M M M choice. So we need to include the + // next match in our sequence when choosing the consensus. + if (x >= left && x < right) { + int ilen = j<1024?j:1024; + if (p->indel == type /*&& x == pos+1*/) { + // Assume any ins of the same size is the same ins. + // (This rescues misaligned insertions.) + if (bcf_cgp_append_cons(&cons_ins[x-left], ins, + ilen, 1) < 0) + goto err; + } else if (x != pos+1){ + if (bcf_cgp_append_cons(&ref_ins[x-left], ins, + ilen, 1) < 0) + goto err; + } + } + break; + } + + case BAM_CDEL: + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < -local_band) + local_band_max = -local_band; + } + + // Maybe not perfect for I/D combos, but likely sufficient. + for (j = 0; j < len; j++, x++) { + if (x < left) continue; + if (x >= right) break; + if ((p->indel == type && !p->is_del) || // starts here + (p->indel == 0 && p->is_del && len == -type)) // left + cons_base[x-left][5]++; + else if (x+len <= pos+1 || (skip_to && x > skip_to)) + ref_base[x-left][5]++; + else if (x <= pos && x+len > pos+1) { + // we have a deletion which overlaps pos, but + // isn't the same "type". We don't wish to + // include these as they may bias the + // evaluation by confirming against a + // secondary consensus produced with the other + // deletion. We set a marker for how long to + // skip adding to ref_base. + if (x > skip_to) + skip_to = x+len; + } + } + break; + } + } + + // Also track the biggest deviation +/- from diagonal. We use + // this band observation in our BAQ alignment step. + if (*band < local_band_max) + *band = local_band_max; + } + + //-------------------------------------------------- + // Expand cons_base to include depth from ref_base/ref_ins + // Caveat: except at pos itself, where true ref is used if type != 0 + + // Note this harms PB-CCS test at chr1:10171880. + // We could retest this heuristic further maybe. + for (i = 0; i < right-left; i++) { + // Total observed depth + int t = cons_base[i][0] + cons_base[i][1] + cons_base[i][2] + + cons_base[i][3] + cons_base[i][4] + cons_base[i][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + t += cons_ins[i].freq[j]; + } + + // Similarly for depth on the non-ALT calls (NB: not necessarily + // REF as maybe it's other ALTs). + int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] + + ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + r += ref_ins[i].freq[j]; + } + + // When evaluating this particular indel, we don't want to + // penalise alignments by SNP errors elsewhere. This can + // happen when we have low depth for a particular 'type'. + // + // So add in a little data from ref_base/ref_ins. + double rfract = (r - t*2)*.75 / (r+1); + + if (rfract < 1.01 / (r+1e-10)) + rfract = 1.01 / (r+1e-10); // low depth compensation + + // TODO: consider limiting rfract so we never drown out the + // signal. We want to use the remaining data only to correct + // for sequencing errors in low depth alleles. If we get + // conflicts, it's better to use N than to change a base + // incase that variant is genuine. + if (i+left >= pos+1 && i+left < pos+1-biggest_del) { + // We're overlapping the current indel region, so + // we don't wish to bring in evidence from the other + // "type" data as it'll harm calling. + continue; + } else { + // Otherwise add in a portion of other data to + // boost low population numbers. + cons_base[i][0] += rfract * ref_base[i][0]; + cons_base[i][1] += rfract * ref_base[i][1]; + cons_base[i][2] += rfract * ref_base[i][2]; + cons_base[i][3] += rfract * ref_base[i][3]; + cons_base[i][4] += rfract * ref_base[i][4]; + cons_base[i][5] += rfract * ref_base[i][5]; + } + + // Similarly for insertions too; consider a different rfract here? + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + if (bcf_cgp_append_cons(&cons_ins[i], + ref_ins[i].str[j], ref_ins[i].len[j], + rfract * ref_ins[i].freq[j]) < 0) + goto err; + } + } + + //-------------------------------------------------- + // Allocate consensus buffer, to worst case length + int max_len = right-left; + for (i = 0; i < right-left; i++) { + if (!cons_ins[i].str[0]) + continue; + + int ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].str[j] && ins < cons_ins[i].len[j]) + ins = cons_ins[i].len[j]; + } + max_len += ins; + } + cons = malloc((max_len+1)*2 + sizeof(char *)*2); + if (!cons) + goto err; + cons[0] = (char *)&cons[2]; + cons[1] = cons[0] + max_len+1; + + //-------------------------------------------------- + // Merge insertions where they are the same length but different + // sequences. + // NB: we could just index by length and have accumulators for each, + // instead of storing separately and merging later (here). + // Ie str_freq.str is [NI][5] instead. + for (i = 0; i < right-left; i++) { + int ins[1024][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + + if (cons_ins[i].freq[j] == 0) + continue; // already merged + + int l; + for (l = 0; l < cons_ins[i].len[j]; l++) { + // Append to relevant frequency counter, zero all others + ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0; + uint8_t b = cons_ins[i].str[j][l]; + ins[l][b] = cons_ins[i].freq[j]; + } + + // Merge other insertions of the same length to ins[] counters + for (k = j+1; k < NI; k++) { + if (!cons_ins[i].str[k]) + break; + if (cons_ins[i].len[k] != cons_ins[i].len[j]) + continue; + if (cons_ins[i].freq[k] == 0) + continue; // redundant? + + // Merge str[j] and str[k] + for (l = 0; l < cons_ins[i].len[k]; l++) { + uint8_t b = cons_ins[i].str[k][l]; + ins[l][b] += cons_ins[i].freq[k]; + } + cons_ins[i].freq[j] += cons_ins[i].freq[k]; + cons_ins[i].freq[k] = 0; + } + + // Now replace ins[j] with the consensus insertion of this len. + for (l = 0; l < cons_ins[i].len[j]; l++) { + int max_v = 0, base = 0; + int tot = ins[l][0] + ins[l][1] + ins[l][2] + + ins[l][3] + ins[l][4]; + if (max_v < ins[l][0]) max_v = ins[l][0], base = 0; + if (max_v < ins[l][1]) max_v = ins[l][1], base = 1; + if (max_v < ins[l][2]) max_v = ins[l][2], base = 2; + if (max_v < ins[l][3]) max_v = ins[l][3], base = 3; + if (max_v < ins[l][4]) max_v = ins[l][4], base = 4; + + cons_ins[i].str[j][l] = (max_v > 0.6*tot) ? base : 4; + } + } + } + +#define CONS_CUTOFF .40 // 40% needed for base vs N +#define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] +#define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0] +#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] HOM +#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N + + //-------------------------------------------------- + // Walk through the frequency arrays to call the consensus. + // We produce cons[0] and cons[1]. Both include strongly + // homozygous indels. Both also include the indel at 'pos'. + // However for heterozygous indels we call the most likely event + // for cons[0] and the less-likely alternative in cons[1]. + // TODO: a proper phase analysis so multiple events end up + // combining together into the correct consensus. + *left_shift = 0; + *right_shift = 0; + int cnum; + + // Het call filled out in cnum==0 (+ve or -ve). + // Used in cnum==1 to do the opposite of whichever way we did before. + int heti[1024] = {0}, hetd[1024] = {0}; + + *cpos_pos = -1; + for (cnum = 0; cnum < 2; cnum++) { + for (i = k = 0; i < right-left; i++) { + // Location in consensus matching the indel itself + if (i >= pos-left+1 && *cpos_pos == -1) + *cpos_pos = k; + + int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0; + for (j = 0; j < 6; j++) { + // Top 2 consensus calls + if (max_v < cons_base[i][j]) { + max_v2 = max_v, max_j2 = max_j; + max_v = cons_base[i][j], max_j = j; + } else if (max_v2 < cons_base[i][j]) { + max_v2 = cons_base[i][j], max_j2 = j; + } + tot += cons_base[i][j]; + } + + // +INS + int max_v_ins = 0, max_j_ins = 0; + int tot_ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].freq[j] == 0) + continue; // previously merged + + if (max_v_ins < cons_ins[i].freq[j]) + //if (i != pos-left+1 || cons_ins[i].len[j] == type) + max_v_ins = cons_ins[i].freq[j], max_j_ins = j; + tot_ins += cons_ins[i].freq[j]; + } + + // NB: tot is based on next matching base, so it includes + // everything with or without the insertion. + int tot_sum = tot; + int always_ins = + (i == pos-left+1 && type>0) || // current eval + max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM + int het_ins = 0; + if (!always_ins && max_v_ins >= bca->min_support) { + // Candidate HET ins. + if (cnum == 0) { + het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum; + if (i < 1024) heti[i] = het_ins + ? 1 + : (max_v_ins > .3*tot_sum ? -1:0); + } else { + // HET but uncalled before + het_ins = i < 1024 ? (heti[i] == -1) : 0; + } + } + + if (always_ins || het_ins) { + if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { + // Insert bases + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { + if (cnum == 0) { + if (k < pos-left+*left_shift) + (*left_shift)++; + else + (*right_shift)++; + } + cons[cnum][k++] = cons_ins[i].str[max_j_ins][j]; + } + } else { + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) + cons[cnum][k++] = 4; // 'N'; + } + } + + // Call deletions & bases + int always_del = (type < 0 && i > pos-left && i <= pos-left-type) + || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del + int het_del = 0; + if (!always_del && cons_base[i][5] >= bca->min_support) { + // Candidate HET del. + if (cnum == 0) { + het_del = cons_base[i][5] >= CONS_CUTOFF * tot; + if (i < 1024) { + if (i > pos-left && i <= pos-left-biggest_del) + hetd[i] = 0; + else + hetd[i] = het_del + ? 1 + : (cons_base[i][5] >= .3 * tot ? -1 : 0); + } + } else { + // HET del uncalled on cnum 0 + het_del = i < 1024 ? (hetd[i] == -1) : 0; + if (max_j == 5 && het_del == 0) { + max_v = max_v2; + max_j = max_j2; + } + } + } + if (always_del || het_del) { + // Deletion + if (k < pos-left+*left_shift) + (*left_shift)--; + else + (*right_shift)++; + } else { + // Finally the easy case - a non-indel base or an N + if (max_v > CONS_CUTOFF*tot) + cons[cnum][k++] = max_j; // "ACGTN*" + else if (max_v > 0) + cons[cnum][k++] = 4; // 'N'; + else { + cons[cnum][k] = left+k < ref_len + ? base6[(uint8_t)ref[left+k]] + : 4; + k++; + } + } + } + + tcon_len[cnum] = k; + } + + // TODO: replace by io_lib's string pool for rapid tidying. + // For now this isn't the bottleneck though. + for (i = 0; i < right-left; i++) { + for (j = 0; j < NI; j++) { + if (cons_ins[i].str[j]) + free(cons_ins[i].str[j]); + if (ref_ins[i].str[j]) + free(ref_ins[i].str[j]); + } + } + + err: + free(cons_base); + free(ref_base); + free(cons_ins); + free(ref_ins); + + return cons; +} + +// A rename of bcf_cgp_calc_cons from bam2bcf_indel.c +// +// Compute the insertion consensus for this sample 's' via a basic +// majority rule. +// +// TODO: merge this into bcf_cgp_consensus as another return value? +static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { + return bcf_cgp_calc_cons(n, n_plp, plp, pos, types, n_types, max_ins, s); +} + +#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a,b) ((a)<(b)?(a):(b)) + +// Compile with LIBS="-L. -ldl -ledlib" CLD=g++ + +// This is faster than ksw and BAQ, meaning we can use larger --indel-size and +// get a more accurate context, improving alignments further. This *may* +// compensate for reduced sensitivity. +#include "edlib.h" +int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, + double m, double del_bias) +{ + EdlibAlignConfig cfg = + edlibNewAlignConfig( + //ABS(type)+ABS(l_ref-l_query)+10, + -1, // k; use small positive for faster alignment + EDLIB_MODE_HW, // mode + EDLIB_TASK_LOC, // task + //EDLIB_TASK_PATH, // for manual alignment scoring + NULL, // additionalEqualities + 0); // additionalEqualitiesLength + EdlibAlignResult r = + edlibAlign((char *)query, l_query, (char *)ref, l_ref, cfg); + + if (r.status != EDLIB_STATUS_OK || r.numLocations < 1 || + !r.endLocations || !r.startLocations) { + edlibFreeAlignResult(r); + return INT_MAX; + } + + int score; +// score = m*r.editDistance; // Illumina: ie -0*(glen - l_query) + +#if 0 + // Alignment based score, scaled by average sequence quality + int i, indel=0; + for (i = score = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: indel=0; break; // match + case 3: score++; indel=0; break; // mismatch + case 1: case 2: // indel + score+=indel?4:2; + indel=1; + break; + } + } + score *= m/2; +#elif 0 + // Alignment based score, using per-base sequence quality + int i, indel=0, qpos = 0; + for (i = score = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: indel=0; qpos++; break; // match + case 3: score+=qq[qpos]/2; indel=0; qpos++; break; // mismatch + case 1: // ins + case 2: // del + score+=(indel?2:1)*qq[qpos]; + indel=1; + qpos += r.alignment[i]==2; + break; + } + } +#elif 0 + // BEST for PB + // + // Alignment based score, using per-base sequence quality. + // Eg params for PacBio CCS. + // This is *marginally* better than the naive t_len-l_query below, but it's + // 34% slower mpileup for CCS. Perhaps not worth the trade off? + int i; + double fscore = 0; + for (i = score = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: break; // match +// case 3: fscore+=0.5; break; // mismatch +// case 1: fscore+=1.0; break; // ins; higher qual +// case 2: fscore+=0.6; break; // del; more often an error + case 3: score+= 5; break; // mismatch + case 1: score+=10; break; // ins; higher qual + case 2: score+= 6; break; // del; more often an error + } + } + score *= m/10; +// score = m*fscore; +#elif 0 + // As above, but accounting for minimum quality in STR region instead. + // BAD + int i; + double fscore = 0; + for (i = score = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: break; // match + case 3: fscore++; break; // mismatch + case 1: fscore+=m2min/m; break; // ins; higher qual + case 2: fscore+=0.6*m2min/m; break; // del; more often an error + } + } + score = fscore*m; +#elif 0 + // As above, but factoring in quality. + // BAD + int i, qpos = 0; + double fscore = 0; + for (i = score = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: qpos++; break; // match + case 3: fscore+=qq[qpos++]; break; // mismatch + case 1: fscore+=qq[qpos++]; break; // ins; higher qual + //case 2: fscore+=0.6*m; break; // del; more often an error + case 2: fscore+=.6*qq[qpos];break; // del; more often an error + } + } + score = fscore; +#endif + + +// int nins = 0, ndel = 0, nmis = 0; +// for (i = score = 0; i < r.alignmentLength; i++) { +// switch (r.alignment[i]) { +// case 1: nins++; break; +// case 2: ndel++; break; +// case 3: nmis++; break; +// } +// } +// assert((*r.endLocations - *r.startLocations + 1) - l_query == ndel-nins); +// +// Then score = f(nins,ndel,nmis). +// Could also track nis_o,nins_e,ndel_o,ndel_e for open/extend. + +#if 1 + int t_len = *r.endLocations - *r.startLocations + 1; + + // Aligned target length minus query length is an indication of the number + // of insertions and/or deletions. + // + // For CIGAR 10M1I10M t_len > l_query ("AC" / "ATC") + // For CIGAR 10M1D10M t_len < l_query ("ATC" / "AC") + // Hence t_len-l_query is -ve for net insertions and +ve for net deletions. + // If we compute nins and ndel directly via walking though EDLIB_TASK_PATH + // we'll see t_len-l_query == ndel-nins. + // + // If a technology has a significantly higher chance of making deletion + // errors than insertion errors, then we would view deletions as less + // indicative of this sequence not coming from this candidate allele than + // if it had insertion (as the deletions are more likely to be errors + // rather than real, relative to the insertions). Hence we can skew the + // score by the net delta of num_del - num_ins. + // + // Note this is an approximation that doesn't account for multiple + // insertions and deletions within the same sequence, but it is much faster + // as it doesn't require EDLIB_TASK_PATH to be computed. + // + // Given editDistance is +1 for every mismatch, insertion and deletion, + // provided the t_len-l_query multiplier < 1 then this is always +ve. + + score = m*(r.editDistance - del_bias*(t_len - l_query)); +#endif + +#if 0 + // DEBUG: dump out the sequence alignment + { + char rseq[1024], *rcp = rseq, qseq[1024], *qcp = qseq; + + int i, rpos = 0, qpos = 0; + for (i = 0; i < r.alignmentLength; i++) { + switch(r.alignment[i]) { + case 0: // match + case 3: // mismath + *rcp++ = "ACGTN"[ref[rpos++]]; + *qcp++ = "ACGTN"[query[qpos++]]; + break; + case 1: // ins + *rcp++ = '-'; + *qcp++ = "ACGTN"[query[qpos++]]; + break; + case 2: // del + *rcp++ = "ACGTN"[ref[rpos++]]; + *qcp++ = '-'; + break; + } + } + *rcp = 0; + *qcp = 0; + fprintf(stderr, "Ref %s\n", rseq); + fprintf(stderr, "Seq %s\n", qseq); + fprintf(stderr, "Score %d t-l %d\n", score, t_len - l_query); + } +#endif + + edlibFreeAlignResult(r); + return score; +} + +// Part of bcf_call_gap_prep. +// +// Realign using BAQ to get an alignment score of a single read vs +// a haplotype consensus. TODO: replace BAQ with something more robust. +// +// There are many coordinates, so let's explain them. +// - left, right, tbeg, tend, r_start and r_end are in aligned reference +// coordinates. +// left/right start from pos +/- indel_win_size. +// r_start/r_end are the BAM first and last mapped coord on the reference. +// tbeg and tend are the intersection of the two. +// - qbeg and qend are in BAM sequence coordinates +// - qpos is in sequence coordinates, relative to qbeg. +// +// To see what this means, we have illustrations with coordinates +// above the seqs in reference space and below the seqs in BAM seq space. +// +// Overlap left: +// tbeg tend +// r_start left pos r_end right +// REF :..............|--------------------#------:--------------|... +// SEQ :..............|--------------------#------| +// 0 qbeg qpos qend +// +// Overlap right: +// r_start tend +// left tbeg pos right r_end +// REF ...|--------------:-----#---------------------|...........: +// SEQ |-----#---------------------|...........: +// qbeg qpos qend +// 0 +// +// The "-" sequence is the bit passed in. +// Ie ref2 spans left..right and query spans qbeg..qend. +// We need to adjust ref2 therefore to tbeg..tend. +// +// Fills out score +// Returns 0 on success, +// <0 on error +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, + int type, int band, + uint8_t *ref1, uint8_t *ref2, uint8_t *query, + int r_start, int r_end, int long_read, + int tbeg, int tend1, int tend2, + int left, int right, + int qbeg, int qend, + int pos, int qpos, int max_deletion, + double qavg, double del_bias, int *score, + int *str_len1_p, int *str_len2_p) { + int atype = abs(type); + int l, sc1, sc2; + const uint8_t *qual = bam_get_qual(p->b), *bq = NULL; + uint8_t *qq; + + // Trim poly_Ns at ends of ref. + // This helps to keep len(ref) and len(query) similar, to reduce + // band size and reduce the chance of -ve BAQ scores. + for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++) + if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) + break; + if (l > atype) + tbeg += l-atype; + + for (l = tend1-tbeg-1; l >= 0; l--) + if (ref1[l + tbeg-left] != 4) + break; + l = tend1-tbeg-1 - l; + if (l > atype) + tend1 -= l-atype; + + for (l = tend2-tbeg-1; l >= 0; l--) + if (ref2[l + tbeg-left] != 4) + break; + l = tend2-tbeg-1 - l; + if (l > atype) { + tend2 -= l-atype; + } + + // Get segment of quality, either ZQ tag or if absent QUAL. + if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) + return -1; + //bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + //if (bq) ++bq; // skip type + double m = 0; + for (l = qbeg; l < qend; ++l) { + int qval = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qval > 30) + qval = 30; + if (qval < 7) + qval = 7; + qq[l - qbeg] = qval; + m += qval; + } + m /= (qend - qbeg); // avg qual + + // Identify STRs in ref covering the indel up to + // (or close to) the end of the sequence. + // Those having an indel and right at the sequence + // end do not confirm the total length of indel + // size. Specifically a *lack* of indel at the + // end, where we know indels occur in other + // sequences, is a possible reference bias. + // + // This is emphasised further if the sequence ends with + // soft clipping. + // FIXME: need to make this work on IUPAC? + rep_ele *reps, *elt, *tmp; + uint8_t *seg = ref2 + tbeg - left; + int seg_len = tend2 - tbeg; + reps = find_STR((char *)seg, seg_len, 0); + int iscore = 0; + double m2 = 0; + int mn = 0, m2min = INT_MAX; + int str_len1 = *str_len1_p, str_len2 = *str_len2_p; + DL_FOREACH_SAFE(reps, elt, tmp) { + if (elt->start <= qpos && elt->end >= qpos) { + iscore += (elt->end-elt->start) / elt->rep_len; // c + if (str_len1 < elt->end-elt->start) + str_len1 = elt->end-elt->start; + if (str_len2 < (elt->end-elt->start) / elt->rep_len) + str_len2 = (elt->end-elt->start) / elt->rep_len; + for (l = MAX(qbeg, elt->start); + l < MIN(qend, elt->end); + l++, mn++) { + m2 += qq[l-qbeg]; + if (m2min > qq[l-qbeg]) + m2min = qq[l-qbeg]; + } + if (elt->start+tbeg <= r_start || + elt->end+tbeg >= r_end) + iscore += 2*(elt->end-elt->start); + } + + DL_DELETE(reps, elt); + free(elt); + } + *str_len1_p = str_len1; + *str_len2_p = str_len2; + if (mn) + m2 /= mn; + else + m2 = m2min = qavg; + + // The bottom 8 bits are length-normalised score while + // the top bits are unnormalised. + // + // Try original cons and new cons and pick best. + // This doesn't reduce FN much (infact maybe adds very slightly), + // but it does reduce GT errors and is a slight reduction to FP. + //m = MIN(30, (m2+m2min)/2); // best so far + m = MIN(30, m2min); + +#if 1 + // edlib + + //double mm = (m+m2)/2; + //double mm = m2min; + double mm = m; + sc2 = edlib_glocal(ref2 + tbeg - left, tend2 - tbeg, + query, qend - qbeg, mm, del_bias); + + if (tend1 != tend2 || + memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, + tend1 - tbeg) != 0) + sc1 = edlib_glocal(ref1 + tbeg - left, tend1 - tbeg, + query, qend - qbeg, mm, del_bias); + else + sc1 = INT_MAX; // skip +#endif + +#if 0 + // BAQ + + int SC1, SC2; + probaln_par_t apf = { 1e-4, 1e-2, 10 }; + if (long_read) { + apf.d = 1e-3; + apf.e = 1e-1; + } + + if (band > (qend-qbeg)/2-3) + band = (qend-qbeg)/2-3; + apf.bw = band + 3; // or abs(l_ref - l_query), so we want to keep similar + + SC2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg, + query, qend - qbeg, qq, &apf, 0, 0); + + if (tend1 != tend2 || + memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, + tend1 - tbeg) != 0) + SC1 = probaln_glocal(ref1 + tbeg - left, tend1 - tbeg, + query, qend - qbeg, qq, &apf, 0, 0); + else + SC1 = INT_MAX; // skip + + sc1 = SC1; + sc2 = SC2; +#endif + + // Find the best of the two alignments + if (sc1 < 0 && sc2 < 0) { + *score = 0xffffff; + free(qq); + return 0; + } + if (sc1 < 0) { + // sc2 is already correct + } else if (sc2 < 0) { + sc2 = sc1; + } else { + // sc1 and sc2 both pass, so use best + if (sc2 > sc1) + sc2 = sc1; + } + +#if 0 + // Old indel-tweak-jkb1:bam2bcf_indel.c code + l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias); + *score = sc2<<8 | MIN(255, l); + l = (*score&0xff)*.8 + iscore*2; + *score = (*score & ~0xff) | MIN(255, l); + free(qq); + return 0; +#endif + + // Sc is overall alignment score, in top 24 bits (SeqQ). It's based + // purely on the scores for the whole alignment. + // We also have a separate indel score in bottom 8 bits (IndelQ). + // This is a function of all sorts of attributes of the candidate indel + // itself, such as STR length and the presence of poor quality bases. + + // Used for adjusting indelQ below. Lower l is more likely to call + // (--FN, ++FP). (NB CLI --indel_bias is 1/indel_bias var). + // Starts as average score per base, and then adjusted based on seq + // complexity / quality. + + l = .5*(100. * sc2 / (qend - qbeg) + .499); + l += iscore*(qavg/(m2min+1.0) + qavg/m2); + + *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias/10); + + // NOTE: indel_bias now seems to have a very minimal impact on scoring. + // Why is this so? + + free(qq); + + return 0; +} + +// Part of bcf_call_gap_prep. +// +// Returns n_alt on success +// -1 on failure + +// TODO: almost identical to bam2bcf_indel.c's copy, so we could share +// the code and add a check on bca->edlib. +static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, + bcf_callaux_t *bca, char *inscns, + int l_run, int max_ins, + int ref_type, int *types, int n_types, + double qavg, int *score, + int str_len1, int str_len2) { + // FIXME: n_types has a maximum; no need to alloc - use a #define? + int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; + memset(sumq, 0, n_types * sizeof(int)); + + // Confusing variable naming and bit usage. + // + // score[] is low 8 bits normalised (by len) alignment score + // top 24 bits full alignment score + // This gets cast into "sct"; mnemonic score-per-indel-type. + // + // sc = (score<<6) | type (index to types[] array for indel size) + // So sc>>14 = score>>(14-6) = score>>8. Ie full alignment score + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + // Labelling is confusing here. + // sct is short for score. + // sc is score + t(type) + // Why aren't these variable names reversed? + int *sct = &score[K*n_types], seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + // sc >> 14 is the total score. It's been shifted by 8 + // from normalised score and 6 from type. + indelQ = (sc[1]>>14) - (sc[0]>>14); + // &0x3f is type number + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1); + } + + if (1) { + int qpos = p->qpos, l; + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + int min_q = qual[qpos]; +// // scan left +// char base = bam_seqi(seq, qpos); +// for (l = qpos; l >= 0; l--) { +// if (bam_seqi(seq, l) != base) +// break; +// if (min_q > qual[l]) +// min_q = qual[l]; +// } + + // scan right (including site of indel) + char base = bam_seqi(seq, qpos+1); + for (l = qpos+1; l < p->b->core.l_qseq; l++) { + if (min_q > qual[l]) + min_q = qual[l]; + if (bam_seqi(seq, l) != base) + break; + } + + // seqQ mod needed for PacBio. + + // We reduce -h so homopolymers get reduced likelihood of being + // called, but then optionally increase or decrease from there + // based on base quality. Hence lack of low quality bases in + // homopolymer will rescue the score back again, reducing FNs. + seqQ += MIN(qavg/20, min_q - qavg/10); + indelQ += MIN(qavg/20, min_q - qavg/5); + + if (seqQ < 0) seqQ = 0; + if (indelQ < 0) indelQ = 0; + } + + // This is the length-normalised score from bcf_cgp_align_score + tmp = sc[0]>>6 & 0xff; + + // reduce indelQ + // high score = bad, low score = good. + // low normalised scores leave indelQ unmodified + // high normalised scores set indelQ to 0 + // inbetween scores have a linear scale from indelQ to 0 + indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + + // Doesn't really help accuracy, but permits -h to take + // affect still. + if (indelQ > seqQ) indelQ = seqQ; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + + // use 22 bits in total + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; + sumq[sc[0]&0x3f] += indelQ; + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if (bca->maxins && !bca->inscns) + return -1; + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + if (bca->maxins) + memcpy(&bca->inscns[t * bca->maxins], + &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + } + } + + return n_alt; +} + +/* +FIXME: with high number of samples, do we handle IMF correctly? Is it +fraction of indels across entire data set, or just fraction for this +specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. + */ + +/* + notes: + - n .. number of samples + - the routine sets bam_pileup1_t.aux of each read as follows: + - 6: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff + */ +int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, + bcf_callaux_t *bca, const char *ref, int ref_len) +{ + if (ref == 0 || bca == 0) return -1; + + int i, s, t, n_types, *types = NULL, max_rd_len, left, right, max_ins; + int *score = NULL; + int N, K, l_run, ref_type, n_alt = -1; + char *inscns = NULL, *query = NULL; + + // determine if there is a gap + for (s = N = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) + if (plp[s][i].indel != 0) break; + if (i < n_plp[s]) break; + } + if (s == n) + // there is no indel at this position. + return -1; + + // Find average base quality over this region + double qavg = 30, qsum = 0, qcount = 0; + int qmax = 0; + for (s = 0; s < n; s++) { + for (i = 0; i < n_plp[s]; i++) { +#define QWIN 50 + bam_pileup1_t *p = plp[s] + i; + int kstart = p->qpos - QWIN > 0 ? p->qpos - QWIN : 0; + int kend = p->qpos + QWIN < p->b->core.l_qseq + ? p->qpos + QWIN : p->b->core.l_qseq; + uint8_t *qual = bam_get_qual(p->b); + int k; + for (k = kstart; k < kend; k++) { + qsum += qual[k]; + qcount++; + if (qmax < qual[k]) + qmax = qual[k]; + } + } + } + qavg = (qsum+1) / (qcount+1); + //qavg = (qavg + qmax)/2; // bias avg toward maximum observed. + + // find out how many types of indels are present + types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, + &max_rd_len, &n_types, &ref_type, &N); + if (!types) + goto err; + + + // calculate left and right boundary +#if 0 + left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; + right = pos + bca->indel_win_size; +#else + int max_indel = 20*MAX(ABS(types[0]), ABS(types[n_types-1])) + + bca->indel_win_size/4; + if (max_indel > bca->indel_win_size) + max_indel = bca->indel_win_size; + left = pos > max_indel ? pos - max_indel : 0; + right = pos + max_indel; +#endif + + int del_size = types[0]<0 ? -types[0] : 0; + right += del_size; + + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + + // compute the likelihood given each type of indel for each read + max_ins = types[n_types - 1]; // max_ins is at least 0 + + // The length of the homopolymer run around the current position + l_run = bcf_cgp_l_run(ref, pos); + int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]]; + int l_run_ins = 0; + + // construct the consensus sequence (minus indels, which are added later) + if (max_ins > 0) { + // TODO: replace filling inscns[] with calc_consensus return + // so the merges of the insertion consensus for type[t] is + // reported directly. (It may need adjustment to avoid N) + inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); + if (!inscns) + return -1; + } + + query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); + score = (int*) calloc(N * n_types, sizeof(int)); + bca->indelreg = 0; + double nqual_over_60 = bca->nqual / 60.0; + + int biggest_del = 0; + int biggest_ins = 0; + for (t = 0; t < n_types; t++) { + if (biggest_del > types[t]) + biggest_del = types[t]; + if (biggest_ins < types[t]) + biggest_ins = types[t]; + } + int band = biggest_ins - biggest_del; // NB del is -ve + + int str_len1 = l_run, str_len2 = l_run/4; + for (t = 0; t < n_types; ++t) { + int l, ir; + + // Compute indelreg. This is the context in the reference. Eg: + // + // REF: AG--TTTC Inscns is "TT". + // SEQ: AGTTTTTC Indelreg is 3; next 3 "TTT" bases + // + // => GTTT GTTTTT is call. + if (types[t] == 0) + ir = 0; + else if (types[t] > 0) + ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else + ir = est_indelreg(pos, ref, -types[t], 0); + + if (ir > bca->indelreg) + bca->indelreg = ir; + + // Realignment score, computed via BAQ + for (s = K = 0; s < n; ++s) { + char **tcons; + int left_shift, right_shift; + int tcon_len[2]; + int cpos_pos; + tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len, + left, right, s, types[t], biggest_del, + &left_shift, &right_shift, &band, + tcon_len, &cpos_pos); + // TODO: Consensus for a deletion shouldn't match the + // consensus for type 0. Eg consider + // vv vv + // REF: AATGTGTGAACAA REF: AATGTG--AACAA + // T0: AATGTG--AACAA T0: AATGTG--AACAA + // T-2: AA--TGTGAATAA T-2: AA--TGTGAATAA: + // + // On left: both T0 and T-2 are the same length, as it's + // just a deletion that moved. We may end up assigning + // reads to an indel allele based on the SNP they have and + // not the actual indel. + // There *is* a deletion here though, but only 1. How do + // we call it once only? Need to replace entire region + // with a reassembly. + // + // On right: T0 and T-2 have same length again, but there + // isn't an indel as it's ins+del vs del+ins. They're + // also the same length as the REF for this region. + // Hence likelihood of this variant existing is tied in + // with their equal and high similarity with/to the ref. + // + // We could do an alignment of tcons[0] and tcons[1] and check + // whether their differences are consistent with (ie the + // hamming distance is at least ABS(types[t]/2). I don't think + // it'll rescue many FPs though. + +#ifdef CONS_DEBUG + { + int j; + for (j = 0; j < 2; j++) { + int k; + fprintf(stderr, "Cons%d @ %d %4d/%4d ", + j, pos, types[t], left_shift); + for (k = 0; k < tcon_len[j]; k++) { + if (k == cpos_pos) + putc('#', stderr); + putc("ACGTN"[(uint8_t)tcons[j][k]], stderr); + } + putc('\n', stderr); + } + } +#endif + + // Scan for base-runs in the insertion. + // We use this to avoid over-correction in est_seqQ when the + // insertion is not part of the neighbouring homopolymer. + int k = tcons[0][cpos_pos], j; + for (j = 0; j < types[t]; j++) + if (tcons[0][cpos_pos+j] != k) + break; + if (j && j == types[t]) + l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN + if (types[t] < 0) + l_run_ins |= 0xff; + + // align each read to consensus(es) + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + + // Some basic ref vs alt stats. + int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual; + imq *= nqual_over_60; + + int sc_len, slen, epos, sc_end; + + // Only need to gather stats on one type, as it's + // identical calculation for all the subsequent ones + // and we're sharing the same stats array + if (t == 0) { + // Gather stats for INFO field to aid filtering. + // mq and sc_len not very helpful for filtering, but could + // help in assigning a better QUAL value. + // + // Pos is slightly useful. + // Base qual can be useful, but need qual prior to BAQ? + // May need to cache orig quals in aux tag so we can fetch + // them even after mpileup step. + get_pos(bca, p, &sc_len, &slen, &epos, &sc_end); + + assert(imq >= 0 && imq < bca->nqual); + assert(epos >= 0 && epos < bca->npos); + assert(sc_len >= 0 && sc_len < 100); + if (p->indel) { + bca->ialt_mq[imq]++; + bca->ialt_scl[sc_len]++; + bca->ialt_pos[epos]++; + } else { + bca->iref_mq[imq]++; + bca->iref_scl[sc_len]++; + bca->iref_pos[epos]++; + } + } + + int qbeg, qpos, qend, tbeg, tend, kk; + uint8_t *seq = bam_get_seq(p->b); + uint32_t *cigar = bam_get_cigar(p->b); + if (p->b->core.flag & BAM_FUNMAP) continue; + + // FIXME: the following loop should be better moved outside; + // nonetheless, realignment should be much slower anyway. + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + + // determine the start and end of sequences for alignment + int left2 = left, right2 = right; + int min_win_size = MAX(-biggest_del, biggest_ins); + min_win_size += ABS(left_shift) + ABS(right_shift); + { + rep_ele *reps, *elt, *tmp; + reps = find_STR(tcons[0], tcon_len[0], 0); + //int max_str = 0; + int tot_str = 0; + DL_FOREACH_SAFE(reps, elt, tmp) { + // if (max_str < elt->end - elt->start) + // max_str = elt->end - elt->start; + tot_str += elt->end - elt->start; + DL_DELETE(reps, elt); + free(elt); + } + + // Ideally max_str should be enough, but it's still not + // sufficient in longer range some repeats. + //min_win_size += max_str; + min_win_size += tot_str; + } + min_win_size += 10; + if (p->b->core.l_qseq > 1000) { // ||1 for 7f-long + // long read data needs less context. It also tends to + // have many more candidate indels to investigate so + // speed here matters more. + if (pos - left >= min_win_size) + left2 = MAX(left2, pos - min_win_size); + if (right-pos >= min_win_size) + right2 = MIN(right2, pos + min_win_size); + } + + // Genomic coords for first and last base of query + // alignment. This is only used in bcf_cgp_align_score + // for computing scores by looking for the proximity + // of STRs with the end of the query alignment. + int r_start = p->b->core.pos; + int r_end = bam_cigar2rlen(p->b->core.n_cigar, + bam_get_cigar(p->b)) + -1 + r_start; + + // Map left2/right2 genomic coordinates to qbeg/qend + // query coordinates. The query may not span the + // entire left/right region, so this also returns the + // equivalent genomic coords for qbeg/qend in tbeg/tend. + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + left2, 0, &tbeg); + qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, + 0, &tend) - qbeg; + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + right2, 1, &tend); + + int old_tend = tend; + int old_tbeg = tbeg; + + // write the query sequence + for (l = qbeg; l < qend; ++l) + query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; + + // A fudge for now. Consider checking SAM header for + // RG platform field. + int long_read = p->b->core.l_qseq > 1000; + + // tbeg and tend are the genomic locations equivalent + // to qbeg and qend on the sequence. + // These may being entirely within our left/right + // coordinates over which we've computed the + // consensus, or overlapping to left/right. + // + // We know an estimation of band, plus biggest indel, + // so we can trim tbeg/tend to a smaller region if we + // wish here. This speeds up BAQ scoring. + int wband = band + MAX(-biggest_del, biggest_ins)*2 + 20; + int tend1 = left + tcon_len[0] - (left2-left); + int tend2 = left + tcon_len[1] - (left2-left); + tend1 = MIN(tend1, old_tend + wband); + tend2 = MIN(tend2, old_tend + wband); + tbeg = MAX(left2, old_tbeg - wband); + + // do realignment; this is the bottleneck. + // + // Note low score = good, high score = bad. + if (tend > tbeg) { + //fprintf(stderr, "Num %d\n", i); + if (bcf_cgp_align_score(p, bca, types[t], band, + (uint8_t *)tcons[0] + left2-left, + (uint8_t *)tcons[1] + left2-left, + (uint8_t *)query, + r_start, r_end, long_read, + tbeg, tend1, tend2, + left2, left + tcon_len[0], + qbeg, qend, pos,qpos, -biggest_del, + qavg, bca->del_bias, + &score[K*n_types + t], + &str_len1, &str_len2) < 0) { + goto err; + } + } else { + // place holder large cost for reads that cover the + // region entirely within a deletion (thus tend < tbeg). + score[K*n_types + t] = 0xffffff; + } + } + free(tcons); + } + } + + // compute indelQ + if (!(l_run_base & l_run_ins)) + l_run = 1; // different base type in ins to flanking region. + n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, + ref_type, types, n_types, qavg, score, + str_len1, str_len2); + + err: + // free + free(query); + free(score); + free(types); + free(inscns); + + return n_alt > 0? 0 : -1; +} diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index faedc3fef..975504f8a 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -45,7 +45,7 @@ KSORT_INIT_GENERIC(uint32_t) // // *_tpos is returned as tpos if query overlaps tpos, but for deletions // it'll be either the start (is_left) or end (!is_left) ref position. -static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) { // x = pos in ref, y = pos in query seq int k, x = c->pos, y = 0, last_y = 0; @@ -98,8 +98,8 @@ inline int est_indelreg(int pos, const char *ref, int l, char *ins4) } // Identify spft-clip length, position in seq, and clipped seq len -static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, - int *sc_len_r, int *slen_r, int *epos_r, int *end) { +void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, + int *sc_len_r, int *slen_r, int *epos_r, int *end) { bam1_t *b = p->b; int sc_len = 0, sc_dist = -1, at_left = 1; int epos = p->qpos, slen = b->core.l_qseq; @@ -155,6 +155,7 @@ static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p, // // Scans the pileup to identify all the different sizes of indels // present. +// types[] returned is sorted by size, from smallest (maybe negative) to largest. // // Returns types and fills out n_types_r, max_rd_len_r and ref_type_r, // or NULL on error. @@ -429,9 +430,9 @@ int bcf_cgp_l_run(const char *ref, int pos) { // Compute the consensus for this sample 's', minus indels which // get added later. -static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, - int pos, int *types, int n_types, - int max_ins, int s) { +char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { int i, j, t, k; int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); if (!inscns_aux) diff --git a/edlib.c b/edlib.c new file mode 100644 index 000000000..23ed8f9e9 --- /dev/null +++ b/edlib.c @@ -0,0 +1,1547 @@ +/* + * A cut down C translated of the C++ edlib.cpp file. + * Taken from edlib v0.1.0-166-g931be2b + */ + +#include +#include +#include +#include + +#include "edlib.h" + +typedef uint64_t Word; +static const int WORD_SIZE = 64; // Size of Word in bits +static const Word WORD_1 = (Word)1; +static const Word HIGH_BIT_MASK = 1LL << 63; // 100..00 +//#define MAX_UCHAR 255 +#define MAX_UCHAR 7 // better cache usage for our data + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +#if 0 +// Data needed to find alignment. +typedef struct AlignmentData { + Word* Ps; + Word* Ms; + int* scores; + int* firstBlocks; + int* lastBlocks; +} AlignmentData; + +static AlignmentData *CreateAlignmentData(int maxNumBlocks, int targetLength) { + AlignmentData *d = malloc(sizeof(*d)); + + // We build a complete table and mark first and last block for each column + // (because algorithm is banded so only part of each columns is used). + // TODO: do not build a whole table, but just enough blocks for each column. + d->Ps = malloc(maxNumBlocks * targetLength * sizeof(*d->Ps)); + d->Ms = malloc(maxNumBlocks * targetLength * sizeof(*d->Ms)); + d->scores = malloc(maxNumBlocks * targetLength * sizeof(*d->scores)); + d->firstBlocks = malloc(targetLength * sizeof(*d->firstBlocks)); + d->lastBlocks = malloc(targetLength * sizeof(*d->lastBlocks)); + + return d; +} + +static void DestroyAlignmentData(AlignmentData *d) { + free(d->Ps); + free(d->Ms); + free(d->scores); + free(d->firstBlocks); + free(d->lastBlocks); +} +#endif + +typedef struct Block { + Word P; // Pvin + Word M; // Mvin + int score; // score of last cell in block; +} Block; + + +/** + * Defines equality relation on alphabet characters. + * By default each character is always equal only to itself, but you can also provide additional equalities. + */ +typedef struct EqualityDefinition { + bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1]; +} EqualityDefinition; + +static EqualityDefinition * +CreateEqualityDefinition(const char *alphabet, int alphabet_size, + const EdlibEqualityPair* additionalEqualities, + const int additionalEqualitiesLength) { + EqualityDefinition *ed = malloc(sizeof(*ed)); + + for (size_t i = 0; i < alphabet_size; i++) { + for (size_t j = 0; j < alphabet_size; j++) { + ed->matrix[i][j] = (i == j); + } + } + if (additionalEqualities != NULL) { + for (int i = 0; i < additionalEqualitiesLength; i++) { + const char *firstTransformed = strchr(alphabet, additionalEqualities[i].first); + const char *secondTransformed = strchr(alphabet, additionalEqualities[i].second); + if (firstTransformed && alphabet_size) { + ed->matrix[firstTransformed - alphabet][secondTransformed - alphabet] = + ed->matrix[secondTransformed - alphabet][firstTransformed - alphabet] + = true; + } + } + } + + return ed; +} + +/** + * @param a Element from transformed sequence. + * @param b Element from transformed sequence. + * @return True if a and b are defined as equal, false otherwise. + */ +static inline const /* attribute pure or const? */ +bool equalityDefinition_areEqual(const EqualityDefinition *ed, unsigned char a, unsigned char b) { + return ed->matrix[a][b]; +} + +static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks, + int queryLength, + const unsigned char* target, int targetLength, + int k, EdlibAlignMode mode, + int* bestScore_, int** positions_, int* numPositions_); + +#if 0 +static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks, + int queryLength, + const unsigned char* target, int targetLength, + int k, int* bestScore_, + int* position_, bool findAlignment, + AlignmentData** alignData, int targetStopPosition); +#endif + +#if 0 +static int obtainAlignment( + const unsigned char* query, const unsigned char* rQuery, int queryLength, + const unsigned char* target, const unsigned char* rTarget, int targetLength, + const EqualityDefinition* equalityDefinition, int alphabetLength, int bestScore, + unsigned char** alignment, int* alignmentLength); + +static int obtainAlignmentHirschberg( + const unsigned char* query, const unsigned char* rQuery, int queryLength, + const unsigned char* target, const unsigned char* rTarget, int targetLength, + const EqualityDefinition* equalityDefinition, int alphabetLength, int bestScore, + unsigned char** alignment, int* alignmentLength); + +static int obtainAlignmentTraceback(int queryLength, int targetLength, + int bestScore, const AlignmentData* alignData, + unsigned char** alignment, int* alignmentLength); +#endif + +static char *transformSequences(const char* queryOriginal, int queryLength, + const char* targetOriginal, int targetLength, + unsigned char** queryTransformed, + unsigned char** targetTransformed, + int *alphabet_size); + +static inline int ceilDiv(int x, int y); + +static inline unsigned char* createReverseCopy(const unsigned char* seq, int length); + +static inline Word* buildPeq(const int alphabetLength, + const unsigned char* query, + const int queryLength, + const EqualityDefinition* equalityDefinition); + + +/** + * Main edlib method. + */ +EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength, + const char* const targetOriginal, const int targetLength, + const EdlibAlignConfig config) { + EdlibAlignResult result; + result.status = EDLIB_STATUS_OK; + result.editDistance = -1; + result.endLocations = result.startLocations = NULL; + result.numLocations = 0; + result.alignment = NULL; + result.alignmentLength = 0; + result.alphabetLength = 0; + + /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/ + unsigned char* query, * target; + int alphabet_size; + char *alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength, + &query, &target, &alphabet_size); + result.alphabetLength = alphabet_size; + /*-------------------------------------------------------*/ + + // Handle special situation when at least one of the sequences has length 0. + if (queryLength == 0 || targetLength == 0) { + if (config.mode == EDLIB_MODE_NW) { + result.editDistance = MAX(queryLength, targetLength); + result.endLocations = malloc(sizeof(int) * 1); + result.endLocations[0] = targetLength - 1; + result.numLocations = 1; + } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) { + result.editDistance = queryLength; + result.endLocations = malloc(sizeof(int) * 1); + result.endLocations[0] = -1; + result.numLocations = 1; + } else { + result.status = EDLIB_STATUS_ERROR; + } + + free(query); + free(target); + free(alphabet); + return result; + } + + /*--------------------- INITIALIZATION ------------------*/ + int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers + int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks + EqualityDefinition *equalityDefinition = + CreateEqualityDefinition(alphabet, alphabet_size, config.additionalEqualities, config.additionalEqualitiesLength); + Word* Peq = buildPeq(alphabet_size, query, queryLength, equalityDefinition); + /*-------------------------------------------------------*/ + + /*------------------ MAIN CALCULATION -------------------*/ + // TODO: Store alignment data only after k is determined? That could make things faster. +// int positionNW; // Used only when mode is NW. +// AlignmentData* alignData = NULL; + bool dynamicK = false; + int k = config.k; + if (k < 0) { // If valid k is not given, auto-adjust k until solution is found. + dynamicK = true; + k = WORD_SIZE; // Gives better results than smaller k. + } + + do { + if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) { + myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks, + queryLength, target, targetLength, + k, config.mode, &(result.editDistance), + &(result.endLocations), &(result.numLocations)); + } else { // mode == EDLIB_MODE_NW +// myersCalcEditDistanceNW(Peq, W, maxNumBlocks, +// queryLength, target, targetLength, +// k, &(result.editDistance), &positionNW, +// false, &alignData, -1); + } + k *= 2; + } while(dynamicK && result.editDistance == -1); + + if (result.editDistance >= 0) { // If there is solution. + // If NW mode, set end location explicitly. + if (config.mode == EDLIB_MODE_NW) { + result.endLocations = malloc(sizeof(int) * 1); + result.endLocations[0] = targetLength - 1; + result.numLocations = 1; + } + + // Find starting locations. + if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) { + result.startLocations = malloc(result.numLocations * sizeof(int)); + if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations. + const unsigned char* rTarget = createReverseCopy(target, targetLength); + const unsigned char* rQuery = createReverseCopy(query, queryLength); + // Peq for reversed query. + Word* rPeq = buildPeq(alphabet_size, rQuery, queryLength, equalityDefinition); + for (int i = 0; i < result.numLocations; i++) { + int endLocation = result.endLocations[i]; + if (endLocation == -1) { + // NOTE: Sometimes one of optimal solutions is that query starts before target, like this: + // AAGG <- target + // CCTT <- query + // It will never be only optimal solution and it does not happen often, however it is + // possible and in that case end location will be -1. What should we do with that? + // Should we just skip reporting such end location, although it is a solution? + // If we do report it, what is the start location? -4? -1? Nothing? + // TODO: Figure this out. This has to do in general with how we think about start + // and end locations. + // Also, we have alignment later relying on this locations to limit the space of it's + // search -> how can it do it right if these locations are negative or incorrect? + result.startLocations[i] = 0; // I put 0 for now, but it does not make much sense. + } else { + int bestScoreSHW, numPositionsSHW; + int* positionsSHW; + myersCalcEditDistanceSemiGlobal( + rPeq, W, maxNumBlocks, + queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1, + result.editDistance, EDLIB_MODE_SHW, + &bestScoreSHW, &positionsSHW, &numPositionsSHW); + // Taking last location as start ensures that alignment will not start with insertions + // if it can start with mismatches instead. + result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1]; + free(positionsSHW); + } + } + free((void *)rTarget); + free((void *)rQuery); + free(rPeq); + } else { // If mode is SHW or NW + for (int i = 0; i < result.numLocations; i++) { + result.startLocations[i] = 0; + } + } + } + +#if 0 + // Find alignment -> all comes down to finding alignment for NW. + // Currently we return alignment only for first pair of locations. + if (config.task == EDLIB_TASK_PATH) { + int alnStartLocation = result.startLocations[0]; + int alnEndLocation = result.endLocations[0]; + const unsigned char* alnTarget = target + alnStartLocation; + const int alnTargetLength = alnEndLocation - alnStartLocation + 1; + const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength); + const unsigned char* rQuery = createReverseCopy(query, queryLength); + obtainAlignment(query, rQuery, queryLength, + alnTarget, rAlnTarget, alnTargetLength, + equalityDefinition, alphabet_size, result.editDistance, + &(result.alignment), &(result.alignmentLength)); + free((void *)rAlnTarget); + free((void *)rQuery); + } +#endif + } + /*-------------------------------------------------------*/ + + //--- Free memory ---// + free(Peq); + free(query); + free(target); + free(alphabet); + free(equalityDefinition); +// DestroyAlignmentData(alignData); + //-------------------// + + return result; +} + +#if 0 +char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength, + const EdlibCigarFormat cigarFormat) { + if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) { + return 0; + } + + // Maps move code from alignment to char in cigar. + // 0 1 2 3 + char moveCodeToChar[] = {'=', 'I', 'D', 'X'}; + if (cigarFormat == EDLIB_CIGAR_STANDARD) { + moveCodeToChar[0] = moveCodeToChar[3] = 'M'; + } + + vector* cigar = new vector(); + char lastMove = 0; // Char of last move. 0 if there was no previous move. + int numOfSameMoves = 0; + for (int i = 0; i <= alignmentLength; i++) { + // if new sequence of same moves started + if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) { + // Write number of moves to cigar string. + int numDigits = 0; + for (; numOfSameMoves; numOfSameMoves /= 10) { + cigar->push_back('0' + numOfSameMoves % 10); + numDigits++; + } + reverse(cigar->end() - numDigits, cigar->end()); + // Write code of move to cigar string. + cigar->push_back(lastMove); + // If not at the end, start new sequence of moves. + if (i < alignmentLength) { + // Check if alignment has valid values. + if (alignment[i] > 3) { + delete cigar; + return 0; + } + numOfSameMoves = 0; + } + } + if (i < alignmentLength) { + lastMove = moveCodeToChar[alignment[i]]; + numOfSameMoves++; + } + } + cigar->push_back(0); // Null character termination. + char* cigar_ = malloc(cigar->size() * sizeof(char)); + memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char)); + delete cigar; + + return cigar_; +} +#endif + +/** + * Build Peq table for given query and alphabet. + * Peq is table of dimensions alphabetLength+1 x maxNumBlocks. + * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0. + * NOTICE: free returned array with free()! + */ +static inline Word* buildPeq(const int alphabetLength, + const unsigned char* const query, + const int queryLength, + const EqualityDefinition* equalityDefinition) { + int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard. + Word* Peq = malloc((alphabetLength + 1) * maxNumBlocks * sizeof(*Peq)); + + // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s +#if 0 + for (int symbol = 0; symbol <= alphabetLength; symbol++) { + for (int b = 0; b < maxNumBlocks; b++) { + if (symbol < alphabetLength) { + Peq[symbol * maxNumBlocks + b] = 0; + for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) { + Peq[symbol * maxNumBlocks + b] <<= 1; + // NOTE: We pretend like query is padded at the end with W wildcard symbols + if (r >= queryLength || equalityDefinition_areEqual(equalityDefinition, query[r], symbol)) + Peq[symbol * maxNumBlocks + b] += 1; + } + } else { // Last symbol is wildcard, so it is all 1s + Peq[symbol * maxNumBlocks + b] = (Word)-1; + } + } + } +#else + // Optimised Peq building avoiding branching. + for (int symbol = 0; symbol < alphabetLength; symbol++) { + for (int b = 0; b < maxNumBlocks; b++) { + Word PeqW = 0; + for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) { + PeqW = (PeqW<<1) + + (r >= queryLength + || equalityDefinition_areEqual(equalityDefinition, + query[r], symbol)); + } + Peq[symbol * maxNumBlocks + b] = PeqW; + } + } + { + int symbol = alphabetLength; + for (int b = 0; b < maxNumBlocks; b++) { + // Last symbol is wildcard, so it is all 1s + Peq[symbol * maxNumBlocks + b] = (Word)-1; + } + } +#endif + return Peq; +} + + +/** + * Returns new sequence that is reverse of given sequence. + * Free returned array with free() + */ +static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) { + unsigned char* rSeq = malloc(length); + for (int i = 0; i < length; i++) { + rSeq[i] = seq[length - i - 1]; + } + return rSeq; +} + +/** + * Corresponds to Advance_Block function from Myers. + * Calculates one word(block), which is part of a column. + * Highest bit of word (one most to the left) is most bottom cell of block from column. + * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1]. + * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0. + * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0. + * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch. + * @param [in] hin Will be +1, 0 or -1. + * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0. + * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0. + * @param [out] hout Will be +1, 0 or -1. + */ +static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin, + Word *PvOut, Word *MvOut) { + // hin can be 1, -1 or 0. + // 1 -> 00...01 + // 0 -> 00...00 + // -1 -> 11...11 (2-complement) + + Word hinIsNeg = (Word)(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1 + + Word Xv = Eq | Mv; + // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1; + Eq |= hinIsNeg; + Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; + + Word Ph = Mv | ~(Xh | Pv); + Word Mh = Pv & Xh; + + int hout = 0; + // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1; + hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1); + // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1; + hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1); + + Ph <<= 1; + Mh <<= 1; + + // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1; + Mh |= hinIsNeg; + // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1; + Ph |= (Word)((hin + 1) >> 1); + + *PvOut = Mh | ~(Xv | Ph); + *MvOut = Ph & Xv; + + return hout; +} + +/** + * Does ceiling division x / y. + * Note: x and y must be non-negative and x + y must not overflow. + */ +static inline int ceilDiv(const int x, const int y) { + return x % y ? x / y + 1 : x / y; +} + +static inline int min(const int x, const int y) { + return x < y ? x : y; +} + +static inline int max(const int x, const int y) { + return x > y ? x : y; +} + + +/** + * @param [in] block + * @return Values of cells in block, starting with bottom cell in block. + */ +static inline int *getBlockCellValues(const Block block) { + int *scores = malloc(WORD_SIZE * sizeof(*scores)); + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + scores[i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + scores[WORD_SIZE - 1] = score; + return scores; +} + +/** + * Writes values of cells in block into given array, starting with first/top cell. + * @param [in] block + * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. + */ +static inline void readBlock(const Block block, int* const dest) { + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + dest[WORD_SIZE - 1 - i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + dest[0] = score; +} + +/** + * Writes values of cells in block into given array, starting with last/bottom cell. + * @param [in] block + * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. + */ +static inline void readBlockReverse(const Block block, int* const dest) { + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + dest[i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + dest[WORD_SIZE - 1] = score; +} + +/** + * @param [in] block + * @param [in] k + * @return True if all cells in block have value larger than k, otherwise false. + */ +static inline bool allBlockCellsLarger(const Block block, const int k) { + int *scores = getBlockCellValues(block); + for (int i = 0; i < WORD_SIZE; i++) { + if (scores[i] <= k) { + free(scores); + return false; + } + } + + free(scores); + return true; +} + + +/** + * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods. + * @param [in] Peq Query profile. + * @param [in] W Size of padding in last block. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] queryLength + * @param [in] target + * @param [in] targetLength + * @param [in] k + * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW + * @param [out] bestScore_ Edit distance. + * @param [out] positions_ Array of 0-indexed positions in target at which best score was found. + Make sure to free this array with free(). + * @param [out] numPositions_ Number of positions in the positions_ array. + * @return Status. + */ +static int myersCalcEditDistanceSemiGlobal( + const Word* const Peq, const int W, const int maxNumBlocks, + const int queryLength, + const unsigned char* const target, const int targetLength, + int k, const EdlibAlignMode mode, + int* const bestScore_, int** const positions_, int* const numPositions_) { + *positions_ = NULL; + *numPositions_ = 0; + + // firstBlock is 0-based index of first block in Ukkonen band. + // lastBlock is 0-based index of last block in Ukkonen band. + int firstBlock = 0; + int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers + Block *bl; // Current block + + Block* blocks = malloc(maxNumBlocks * sizeof(*blocks)); + + // For HW, solution will never be larger then queryLength. + if (mode == EDLIB_MODE_HW) { + k = min(queryLength, k); + } + + // Each STRONG_REDUCE_NUM column is reduced in more expensive way. + // This gives speed up of about 2 times for small k. + const int STRONG_REDUCE_NUM = 2048; + + // Initialize P, M and score + bl = blocks; + for (int b = 0; b <= lastBlock; b++) { + bl->score = (b + 1) * WORD_SIZE; + bl->P = (Word)(-1); // All 1s + bl->M = (Word)(0); + bl++; + } + + int bestScore = -1; +#define MAX_POS 100 // maximum number of positions returned. + int positions[MAX_POS]; + int npositions = 0; + const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized; + const unsigned char* targetChar = target; + for (int c = 0; c < targetLength; c++) { // for each column + const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks; + + //----------------------- Calculate column -------------------------// + int hout = startHout; + bl = blocks + firstBlock; + Peq_c += firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M); + bl->score += hout; + bl++; Peq_c++; + } + bl--; Peq_c--; + //------------------------------------------------------------------// + + //---------- Adjust number of blocks according to Ukkonen ----------// + if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block + && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block + // If score of left block is not too big, calculate one more block + lastBlock++; bl++; Peq_c++; + bl->P = (Word)(-1); // All 1s + bl->M = (Word)(0); + bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, &bl->P, &bl->M); + } else { + while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) { + lastBlock--; bl--; Peq_c--; + } + } + + // Every some columns, do some expensive but also more efficient block reducing. + // This is important! + // + // Reduce the band by decreasing last block if possible. + if (c % STRONG_REDUCE_NUM == 0) { + while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) { + lastBlock--; bl--; Peq_c--; + } + } + // For HW, even if all cells are > k, there still may be solution in next + // column because starting conditions at upper boundary are 0. + // That means that first block is always candidate for solution, + // and we can never end calculation before last column. + if (mode == EDLIB_MODE_HW && lastBlock == -1) { + lastBlock++; bl++; Peq_c++; + } + + // Reduce band by increasing first block if possible. Not applicable to HW. + if (mode != EDLIB_MODE_HW) { + while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) { + firstBlock++; + } + if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks + while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) { + firstBlock++; + } + } + } + + // If band stops to exist finish + if (lastBlock < firstBlock) { + *bestScore_ = bestScore; + if (bestScore != -1) { + *positions_ = malloc(npositions * sizeof(int)); + *numPositions_ = npositions; + memcpy(*positions_, positions, npositions * sizeof(int)); + } + free(blocks); + return EDLIB_STATUS_OK; + } + //------------------------------------------------------------------// + + //------------------------- Update best score ----------------------// + if (lastBlock == maxNumBlocks - 1) { + int colScore = bl->score; + if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k. + // NOTE: Score that I find in column c is actually score from column c-W + if (bestScore == -1 || colScore <= bestScore) { + if (colScore != bestScore) { + npositions = 0; + bestScore = colScore; + // Change k so we will look only for equal or better + // scores then the best found so far. + k = bestScore; + } + if (npositions < MAX_POS) + positions[npositions++] = c - W; + } + } + } + //------------------------------------------------------------------// + + targetChar++; + } + + + // Obtain results for last W columns from last column. + if (lastBlock == maxNumBlocks - 1) { + int *blockScores = getBlockCellValues(*bl); + for (int i = 0; i < W; i++) { + int colScore = blockScores[i + 1]; + if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) { + if (colScore != bestScore) { + npositions = 0; + k = bestScore = colScore; + } + if (npositions < MAX_POS) + positions[npositions++] = targetLength - W + i; + } + } + free(blockScores); + } + + *bestScore_ = bestScore; + if (bestScore != -1) { + *positions_ = malloc(npositions * sizeof(int)); + *numPositions_ = npositions; + memcpy(*positions_, positions, npositions * sizeof(int)); + } + + free(blocks); + return EDLIB_STATUS_OK; +} + + +/** + * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method. + * @param [in] Peq Query profile. + * @param [in] W Size of padding in last block. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] queryLength + * @param [in] target + * @param [in] targetLength + * @param [in] k + * @param [out] bestScore_ Edit distance. + * @param [out] position_ 0-indexed position in target at which best score was found. + * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned. + * Quadratic amount of memory is consumed. + * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment). + * Set only if findAlignment is set to true, otherwise it is NULL. + * Make sure to free this array using free(). + * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected. + * If set to p, calculation is performed up to position p in target (inclusive) + * and column p is returned as the only column in alignData. + * @return Status. + */ +#if 0 +static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks, + const int queryLength, + const unsigned char* const target, const int targetLength, + int k, int* const bestScore_, + int* const position_, const bool findAlignment, + AlignmentData** const alignData, const int targetStopPosition) { + if (targetStopPosition > -1 && findAlignment) { + // They can not be both set at the same time! + return EDLIB_STATUS_ERROR; + } + + // Each STRONG_REDUCE_NUM column is reduced in more expensive way. + const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation + + if (k < abs(targetLength - queryLength)) { + *bestScore_ = *position_ = -1; + return EDLIB_STATUS_OK; + } + + k = min(k, max(queryLength, targetLength)); // Upper bound for k + + // firstBlock is 0-based index of first block in Ukkonen band. + // lastBlock is 0-based index of last block in Ukkonen band. + int firstBlock = 0; + // This is optimal now, by my formula. + int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1; + Block* bl; // Current block + + Block* blocks = malloc(maxNumBlocks * sizeof(*blocks)); + + // Initialize P, M and score + bl = blocks; + for (int b = 0; b <= lastBlock; b++) { + bl->score = (b + 1) * WORD_SIZE; + bl->P = (Word)(-1); // All 1s + bl->M = (Word)(0); + bl++; + } + + // If we want to find alignment, we have to store needed data. + if (findAlignment) + *alignData = new AlignmentData(maxNumBlocks, targetLength); + else if (targetStopPosition > -1) + *alignData = new AlignmentData(maxNumBlocks, 1); + else + *alignData = NULL; + + const unsigned char* targetChar = target; + for (int c = 0; c < targetLength; c++) { // for each column + const Word* Peq_c = Peq + *targetChar * maxNumBlocks; + + //----------------------- Calculate column -------------------------// + int hout = 1; + bl = blocks + firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M); + bl->score += hout; + bl++; + } + bl--; + //------------------------------------------------------------------// + // bl now points to last block + + // Update k. I do it only on end of column because it would slow calculation too much otherwise. + // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up. + k = min(k, bl->score + + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1) + + (lastBlock == maxNumBlocks - 1 ? W : 0)); + + //---------- Adjust number of blocks according to Ukkonen ----------// + //--- Adjust last block ---// + // If block is not beneath band, calculate next block. Only next because others are certainly beneath band. + if (lastBlock + 1 < maxNumBlocks + && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also! + ((lastBlock + 1) * WORD_SIZE - 1 + > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) { + lastBlock++; bl++; + bl->P = (Word)(-1); // All 1s + bl->M = (Word)(0); + int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M); + bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout; + hout = newHout; + } + + // While block is out of band, move one block up. + // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it. + // I could consider adding that max part, for optimal performance. + while (lastBlock >= firstBlock + && (bl->score >= k + WORD_SIZE + || ((lastBlock + 1) * WORD_SIZE - 1 > + // TODO: Does not work if do not put +1! Why??? + k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) { + lastBlock--; bl--; + } + //-------------------------// + + //--- Adjust first block ---// + // While outside of band, advance block + while (firstBlock <= lastBlock + && (blocks[firstBlock].score >= k + WORD_SIZE + || ((firstBlock + 1) * WORD_SIZE - 1 < + blocks[firstBlock].score - k - targetLength + queryLength + c))) { + firstBlock++; + } + //--------------------------/ + + + // TODO: consider if this part is useful, it does not seem to help much + if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction + while (lastBlock >= firstBlock) { + // If all cells outside of band, remove block + vector scores = getBlockCellValues(*bl); + int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = lastBlock * WORD_SIZE + numCells - 1; + bool reduce = true; + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { + // TODO: Does not work if do not put +1! Why??? + if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) { + reduce = false; + break; + } + r--; + } + if (!reduce) break; + lastBlock--; bl--; + } + + while (firstBlock <= lastBlock) { + // If all cells outside of band, remove block + vector scores = getBlockCellValues(blocks[firstBlock]); + int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = firstBlock * WORD_SIZE + numCells - 1; + bool reduce = true; + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { + if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) { + reduce = false; + break; + } + r--; + } + if (!reduce) break; + firstBlock++; + } + } + + + // If band stops to exist finish + if (lastBlock < firstBlock) { + *bestScore_ = *position_ = -1; + free(blocks); + return EDLIB_STATUS_OK; + } + //------------------------------------------------------------------// + + + //---- Save column so it can be used for reconstruction ----// + if (findAlignment && c < targetLength) { + bl = blocks + firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + (*alignData)->Ps[maxNumBlocks * c + b] = bl->P; + (*alignData)->Ms[maxNumBlocks * c + b] = bl->M; + (*alignData)->scores[maxNumBlocks * c + b] = bl->score; + bl++; + } + (*alignData)->firstBlocks[c] = firstBlock; + (*alignData)->lastBlocks[c] = lastBlock; + } + //----------------------------------------------------------// + //---- If this is stop column, save it and finish ----// + if (c == targetStopPosition) { + for (int b = firstBlock; b <= lastBlock; b++) { + (*alignData)->Ps[b] = (blocks + b)->P; + (*alignData)->Ms[b] = (blocks + b)->M; + (*alignData)->scores[b] = (blocks + b)->score; + } + (*alignData)->firstBlocks[0] = firstBlock; + (*alignData)->lastBlocks[0] = lastBlock; + *bestScore_ = -1; + *position_ = targetStopPosition; + free(blocks); + return EDLIB_STATUS_OK; + } + //----------------------------------------------------// + + targetChar++; + } + + if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated + // Obtain best score from block -> it is complicated because query is padded with W cells + int bestScore = getBlockCellValues(blocks[lastBlock])[W]; + if (bestScore <= k) { + *bestScore_ = bestScore; + *position_ = targetLength - 1; + free(blocks); + return EDLIB_STATUS_OK; + } + } + + *bestScore_ = *position_ = -1; + free(blocks); + return EDLIB_STATUS_OK; +} +#endif + +#if 0 +/** + * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix, + * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength). + * @param [in] queryLength Normal length, without W. + * @param [in] targetLength Normal length, without W. + * @param [in] bestScore Best score. + * @param [in] alignData Data obtained during finding best score that is useful for finding alignment. + * @param [out] alignment Alignment. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignmentTraceback(const int queryLength, const int targetLength, + const int bestScore, const AlignmentData* const alignData, + unsigned char** const alignment, int* const alignmentLength) { + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + + *alignment = malloc((queryLength + targetLength - 1) * sizeof(unsigned char)); + *alignmentLength = 0; + int c = targetLength - 1; // index of column + int b = maxNumBlocks - 1; // index of block in column + int currScore = bestScore; // Score of current cell + int lScore = -1; // Score of left cell + int uScore = -1; // Score of upper cell + int ulScore = -1; // Score of upper left cell + Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block + Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block + // True if block to left exists and is in band + bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]; + // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the + // calculation as both lP and lM should be initialized at some moment later (but compiler can not + // detect it since this initialization is guaranteed by "business" logic). + Word lP = 0, lM = 0; + if (thereIsLeftBlock) { + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left + } + currP <<= W; + currM <<= W; + int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos + + // TODO(martin): refactor this whole piece of code. There are too many if-else statements, + // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases. + // We need better separation of logic and responsibilities. + while (true) { + if (c == 0) { + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } + + // TODO: improvement: calculate only those cells that are needed, + // for example if I calculate upper cell and can move up, + // there is no need to calculate left and upper left cell + //---------- Calculate scores ---------// + if (lScore == -1 && thereIsLeftBlock) { + lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left + for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) { + if (lP & HIGH_BIT_MASK) lScore--; + if (lM & HIGH_BIT_MASK) lScore++; + lP <<= 1; + lM <<= 1; + } + } + if (ulScore == -1) { + if (lScore != -1) { + ulScore = lScore; + if (lP & HIGH_BIT_MASK) ulScore--; + if (lM & HIGH_BIT_MASK) ulScore++; + } + else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) { + // This is the case when upper left cell is last cell in block, + // and block to left is not in band so lScore is -1. + ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1]; + } + } + if (uScore == -1) { + uScore = currScore; + if (currP & HIGH_BIT_MASK) uScore--; + if (currM & HIGH_BIT_MASK) uScore++; + currP <<= 1; + currM <<= 1; + } + //-------------------------------------// + + // TODO: should I check if there is upper block? + + //-------------- Move --------------// + // Move up - insertion to target - deletion from query + if (uScore != -1 && uScore + 1 == currScore) { + currScore = uScore; + lScore = ulScore; + uScore = ulScore = -1; + if (blockPos == 0) { // If entering new (upper) block + if (b == 0) { // If there are no cells above (only boundary cells) + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up + for (int i = 0; i < c + 1; i++) // Move left until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + break; + } else { + blockPos = WORD_SIZE - 1; + b--; + currP = alignData->Ps[c * maxNumBlocks + b]; + currM = alignData->Ms[c * maxNumBlocks + b]; + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + thereIsLeftBlock = false; + // TODO(martin): There may not be left block, but there can be left boundary - do we + // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this. + } + } + } else { + blockPos--; + lP <<= 1; + lM <<= 1; + } + // Mark move + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + } + // Move left - deletion from target - insertion to query + else if (lScore != -1 && lScore + 1 == currScore) { + currScore = lScore; + uScore = ulScore; + lScore = ulScore = -1; + c--; + if (c == -1) { // If there are no cells to the left (only boundary cells) + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left + int numUp = b * WORD_SIZE + blockPos + 1; + for (int i = 0; i < numUp; i++) // Move up until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + break; + } + currP = lP; + currM = lM; + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + if (c == 0) { // If there are no cells to the left (only boundary cells) + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } else { + thereIsLeftBlock = false; + } + } + // Mark move + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + } + // Move up left - (mis)match + else if (ulScore != -1) { + unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH; + currScore = ulScore; + uScore = lScore = ulScore = -1; + c--; + if (c == -1) { // If there are no cells to the left (only boundary cells) + (*alignment)[(*alignmentLength)++] = moveCode; // Move left + int numUp = b * WORD_SIZE + blockPos; + for (int i = 0; i < numUp; i++) // Move up until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + break; + } + if (blockPos == 0) { // If entering upper left block + if (b == 0) { // If there are no more cells above (only boundary cells) + (*alignment)[(*alignmentLength)++] = moveCode; // Move up left + for (int i = 0; i < c + 1; i++) // Move left until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + break; + } + blockPos = WORD_SIZE - 1; + b--; + currP = alignData->Ps[c * maxNumBlocks + b]; + currM = alignData->Ms[c * maxNumBlocks + b]; + } else { // If entering left block + blockPos--; + currP = lP; + currM = lM; + currP <<= 1; + currM <<= 1; + } + // Set new left block + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + if (c == 0) { // If there are no cells to the left (only boundary cells) + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } else { + thereIsLeftBlock = false; + } + } + // Mark move + (*alignment)[(*alignmentLength)++] = moveCode; + } else { + // Reached end - finished! + break; + } + //----------------------------------// + } + + *alignment = realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)); + reverse(*alignment, *alignment + (*alignmentLength)); + return EDLIB_STATUS_OK; +} + + +/** + * Finds one possible alignment that gives optimal score (bestScore). + * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough, + * it will solve them using traceback algorithm. + * @param [in] query + * @param [in] rQuery Reversed query. + * @param [in] queryLength + * @param [in] target + * @param [in] rTarget Reversed target. + * @param [in] targetLength + * @param [in] equalityDefinition + * @param [in] alphabetLength + * @param [in] bestScore Best(optimal) score. + * @param [out] alignment Sequence of edit operations that make target equal to query. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignment( + const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, + const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, + const EqualityDefinition* equalityDefinition, const int alphabetLength, const int bestScore, + unsigned char** const alignment, int* const alignmentLength) { + + // Handle special case when one of sequences has length of 0. + if (queryLength == 0 || targetLength == 0) { + *alignmentLength = targetLength + queryLength; + *alignment = malloc((*alignmentLength) * sizeof(unsigned char)); + for (int i = 0; i < *alignmentLength; i++) { + (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT; + } + return EDLIB_STATUS_OK; + } + + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + int statusCode; + + // TODO: think about reducing number of memory allocations in alignment functions, probably + // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg, + // and it could also be done for alignments - we could have one big array for alignment that would be + // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results. + + // If estimated memory consumption for traceback algorithm is smaller than 1MB use it, + // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal. + long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength + + 2ll * sizeof(int) * targetLength; + if (alignmentDataSize < 1024 * 1024) { + int score_, endLocation_; // Used only to call function. + AlignmentData* alignData = NULL; + Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); + myersCalcEditDistanceNW(Peq, W, maxNumBlocks, + queryLength, + target, targetLength, + bestScore, + &score_, &endLocation_, true, &alignData, -1); + //assert(score_ == bestScore); + //assert(endLocation_ == targetLength - 1); + + statusCode = obtainAlignmentTraceback(queryLength, targetLength, + bestScore, alignData, alignment, alignmentLength); + free(alignData); + free(Peq); + } else { + statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength, + target, rTarget, targetLength, + equalityDefinition, alphabetLength, bestScore, + alignment, alignmentLength); + } + return statusCode; +} + + +/** + * Finds one possible alignment that gives optimal score (bestScore). + * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together. + * @param [in] query + * @param [in] rQuery Reversed query. + * @param [in] queryLength + * @param [in] target + * @param [in] rTarget Reversed target. + * @param [in] targetLength + * @param [in] alphabetLength + * @param [in] bestScore Best(optimal) score. + * @param [out] alignment Sequence of edit operations that make target equal to query. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignmentHirschberg( + const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, + const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, + const EqualityDefinition* equalityDefinition, const int alphabetLength, const int bestScore, + unsigned char** const alignment, int* const alignmentLength) { + + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + + Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); + Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition); + + // Used only to call functions. + int score_, endLocation_; + + // Divide dynamic matrix into two halfs, left and right. + const int leftHalfWidth = targetLength / 2; + const int rightHalfWidth = targetLength - leftHalfWidth; + + // Calculate left half. + AlignmentData* alignDataLeftHalf = NULL; + int leftHalfCalcStatus = myersCalcEditDistanceNW( + Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore, + &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); + + // Calculate right half. + AlignmentData* alignDataRightHalf = NULL; + int rightHalfCalcStatus = myersCalcEditDistanceNW( + rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore, + &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); + + free(Peq); + free(rPeq); + + if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) { + if (alignDataLeftHalf) free(alignDataLeftHalf); + if (alignDataRightHalf) free(alignDataRightHalf); + return EDLIB_STATUS_ERROR; + } + + // Unwrap the left half. + int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0]; + int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0]; + // TODO: avoid this allocation by using some shared array? + // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index) + // and ending with scoresLeftEndIdx row (0-indexed). + int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE; + int* scoresLeft = malloc(scoresLeftLength * sizeof(int)); + for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) { + Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx], + alignDataLeftHalf->scores[blockIdx]); + readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE); + } + int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE; + // If last block contains padding, shorten the length of scores for the length of padding. + if (lastBlockIdxLeft == maxNumBlocks - 1) { + scoresLeftLength -= W; + } + + // Unwrap the right half (I also reverse it while unwraping). + int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0]; + int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0]; + int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE; + int* scoresRight = malloc(scoresRightLength * sizeof(int)); + int* scoresRightOriginalStart = scoresRight; + for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) { + Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx], + alignDataRightHalf->scores[blockIdx]); + readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE); + } + int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE; + // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do), + // move pointer forward to remove the padding (that is why we remember originalStart). + if (scoresRightStartIdx < 0) { + //assert(scoresRightStartIdx == -1 * W); + scoresRight += W; + scoresRightStartIdx += W; + scoresRightLength -= W; + } + + free(alignDataLeftHalf); + free(alignDataRightHalf); + + //--------------------- Find the best move ----------------// + // Find the query/row index of cell in left column which together with its lower right neighbour + // from right column gives the best score (when summed). We also have to consider boundary cells + // (those cells at -1 indexes). + // x| + // -+- + // |x + int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1); + int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1, + scoresRightStartIdx + scoresRightLength - 2); + int leftScore = -1, rightScore = -1; + int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through. + bool queryIdxLeftAlignmentFound = false; + for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) { + leftScore = scoresLeft[queryIdx - scoresLeftStartIdx]; + rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx]; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = queryIdx; + queryIdxLeftAlignmentFound = true; + break; + } + } + // Check boundary cells. + if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) { + leftScore = leftHalfWidth; + rightScore = scoresRight[0]; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = -1; + queryIdxLeftAlignmentFound = true; + } + } + if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength + && scoresRightStartIdx + scoresRightLength == queryLength) { + leftScore = scoresLeft[scoresLeftLength - 1]; + rightScore = rightHalfWidth; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = queryLength - 1; + queryIdxLeftAlignmentFound = true; + } + } + + free(scoresLeft); + free(scoresRightOriginalStart); + + if (queryIdxLeftAlignmentFound == false) { + // If there was no move that is part of optimal alignment, then there is no such alignment + // or given bestScore is not correct! + return EDLIB_STATUS_ERROR; + } + //----------------------------------------------------------// + + // Calculate alignments for upper half of left half (upper left - ul) + // and lower half of right half (lower right - lr). + const int ulHeight = queryIdxLeftAlignment + 1; + const int lrHeight = queryLength - ulHeight; + const int ulWidth = leftHalfWidth; + const int lrWidth = rightHalfWidth; + unsigned char* ulAlignment = NULL; int ulAlignmentLength; + int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight, + target, rTarget + lrWidth, ulWidth, + equalityDefinition, alphabetLength, leftScore, + &ulAlignment, &ulAlignmentLength); + unsigned char* lrAlignment = NULL; int lrAlignmentLength; + int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight, + target + ulWidth, rTarget, lrWidth, + equalityDefinition, alphabetLength, rightScore, + &lrAlignment, &lrAlignmentLength); + if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) { + if (ulAlignment) free(ulAlignment); + if (lrAlignment) free(lrAlignment); + return EDLIB_STATUS_ERROR; + } + + // Build alignment by concatenating upper left alignment with lower right alignment. + *alignmentLength = ulAlignmentLength + lrAlignmentLength; + *alignment = malloc((*alignmentLength) * sizeof(unsigned char)); + memcpy(*alignment, ulAlignment, ulAlignmentLength); + memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength); + + free(ulAlignment); + free(lrAlignment); + return EDLIB_STATUS_OK; +} +#endif + +/** + * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences + * where elements in sequences are not any more letters of alphabet, but their index in alphabet. + * Most of internal edlib functions expect such transformed sequences. + * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done. + * Example: + * Original sequences: "ACT" and "CGT". + * Alphabet would be recognized as "ACTG". Alphabet length = 4. + * Transformed sequences: [0, 1, 2] and [1, 3, 2]. + * @param [in] queryOriginal + * @param [in] queryLength + * @param [in] targetOriginal + * @param [in] targetLength + * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1]. + * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1]. + * @return Alphabet as a string of unique characters, where index of each character is its value in transformed + * sequences. + */ +static char *transformSequences(const char* const queryOriginal, const int queryLength, + const char* const targetOriginal, const int targetLength, + unsigned char** const queryTransformed, + unsigned char** const targetTransformed, + int *alphabet_size) { + // Alphabet is constructed from letters that are present in sequences. + // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1, + // and new query and target are created in which letters are replaced with their ordinal numbers. + // This query and target are used in all the calculations later. + *queryTransformed = malloc(sizeof(unsigned char) * queryLength); + *targetTransformed = malloc(sizeof(unsigned char) * targetLength); + + char *alphabet = malloc(MAX_UCHAR+1), *alphabet_cp = alphabet; + + // Alphabet information, it is constructed on fly while transforming sequences. + // letterIdx[c] is index of letter c in alphabet. + unsigned char letterIdx[MAX_UCHAR + 1]; + bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet + for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false; + + for (int i = 0; i < queryLength; i++) { + unsigned char c = queryOriginal[i]; + if (!inAlphabet[c]) { + inAlphabet[c] = true; + letterIdx[c] = alphabet_cp - alphabet; + *alphabet_cp++ = queryOriginal[i]; + } + (*queryTransformed)[i] = letterIdx[c]; + } + for (int i = 0; i < targetLength; i++) { + unsigned char c = targetOriginal[i]; + if (!inAlphabet[c]) { + inAlphabet[c] = true; + letterIdx[c] = alphabet_cp - alphabet; + *alphabet_cp++ = targetOriginal[i]; + } + (*targetTransformed)[i] = letterIdx[c]; + } + + *alphabet_size = alphabet_cp - alphabet; + return alphabet; +} + + +EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task, + const EdlibEqualityPair* additionalEqualities, + int additionalEqualitiesLength) { + EdlibAlignConfig config; + config.k = k; + config.mode = mode; + config.task = task; + config.additionalEqualities = additionalEqualities; + config.additionalEqualitiesLength = additionalEqualitiesLength; + return config; +} + +EdlibAlignConfig edlibDefaultAlignConfig(void) { + return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0); +} + +void edlibFreeAlignResult(EdlibAlignResult result) { + if (result.endLocations) free(result.endLocations); + if (result.startLocations) free(result.startLocations); + if (result.alignment) free(result.alignment); +} diff --git a/edlib.h b/edlib.h new file mode 100644 index 000000000..acce6b20e --- /dev/null +++ b/edlib.h @@ -0,0 +1,277 @@ +#ifndef EDLIB_H +#define EDLIB_H + +/** + * @file + * @author Martin Sosic + * @brief Main header file, containing all public functions and structures. + */ + +// Define EDLIB_API macro to properly export symbols +#ifdef EDLIB_SHARED +# ifdef _WIN32 +# ifdef EDLIB_BUILD +# define EDLIB_API __declspec(dllexport) +# else +# define EDLIB_API __declspec(dllimport) +# endif +# else +# define EDLIB_API __attribute__ ((visibility ("default"))) +# endif +#else +# define EDLIB_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Status codes +#define EDLIB_STATUS_OK 0 +#define EDLIB_STATUS_ERROR 1 + + /** + * Alignment methods - how should Edlib treat gaps before and after query? + */ + typedef enum { + /** + * Global method. This is the standard method. + * Useful when you want to find out how similar is first sequence to second sequence. + */ + EDLIB_MODE_NW, + /** + * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. + * What that means is that deleting elements from the end of second sequence is "free"! + * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end + * of second sequence is "free" and does not count into total edit distance. This method is appropriate + * when you want to find out how well first sequence fits at the beginning of second sequence. + */ + EDLIB_MODE_SHW, + /** + * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are + * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! + * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start + * and GAC from the end of second sequence is "free" and does not count into total edit distance. + * This method is appropriate when you want to find out how well first sequence fits at any part of + * second sequence. + * For example, if your second sequence was a long text and your first sequence was a sentence from that text, + * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in + * that text. In bioinformatics, this method is appropriate for aligning read to a sequence. + */ + EDLIB_MODE_HW + } EdlibAlignMode; + + /** + * Alignment tasks - what do you want Edlib to do? + */ + typedef enum { + EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. + EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. + EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. + } EdlibAlignTask; + + /** + * Describes cigar format. + * @see http://samtools.github.io/hts-specs/SAMv1.pdf + * @see http://drive5.com/usearch/manual/cigar.html + */ + typedef enum { + EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. + EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. + } EdlibCigarFormat; + +// Edit operations. +#define EDLIB_EDOP_MATCH 0 //!< Match. +#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. +#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. +#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. + + /** + * @brief Defines two given characters as equal. + */ + typedef struct { + char first; + char second; + } EdlibEqualityPair; + + /** + * @brief Configuration object for edlibAlign() function. + */ + typedef struct { + /** + * Set k to non-negative value to tell edlib that edit distance is not larger than k. + * Smaller k can significantly improve speed of computation. + * If edit distance is larger than k, edlib will set edit distance to -1. + * Set k to negative value and edlib will internally auto-adjust k until score is found. + */ + int k; + + /** + * Alignment method. + * EDLIB_MODE_NW: global (Needleman-Wunsch) + * EDLIB_MODE_SHW: prefix. Gap after query is not penalized. + * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. + */ + EdlibAlignMode mode; + + /** + * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. + * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. + * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. + * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). + */ + EdlibAlignTask task; + + /** + * List of pairs of characters, where each pair defines two characters as equal. + * This way you can extend edlib's definition of equality (which is that each character is equal only + * to itself). + * This can be useful if you have some wildcard characters that should match multiple other characters, + * or e.g. if you want edlib to be case insensitive. + * Can be set to NULL if there are none. + */ + const EdlibEqualityPair* additionalEqualities; + + /** + * Number of additional equalities, which is non-negative number. + * 0 if there are none. + */ + int additionalEqualitiesLength; + } EdlibAlignConfig; + + /** + * Helper method for easy construction of configuration object. + * @return Configuration object filled with given parameters. + */ + EDLIB_API EdlibAlignConfig edlibNewAlignConfig( + int k, EdlibAlignMode mode, EdlibAlignTask task, + const EdlibEqualityPair* additionalEqualities, + int additionalEqualitiesLength + ); + + /** + * @return Default configuration object, with following defaults: + * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. + */ + EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); + + + /** + * Container for results of alignment done by edlibAlign() function. + */ + typedef struct { + /** + * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. + */ + int status; + + /** + * -1 if k is non-negative and edit distance is larger than k. + */ + int editDistance; + + /** + * Array of zero-based positions in target where optimal alignment paths end. + * If gap after query is penalized, gap counts as part of query (NW), otherwise not. + * Set to NULL if edit distance is larger than k. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + int* endLocations; + + /** + * Array of zero-based positions in target where optimal alignment paths start, + * they correspond to endLocations. + * If gap before query is penalized, gap counts as part of query (NW), otherwise not. + * Set to NULL if not calculated or if edit distance is larger than k. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + int* startLocations; + + /** + * Number of end (and start) locations. + */ + int numLocations; + + /** + * Alignment is found for first pair of start and end locations. + * Set to NULL if not calculated. + * Alignment is sequence of numbers: 0, 1, 2, 3. + * 0 stands for match. + * 1 stands for insertion to target. + * 2 stands for insertion to query. + * 3 stands for mismatch. + * Alignment aligns query to target from begining of query till end of query. + * If gaps are not penalized, they are not in alignment. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + unsigned char* alignment; + + /** + * Length of alignment. + */ + int alignmentLength; + + /** + * Number of different characters in query and target together. + */ + int alphabetLength; + } EdlibAlignResult; + + /** + * Frees memory in EdlibAlignResult that was allocated by edlib. + * If you do not use it, make sure to free needed members manually using free(). + */ + EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); + + + /** + * Aligns two sequences (query and target) using edit distance (levenshtein distance). + * Through config parameter, this function supports different alignment methods (global, prefix, infix), + * as well as different modes of search (tasks). + * It always returns edit distance and end locations of optimal alignment in target. + * It optionally returns start locations of optimal alignment in target and alignment path, + * if you choose appropriate tasks. + * @param [in] query First sequence. + * @param [in] queryLength Number of characters in first sequence. + * @param [in] target Second sequence. + * @param [in] targetLength Number of characters in second sequence. + * @param [in] config Additional alignment parameters, like alignment method and wanted results. + * @return Result of alignment, which can contain edit distance, start and end locations and alignment path. + * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. + */ + EDLIB_API EdlibAlignResult edlibAlign( + const char* query, int queryLength, + const char* target, int targetLength, + const EdlibAlignConfig config + ); + + + /** + * Builds cigar string from given alignment sequence. + * @param [in] alignment Alignment sequence. + * 0 stands for match. + * 1 stands for insertion to target. + * 2 stands for insertion to query. + * 3 stands for mismatch. + * @param [in] alignmentLength + * @param [in] cigarFormat Cigar will be returned in specified format. + * @return Cigar string. + * I stands for insertion. + * D stands for deletion. + * X stands for mismatch. (used only in extended format) + * = stands for match. (used only in extended format) + * M stands for (mis)match. (used only in standard format) + * String is null terminated. + * Needed memory is allocated and given pointer is set to it. + * Do not forget to free it later using free()! + */ + EDLIB_API char* edlibAlignmentToCigar( + const unsigned char* alignment, int alignmentLength, + EdlibCigarFormat cigarFormat + ); + +#ifdef __cplusplus +} +#endif + +#endif // EDLIB_H diff --git a/mpileup.c b/mpileup.c index d42a6a360..f681945a7 100644 --- a/mpileup.c +++ b/mpileup.c @@ -74,6 +74,7 @@ typedef struct { int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; + double del_bias; // compensate for diff deletion vs insertion error rates char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; @@ -99,6 +100,7 @@ typedef struct { htsFile *bcf_fp; bcf_hdr_t *bcf_hdr; int indels_v20; + int edlib; int argc; char **argv; int write_index; @@ -584,12 +586,14 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them - if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth ) + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth) { bcf_callaux_clean(conf->bca, &conf->bc); conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL; int iret; - if ( conf->indels_v20 ) + if (conf->edlib) + iret = bcf_edlib_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref, ref_len); + else if ( conf->indels_v20 ) iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); else iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); @@ -864,13 +868,14 @@ static int mpileup(mplp_conf_t *conf) conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; conf->bca->indel_bias = conf->indel_bias; + conf->bca->del_bias = conf->del_bias; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; conf->bca->indel_win_size = conf->indel_win_size; - conf->bca->indels_v20 = conf->indels_v20; + conf->bca->edlib = conf->edlib; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1269,17 +1274,31 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); fprintf(fp, " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); + fprintf(fp, + " --del-bias FLOAT Relative likelihood of insertion to deletion [%.2f]\n", mplp->del_bias); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp, - " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n"); + " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n" + " --edlib New EXPERIMENTAL indel calling model with edlib\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" " 1.12: -Q13 -h100 -m1 -F0.002\n" - " illumina: [ default values ]\n" + " illumina-1.18: --indel-size 110\n" + " illumina or illumina-1.20: [ default values ]\n" " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" - " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + " ont-sup or ont-sup-1.20:\n" + " -B -Q1 --max-BQ 99 -F0.20 -o15 -e1 -h80 --delta-BQ 60 \\\n" + " --del-bias 0.4\n" + " pacbio-ccs-1.18: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n" + " -M99999 --indel-size 110\n" + " pacbio-ccs or pacbio-ccs-1.20:\n" + " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" + " --del-bias 0.4\n" + " ultima or ultima-1.20:\n" + " -B -Q4 --max-BQ 40 -F0.15 -o20 -e15 -h250 --delta-BQ 99 \\\n" + " --del-bias 0.3\n" "\n" "Notes: Assuming diploid individuals.\n" "\n" @@ -1322,8 +1341,9 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; - mplp.indel_win_size = 110; + mplp.indel_win_size = 80; mplp.clevel = -1; + mplp.del_bias = 0; // even insertion and deletion likelhoods. hts_srand48(0); static const struct option lopts[] = @@ -1382,6 +1402,7 @@ int main_mpileup(int argc, char *argv[]) {"indel-bias", required_argument, NULL, 10}, {"indel-size", required_argument, NULL, 15}, {"indels-2.0", no_argument, NULL, 20}, + {"edlib", no_argument, NULL, 22}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1395,6 +1416,7 @@ int main_mpileup(int argc, char *argv[]) {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, {"write-index",no_argument,NULL,21}, + {"del-bias", required_argument, NULL, 23}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1509,15 +1531,17 @@ int main_mpileup(int argc, char *argv[]) char *tmp; mplp.indel_win_size = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); - if ( mplp.indel_win_size < 110 ) + if ( mplp.indel_win_size < 20 ) { - mplp.indel_win_size = 110; + mplp.indel_win_size = 20; fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); } } break; case 20: mplp.indels_v20 = 1; break; case 21: mplp.write_index = 1; break; + case 22: mplp.edlib = 1; break; + case 23: mplp.del_bias = atof(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1532,7 +1556,7 @@ int main_mpileup(int argc, char *argv[]) break; case 'M': mplp.max_read_len = atoi(optarg); break; case 'X': - if (strcasecmp(optarg, "pacbio-ccs") == 0) { + if (strcasecmp(optarg, "pacbio-ccs-1.18") == 0) { mplp.min_frac = 0.1; mplp.min_baseQ = 5; mplp.max_baseQ = 50; @@ -1541,13 +1565,48 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag |= MPLP_REALN_PARTIAL; mplp.max_read_len = 99999; + mplp.indel_win_size = 110; + } else if (strcasecmp(optarg, "pacbio-ccs") == 0 || + strcasecmp(optarg, "pacbio-ccs-1.20") == 0) { + mplp.min_frac = 0.1; + mplp.min_baseQ = 5; + mplp.max_baseQ = 50; + mplp.delta_baseQ = 10; + mplp.tandemQ = 300; + mplp.openQ = 25; + mplp.extQ = 1; + mplp.flag &= ~MPLP_REALN; + mplp.del_bias = 0.4; } else if (strcasecmp(optarg, "ont") == 0) { - fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " + fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); mplp.min_baseQ = 5; mplp.max_baseQ = 30; mplp.flag &= ~MPLP_REALN; mplp.flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "ont-sup") == 0 || + strcasecmp(optarg, "ont-sup-1.20") == 0) { + mplp.min_frac = 0.2; + mplp.min_baseQ = 1; + mplp.max_baseQ = 35; + mplp.delta_baseQ = 99; + mplp.openQ = 15; + mplp.extQ = 1; + mplp.tandemQ = 80; + mplp.flag &= ~MPLP_REALN; + mplp.max_read_len = 9999999; + mplp.del_bias = 0.4; + } else if (strcasecmp(optarg, "ultima") == 0 || + strcasecmp(optarg, "ultima-1.20") == 0) { + mplp.min_frac = 0.15; + mplp.min_baseQ = 3; + mplp.max_baseQ = 40; + mplp.delta_baseQ = 99; + mplp.openQ = 20; + mplp.extQ = 15; + mplp.tandemQ = 250; + mplp.flag &= ~MPLP_REALN; + mplp.del_bias = 0.3; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; @@ -1556,7 +1615,11 @@ int main_mpileup(int argc, char *argv[]) mplp.tandemQ = 100; mplp.flag &= ~MPLP_REALN_PARTIAL; mplp.flag |= MPLP_REALN; - } else if (strcasecmp(optarg, "illumina") == 0) { + } else if (strcasecmp(optarg, "illumina-1.18") == 0) { + mplp.indel_win_size = 110; + mplp.flag |= MPLP_REALN_PARTIAL; + } else if (strcasecmp(optarg, "illumina") == 0 || + strcasecmp(optarg, "illumina-1.20") == 0) { mplp.flag |= MPLP_REALN_PARTIAL; } else { fprintf(stderr, "Unknown configuration name '%s'\n" diff --git a/str_finder.c b/str_finder.c index 800cbfef9..a850fb529 100644 --- a/str_finder.c +++ b/str_finder.c @@ -1,7 +1,7 @@ /* str_finder.c -- Short Tandem Repeat finder. Originally from Crumble (https://github.com/jkbonfield/crumble) - Copyright (C) 2015-2016, 2021 Genome Research Ltd. + Copyright (C) 2015-2016, 2021-2022 Genome Research Ltd. Author: James Bonfield @@ -137,6 +137,86 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, * Returns a list of rep_ele structs holding the start,end tuples of repeats; * NULL on failure. */ +rep_ele *find_STR64(char *cons, int len, int lower_only) { + int i, j; + uint64_t w = 0; + rep_ele *reps = NULL; + + for (i = j = 0; i < len && j < 26; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + if (j>=15 && (w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + if (j>=17 && (w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + if (j>=19 && (w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i,10, lower_only, w); + if (j>=21 && (w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i,11, lower_only, w); + if (j>=23 && (w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i,12, lower_only, w); + if (j>=24 && (w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i,13, lower_only, w); + + j++; + } + + for (; i < len; i++) { + if (cons[i] == '*') continue; + + w <<= 2; + w |= cons[i]; + //printf("%3d %c w=%08x\n", i, cons[i], w); + if ((w&0xfffffff) == ((w>>28)&0xfffffff)) + add_rep(&reps, cons, len, i, 14, lower_only, w); + else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i, 13, lower_only, w); + else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i, 12, lower_only, w); + else if ((w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i, 11, lower_only, w); + else if ((w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i, 10, lower_only, w); + else if ((w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + else if ((w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + else if ((w&0x3fff) == ((w>>14)&0x3fff)) + add_rep(&reps, cons, len, i, 7, lower_only, w); + else if ((w&0x0fff) == ((w>>12)&0x0fff)) + add_rep(&reps, cons, len, i, 6, lower_only, w); + else if ((w&0x03ff) == ((w>>10)&0x03ff)) + add_rep(&reps, cons, len, i, 5, lower_only, w); + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + add_rep(&reps, cons, len, i, 4, lower_only, w); + else if ((w&0x003f) == ((w>> 6)&0x003f)) + add_rep(&reps, cons, len, i, 3, lower_only, w); + else if ((w&0x000f) == ((w>> 4)&0x000f)) + add_rep(&reps, cons, len, i, 2, lower_only, w); + else if ((w&0x0003) == ((w>> 2)&0x0003)) + add_rep(&reps, cons, len, i, 1, lower_only, w); + } + + return reps; +} + rep_ele *find_STR(char *cons, int len, int lower_only) { int i, j; uint32_t w = 0; @@ -172,21 +252,21 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { w <<= 2; w |= cons[i]; //printf("%3d %c w=%08x\n", i, cons[i], w); - if ((w&0xffff) == ((w>>16)&0xffff)) + if ((w&0xffff) == ((w>>16)&0xffff)) add_rep(&reps, cons, len, i, 8, lower_only, w); - else if ((w&0x3fff) == ((w>>14)&0x3fff)) + else if ((w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); - else if ((w&0x0fff) == ((w>>12)&0x0fff)) + else if ((w&0x0fff) == ((w>>12)&0x0fff)) add_rep(&reps, cons, len, i, 6, lower_only, w); - else if ((w&0x03ff) == ((w>>10)&0x03ff)) + else if ((w&0x03ff) == ((w>>10)&0x03ff)) add_rep(&reps, cons, len, i, 5, lower_only, w); - else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) add_rep(&reps, cons, len, i, 4, lower_only, w); - else if ((w&0x003f) == ((w>> 6)&0x003f)) + else if ((w&0x003f) == ((w>> 6)&0x003f)) add_rep(&reps, cons, len, i, 3, lower_only, w); - else if ((w&0x000f) == ((w>> 4)&0x000f)) + else if ((w&0x000f) == ((w>> 4)&0x000f)) add_rep(&reps, cons, len, i, 2, lower_only, w); - else if ((w&0x0003) == ((w>> 2)&0x0003)) + else if ((w&0x0003) == ((w>> 2)&0x0003)) add_rep(&reps, cons, len, i, 1, lower_only, w); } diff --git a/str_finder.h b/str_finder.h index 242f59ec1..22f9f5941 100644 --- a/str_finder.h +++ b/str_finder.h @@ -1,7 +1,7 @@ /* str_finder.c -- Short Tandem Repeat finder. Originally from Crumble (https://github.com/jkbonfield/crumble) - Copyright (C) 2015-2016, 2021 Genome Research Ltd. + Copyright (C) 2015-2016, 2021, 2023 Genome Research Ltd. Author: James Bonfield @@ -48,6 +48,9 @@ typedef struct rep_ele { */ rep_ele *find_STR(char *cons, int len, int lower_only); +/* As above, but use a longer hash with longer STR elements found */ +rep_ele *find_STR64(char *cons, int len, int lower_only); + /* * Returns an array of STR vs no-STR values. * 0 => non repetitive. From 5eca9a02d76922677d3f066a105d94b85a0d1f13 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 22 Nov 2023 16:27:58 +0000 Subject: [PATCH 2/8] Add an mpileup --poly-mqual option for the "edlib" mode. (TODO: rename edlib to something better) This enables the homopolymer scan-left/right for minimum quality for adjusting seqQ and indelQ. This is good for machines with mainly indel errors, and detrimental for clocked instruments such as Illumina. --- bam2bcf.h | 2 +- bam2bcf_edlib.c | 37 ++++++++++++++++++++++++------------- mpileup.c | 18 +++++++++++++----- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/bam2bcf.h b/bam2bcf.h index 11413ee83..9d464d608 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -123,7 +123,7 @@ typedef struct __bcf_callaux_t { int max_bases; int indel_types[4]; // indel lengths int indel_win_size, indels_v20, edlib; - int maxins, indelreg; + int maxins, indelreg, poly_mqual; int read_len; char *inscns; uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index b39310ddc..ccbe6ff63 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -1289,21 +1289,31 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1); } - if (1) { + // Skew SeqQ and IndelQ based on a portion of the minimum quality + // found within a homopolymer. This is useful where the quality + // values are a bit mutable and move around in such data, but less + // so on clocked sequencing technologies. + // + // Enabling this causes lots of GT errors on Illumina. + // However on PacBio it's key to removal of false positives. + // ONT and UG seem somewhere inbetween. + if (bca->poly_mqual) { int qpos = p->qpos, l; uint8_t *seq = bam_get_seq(p->b); uint8_t *qual = bam_get_qual(p->b); int min_q = qual[qpos]; -// // scan left -// char base = bam_seqi(seq, qpos); -// for (l = qpos; l >= 0; l--) { -// if (bam_seqi(seq, l) != base) -// break; -// if (min_q > qual[l]) -// min_q = qual[l]; -// } - - // scan right (including site of indel) + + // scan homopolymer left + char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq + ? qpos+1 : qpos); + for (l = qpos; l >= 0; l--) { + if (bam_seqi(seq, l) != baseL) + break; + if (min_q > qual[l]) + min_q = qual[l]; + } + + // scan homo-polymer right (including site of indel) char base = bam_seqi(seq, qpos+1); for (l = qpos+1; l < p->b->core.l_qseq; l++) { if (min_q > qual[l]) @@ -1312,12 +1322,13 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, break; } - // seqQ mod needed for PacBio. - // We reduce -h so homopolymers get reduced likelihood of being // called, but then optionally increase or decrease from there // based on base quality. Hence lack of low quality bases in // homopolymer will rescue the score back again, reducing FNs. + + // The score factors here may also be machine specific, but for + // now these work well (tuned on PB HiFi). seqQ += MIN(qavg/20, min_q - qavg/10); indelQ += MIN(qavg/20, min_q - qavg/5); diff --git a/mpileup.c b/mpileup.c index f681945a7..a2c062dd7 100644 --- a/mpileup.c +++ b/mpileup.c @@ -73,7 +73,7 @@ typedef struct { int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels - double indel_bias; + double indel_bias, poly_mqual; double del_bias; // compensate for diff deletion vs insertion error rates char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; @@ -876,6 +876,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->ambig_reads = conf->ambig_reads; conf->bca->indel_win_size = conf->indel_win_size; conf->bca->edlib = conf->edlib; + conf->bca->poly_mqual = conf->poly_mqual; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1280,7 +1281,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp, " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n" - " --edlib New EXPERIMENTAL indel calling model with edlib\n"); + " --edlib New EXPERIMENTAL indel calling model with edlib\n" + " --poly-mqual (Edlib mode) Use minimum quality within homopolymers\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -1290,15 +1292,15 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" " ont-sup or ont-sup-1.20:\n" " -B -Q1 --max-BQ 99 -F0.20 -o15 -e1 -h80 --delta-BQ 60 \\\n" - " --del-bias 0.4\n" + " --del-bias 0.4 --poly-mqual\n" " pacbio-ccs-1.18: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n" " -M99999 --indel-size 110\n" " pacbio-ccs or pacbio-ccs-1.20:\n" " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" - " --del-bias 0.4\n" + " --del-bias 0.4 --poly-mqual\n" " ultima or ultima-1.20:\n" " -B -Q4 --max-BQ 40 -F0.15 -o20 -e15 -h250 --delta-BQ 99 \\\n" - " --del-bias 0.3\n" + " --del-bias 0.3 --poly-mqual\n" "\n" "Notes: Assuming diploid individuals.\n" "\n" @@ -1342,6 +1344,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; mplp.indel_win_size = 80; + mplp.poly_mqual = 0; mplp.clevel = -1; mplp.del_bias = 0; // even insertion and deletion likelhoods. hts_srand48(0); @@ -1417,6 +1420,7 @@ int main_mpileup(int argc, char *argv[]) {"ar", required_argument, NULL, 14}, {"write-index",no_argument,NULL,21}, {"del-bias", required_argument, NULL, 23}, + {"poly-mqual", no_argument, NULL, 24}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1542,6 +1546,7 @@ int main_mpileup(int argc, char *argv[]) case 21: mplp.write_index = 1; break; case 22: mplp.edlib = 1; break; case 23: mplp.del_bias = atof(optarg); break; + case 24: mplp.poly_mqual = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1577,6 +1582,7 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.4; + mplp.poly_mqual = 1; } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1596,6 +1602,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag &= ~MPLP_REALN; mplp.max_read_len = 9999999; mplp.del_bias = 0.4; + mplp.poly_mqual = 1; } else if (strcasecmp(optarg, "ultima") == 0 || strcasecmp(optarg, "ultima-1.20") == 0) { mplp.min_frac = 0.15; @@ -1607,6 +1614,7 @@ int main_mpileup(int argc, char *argv[]) mplp.tandemQ = 250; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.3; + mplp.poly_mqual = 1; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; From 11d193d890ca122227c579874f636928d0afc888 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 22 Nov 2023 16:40:33 +0000 Subject: [PATCH 3/8] - Enable the "edlib mode" for the more recent mpileup -X profiles. It's faster and more accurate than the old mode. The speed is due to BAQ vs Edlib, but the accuracy comes from the use of allele consensus generation and better heuristics. - Cull lots of ifdefed out code, plus reset indel-bias back. Earlier experiments had increased this score by doing indel_bias/10 when it's used (it's 1/score for effect). The difference isn't great, and it's marginally better FN/FP ratio with it in the original usage. We can still manually change this score to get better sensitivity if desired, but it's not dialed up so high by default. - Remove an indel caller hang. Our consensus can sometimes fail and give us a short alignment that doesn't span pos. We were checking the wrong end coord. --- bam2bcf_edlib.c | 197 ++---------------------------------------------- mpileup.c | 15 +++- 2 files changed, 18 insertions(+), 194 deletions(-) diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index ccbe6ff63..c513914d5 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -808,7 +808,6 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, -1, // k; use small positive for faster alignment EDLIB_MODE_HW, // mode EDLIB_TASK_LOC, // task - //EDLIB_TASK_PATH, // for manual alignment scoring NULL, // additionalEqualities 0); // additionalEqualitiesLength EdlibAlignResult r = @@ -820,108 +819,6 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, return INT_MAX; } - int score; -// score = m*r.editDistance; // Illumina: ie -0*(glen - l_query) - -#if 0 - // Alignment based score, scaled by average sequence quality - int i, indel=0; - for (i = score = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: indel=0; break; // match - case 3: score++; indel=0; break; // mismatch - case 1: case 2: // indel - score+=indel?4:2; - indel=1; - break; - } - } - score *= m/2; -#elif 0 - // Alignment based score, using per-base sequence quality - int i, indel=0, qpos = 0; - for (i = score = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: indel=0; qpos++; break; // match - case 3: score+=qq[qpos]/2; indel=0; qpos++; break; // mismatch - case 1: // ins - case 2: // del - score+=(indel?2:1)*qq[qpos]; - indel=1; - qpos += r.alignment[i]==2; - break; - } - } -#elif 0 - // BEST for PB - // - // Alignment based score, using per-base sequence quality. - // Eg params for PacBio CCS. - // This is *marginally* better than the naive t_len-l_query below, but it's - // 34% slower mpileup for CCS. Perhaps not worth the trade off? - int i; - double fscore = 0; - for (i = score = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: break; // match -// case 3: fscore+=0.5; break; // mismatch -// case 1: fscore+=1.0; break; // ins; higher qual -// case 2: fscore+=0.6; break; // del; more often an error - case 3: score+= 5; break; // mismatch - case 1: score+=10; break; // ins; higher qual - case 2: score+= 6; break; // del; more often an error - } - } - score *= m/10; -// score = m*fscore; -#elif 0 - // As above, but accounting for minimum quality in STR region instead. - // BAD - int i; - double fscore = 0; - for (i = score = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: break; // match - case 3: fscore++; break; // mismatch - case 1: fscore+=m2min/m; break; // ins; higher qual - case 2: fscore+=0.6*m2min/m; break; // del; more often an error - } - } - score = fscore*m; -#elif 0 - // As above, but factoring in quality. - // BAD - int i, qpos = 0; - double fscore = 0; - for (i = score = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: qpos++; break; // match - case 3: fscore+=qq[qpos++]; break; // mismatch - case 1: fscore+=qq[qpos++]; break; // ins; higher qual - //case 2: fscore+=0.6*m; break; // del; more often an error - case 2: fscore+=.6*qq[qpos];break; // del; more often an error - } - } - score = fscore; -#endif - - -// int nins = 0, ndel = 0, nmis = 0; -// for (i = score = 0; i < r.alignmentLength; i++) { -// switch (r.alignment[i]) { -// case 1: nins++; break; -// case 2: ndel++; break; -// case 3: nmis++; break; -// } -// } -// assert((*r.endLocations - *r.startLocations + 1) - l_query == ndel-nins); -// -// Then score = f(nins,ndel,nmis). -// Could also track nis_o,nins_e,ndel_o,ndel_e for open/extend. - -#if 1 - int t_len = *r.endLocations - *r.startLocations + 1; - // Aligned target length minus query length is an indication of the number // of insertions and/or deletions. // @@ -945,39 +842,8 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, // Given editDistance is +1 for every mismatch, insertion and deletion, // provided the t_len-l_query multiplier < 1 then this is always +ve. - score = m*(r.editDistance - del_bias*(t_len - l_query)); -#endif - -#if 0 - // DEBUG: dump out the sequence alignment - { - char rseq[1024], *rcp = rseq, qseq[1024], *qcp = qseq; - - int i, rpos = 0, qpos = 0; - for (i = 0; i < r.alignmentLength; i++) { - switch(r.alignment[i]) { - case 0: // match - case 3: // mismath - *rcp++ = "ACGTN"[ref[rpos++]]; - *qcp++ = "ACGTN"[query[qpos++]]; - break; - case 1: // ins - *rcp++ = '-'; - *qcp++ = "ACGTN"[query[qpos++]]; - break; - case 2: // del - *rcp++ = "ACGTN"[ref[rpos++]]; - *qcp++ = '-'; - break; - } - } - *rcp = 0; - *qcp = 0; - fprintf(stderr, "Ref %s\n", rseq); - fprintf(stderr, "Seq %s\n", qseq); - fprintf(stderr, "Score %d t-l %d\n", score, t_len - l_query); - } -#endif + int t_len = *r.endLocations - *r.startLocations + 1; + int score = m*(r.editDistance - del_bias*(t_len - l_query)); edlibFreeAlignResult(r); return score; @@ -1135,9 +1001,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, //m = MIN(30, (m2+m2min)/2); // best so far m = MIN(30, m2min); -#if 1 - // edlib - + // Alternatives to experiment on. //double mm = (m+m2)/2; //double mm = m2min; double mm = m; @@ -1151,36 +1015,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, query, qend - qbeg, mm, del_bias); else sc1 = INT_MAX; // skip -#endif - -#if 0 - // BAQ - - int SC1, SC2; - probaln_par_t apf = { 1e-4, 1e-2, 10 }; - if (long_read) { - apf.d = 1e-3; - apf.e = 1e-1; - } - - if (band > (qend-qbeg)/2-3) - band = (qend-qbeg)/2-3; - apf.bw = band + 3; // or abs(l_ref - l_query), so we want to keep similar - - SC2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg, - query, qend - qbeg, qq, &apf, 0, 0); - - if (tend1 != tend2 || - memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, - tend1 - tbeg) != 0) - SC1 = probaln_glocal(ref1 + tbeg - left, tend1 - tbeg, - query, qend - qbeg, qq, &apf, 0, 0); - else - SC1 = INT_MAX; // skip - - sc1 = SC1; - sc2 = SC2; -#endif // Find the best of the two alignments if (sc1 < 0 && sc2 < 0) { @@ -1198,16 +1032,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, sc2 = sc1; } -#if 0 - // Old indel-tweak-jkb1:bam2bcf_indel.c code - l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias); - *score = sc2<<8 | MIN(255, l); - l = (*score&0xff)*.8 + iscore*2; - *score = (*score & ~0xff) | MIN(255, l); - free(qq); - return 0; -#endif - // Sc is overall alignment score, in top 24 bits (SeqQ). It's based // purely on the scores for the whole alignment. // We also have a separate indel score in bottom 8 bits (IndelQ). @@ -1222,10 +1046,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, l = .5*(100. * sc2 / (qend - qbeg) + .499); l += iscore*(qavg/(m2min+1.0) + qavg/m2); - *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias/10); - - // NOTE: indel_bias now seems to have a very minimal impact on scoring. - // Why is this so? + *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias); free(qq); @@ -1462,18 +1283,14 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, goto err; - // calculate left and right boundary -#if 0 - left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; - right = pos + bca->indel_win_size; -#else + // calculate left and right boundary, based on type size for a bit more + // speed. int max_indel = 20*MAX(ABS(types[0]), ABS(types[n_types-1])) + bca->indel_win_size/4; if (max_indel > bca->indel_win_size) max_indel = bca->indel_win_size; left = pos > max_indel ? pos - max_indel : 0; right = pos + max_indel; -#endif int del_size = types[0]<0 ? -types[0] : 0; right += del_size; @@ -1736,7 +1553,7 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // do realignment; this is the bottleneck. // // Note low score = good, high score = bad. - if (tend > tbeg) { + if (tend1 > tbeg && tend2 > tbeg) { //fprintf(stderr, "Num %d\n", i); if (bcf_cgp_align_score(p, bca, types[t], band, (uint8_t *)tcons[0] + left2-left, diff --git a/mpileup.c b/mpileup.c index a2c062dd7..c023a9b26 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1282,25 +1282,26 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n" " --edlib New EXPERIMENTAL indel calling model with edlib\n" + " --no-edlib Disable edlib mode, to use after a -X profile\n" " --poly-mqual (Edlib mode) Use minimum quality within homopolymers\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" " 1.12: -Q13 -h100 -m1 -F0.002\n" " illumina-1.18: --indel-size 110\n" - " illumina or illumina-1.20: [ default values ]\n" + " illumina or illumina-1.20: --edlib\n" " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" " ont-sup or ont-sup-1.20:\n" " -B -Q1 --max-BQ 99 -F0.20 -o15 -e1 -h80 --delta-BQ 60 \\\n" - " --del-bias 0.4 --poly-mqual\n" + " --del-bias 0.4 --poly-mqual --edlib\n" " pacbio-ccs-1.18: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n" " -M99999 --indel-size 110\n" " pacbio-ccs or pacbio-ccs-1.20:\n" " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" - " --del-bias 0.4 --poly-mqual\n" + " --del-bias 0.4 --poly-mqual --edlib\n" " ultima or ultima-1.20:\n" " -B -Q4 --max-BQ 40 -F0.15 -o20 -e15 -h250 --delta-BQ 99 \\\n" - " --del-bias 0.3 --poly-mqual\n" + " --del-bias 0.3 --poly-mqual --edlib\n" "\n" "Notes: Assuming diploid individuals.\n" "\n" @@ -1406,6 +1407,7 @@ int main_mpileup(int argc, char *argv[]) {"indel-size", required_argument, NULL, 15}, {"indels-2.0", no_argument, NULL, 20}, {"edlib", no_argument, NULL, 22}, + {"no-edlib", no_argument, NULL, 25}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1545,6 +1547,7 @@ int main_mpileup(int argc, char *argv[]) case 20: mplp.indels_v20 = 1; break; case 21: mplp.write_index = 1; break; case 22: mplp.edlib = 1; break; + case 25: mplp.edlib = 0; break; case 23: mplp.del_bias = atof(optarg); break; case 24: mplp.poly_mqual = 1; break; case 'A': use_orphan = 1; break; @@ -1583,6 +1586,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.4; mplp.poly_mqual = 1; + mplp.edlib = 1; } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1603,6 +1607,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_read_len = 9999999; mplp.del_bias = 0.4; mplp.poly_mqual = 1; + mplp.edlib = 1; } else if (strcasecmp(optarg, "ultima") == 0 || strcasecmp(optarg, "ultima-1.20") == 0) { mplp.min_frac = 0.15; @@ -1615,6 +1620,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.3; mplp.poly_mqual = 1; + mplp.edlib = 1; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; @@ -1629,6 +1635,7 @@ int main_mpileup(int argc, char *argv[]) } else if (strcasecmp(optarg, "illumina") == 0 || strcasecmp(optarg, "illumina-1.20") == 0) { mplp.flag |= MPLP_REALN_PARTIAL; + mplp.edlib = 1; } else { fprintf(stderr, "Unknown configuration name '%s'\n" "Please choose from 1.12, illumina, pacbio-ccs or ont\n", From 0a8006df501735307a46084240b28d9bac71146b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 30 Nov 2023 10:21:20 +0000 Subject: [PATCH 4/8] Recompute IDV and IMF if mpileup -a AD is set. The current IDV and IMF come from the initial "types" assignment, before realigning reads back to the consensus alleles. If we're outputting AD then we have more accurate re-aligned counts of each type, so we report those instead. --- bam2bcf.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index 402d42687..d9e55904b 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -1197,10 +1197,26 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( bc->ori_ref < 0 ) { bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); + uint32_t idv = bca->max_support; + if ( fmt_flag&B2B_INFO_IMF ) { + float max_frac; + if (bc->ADF && bc->ADR) { + int max_ad = 0, tot_ad = bc->ADF[0] + bc->ADR[0]; + for (int k = 1; k < rec->n_allele; k++) { + if (max_ad < bc->ADF[k] + bc->ADR[k]) + max_ad = bc->ADF[k] + bc->ADR[k]; + tot_ad += bc->ADF[k] + bc->ADR[k]; + } + max_frac = (double)(max_ad) / bc->ori_depth; + //max_frac = (double)(max_ad) / tot_ad; + idv = max_ad; + } else { + max_frac = bca->max_frac; + } + bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1); + } if ( fmt_flag&B2B_INFO_IDV ) - bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); - if ( fmt_flag&B2B_INFO_IMF ) - bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); + bcf_update_info_int32(hdr, rec, "IDV", &idv, 1); } bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); if ( fmt_flag&B2B_INFO_ADF ) From b4f3965b27e1b8185d1652328d1ce6e5766a8dbd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 30 Nov 2023 10:22:41 +0000 Subject: [PATCH 5/8] Fix the indelQ assignment for multi-allelic indel sites. When we have eg REF C and ALT CT,CTT then we're saying it's +1 or +2 Ts. We align all reads against the 3 candidate alleles and score them, sorting these scores to find the best so we can allocate reads to the alleles. The score for those allele assignments was REF minus best-indel for REF assignments, and best-indel minus REF for indel assignments. This change turns the latter into best-indel vs next-best-allele, regardless of whether that next best is REF or not. Consider the case of +1 score 30, +2 score 30, +0 (REF) score 60. Previously we'd have recorded the relative quality of 60-30, but now we record 30-30. The consequence of this is reads that align equally well against +1 and +2 get zero confidence in their correct allele assignment. This considerably reduces the chance of recording GT 1/2 for variable homopolymers caused purely by the vagarities of sorting a bunch of equal numbers. (The equal scores often arrive due to reads that don't span the homopolymer or STR.) We may miss some true variants as the multi-allelic possibilities all cancel each other out, but typically that means there are very few reads spanning the site and this data may well be false-positive caused by sequencing artifacts. So overall it's a considerable benefit to accuracy. A consequence of this is also that AD values now more accurately reflect evidence, rather than incorrectly distributing the uninformative reads to alleles. However, AD will now also be lower, and similarly PL. So a global boost to indelQ helps recover some of this lost sensitivity. This commit also boosts indelQ by the --indel-bias param to make this more impactful. (Previously it only boosted seqQ) --- bam2bcf.c | 4 + bam2bcf.h | 1 + bam2bcf_edlib.c | 220 +++++++++++++++++++++++++++++++++++++++++++++--- mpileup.c | 35 ++++++-- 4 files changed, 245 insertions(+), 15 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index d9e55904b..07fcd3539 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -382,6 +382,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // Note baseQ changes some output fields such as I16, but has no // significant affect on "call". baseQ = p->aux>>8&0xff; + + // Can we reuse baseQ as indelQ1 instead of indelQ? + // So we can distinguish between likelihood of any indel vs + // likelihood of this specific indel? } else { diff --git a/bam2bcf.h b/bam2bcf.h index 9d464d608..e4ed6628e 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -131,6 +131,7 @@ typedef struct __bcf_callaux_t { void *rghash; float indel_bias; // adjusts indel score threshold; lower => call more. float del_bias; // (-.9 < x < .9) error profile; >0 => more del, <0 => more ins + float vs_ref; // 0 to 1. 0: score vs next-best. 1: score vs ref int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm} unsigned int nnm[2]; // number of nm observations float nm[2]; // cumulative count of mismatches in ref and alt reads diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index c513914d5..34c686803 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -1069,6 +1069,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // FIXME: n_types has a maximum; no need to alloc - use a #define? int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp; memset(sumq, 0, n_types * sizeof(int)); + int sum_indelQ1[100] = {0}; // n + int sum_indelQ2[100] = {0}; // n // Confusing variable naming and bit usage. // @@ -1085,7 +1087,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // sct is short for score. // sc is score + t(type) // Why aren't these variable names reversed? - int *sct = &score[K*n_types], seqQ, indelQ; + int *sct = &score[K*n_types], seqQ, indelQ1=0, indelQ2=0, indelQ=0; for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; for (t = 1; t < n_types; ++t) // insertion sort for (j = t; j > 0 && sc[j] < sc[j-1]; --j) @@ -1100,16 +1102,154 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if ((sc[0]&0x3f) == ref_type) { // sc >> 14 is the total score. It's been shifted by 8 // from normalised score and 6 from type. - indelQ = (sc[1]>>14) - (sc[0]>>14); // &0x3f is type number + + // Best call is REF. Compare vs best indel + indelQ = (sc[1]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1); } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ = (sc[t]>>14) - (sc[0]>>14); +#if 0 + // Simplest code in edlib10f outputs + indelQ = (sc[1]>>14) - (sc[0]>>14); +#endif + +// Maybe just an option of x*indelQ1 + (1-x)*indelQ2 so we can adjust +// based on indel vs genotype accuracy, per instrument / dataset? + + +#if 1 +// Orig code; indelQ1 +// Good on HG002.GRCh38.PacBio_CCS_15Kb.bam + // look for the reference type + for (t = 0; t < n_types; ++t) { + if ((sc[t]&0x3f) == ref_type) + break; + } + indelQ = indelQ1 = (sc[t]>>14) - (sc[0]>>14); +// fprintf(stderr, "IndelQ = %d: %d-%d", +// indelQ, (sc[t]>>14), (sc[0]>>14)); +#endif + +#if 1 +// 10e; indelQ + + // Revised code in edlib10e outputs + // Good on most other data sets, including the 53x CCS SequelII data. + + // Best call is non-ref, compare vs next best non-ref, + // or ref if it's just 2 choices (most common case). + for (t = 1; t < n_types; t++) + if ((sc[t]&0x3f) == ref_type) + continue; + else break; + if (t == n_types) + t--; // it's ref, but it'll do as next best. + indelQ2 = (sc[t]>>14) - (sc[0]>>14); +// fprintf(stderr, "\tNEW %d: %d-%d\n", +// indelQ, (sc[t]>>14), (sc[0]>>14)); +#endif + +#if 0 + // Best call is non-ref, get the ref score and (if different) + // the next best non-ref. Average the two and then compare vs + // this call. It means we assign ADs better, but don't + // overly lose variants either when we have many choices. + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + // Could also try avg(ref,1) too rather than avg(ref,next)? + // Tried - no better + + indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); +#endif + +#if 0 + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + if (ref_t <= next_t) + indelQ = (sc[ref_t]>>14) - (sc[0]>>14); + else + //indelQ = (sc[next_t]>>14) - (sc[0]>>14); + indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); +#endif + +#if 0 + // Best call is non-ref, get the ref score and (if different) + // the next best non-ref. Average the two and then compare vs + // this call. It means we assign ADs better, but don't + // overly lose variants either when we have many choices. + int ref_t = -1, next_t = -1; + for (t = 1; t < n_types; t++) { + if ((sc[t]&0x3f) == ref_type) { + ref_t = t; + } else { + if (next_t <= 0) + next_t = t; + } + } + if (next_t < 0) + next_t = ref_t; + assert(MIN(ref_t, next_t) == 1); + + indelQ = (sc[next_t]>>14) - (sc[0]>>14); + indelQ += (sc[ref_t]>>14) > (sc[next_t]>>14); +// indelQ >>= (sc[ref_t]>>14) > (sc[next_t]>>14); // VBAD +#endif + + // Maybe boost seqQ if ref_t != next_t and sc[ref_t]-sc[0] is high + // while sc[next_t]-sc[0] is not? So we're saying the indel is good + // but this allele is not? Not sure how we explore that. + // It's the difference between making an indel-exists call vs making + // a call on this specific genotype. + + // ORIG seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1); + + // TO TEST + //seqQ += MAX(0, (indelQ1 - indelQ)/4); + //indelQ += MAX(0, (indelQ1 - indelQ2)/4); + + indelQ = bca->vs_ref*indelQ1 + (1-bca->vs_ref)*indelQ2; + +#if 0 + // If ref_t scores higher than next_t then we have more + // than 1 ALT allele. This requires a slightly higher + // threshold of confidence to call it as it's more likely + // down to sequencing error. + if (ref_t >= 2) + seqQ *= pow(0.9, ref_t - 1); +#endif } + // So we lower qual in some, but raise the average to keep FN/FP + // ratios up. + indelQ /= bca->indel_bias; + indelQ1 /= bca->indel_bias; + + // Or maybe just *2 if bca->poly_mqual and be done with it? + // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ? + // Skew SeqQ and IndelQ based on a portion of the minimum quality // found within a homopolymer. This is useful where the quality // values are a bit mutable and move around in such data, but less @@ -1123,6 +1263,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, uint8_t *seq = bam_get_seq(p->b); uint8_t *qual = bam_get_qual(p->b); int min_q = qual[qpos]; + int nbase = 0; + int sumq = 0; // scan homopolymer left char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq @@ -1141,8 +1283,19 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, min_q = qual[l]; if (bam_seqi(seq, l) != base) break; + nbase++; + sumq += qual[l]; } +// if (nbase) { +// sumq = sumq / (double)nbase; +// if (min_q < sumq / 4) +// min_q = sumq / 4; +// } + +// if (min_q < 10) +// min_q = 10; + // We reduce -h so homopolymers get reduced likelihood of being // called, but then optionally increase or decrease from there // based on base quality. Hence lack of low quality bases in @@ -1152,33 +1305,64 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // now these work well (tuned on PB HiFi). seqQ += MIN(qavg/20, min_q - qavg/10); indelQ += MIN(qavg/20, min_q - qavg/5); + indelQ1+= MIN(qavg/20, min_q - qavg/5); if (seqQ < 0) seqQ = 0; if (indelQ < 0) indelQ = 0; + if (indelQ1< 0) indelQ1= 0; } // This is the length-normalised score from bcf_cgp_align_score tmp = sc[0]>>6 & 0xff; // reduce indelQ - // high score = bad, low score = good. + // high score = bad, low score = good; flip for indelQ // low normalised scores leave indelQ unmodified // high normalised scores set indelQ to 0 // inbetween scores have a linear scale from indelQ to 0 indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + indelQ1= tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1+ .499); // Doesn't really help accuracy, but permits -h to take // affect still. if (indelQ > seqQ) indelQ = seqQ; if (indelQ > 255) indelQ = 255; + if (indelQ1> 255) indelQ1= 255; if (seqQ > 255) seqQ = 255; - // use 22 bits in total + // Use 22 bits in total. + // 0-7 IndelQ + // 8-15 SeqQ + // 16-22 Score-per-base p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; sumq[sc[0]&0x3f] += indelQ; + + // Experiment in p->aux vs sumq. + // One gives likelihood of an indel being here, while the other + // is likelihood of a specific genotype? But which is which? + + sum_indelQ1[s] += indelQ1; + sum_indelQ2[s] += indelQ; } + +// double m = (double)sum_indelQ1[s] / sum_indelQ2[s]; +// if (m > 1) { +// m = sqrt(m); +// fprintf(stderr, "Sum %d %d m=%f\n", sum_indelQ1, sum_indelQ, m); +// for (i = 0; i < n_plp[s]; ++i) { +// bam_pileup1_t *p = plp[s] + i; +// int indelQ = p->aux & 0xff; +// indelQ *= m; +// if (indelQ > 255) +// indelQ = 255; +// p->aux = (p->aux & ~0xff) | indelQ; +// } +// } } - // determine bca->indel_types[] and bca->inscns + + // Determine bca->indel_types[] and bca->inscns. + // Sumq[0] is always reference. + // Sumq[1] is best non-ref (and maybe better than ref) bca->maxins = max_ins; bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); if (bca->maxins && !bca->inscns) @@ -1190,20 +1374,27 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; for (t = 0; t < n_types; ++t) // look for the reference type if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first tmp = sumq[t]; for (; t > 0; --t) sumq[t] = sumq[t-1]; sumq[0] = tmp; } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; for (t = 0; t < 4 && t < n_types; ++t) { bca->indel_types[t] = types[sumq[t]&0x3f]; - if (bca->maxins) + if (bca->maxins) // potentially an insertion memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); } - // update p->aux + + // Update p->aux. + // If per-alignment type isn't found, then indelQ/seqQ is 0, + // otherwise unchanged. for (s = n_alt = 0; s < n; ++s) { +// double m = sqrt((double)sum_indelQ1[s] / sum_indelQ2[s]); +// if (m < 1) m = 1; for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; int x = types[p->aux>>16&0x3f]; @@ -1211,6 +1402,15 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; + +// Poor +// // We recorded indelQ based on this allele, but +// // scale now by quality of any non-ref allele. +// // This reduces FN while keeping GT accurate (maybe!) +// int indelQ = (p->aux & 0xff) * m; +// if (indelQ > 255) indelQ = 255; +// p->aux = (p->aux & ~0xff) | indelQ; + //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } diff --git a/mpileup.c b/mpileup.c index c023a9b26..b5e02c3d6 100644 --- a/mpileup.c +++ b/mpileup.c @@ -75,6 +75,7 @@ typedef struct { double min_frac; // for indels double indel_bias, poly_mqual; double del_bias; // compensate for diff deletion vs insertion error rates + double vs_ref; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; @@ -877,6 +878,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->indel_win_size = conf->indel_win_size; conf->bca->edlib = conf->edlib; conf->bca->poly_mqual = conf->poly_mqual; + conf->bca->vs_ref = conf->vs_ref; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -1277,6 +1279,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); fprintf(fp, " --del-bias FLOAT Relative likelihood of insertion to deletion [%.2f]\n", mplp->del_bias); + fprintf(fp, + " --score-vs-ref FLOAT\n" + " Ratio of score vs ref (1) or 2nd-best allele (0) [%.2f]\n", mplp->vs_ref); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp, @@ -1300,7 +1305,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" " --del-bias 0.4 --poly-mqual --edlib\n" " ultima or ultima-1.20:\n" - " -B -Q4 --max-BQ 40 -F0.15 -o20 -e15 -h250 --delta-BQ 99 \\\n" + " -B --max-BQ 30 -F0.15 -o20 -e15 -h250 --delta-BQ 10 \\\n" " --del-bias 0.3 --poly-mqual --edlib\n" "\n" "Notes: Assuming diploid individuals.\n" @@ -1331,6 +1336,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.vs_ref = 0; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; @@ -1423,6 +1429,8 @@ int main_mpileup(int argc, char *argv[]) {"write-index",no_argument,NULL,21}, {"del-bias", required_argument, NULL, 23}, {"poly-mqual", no_argument, NULL, 24}, + {"no-poly-mqual", no_argument, NULL, 26}, + {"score-vs-ref",required_argument, NULL, 27}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1533,6 +1541,11 @@ int main_mpileup(int argc, char *argv[]) else mplp.indel_bias = 1/atof(optarg); break; + case 27: + mplp.vs_ref = atof(optarg); + //if (mplp.vs_ref < 0) mplp.vs_ref = 0; + if (mplp.vs_ref > 1) mplp.vs_ref = 1; + break; case 15: { char *tmp; mplp.indel_win_size = strtol(optarg,&tmp,10); @@ -1550,6 +1563,7 @@ int main_mpileup(int argc, char *argv[]) case 25: mplp.edlib = 0; break; case 23: mplp.del_bias = atof(optarg); break; case 24: mplp.poly_mqual = 1; break; + case 26: mplp.poly_mqual = 0; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1585,8 +1599,10 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.4; + mplp.indel_bias = 1.2; mplp.poly_mqual = 1; mplp.edlib = 1; + mplp.vs_ref = 0.7; } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1611,16 +1627,17 @@ int main_mpileup(int argc, char *argv[]) } else if (strcasecmp(optarg, "ultima") == 0 || strcasecmp(optarg, "ultima-1.20") == 0) { mplp.min_frac = 0.15; - mplp.min_baseQ = 3; - mplp.max_baseQ = 40; - mplp.delta_baseQ = 99; + mplp.min_baseQ = 1; + mplp.max_baseQ = 30; + mplp.delta_baseQ = 10; mplp.openQ = 20; - mplp.extQ = 15; + mplp.extQ = 10; mplp.tandemQ = 250; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.3; mplp.poly_mqual = 1; mplp.edlib = 1; + mplp.vs_ref = 0.3; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; @@ -1634,8 +1651,16 @@ int main_mpileup(int argc, char *argv[]) mplp.flag |= MPLP_REALN_PARTIAL; } else if (strcasecmp(optarg, "illumina") == 0 || strcasecmp(optarg, "illumina-1.20") == 0) { + mplp.edlib = 1; + mplp.indel_win_size = 110; mplp.flag |= MPLP_REALN_PARTIAL; + } else if (strcasecmp(optarg, "bgi") == 0 || + strcasecmp(optarg, "bgi-1.20") == 0) { + // Largely as per Illumina mplp.edlib = 1; + mplp.indel_win_size = 110; + mplp.indel_bias = 0.9; + mplp.flag |= MPLP_REALN_PARTIAL; } else { fprintf(stderr, "Unknown configuration name '%s'\n" "Please choose from 1.12, illumina, pacbio-ccs or ont\n", From a7484e1c60255db4b353864406001609cf586558 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Dec 2023 15:01:40 +0000 Subject: [PATCH 6/8] - Rename mpileup --edlib to --indels-cns The main benefit of the new indel caller is the use of alignment against diploid consensus generation instead of simply patching the reference with candidate indel types. This greatly reduces false positives and incorrect allele alignment (leading to wrong genotype calls). This was the earlier PR #1679, but has since acquired edlib as an alternative to BAQ for the indel alignment. However this is primarily a speed benefit (with some minor removal of false-negatives due to quality smashing), and isn't the main thing users should think about when choosing an indel caller. Also tidied up the usage statement and added an explicit "-X list" to print the profile parameters. - Add extra debugging defines. GLF_DEBUG reports GLF calculation in bam2bcf.c. ALIGN_DEBUG uses edlib EDLIB_TASK_PATH to report sequence alignment. NB to use this you need to link against edlib itself rather than the cutdown version in this repository. Also fix the edlib heuristics used in bcf_call_glfgen. We don't want to change the call (b=5) as this affects AD. Instead we change the quality so poor calls get filtered by QUAL rather than simply being removed. - Tweak edlib tuning for SeqQ/qual. Add quality value assessment into soft-clip recovery. Use /500 instead of /111 in indelQ assignment, and skew indel-bias accordingly. This gives better separation of FP/GT/FN generally. - Added --seqq-offset parameter so we can use it in tunables per profile. This is used as a limit on the seqQ reduction in the "VAL-5*MIN(20,depth)" formula, used for favouring data over seqQ scores when depth is sufficient. Experimentation showed no single value that worked for all platforms, but the default is in the middle. - Tidy up to cull ifdefed and commented out code. - Add test for indels-cns. It's minimal, but the whole indel calling has minimal testing. I think we have under 10 indels in total with develop (all short read and mostly duplications of each other), and no testing of indels-2.0. This tests 4 indels with indels-cns. - Added documentation for the new --indels-2.0 options - Cull more unused parts of edlib.c. This avoids clang warnings (which become errors with -Werror). We're only including the bits we need here for mpileup. If you want the whole thing, link against the upstream source library instead. --- bam2bcf.c | 86 +++- bam2bcf.h | 1 + bam2bcf_edlib.c | 353 ++++--------- doc/bcftools.txt | 78 ++- edlib.c | 887 +-------------------------------- mpileup.c | 128 +++-- test/mpileup/indel-AD.1cns.out | 322 ++++++++++++ test/test.pl | 1 + 8 files changed, 655 insertions(+), 1201 deletions(-) create mode 100644 test/mpileup/indel-AD.1cns.out diff --git a/bam2bcf.c b/bam2bcf.c index 07fcd3539..85ae34bae 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -1,7 +1,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2024 Genome Research Ltd. Author: Heng Li @@ -249,6 +249,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t { int i, n, ref4, is_indel, ori_depth = 0; +#ifdef GLF_DEBUG + fprintf(stderr, "Call GLFGEN\n"); +#endif + // clean from previous run r->ori_depth = 0; r->mq0 = 0; @@ -351,6 +355,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } } +#ifdef GLF_DEBUG + fprintf(stderr, "GLF %s\t%d\t%d\n", bam_get_qname(p->b), + bca->indel_types[b], q); +#endif if (q < bca->min_baseQ) { if (!p->indel && b < 4) // not an indel read @@ -363,29 +371,50 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t continue; } - // FIXME: CHECK if this is still needed with edlib mode - // It's a slight variant on the one above guarded by --indels-2.0 +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +#if 1 // TEST 6 if (bca->edlib) { - if (indel_in_sample && p->indel == 0 && (q < _n/2 || _n > 20)) { - // high quality indel calls without p->indel set aren't - // particularly indicative of being a good REF match either, - // at least not in low coverage. So require solid coverage - // before we start utilising such quals. - if (b != 0) - b = 5; - q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = (3*seqQ + 2*q)/8; + // Deeper data should rely more heavily on counts of data + // than quality, as quality can be unreliable and prone to + // miscalculations through BAQ, STR analysis, etc. + // So we put a cap on how good seqQ can be. + // + // Is it simply the equivalent of increasing -F filter? + // Not quite, as the latter removes many real variants upfront. + // This calls them and then post-adjusts quality, potentially + // dropping it later or changing genotype. So we still get + // calls, but lower qual. + seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5)); + + if (indel_in_sample && p->indel == 0 && b != 0) { + // This read doesn't contain an indel in CIGAR, but it + // is assigned to an indel now (b != 0), These are + // reads we've corrected with realignment, but they're + // also enriched for FPs so at high depth we reduce their + // confidence and let the depth do the talking. If it's + // real and deep, then we don't need every read aligning. + // We also reduce base quality too to reflect the + // chance of our realignment being incorrect. + + seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5 + + // Finally reduce indel quality. + // This is a blend of indelQ and base QUAL. + q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1); } - if (_n > 20 && seqQ > 40) seqQ = 40; } +#endif // Note baseQ changes some output fields such as I16, but has no // significant affect on "call". baseQ = p->aux>>8&0xff; - - // Can we reuse baseQ as indelQ1 instead of indelQ? - // So we can distinguish between likelihood of any indel vs - // likelihood of this specific indel? } else { @@ -419,6 +448,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 if ( !mapQ ) r->mq0++; +#ifdef GLF_DEBUG + fprintf(stderr, "GLF2 %s\t%d\t%d\t%d,%d\n", + bam_get_qname(p->b), b, q, + seqQ, mapQ); +#endif if (q > seqQ) q = seqQ; mapQ = mapQ < bca->capQ? mapQ : bca->capQ; if (q > mapQ) q = mapQ; @@ -1202,25 +1236,29 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, { bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); uint32_t idv = bca->max_support; - if ( fmt_flag&B2B_INFO_IMF ) { + if ( fmt_flag&B2B_INFO_IMF) { float max_frac; - if (bc->ADF && bc->ADR) { - int max_ad = 0, tot_ad = bc->ADF[0] + bc->ADR[0]; + // Recompute IDV and IMF based on alignment results for more + // accurate counts, but only when in new "--indels-cns" mode. + if (bc->ADF && bc->ADR && bca->edlib) { + int max_ad = 0; for (int k = 1; k < rec->n_allele; k++) { if (max_ad < bc->ADF[k] + bc->ADR[k]) max_ad = bc->ADF[k] + bc->ADR[k]; - tot_ad += bc->ADF[k] + bc->ADR[k]; } max_frac = (double)(max_ad) / bc->ori_depth; - //max_frac = (double)(max_ad) / tot_ad; idv = max_ad; } else { max_frac = bca->max_frac; } + // Copied here to maintain order for consistency of "make check" + if ( fmt_flag&B2B_INFO_IDV ) + bcf_update_info_int32(hdr, rec, "IDV", &idv, 1); bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1); + } else { + if ( fmt_flag&B2B_INFO_IDV ) + bcf_update_info_int32(hdr, rec, "IDV", &idv, 1); } - if ( fmt_flag&B2B_INFO_IDV ) - bcf_update_info_int32(hdr, rec, "IDV", &idv, 1); } bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); if ( fmt_flag&B2B_INFO_ADF ) diff --git a/bam2bcf.h b/bam2bcf.h index e4ed6628e..8f8f8db5a 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -123,6 +123,7 @@ typedef struct __bcf_callaux_t { int max_bases; int indel_types[4]; // indel lengths int indel_win_size, indels_v20, edlib; + int seqQ_offset; // edlib mode, seqQ=MIN(seqQ, offset - MIN(20,depth)*5); int maxins, indelreg, poly_mqual; int read_len; char *inscns; diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index 34c686803..1cee6bfb7 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016-2017, 2021-2023 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017, 2021-2024 Genome Research Ltd. Author: Heng Li Petr Danecek @@ -25,8 +25,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +// Show consensus //#define CONS_DEBUG +// Show alignments to consensus +//#define ALIGN_DEBUG + #include #include #include @@ -472,7 +476,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Expand cons_base to include depth from ref_base/ref_ins // Caveat: except at pos itself, where true ref is used if type != 0 - // Note this harms PB-CCS test at chr1:10171880. +#if 1 // TEST 1 // We could retest this heuristic further maybe. for (i = 0; i < right-left; i++) { // Total observed depth @@ -503,6 +507,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (rfract < 1.01 / (r+1e-10)) rfract = 1.01 / (r+1e-10); // low depth compensation +// if (rfract > 0.2) +// rfract = 0.2; // TODO: consider limiting rfract so we never drown out the // signal. We want to use the remaining data only to correct @@ -535,6 +541,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, goto err; } } +#endif //-------------------------------------------------- // Allocate consensus buffer, to worst case length @@ -807,7 +814,11 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, //ABS(type)+ABS(l_ref-l_query)+10, -1, // k; use small positive for faster alignment EDLIB_MODE_HW, // mode - EDLIB_TASK_LOC, // task +#ifdef ALIGN_DEBUG + EDLIB_TASK_PATH, +#else + EDLIB_TASK_LOC, +#endif NULL, // additionalEqualities 0); // additionalEqualitiesLength EdlibAlignResult r = @@ -819,6 +830,50 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, return INT_MAX; } +#ifdef ALIGN_DEBUG + // NB: Needs linking against the C++ libedlib.a as our cut-down C + // implementation misses the alignment generation code. + { + int i, j = 0, pt = r.startLocations[0], pq = 0; + char line1[80]; + char line2[80]; + char line3[80]; + for (i = 0; i < r.alignmentLength && pt < r.endLocations[0]; i++) { + int n; + switch (n = r.alignment[i]) { + case 0: // match + case 3: // mismatch + line1[j] = "ACGTN"[ref[pt++]]; + line2[j] = "ACGTN"[query[pq++]]; + line3[j] = " x"[n==3]; + break; + case 2: // insertion to ref + line1[j] = "ACGTN"[ref[pt++]]; + line2[j] = '-'; + line3[j] = '-'; + break; + case 1: // insertion to query + line1[j] = '-'; + line2[j] = "ACGTN"[query[pq++]]; + line3[j] = '+'; + break; + } + + if (++j == sizeof(line1)) { + fprintf(stderr, "%.*s\n", j, line1); + fprintf(stderr, "%.*s\n", j, line2); + fprintf(stderr, "%.*s\n", j, line3); + j = 0; + } + } + if (j) { + fprintf(stderr, "%.*s\n", j, line1); + fprintf(stderr, "%.*s\n", j, line2); + fprintf(stderr, "%.*s\n", j, line3); + } + } +#endif + // Aligned target length minus query length is an indication of the number // of insertions and/or deletions. // @@ -891,7 +946,7 @@ int edlib_glocal(uint8_t *ref, int l_ref, uint8_t *query, int l_query, static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, int band, uint8_t *ref1, uint8_t *ref2, uint8_t *query, - int r_start, int r_end, int long_read, + int r_start, int r_end, int tbeg, int tend1, int tend2, int left, int right, int qbeg, int qend, @@ -900,8 +955,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int *str_len1_p, int *str_len2_p) { int atype = abs(type); int l, sc1, sc2; - const uint8_t *qual = bam_get_qual(p->b), *bq = NULL; - uint8_t *qq; // Trim poly_Ns at ends of ref. // This helps to keep len(ref) and len(query) similar, to reduce @@ -927,84 +980,14 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, tend2 -= l-atype; } - // Get segment of quality, either ZQ tag or if absent QUAL. - if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) - return -1; - //bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - //if (bq) ++bq; // skip type - double m = 0; - for (l = qbeg; l < qend; ++l) { - int qval = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qval > 30) - qval = 30; - if (qval < 7) - qval = 7; - qq[l - qbeg] = qval; - m += qval; - } - m /= (qend - qbeg); // avg qual - - // Identify STRs in ref covering the indel up to - // (or close to) the end of the sequence. - // Those having an indel and right at the sequence - // end do not confirm the total length of indel - // size. Specifically a *lack* of indel at the - // end, where we know indels occur in other - // sequences, is a possible reference bias. - // - // This is emphasised further if the sequence ends with - // soft clipping. - // FIXME: need to make this work on IUPAC? - rep_ele *reps, *elt, *tmp; - uint8_t *seg = ref2 + tbeg - left; - int seg_len = tend2 - tbeg; - reps = find_STR((char *)seg, seg_len, 0); - int iscore = 0; - double m2 = 0; - int mn = 0, m2min = INT_MAX; - int str_len1 = *str_len1_p, str_len2 = *str_len2_p; - DL_FOREACH_SAFE(reps, elt, tmp) { - if (elt->start <= qpos && elt->end >= qpos) { - iscore += (elt->end-elt->start) / elt->rep_len; // c - if (str_len1 < elt->end-elt->start) - str_len1 = elt->end-elt->start; - if (str_len2 < (elt->end-elt->start) / elt->rep_len) - str_len2 = (elt->end-elt->start) / elt->rep_len; - for (l = MAX(qbeg, elt->start); - l < MIN(qend, elt->end); - l++, mn++) { - m2 += qq[l-qbeg]; - if (m2min > qq[l-qbeg]) - m2min = qq[l-qbeg]; - } - if (elt->start+tbeg <= r_start || - elt->end+tbeg >= r_end) - iscore += 2*(elt->end-elt->start); - } - - DL_DELETE(reps, elt); - free(elt); - } - *str_len1_p = str_len1; - *str_len2_p = str_len2; - if (mn) - m2 /= mn; - else - m2 = m2min = qavg; - // The bottom 8 bits are length-normalised score while // the top bits are unnormalised. // // Try original cons and new cons and pick best. // This doesn't reduce FN much (infact maybe adds very slightly), // but it does reduce GT errors and is a slight reduction to FP. - //m = MIN(30, (m2+m2min)/2); // best so far - m = MIN(30, m2min); - // Alternatives to experiment on. - //double mm = (m+m2)/2; - //double mm = m2min; - double mm = m; + double mm = 30; // a const average qual for now. Could tune sc2 = edlib_glocal(ref2 + tbeg - left, tend2 - tbeg, query, qend - qbeg, mm, del_bias); @@ -1019,7 +1002,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // Find the best of the two alignments if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; - free(qq); return 0; } if (sc1 < 0) { @@ -1044,11 +1026,8 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // complexity / quality. l = .5*(100. * sc2 / (qend - qbeg) + .499); - l += iscore*(qavg/(m2min+1.0) + qavg/m2); - - *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias); - free(qq); + *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias * .5); return 0; } @@ -1093,6 +1072,13 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, for (j = t; j > 0 && sc[j] < sc[j-1]; --j) tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; +#ifdef ALIGN_DEBUG + fprintf(stderr, "READ %s\tscores ", bam_get_qname(p->b)); + for (t = 0; t < n_types; ++t) { + fprintf(stderr, "%+2d/%-3d ", types[sc[t]&0x3f], sc[t]>>14); + } +#endif + /* errmod_cal() assumes that if the call is wrong, the * likelihoods of other events are equal. This is about * right for substitutions, but is not desired for @@ -1108,18 +1094,6 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, indelQ = (sc[1]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run, str_len1); } else { -#if 0 - // Simplest code in edlib10f outputs - indelQ = (sc[1]>>14) - (sc[0]>>14); -#endif - -// Maybe just an option of x*indelQ1 + (1-x)*indelQ2 so we can adjust -// based on indel vs genotype accuracy, per instrument / dataset? - - -#if 1 -// Orig code; indelQ1 -// Good on HG002.GRCh38.PacBio_CCS_15Kb.bam // look for the reference type for (t = 0; t < n_types; ++t) { if ((sc[t]&0x3f) == ref_type) @@ -1128,13 +1102,6 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, indelQ = indelQ1 = (sc[t]>>14) - (sc[0]>>14); // fprintf(stderr, "IndelQ = %d: %d-%d", // indelQ, (sc[t]>>14), (sc[0]>>14)); -#endif - -#if 1 -// 10e; indelQ - - // Revised code in edlib10e outputs - // Good on most other data sets, including the 53x CCS SequelII data. // Best call is non-ref, compare vs next best non-ref, // or ref if it's just 2 choices (most common case). @@ -1145,107 +1112,18 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (t == n_types) t--; // it's ref, but it'll do as next best. indelQ2 = (sc[t]>>14) - (sc[0]>>14); -// fprintf(stderr, "\tNEW %d: %d-%d\n", -// indelQ, (sc[t]>>14), (sc[0]>>14)); -#endif - -#if 0 - // Best call is non-ref, get the ref score and (if different) - // the next best non-ref. Average the two and then compare vs - // this call. It means we assign ADs better, but don't - // overly lose variants either when we have many choices. - int ref_t = -1, next_t = -1; - for (t = 1; t < n_types; t++) { - if ((sc[t]&0x3f) == ref_type) { - ref_t = t; - } else { - if (next_t <= 0) - next_t = t; - } - } - if (next_t < 0) - next_t = ref_t; - assert(MIN(ref_t, next_t) == 1); - - // Could also try avg(ref,1) too rather than avg(ref,next)? - // Tried - no better - - indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); -#endif - -#if 0 - int ref_t = -1, next_t = -1; - for (t = 1; t < n_types; t++) { - if ((sc[t]&0x3f) == ref_type) { - ref_t = t; - } else { - if (next_t <= 0) - next_t = t; - } - } - if (next_t < 0) - next_t = ref_t; - assert(MIN(ref_t, next_t) == 1); - - if (ref_t <= next_t) - indelQ = (sc[ref_t]>>14) - (sc[0]>>14); - else - //indelQ = (sc[next_t]>>14) - (sc[0]>>14); - indelQ = (((sc[ref_t]>>14)+(sc[next_t]>>14))>>1) - (sc[0]>>14); -#endif - -#if 0 - // Best call is non-ref, get the ref score and (if different) - // the next best non-ref. Average the two and then compare vs - // this call. It means we assign ADs better, but don't - // overly lose variants either when we have many choices. - int ref_t = -1, next_t = -1; - for (t = 1; t < n_types; t++) { - if ((sc[t]&0x3f) == ref_type) { - ref_t = t; - } else { - if (next_t <= 0) - next_t = t; - } - } - if (next_t < 0) - next_t = ref_t; - assert(MIN(ref_t, next_t) == 1); - - indelQ = (sc[next_t]>>14) - (sc[0]>>14); - indelQ += (sc[ref_t]>>14) > (sc[next_t]>>14); -// indelQ >>= (sc[ref_t]>>14) > (sc[next_t]>>14); // VBAD -#endif - - // Maybe boost seqQ if ref_t != next_t and sc[ref_t]-sc[0] is high - // while sc[next_t]-sc[0] is not? So we're saying the indel is good - // but this allele is not? Not sure how we explore that. - // It's the difference between making an indel-exists call vs making - // a call on this specific genotype. - - // ORIG seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run, str_len1); - // TO TEST - //seqQ += MAX(0, (indelQ1 - indelQ)/4); - //indelQ += MAX(0, (indelQ1 - indelQ2)/4); - +#if 1 // TEST 3 indelQ = bca->vs_ref*indelQ1 + (1-bca->vs_ref)*indelQ2; - -#if 0 - // If ref_t scores higher than next_t then we have more - // than 1 ALT allele. This requires a slightly higher - // threshold of confidence to call it as it's more likely - // down to sequencing error. - if (ref_t >= 2) - seqQ *= pow(0.9, ref_t - 1); #endif } // So we lower qual in some, but raise the average to keep FN/FP // ratios up. - indelQ /= bca->indel_bias; - indelQ1 /= bca->indel_bias; + // Is this key diff for PacBio old vs new HiFi? + indelQ /= bca->indel_bias*0.5; + indelQ1 /= bca->indel_bias*0.5; // Or maybe just *2 if bca->poly_mqual and be done with it? // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ? @@ -1258,13 +1136,11 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // Enabling this causes lots of GT errors on Illumina. // However on PacBio it's key to removal of false positives. // ONT and UG seem somewhere inbetween. - if (bca->poly_mqual) { + if (bca->poly_mqual) { // TEST 4 int qpos = p->qpos, l; uint8_t *seq = bam_get_seq(p->b); uint8_t *qual = bam_get_qual(p->b); int min_q = qual[qpos]; - int nbase = 0; - int sumq = 0; // scan homopolymer left char baseL = bam_seqi(seq, qpos+1 < p->b->core.l_qseq @@ -1283,19 +1159,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, min_q = qual[l]; if (bam_seqi(seq, l) != base) break; - nbase++; - sumq += qual[l]; } -// if (nbase) { -// sumq = sumq / (double)nbase; -// if (min_q < sumq / 4) -// min_q = sumq / 4; -// } - -// if (min_q < 10) -// min_q = 10; - // We reduce -h so homopolymers get reduced likelihood of being // called, but then optionally increase or decrease from there // based on base quality. Hence lack of low quality bases in @@ -1320,8 +1185,15 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // low normalised scores leave indelQ unmodified // high normalised scores set indelQ to 0 // inbetween scores have a linear scale from indelQ to 0 - indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); - indelQ1= tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1+ .499); +// Altering the MAGIC value below (originally 111, but chosen for unknown +// reasons) is comparable to altering --indel-bias. +#define TMP_MAGIC 255.0 + + indelQ = tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ + .499); + indelQ1= tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ1+ .499); + + indelQ = MIN(indelQ, 255); + indelQ1 = MIN(indelQ1, 255); // Doesn't really help accuracy, but permits -h to take // affect still. @@ -1337,6 +1209,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; sumq[sc[0]&0x3f] += indelQ; +#ifdef ALIGN_DEBUG + fprintf(stderr, "\t%d\t%d\n", indelQ, seqQ); +#endif + // Experiment in p->aux vs sumq. // One gives likelihood of an indel being here, while the other // is likelihood of a specific genotype? But which is which? @@ -1344,20 +1220,6 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, sum_indelQ1[s] += indelQ1; sum_indelQ2[s] += indelQ; } - -// double m = (double)sum_indelQ1[s] / sum_indelQ2[s]; -// if (m > 1) { -// m = sqrt(m); -// fprintf(stderr, "Sum %d %d m=%f\n", sum_indelQ1, sum_indelQ, m); -// for (i = 0; i < n_plp[s]; ++i) { -// bam_pileup1_t *p = plp[s] + i; -// int indelQ = p->aux & 0xff; -// indelQ *= m; -// if (indelQ > 255) -// indelQ = 255; -// p->aux = (p->aux & ~0xff) | indelQ; -// } -// } } // Determine bca->indel_types[] and bca->inscns. @@ -1384,6 +1246,9 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; for (t = 0; t < 4 && t < n_types; ++t) { bca->indel_types[t] = types[sumq[t]&0x3f]; +#ifdef ALIGN_DEBUG + fprintf(stderr, "TYPE %+2d %d\n", types[t], sumq[t]>>6); +#endif if (bca->maxins) // potentially an insertion memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); @@ -1393,8 +1258,6 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, // If per-alignment type isn't found, then indelQ/seqQ is 0, // otherwise unchanged. for (s = n_alt = 0; s < n; ++s) { -// double m = sqrt((double)sum_indelQ1[s] / sum_indelQ2[s]); -// if (m < 1) m = 1; for (i = 0; i < n_plp[s]; ++i) { bam_pileup1_t *p = plp[s] + i; int x = types[p->aux>>16&0x3f]; @@ -1402,16 +1265,11 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; - -// Poor -// // We recorded indelQ based on this allele, but -// // scale now by quality of any non-ref allele. -// // This reduces FN while keeping GT accurate (maybe!) -// int indelQ = (p->aux & 0xff) * m; -// if (indelQ > 255) indelQ = 255; -// p->aux = (p->aux & ~0xff) | indelQ; - - //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); +#ifdef ALIGN_DEBUG + fprintf(stderr, "FIN %s\t%d\t%d\t%d\n", + bam_get_qname(p->b), (p->aux>>16)&0x3f, + bca->indel_types[(p->aux>>16)&0x3f], p->aux&0xff); +#endif } } @@ -1474,7 +1332,6 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } } qavg = (qsum+1) / (qcount+1); - //qavg = (qavg + qmax)/2; // bias avg toward maximum observed. // find out how many types of indels are present types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, @@ -1693,7 +1550,9 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, min_win_size += tot_str; } min_win_size += 10; - if (p->b->core.l_qseq > 1000) { // ||1 for 7f-long + +// TEST 8 + if (p->b->core.l_qseq > 1000) { // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. @@ -1709,8 +1568,9 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // of STRs with the end of the query alignment. int r_start = p->b->core.pos; int r_end = bam_cigar2rlen(p->b->core.n_cigar, - bam_get_cigar(p->b)) - -1 + r_start; + bam_get_cigar(p->b)); + r_end += -1 + r_start; + // Map left2/right2 genomic coordinates to qbeg/qend // query coordinates. The query may not span the @@ -1730,10 +1590,6 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; - // A fudge for now. Consider checking SAM header for - // RG platform field. - int long_read = p->b->core.l_qseq > 1000; - // tbeg and tend are the genomic locations equivalent // to qbeg and qend on the sequence. // These may being entirely within our left/right @@ -1759,7 +1615,7 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, (uint8_t *)tcons[0] + left2-left, (uint8_t *)tcons[1] + left2-left, (uint8_t *)query, - r_start, r_end, long_read, + r_start, r_end, tbeg, tend1, tend2, left2, left + tcon_len[0], qbeg, qend, pos,qpos, -biggest_del, @@ -1768,6 +1624,13 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, &str_len1, &str_len2) < 0) { goto err; } +#ifdef ALIGN_DEBUG + fprintf(stderr, "type %d %x / %x\t%s\n", + types[t], + score[K*n_types + t] >> 8, + score[K*n_types + t] & 0xff, + bam_get_qname(p->b)); +#endif } else { // place holder large cost for reads that cover the // region entirely within a deletion (thus tend < tbeg). diff --git a/doc/bcftools.txt b/doc/bcftools.txt index cf49c82d1..f474ede7b 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -2182,7 +2182,21 @@ those scenarios. A new EXPERIMENTAL indel calling model which aims to address some known deficiencies of the current indel calling algorithm. Specifically, it uses diploid reference consensus sequence. Note that in the current version it has the potential to increase sensitivity - but at the cost of decreased specificity + but at the cost of decreased specificity. + Only works with short-read sequencing technologies. + +*--indels-cns*:: + Another EXPERIMENTAL indel calling method, predating indels-2.0 in + PR form, but merged more recently. It also uses a diploid + reference consensus, but with added parameters and heuristics to + optimise for a variety of sequencing platforms. This is usually + faster and more accurate than the default caller and --indels-2.0, + but has not been tested on non-diploid samples and samples without + approximately even allele frequency. + +*--no-indels-cns*:: + May be used to turn off --indels-cns mode when using one of the + newer profiles that has this enabled by default. *-q, -min-MQ* 'INT':: Minimum mapping quality for an alignment to be used [0] @@ -2323,14 +2337,45 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for ==== Options for SNP/INDEL genotype likelihood computation *-X, --config* 'STR':: - Specify a platform specific configuration profile. The profile - should be one of '1.12', 'illumina', 'ont' or 'pacbio-ccs'. - Settings applied are as follows: + Specify a platform specific configuration profile. Specifying the + profile as "list" will list the available profile names and the + parameters they change. There are profiles named after a release, + which should be used if you wish to ensure forward compatibility + of results. The non-versioned names (eg "illumina") will always + point to the most recent set of parameters for that instrument type. + The current values are: - 1.12 -Q13 -h100 -m1 - illumina [ default values ] - ont -B -Q5 --max-BQ 30 -I - pacbio-ccs -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 -M99999 + 1.12 -Q13 -h100 -m1 + + bgi + bgi-1.20 --indels-cns -B --indel-size 80 -F0.1 --indel-bias 0.9 + --seqq-offset 120 + + illumina-1.18 [ default values ] + + illumina + illumina-1.20 --indels-cns --seqq-offset 125 + + ont -B -Q5 --max-BQ 30 -I + + ont-sup + ont-sup-1.20 --indels-cns -B -Q1 --max-BQ 35 --delta-BQ 99 -F0.2 + -o15 -e1 -h110 --del-bias 0.4 --indel-bias 0.7 + --poly-mqual --seqq-offset 130 --indel-size 80 + + pacbio-ccs-1.18 -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 -M99999 + + pacbio-ccs + pacbio-ccs-1.20 --indels-cns -B -Q5 --max-BQ 50 -F0.1 -o25 -e1 -h300 + --delta-BQ 10 --del-bias 0.4 --poly-mqual + --indel-bias 0.9 --seqq-offset 118 --indel-size 80 + --score-vs-ref 0.7 + + ultima + ultima-1.20 --indels-cns -B -Q1 --max-BQ 30 --delta-BQ 10 -F0.15 + -o20 -e10 -h250 --del-bias 0.3 --indel-bias 0.7 + --poly-mqual --seqq-offset 140 --score-vs-ref 0.3 + --indel-size 80 *--ar, --ambig-reads* 'drop'|'incAD'|'incAD0':: What to do with ambiguous indel reads that do not span an entire @@ -2369,11 +2414,28 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for 0.75) while higher depth samples or where you favour recall rates over precision may work better with a higher value such as 2.0. +*--del-bias* 'FLOAT':: + Skews the likelihood of deletions over insertions. Defaults to an + even distribution value of 1.0. Lower values imply a higher rate + of false positive deletions (meaning candidate deletions are less + likely to be real). + *--indel-size* 'INT':: Indel window size to use when assessing the quality of candidate indels. Note that although the window size approximately corresponds to the maximum indel size considered, it is not an exact threshold [110] +*--seqq-offset* 'INT':: + Tunes the importance of indel sequence quality per depth. The + final "seqQ" quality used is "offset - 5*MIN(depth,20)". [120] + +*--poly-mqual*:: + Use the lowest quality value within a homopolymer run, instead of + the quality immediately adjacent to the indel. This may be + important for unclocked instruments, particularly ones with a flow + chemistry where runs of bases of identical type are incorporated + together. + *-I, --skip-indels*:: Do not perform INDEL calling diff --git a/edlib.c b/edlib.c index 23ed8f9e9..5421fee48 100644 --- a/edlib.c +++ b/edlib.c @@ -21,40 +21,6 @@ static const Word HIGH_BIT_MASK = 1LL << 63; // 100..00 #define MAX(a,b) ((a)>(b)?(a):(b)) #endif -#if 0 -// Data needed to find alignment. -typedef struct AlignmentData { - Word* Ps; - Word* Ms; - int* scores; - int* firstBlocks; - int* lastBlocks; -} AlignmentData; - -static AlignmentData *CreateAlignmentData(int maxNumBlocks, int targetLength) { - AlignmentData *d = malloc(sizeof(*d)); - - // We build a complete table and mark first and last block for each column - // (because algorithm is banded so only part of each columns is used). - // TODO: do not build a whole table, but just enough blocks for each column. - d->Ps = malloc(maxNumBlocks * targetLength * sizeof(*d->Ps)); - d->Ms = malloc(maxNumBlocks * targetLength * sizeof(*d->Ms)); - d->scores = malloc(maxNumBlocks * targetLength * sizeof(*d->scores)); - d->firstBlocks = malloc(targetLength * sizeof(*d->firstBlocks)); - d->lastBlocks = malloc(targetLength * sizeof(*d->lastBlocks)); - - return d; -} - -static void DestroyAlignmentData(AlignmentData *d) { - free(d->Ps); - free(d->Ms); - free(d->scores); - free(d->firstBlocks); - free(d->lastBlocks); -} -#endif - typedef struct Block { Word P; // Pvin Word M; // Mvin @@ -112,33 +78,6 @@ static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlo int k, EdlibAlignMode mode, int* bestScore_, int** positions_, int* numPositions_); -#if 0 -static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks, - int queryLength, - const unsigned char* target, int targetLength, - int k, int* bestScore_, - int* position_, bool findAlignment, - AlignmentData** alignData, int targetStopPosition); -#endif - -#if 0 -static int obtainAlignment( - const unsigned char* query, const unsigned char* rQuery, int queryLength, - const unsigned char* target, const unsigned char* rTarget, int targetLength, - const EqualityDefinition* equalityDefinition, int alphabetLength, int bestScore, - unsigned char** alignment, int* alignmentLength); - -static int obtainAlignmentHirschberg( - const unsigned char* query, const unsigned char* rQuery, int queryLength, - const unsigned char* target, const unsigned char* rTarget, int targetLength, - const EqualityDefinition* equalityDefinition, int alphabetLength, int bestScore, - unsigned char** alignment, int* alignmentLength); - -static int obtainAlignmentTraceback(int queryLength, int targetLength, - int bestScore, const AlignmentData* alignData, - unsigned char** alignment, int* alignmentLength); -#endif - static char *transformSequences(const char* queryOriginal, int queryLength, const char* targetOriginal, int targetLength, unsigned char** queryTransformed, @@ -288,25 +227,6 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng } } } - -#if 0 - // Find alignment -> all comes down to finding alignment for NW. - // Currently we return alignment only for first pair of locations. - if (config.task == EDLIB_TASK_PATH) { - int alnStartLocation = result.startLocations[0]; - int alnEndLocation = result.endLocations[0]; - const unsigned char* alnTarget = target + alnStartLocation; - const int alnTargetLength = alnEndLocation - alnStartLocation + 1; - const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength); - const unsigned char* rQuery = createReverseCopy(query, queryLength); - obtainAlignment(query, rQuery, queryLength, - alnTarget, rAlnTarget, alnTargetLength, - equalityDefinition, alphabet_size, result.editDistance, - &(result.alignment), &(result.alignmentLength)); - free((void *)rAlnTarget); - free((void *)rQuery); - } -#endif } /*-------------------------------------------------------*/ @@ -322,59 +242,6 @@ EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLeng return result; } -#if 0 -char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength, - const EdlibCigarFormat cigarFormat) { - if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) { - return 0; - } - - // Maps move code from alignment to char in cigar. - // 0 1 2 3 - char moveCodeToChar[] = {'=', 'I', 'D', 'X'}; - if (cigarFormat == EDLIB_CIGAR_STANDARD) { - moveCodeToChar[0] = moveCodeToChar[3] = 'M'; - } - - vector* cigar = new vector(); - char lastMove = 0; // Char of last move. 0 if there was no previous move. - int numOfSameMoves = 0; - for (int i = 0; i <= alignmentLength; i++) { - // if new sequence of same moves started - if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) { - // Write number of moves to cigar string. - int numDigits = 0; - for (; numOfSameMoves; numOfSameMoves /= 10) { - cigar->push_back('0' + numOfSameMoves % 10); - numDigits++; - } - reverse(cigar->end() - numDigits, cigar->end()); - // Write code of move to cigar string. - cigar->push_back(lastMove); - // If not at the end, start new sequence of moves. - if (i < alignmentLength) { - // Check if alignment has valid values. - if (alignment[i] > 3) { - delete cigar; - return 0; - } - numOfSameMoves = 0; - } - } - if (i < alignmentLength) { - lastMove = moveCodeToChar[alignment[i]]; - numOfSameMoves++; - } - } - cigar->push_back(0); // Null character termination. - char* cigar_ = malloc(cigar->size() * sizeof(char)); - memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char)); - delete cigar; - - return cigar_; -} -#endif - /** * Build Peq table for given query and alphabet. * Peq is table of dimensions alphabetLength+1 x maxNumBlocks. @@ -390,23 +257,6 @@ static inline Word* buildPeq(const int alphabetLength, Word* Peq = malloc((alphabetLength + 1) * maxNumBlocks * sizeof(*Peq)); // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s -#if 0 - for (int symbol = 0; symbol <= alphabetLength; symbol++) { - for (int b = 0; b < maxNumBlocks; b++) { - if (symbol < alphabetLength) { - Peq[symbol * maxNumBlocks + b] = 0; - for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) { - Peq[symbol * maxNumBlocks + b] <<= 1; - // NOTE: We pretend like query is padded at the end with W wildcard symbols - if (r >= queryLength || equalityDefinition_areEqual(equalityDefinition, query[r], symbol)) - Peq[symbol * maxNumBlocks + b] += 1; - } - } else { // Last symbol is wildcard, so it is all 1s - Peq[symbol * maxNumBlocks + b] = (Word)-1; - } - } - } -#else // Optimised Peq building avoiding branching. for (int symbol = 0; symbol < alphabetLength; symbol++) { for (int b = 0; b < maxNumBlocks; b++) { @@ -427,7 +277,7 @@ static inline Word* buildPeq(const int alphabetLength, Peq[symbol * maxNumBlocks + b] = (Word)-1; } } -#endif + return Peq; } @@ -506,10 +356,6 @@ static inline int min(const int x, const int y) { return x < y ? x : y; } -static inline int max(const int x, const int y) { - return x > y ? x : y; -} - /** * @param [in] block @@ -529,40 +375,6 @@ static inline int *getBlockCellValues(const Block block) { return scores; } -/** - * Writes values of cells in block into given array, starting with first/top cell. - * @param [in] block - * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. - */ -static inline void readBlock(const Block block, int* const dest) { - int score = block.score; - Word mask = HIGH_BIT_MASK; - for (int i = 0; i < WORD_SIZE - 1; i++) { - dest[WORD_SIZE - 1 - i] = score; - if (block.P & mask) score--; - if (block.M & mask) score++; - mask >>= 1; - } - dest[0] = score; -} - -/** - * Writes values of cells in block into given array, starting with last/bottom cell. - * @param [in] block - * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. - */ -static inline void readBlockReverse(const Block block, int* const dest) { - int score = block.score; - Word mask = HIGH_BIT_MASK; - for (int i = 0; i < WORD_SIZE - 1; i++) { - dest[i] = score; - if (block.P & mask) score--; - if (block.M & mask) score++; - mask >>= 1; - } - dest[WORD_SIZE - 1] = score; -} - /** * @param [in] block * @param [in] k @@ -765,703 +577,6 @@ static int myersCalcEditDistanceSemiGlobal( } -/** - * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method. - * @param [in] Peq Query profile. - * @param [in] W Size of padding in last block. - * TODO: Calculate this directly from query, instead of passing it. - * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. - * TODO: Calculate this directly from query, instead of passing it. - * @param [in] queryLength - * @param [in] target - * @param [in] targetLength - * @param [in] k - * @param [out] bestScore_ Edit distance. - * @param [out] position_ 0-indexed position in target at which best score was found. - * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned. - * Quadratic amount of memory is consumed. - * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment). - * Set only if findAlignment is set to true, otherwise it is NULL. - * Make sure to free this array using free(). - * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected. - * If set to p, calculation is performed up to position p in target (inclusive) - * and column p is returned as the only column in alignData. - * @return Status. - */ -#if 0 -static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks, - const int queryLength, - const unsigned char* const target, const int targetLength, - int k, int* const bestScore_, - int* const position_, const bool findAlignment, - AlignmentData** const alignData, const int targetStopPosition) { - if (targetStopPosition > -1 && findAlignment) { - // They can not be both set at the same time! - return EDLIB_STATUS_ERROR; - } - - // Each STRONG_REDUCE_NUM column is reduced in more expensive way. - const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation - - if (k < abs(targetLength - queryLength)) { - *bestScore_ = *position_ = -1; - return EDLIB_STATUS_OK; - } - - k = min(k, max(queryLength, targetLength)); // Upper bound for k - - // firstBlock is 0-based index of first block in Ukkonen band. - // lastBlock is 0-based index of last block in Ukkonen band. - int firstBlock = 0; - // This is optimal now, by my formula. - int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1; - Block* bl; // Current block - - Block* blocks = malloc(maxNumBlocks * sizeof(*blocks)); - - // Initialize P, M and score - bl = blocks; - for (int b = 0; b <= lastBlock; b++) { - bl->score = (b + 1) * WORD_SIZE; - bl->P = (Word)(-1); // All 1s - bl->M = (Word)(0); - bl++; - } - - // If we want to find alignment, we have to store needed data. - if (findAlignment) - *alignData = new AlignmentData(maxNumBlocks, targetLength); - else if (targetStopPosition > -1) - *alignData = new AlignmentData(maxNumBlocks, 1); - else - *alignData = NULL; - - const unsigned char* targetChar = target; - for (int c = 0; c < targetLength; c++) { // for each column - const Word* Peq_c = Peq + *targetChar * maxNumBlocks; - - //----------------------- Calculate column -------------------------// - int hout = 1; - bl = blocks + firstBlock; - for (int b = firstBlock; b <= lastBlock; b++) { - hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M); - bl->score += hout; - bl++; - } - bl--; - //------------------------------------------------------------------// - // bl now points to last block - - // Update k. I do it only on end of column because it would slow calculation too much otherwise. - // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up. - k = min(k, bl->score - + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1) - + (lastBlock == maxNumBlocks - 1 ? W : 0)); - - //---------- Adjust number of blocks according to Ukkonen ----------// - //--- Adjust last block ---// - // If block is not beneath band, calculate next block. Only next because others are certainly beneath band. - if (lastBlock + 1 < maxNumBlocks - && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also! - ((lastBlock + 1) * WORD_SIZE - 1 - > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) { - lastBlock++; bl++; - bl->P = (Word)(-1); // All 1s - bl->M = (Word)(0); - int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M); - bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout; - hout = newHout; - } - - // While block is out of band, move one block up. - // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it. - // I could consider adding that max part, for optimal performance. - while (lastBlock >= firstBlock - && (bl->score >= k + WORD_SIZE - || ((lastBlock + 1) * WORD_SIZE - 1 > - // TODO: Does not work if do not put +1! Why??? - k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) { - lastBlock--; bl--; - } - //-------------------------// - - //--- Adjust first block ---// - // While outside of band, advance block - while (firstBlock <= lastBlock - && (blocks[firstBlock].score >= k + WORD_SIZE - || ((firstBlock + 1) * WORD_SIZE - 1 < - blocks[firstBlock].score - k - targetLength + queryLength + c))) { - firstBlock++; - } - //--------------------------/ - - - // TODO: consider if this part is useful, it does not seem to help much - if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction - while (lastBlock >= firstBlock) { - // If all cells outside of band, remove block - vector scores = getBlockCellValues(*bl); - int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; - int r = lastBlock * WORD_SIZE + numCells - 1; - bool reduce = true; - for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { - // TODO: Does not work if do not put +1! Why??? - if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) { - reduce = false; - break; - } - r--; - } - if (!reduce) break; - lastBlock--; bl--; - } - - while (firstBlock <= lastBlock) { - // If all cells outside of band, remove block - vector scores = getBlockCellValues(blocks[firstBlock]); - int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; - int r = firstBlock * WORD_SIZE + numCells - 1; - bool reduce = true; - for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { - if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) { - reduce = false; - break; - } - r--; - } - if (!reduce) break; - firstBlock++; - } - } - - - // If band stops to exist finish - if (lastBlock < firstBlock) { - *bestScore_ = *position_ = -1; - free(blocks); - return EDLIB_STATUS_OK; - } - //------------------------------------------------------------------// - - - //---- Save column so it can be used for reconstruction ----// - if (findAlignment && c < targetLength) { - bl = blocks + firstBlock; - for (int b = firstBlock; b <= lastBlock; b++) { - (*alignData)->Ps[maxNumBlocks * c + b] = bl->P; - (*alignData)->Ms[maxNumBlocks * c + b] = bl->M; - (*alignData)->scores[maxNumBlocks * c + b] = bl->score; - bl++; - } - (*alignData)->firstBlocks[c] = firstBlock; - (*alignData)->lastBlocks[c] = lastBlock; - } - //----------------------------------------------------------// - //---- If this is stop column, save it and finish ----// - if (c == targetStopPosition) { - for (int b = firstBlock; b <= lastBlock; b++) { - (*alignData)->Ps[b] = (blocks + b)->P; - (*alignData)->Ms[b] = (blocks + b)->M; - (*alignData)->scores[b] = (blocks + b)->score; - } - (*alignData)->firstBlocks[0] = firstBlock; - (*alignData)->lastBlocks[0] = lastBlock; - *bestScore_ = -1; - *position_ = targetStopPosition; - free(blocks); - return EDLIB_STATUS_OK; - } - //----------------------------------------------------// - - targetChar++; - } - - if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated - // Obtain best score from block -> it is complicated because query is padded with W cells - int bestScore = getBlockCellValues(blocks[lastBlock])[W]; - if (bestScore <= k) { - *bestScore_ = bestScore; - *position_ = targetLength - 1; - free(blocks); - return EDLIB_STATUS_OK; - } - } - - *bestScore_ = *position_ = -1; - free(blocks); - return EDLIB_STATUS_OK; -} -#endif - -#if 0 -/** - * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix, - * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength). - * @param [in] queryLength Normal length, without W. - * @param [in] targetLength Normal length, without W. - * @param [in] bestScore Best score. - * @param [in] alignData Data obtained during finding best score that is useful for finding alignment. - * @param [out] alignment Alignment. - * @param [out] alignmentLength Length of alignment. - * @return Status code. - */ -static int obtainAlignmentTraceback(const int queryLength, const int targetLength, - const int bestScore, const AlignmentData* const alignData, - unsigned char** const alignment, int* const alignmentLength) { - const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); - const int W = maxNumBlocks * WORD_SIZE - queryLength; - - *alignment = malloc((queryLength + targetLength - 1) * sizeof(unsigned char)); - *alignmentLength = 0; - int c = targetLength - 1; // index of column - int b = maxNumBlocks - 1; // index of block in column - int currScore = bestScore; // Score of current cell - int lScore = -1; // Score of left cell - int uScore = -1; // Score of upper cell - int ulScore = -1; // Score of upper left cell - Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block - Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block - // True if block to left exists and is in band - bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]; - // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the - // calculation as both lP and lM should be initialized at some moment later (but compiler can not - // detect it since this initialization is guaranteed by "business" logic). - Word lP = 0, lM = 0; - if (thereIsLeftBlock) { - lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left - lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left - } - currP <<= W; - currM <<= W; - int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos - - // TODO(martin): refactor this whole piece of code. There are too many if-else statements, - // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases. - // We need better separation of logic and responsibilities. - while (true) { - if (c == 0) { - thereIsLeftBlock = true; - lScore = b * WORD_SIZE + blockPos + 1; - ulScore = lScore - 1; - } - - // TODO: improvement: calculate only those cells that are needed, - // for example if I calculate upper cell and can move up, - // there is no need to calculate left and upper left cell - //---------- Calculate scores ---------// - if (lScore == -1 && thereIsLeftBlock) { - lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left - for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) { - if (lP & HIGH_BIT_MASK) lScore--; - if (lM & HIGH_BIT_MASK) lScore++; - lP <<= 1; - lM <<= 1; - } - } - if (ulScore == -1) { - if (lScore != -1) { - ulScore = lScore; - if (lP & HIGH_BIT_MASK) ulScore--; - if (lM & HIGH_BIT_MASK) ulScore++; - } - else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) { - // This is the case when upper left cell is last cell in block, - // and block to left is not in band so lScore is -1. - ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1]; - } - } - if (uScore == -1) { - uScore = currScore; - if (currP & HIGH_BIT_MASK) uScore--; - if (currM & HIGH_BIT_MASK) uScore++; - currP <<= 1; - currM <<= 1; - } - //-------------------------------------// - - // TODO: should I check if there is upper block? - - //-------------- Move --------------// - // Move up - insertion to target - deletion from query - if (uScore != -1 && uScore + 1 == currScore) { - currScore = uScore; - lScore = ulScore; - uScore = ulScore = -1; - if (blockPos == 0) { // If entering new (upper) block - if (b == 0) { // If there are no cells above (only boundary cells) - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up - for (int i = 0; i < c + 1; i++) // Move left until end - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; - break; - } else { - blockPos = WORD_SIZE - 1; - b--; - currP = alignData->Ps[c * maxNumBlocks + b]; - currM = alignData->Ms[c * maxNumBlocks + b]; - if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { - thereIsLeftBlock = true; - lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations - lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; - } else { - thereIsLeftBlock = false; - // TODO(martin): There may not be left block, but there can be left boundary - do we - // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this. - } - } - } else { - blockPos--; - lP <<= 1; - lM <<= 1; - } - // Mark move - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; - } - // Move left - deletion from target - insertion to query - else if (lScore != -1 && lScore + 1 == currScore) { - currScore = lScore; - uScore = ulScore; - lScore = ulScore = -1; - c--; - if (c == -1) { // If there are no cells to the left (only boundary cells) - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left - int numUp = b * WORD_SIZE + blockPos + 1; - for (int i = 0; i < numUp; i++) // Move up until end - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; - break; - } - currP = lP; - currM = lM; - if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { - thereIsLeftBlock = true; - lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; - lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; - } else { - if (c == 0) { // If there are no cells to the left (only boundary cells) - thereIsLeftBlock = true; - lScore = b * WORD_SIZE + blockPos + 1; - ulScore = lScore - 1; - } else { - thereIsLeftBlock = false; - } - } - // Mark move - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; - } - // Move up left - (mis)match - else if (ulScore != -1) { - unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH; - currScore = ulScore; - uScore = lScore = ulScore = -1; - c--; - if (c == -1) { // If there are no cells to the left (only boundary cells) - (*alignment)[(*alignmentLength)++] = moveCode; // Move left - int numUp = b * WORD_SIZE + blockPos; - for (int i = 0; i < numUp; i++) // Move up until end - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; - break; - } - if (blockPos == 0) { // If entering upper left block - if (b == 0) { // If there are no more cells above (only boundary cells) - (*alignment)[(*alignmentLength)++] = moveCode; // Move up left - for (int i = 0; i < c + 1; i++) // Move left until end - (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; - break; - } - blockPos = WORD_SIZE - 1; - b--; - currP = alignData->Ps[c * maxNumBlocks + b]; - currM = alignData->Ms[c * maxNumBlocks + b]; - } else { // If entering left block - blockPos--; - currP = lP; - currM = lM; - currP <<= 1; - currM <<= 1; - } - // Set new left block - if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { - thereIsLeftBlock = true; - lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; - lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; - } else { - if (c == 0) { // If there are no cells to the left (only boundary cells) - thereIsLeftBlock = true; - lScore = b * WORD_SIZE + blockPos + 1; - ulScore = lScore - 1; - } else { - thereIsLeftBlock = false; - } - } - // Mark move - (*alignment)[(*alignmentLength)++] = moveCode; - } else { - // Reached end - finished! - break; - } - //----------------------------------// - } - - *alignment = realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)); - reverse(*alignment, *alignment + (*alignmentLength)); - return EDLIB_STATUS_OK; -} - - -/** - * Finds one possible alignment that gives optimal score (bestScore). - * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough, - * it will solve them using traceback algorithm. - * @param [in] query - * @param [in] rQuery Reversed query. - * @param [in] queryLength - * @param [in] target - * @param [in] rTarget Reversed target. - * @param [in] targetLength - * @param [in] equalityDefinition - * @param [in] alphabetLength - * @param [in] bestScore Best(optimal) score. - * @param [out] alignment Sequence of edit operations that make target equal to query. - * @param [out] alignmentLength Length of alignment. - * @return Status code. - */ -static int obtainAlignment( - const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, - const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, - const EqualityDefinition* equalityDefinition, const int alphabetLength, const int bestScore, - unsigned char** const alignment, int* const alignmentLength) { - - // Handle special case when one of sequences has length of 0. - if (queryLength == 0 || targetLength == 0) { - *alignmentLength = targetLength + queryLength; - *alignment = malloc((*alignmentLength) * sizeof(unsigned char)); - for (int i = 0; i < *alignmentLength; i++) { - (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT; - } - return EDLIB_STATUS_OK; - } - - const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); - const int W = maxNumBlocks * WORD_SIZE - queryLength; - int statusCode; - - // TODO: think about reducing number of memory allocations in alignment functions, probably - // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg, - // and it could also be done for alignments - we could have one big array for alignment that would be - // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results. - - // If estimated memory consumption for traceback algorithm is smaller than 1MB use it, - // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal. - long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength - + 2ll * sizeof(int) * targetLength; - if (alignmentDataSize < 1024 * 1024) { - int score_, endLocation_; // Used only to call function. - AlignmentData* alignData = NULL; - Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); - myersCalcEditDistanceNW(Peq, W, maxNumBlocks, - queryLength, - target, targetLength, - bestScore, - &score_, &endLocation_, true, &alignData, -1); - //assert(score_ == bestScore); - //assert(endLocation_ == targetLength - 1); - - statusCode = obtainAlignmentTraceback(queryLength, targetLength, - bestScore, alignData, alignment, alignmentLength); - free(alignData); - free(Peq); - } else { - statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength, - target, rTarget, targetLength, - equalityDefinition, alphabetLength, bestScore, - alignment, alignmentLength); - } - return statusCode; -} - - -/** - * Finds one possible alignment that gives optimal score (bestScore). - * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together. - * @param [in] query - * @param [in] rQuery Reversed query. - * @param [in] queryLength - * @param [in] target - * @param [in] rTarget Reversed target. - * @param [in] targetLength - * @param [in] alphabetLength - * @param [in] bestScore Best(optimal) score. - * @param [out] alignment Sequence of edit operations that make target equal to query. - * @param [out] alignmentLength Length of alignment. - * @return Status code. - */ -static int obtainAlignmentHirschberg( - const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, - const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, - const EqualityDefinition* equalityDefinition, const int alphabetLength, const int bestScore, - unsigned char** const alignment, int* const alignmentLength) { - - const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); - const int W = maxNumBlocks * WORD_SIZE - queryLength; - - Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); - Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition); - - // Used only to call functions. - int score_, endLocation_; - - // Divide dynamic matrix into two halfs, left and right. - const int leftHalfWidth = targetLength / 2; - const int rightHalfWidth = targetLength - leftHalfWidth; - - // Calculate left half. - AlignmentData* alignDataLeftHalf = NULL; - int leftHalfCalcStatus = myersCalcEditDistanceNW( - Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore, - &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); - - // Calculate right half. - AlignmentData* alignDataRightHalf = NULL; - int rightHalfCalcStatus = myersCalcEditDistanceNW( - rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore, - &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); - - free(Peq); - free(rPeq); - - if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) { - if (alignDataLeftHalf) free(alignDataLeftHalf); - if (alignDataRightHalf) free(alignDataRightHalf); - return EDLIB_STATUS_ERROR; - } - - // Unwrap the left half. - int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0]; - int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0]; - // TODO: avoid this allocation by using some shared array? - // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index) - // and ending with scoresLeftEndIdx row (0-indexed). - int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE; - int* scoresLeft = malloc(scoresLeftLength * sizeof(int)); - for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) { - Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx], - alignDataLeftHalf->scores[blockIdx]); - readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE); - } - int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE; - // If last block contains padding, shorten the length of scores for the length of padding. - if (lastBlockIdxLeft == maxNumBlocks - 1) { - scoresLeftLength -= W; - } - - // Unwrap the right half (I also reverse it while unwraping). - int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0]; - int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0]; - int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE; - int* scoresRight = malloc(scoresRightLength * sizeof(int)); - int* scoresRightOriginalStart = scoresRight; - for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) { - Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx], - alignDataRightHalf->scores[blockIdx]); - readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE); - } - int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE; - // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do), - // move pointer forward to remove the padding (that is why we remember originalStart). - if (scoresRightStartIdx < 0) { - //assert(scoresRightStartIdx == -1 * W); - scoresRight += W; - scoresRightStartIdx += W; - scoresRightLength -= W; - } - - free(alignDataLeftHalf); - free(alignDataRightHalf); - - //--------------------- Find the best move ----------------// - // Find the query/row index of cell in left column which together with its lower right neighbour - // from right column gives the best score (when summed). We also have to consider boundary cells - // (those cells at -1 indexes). - // x| - // -+- - // |x - int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1); - int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1, - scoresRightStartIdx + scoresRightLength - 2); - int leftScore = -1, rightScore = -1; - int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through. - bool queryIdxLeftAlignmentFound = false; - for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) { - leftScore = scoresLeft[queryIdx - scoresLeftStartIdx]; - rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx]; - if (leftScore + rightScore == bestScore) { - queryIdxLeftAlignment = queryIdx; - queryIdxLeftAlignmentFound = true; - break; - } - } - // Check boundary cells. - if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) { - leftScore = leftHalfWidth; - rightScore = scoresRight[0]; - if (leftScore + rightScore == bestScore) { - queryIdxLeftAlignment = -1; - queryIdxLeftAlignmentFound = true; - } - } - if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength - && scoresRightStartIdx + scoresRightLength == queryLength) { - leftScore = scoresLeft[scoresLeftLength - 1]; - rightScore = rightHalfWidth; - if (leftScore + rightScore == bestScore) { - queryIdxLeftAlignment = queryLength - 1; - queryIdxLeftAlignmentFound = true; - } - } - - free(scoresLeft); - free(scoresRightOriginalStart); - - if (queryIdxLeftAlignmentFound == false) { - // If there was no move that is part of optimal alignment, then there is no such alignment - // or given bestScore is not correct! - return EDLIB_STATUS_ERROR; - } - //----------------------------------------------------------// - - // Calculate alignments for upper half of left half (upper left - ul) - // and lower half of right half (lower right - lr). - const int ulHeight = queryIdxLeftAlignment + 1; - const int lrHeight = queryLength - ulHeight; - const int ulWidth = leftHalfWidth; - const int lrWidth = rightHalfWidth; - unsigned char* ulAlignment = NULL; int ulAlignmentLength; - int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight, - target, rTarget + lrWidth, ulWidth, - equalityDefinition, alphabetLength, leftScore, - &ulAlignment, &ulAlignmentLength); - unsigned char* lrAlignment = NULL; int lrAlignmentLength; - int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight, - target + ulWidth, rTarget, lrWidth, - equalityDefinition, alphabetLength, rightScore, - &lrAlignment, &lrAlignmentLength); - if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) { - if (ulAlignment) free(ulAlignment); - if (lrAlignment) free(lrAlignment); - return EDLIB_STATUS_ERROR; - } - - // Build alignment by concatenating upper left alignment with lower right alignment. - *alignmentLength = ulAlignmentLength + lrAlignmentLength; - *alignment = malloc((*alignmentLength) * sizeof(unsigned char)); - memcpy(*alignment, ulAlignment, ulAlignmentLength); - memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength); - - free(ulAlignment); - free(lrAlignment); - return EDLIB_STATUS_OK; -} -#endif - /** * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences * where elements in sequences are not any more letters of alphabet, but their index in alphabet. diff --git a/mpileup.c b/mpileup.c index b5e02c3d6..1d167d2d5 100644 --- a/mpileup.c +++ b/mpileup.c @@ -72,6 +72,7 @@ typedef struct { uint32_t fmt_flag; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels + int seqQ_offset; double min_frac; // for indels double indel_bias, poly_mqual; double del_bias; // compensate for diff deletion vs insertion error rates @@ -876,7 +877,9 @@ static int mpileup(mplp_conf_t *conf) conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; conf->bca->indel_win_size = conf->indel_win_size; + conf->bca->indels_v20 = conf->indels_v20; conf->bca->edlib = conf->edlib; + conf->bca->seqQ_offset = conf->seqQ_offset; conf->bca->poly_mqual = conf->poly_mqual; conf->bca->vs_ref = conf->vs_ref; @@ -1256,7 +1259,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --write-index Automatically index the output files [off]\n" "\n" "SNP/INDEL genotype likelihoods options:\n" - " -X, --config STR Specify platform specific profiles (see below)\n" + " -X, --config STR Specify platform profile (use \"-X list\" for details)\n" " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); fprintf(fp, " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac); @@ -1286,34 +1289,17 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); fprintf(fp, " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n" - " --edlib New EXPERIMENTAL indel calling model with edlib\n" - " --no-edlib Disable edlib mode, to use after a -X profile\n" + " --indels-cns New EXPERIMENTAL indel calling model with edlib\n" + " --seqq-offset Indel-cns tuning for indel seq-qual scores [120]\n" + " --no-indels-cns Disable CNS mode, to use after a -X profile\n" " --poly-mqual (Edlib mode) Use minimum quality within homopolymers\n"); fprintf(fp,"\n"); fprintf(fp, - "Configuration profiles activated with -X, --config:\n" - " 1.12: -Q13 -h100 -m1 -F0.002\n" - " illumina-1.18: --indel-size 110\n" - " illumina or illumina-1.20: --edlib\n" - " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" - " ont-sup or ont-sup-1.20:\n" - " -B -Q1 --max-BQ 99 -F0.20 -o15 -e1 -h80 --delta-BQ 60 \\\n" - " --del-bias 0.4 --poly-mqual --edlib\n" - " pacbio-ccs-1.18: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n" - " -M99999 --indel-size 110\n" - " pacbio-ccs or pacbio-ccs-1.20:\n" - " -B -Q5 --max-BQ 50 -F0.10 -o25 -e1 -h300 --delta-BQ 10 \\\n" - " --del-bias 0.4 --poly-mqual --edlib\n" - " ultima or ultima-1.20:\n" - " -B --max-BQ 30 -F0.15 -o20 -e15 -h250 --delta-BQ 10 \\\n" - " --del-bias 0.3 --poly-mqual --edlib\n" - "\n" - "Notes: Assuming diploid individuals.\n" - "\n" - "Example:\n" - " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" - " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" - "\n"); + "Notes: Assuming diploid individuals.\n\n" + "Example:\n" + " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" + " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" + "\n"); free(tmp_skip_all_set); free(tmp_skip_any_unset); @@ -1321,6 +1307,38 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) free(tmp_skip_any_set); } +static void print_profiles(void) { + printf( +"Configuration profiles activated with -X, --config:\n\n" +"1.12\n" +" -Q13 -h100 -m1 -F0.002\n\n" +"bgi, bgi-1.20\n" +" --indels-cns -B --indel-size 80 -F0.1 --indel-bias 0.9 --seqq-offset 120\n\n" +"illumina-1.18\n" +" --indel-size 110\n\n" +"illumina\n" +"illumina-1.20\n" +" --indels-cns --indel-size 110\n\n" +"ont\n" +" -B -Q5 --max-BQ 30 -I\n\n" +"ont-sup, ont-sup-1.20\n" +" --indels-cns -B -Q1 --max-BQ 35 -F0.2 -o15 -e1 -h110 --delta-BQ 99\\\n" +" --del-bias 0.4 --indel-bias 0.7 --poly-mqual --seqq-offset 130\\\n" +" --indel-size 80\n\n" +"pacbio-ccs-1.18\n" +" -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 \\\n" +" -M99999 --indel-size 110\n\n" +"pacbio-ccs, pacbio-ccs-1.20\n" +" --indels-cns -B -Q5 --max-BQ 50 -F0.1 -o25 -e1 -h300 --delta-BQ 10 \\\n" +" --del-bias 0.4 --poly-mqual --indel-bias 0.9 --seqq-offset 118\\\n" +" --indel-size 80 --score-vs-ref 0.7\n\n" +"ultima, ultima-1.20\n" +" --indels-cns -B -Q1 --max-BQ 30 -F0.15 -o20 -e10 -h250 --delta-BQ 10 \\\n" +" --del-bias 0.3 --indel-bias 0.7 --poly-mqual --seqq-offset 140 \\\n" +" --indel-size 80 --score-vs-ref 0.3\n\n" +"\n"); +} + int main_mpileup(int argc, char *argv[]) { int c; @@ -1350,8 +1368,9 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; - mplp.indel_win_size = 80; + mplp.indel_win_size = 110; mplp.poly_mqual = 0; + mplp.seqQ_offset = 120; mplp.clevel = -1; mplp.del_bias = 0; // even insertion and deletion likelhoods. hts_srand48(0); @@ -1412,8 +1431,8 @@ int main_mpileup(int argc, char *argv[]) {"indel-bias", required_argument, NULL, 10}, {"indel-size", required_argument, NULL, 15}, {"indels-2.0", no_argument, NULL, 20}, - {"edlib", no_argument, NULL, 22}, - {"no-edlib", no_argument, NULL, 25}, + {"indels-cns", no_argument, NULL, 22}, + {"no-indels-cns", no_argument, NULL, 25}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1431,6 +1450,7 @@ int main_mpileup(int argc, char *argv[]) {"poly-mqual", no_argument, NULL, 24}, {"no-poly-mqual", no_argument, NULL, 26}, {"score-vs-ref",required_argument, NULL, 27}, + {"seqq-offset", required_argument, NULL, 28}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { @@ -1557,10 +1577,17 @@ int main_mpileup(int argc, char *argv[]) } } break; - case 20: mplp.indels_v20 = 1; break; + case 20: mplp.indels_v20 = 1; mplp.edlib = 0; break; case 21: mplp.write_index = 1; break; - case 22: mplp.edlib = 1; break; + case 22: mplp.edlib = 1; mplp.indels_v20 = 0; break; case 25: mplp.edlib = 0; break; + case 28: + mplp.seqQ_offset = atoi(optarg); + if (mplp.seqQ_offset < 100) + mplp.seqQ_offset = 100; + if (mplp.seqQ_offset > 200) + mplp.seqQ_offset = 200; + break; case 23: mplp.del_bias = atof(optarg); break; case 24: mplp.poly_mqual = 1; break; case 26: mplp.poly_mqual = 0; break; @@ -1587,7 +1614,7 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag |= MPLP_REALN_PARTIAL; mplp.max_read_len = 99999; - mplp.indel_win_size = 110; + } else if (strcasecmp(optarg, "pacbio-ccs") == 0 || strcasecmp(optarg, "pacbio-ccs-1.20") == 0) { mplp.min_frac = 0.1; @@ -1599,10 +1626,13 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag &= ~MPLP_REALN; mplp.del_bias = 0.4; - mplp.indel_bias = 1.2; + mplp.indel_bias = 1/.9; + mplp.seqQ_offset = 118; mplp.poly_mqual = 1; mplp.edlib = 1; mplp.vs_ref = 0.7; + mplp.indel_win_size = 80; + } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "With old ONT data may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1610,6 +1640,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_baseQ = 30; mplp.flag &= ~MPLP_REALN; mplp.flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "ont-sup") == 0 || strcasecmp(optarg, "ont-sup-1.20") == 0) { mplp.min_frac = 0.2; @@ -1618,12 +1649,17 @@ int main_mpileup(int argc, char *argv[]) mplp.delta_baseQ = 99; mplp.openQ = 15; mplp.extQ = 1; - mplp.tandemQ = 80; mplp.flag &= ~MPLP_REALN; mplp.max_read_len = 9999999; mplp.del_bias = 0.4; mplp.poly_mqual = 1; mplp.edlib = 1; + // If we increase -h then we can increase bias denominator too + mplp.tandemQ = 110; + mplp.indel_bias = 1/0.7; + mplp.seqQ_offset = 130; + mplp.indel_win_size = 80; + } else if (strcasecmp(optarg, "ultima") == 0 || strcasecmp(optarg, "ultima-1.20") == 0) { mplp.min_frac = 0.15; @@ -1637,7 +1673,11 @@ int main_mpileup(int argc, char *argv[]) mplp.del_bias = 0.3; mplp.poly_mqual = 1; mplp.edlib = 1; + mplp.indel_bias = 1/0.7; + mplp.seqQ_offset = 140; mplp.vs_ref = 0.3; + mplp.indel_win_size = 80; + } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002; @@ -1646,24 +1686,36 @@ int main_mpileup(int argc, char *argv[]) mplp.tandemQ = 100; mplp.flag &= ~MPLP_REALN_PARTIAL; mplp.flag |= MPLP_REALN; + } else if (strcasecmp(optarg, "illumina-1.18") == 0) { mplp.indel_win_size = 110; mplp.flag |= MPLP_REALN_PARTIAL; + } else if (strcasecmp(optarg, "illumina") == 0 || strcasecmp(optarg, "illumina-1.20") == 0) { mplp.edlib = 1; mplp.indel_win_size = 110; mplp.flag |= MPLP_REALN_PARTIAL; + mplp.indel_bias = 1; + mplp.seqQ_offset = 125; + //mplp.indel_win_size = 80; TEST? + } else if (strcasecmp(optarg, "bgi") == 0 || strcasecmp(optarg, "bgi-1.20") == 0) { - // Largely as per Illumina + mplp.min_frac = 0.1; mplp.edlib = 1; - mplp.indel_win_size = 110; - mplp.indel_bias = 0.9; + mplp.indel_bias = 1; + mplp.seqQ_offset = 120; mplp.flag |= MPLP_REALN_PARTIAL; + mplp.indel_win_size = 80; + + } else if (strcasecmp(optarg, "list") == 0 || + strcasecmp(optarg, "help") == 0) { + print_profiles(); + return 1; } else { fprintf(stderr, "Unknown configuration name '%s'\n" - "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + "Please use '-X list' to show available choices.\n", optarg); return 1; } diff --git a/test/mpileup/indel-AD.1cns.out b/test/mpileup/indel-AD.1cns.out new file mode 100644 index 000000000..21c38bddd --- /dev/null +++ b/test/mpileup/indel-AD.1cns.out @@ -0,0 +1,322 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +000000F 392 . C <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,32:1,0 +000000F 393 . A <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,32:1,0 +000000F 394 . T <*> 0 . DP=1;I16=1,0,0,0,32,1024,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,32:1,0 +000000F 395 . G <*> 0 . DP=3;I16=1,2,0,0,76,2242,0,0,180,10800,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,73:3,0 +000000F 396 . T <*> 0 . DP=3;I16=1,2,0,0,86,2882,0,0,180,10800,0,0,6,18,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,83:3,0 +000000F 397 . A <*> 0 . DP=3;I16=1,2,0,0,80,2554,0,0,180,10800,0,0,9,33,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,77:3,0 +000000F 398 . C <*> 0 . DP=3;I16=1,2,0,0,75,2309,0,0,180,10800,0,0,12,54,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,72:3,0 +000000F 399 . T <*> 0 . DP=3;I16=1,2,0,0,86,2582,0,0,180,10800,0,0,15,81,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,82:3,0 +000000F 400 . C T,<*> 0 . DP=3;I16=1,1,0,1,63,2165,12,144,120,7200,60,3600,13,89,5,25;QS=0.84,0.16,0;SGB=-0.379885;RPBZ=-0.707107;MQBZ=0;MQSBZ=0;BQBZ=-1.22474;SCBZ=0;MQ0F=0 PL:AD 3,0,54,9,57,61:2,1,0 +000000F 401 . G <*> 0 . DP=3;I16=1,2,0,0,65,1969,0,0,180,10800,0,0,21,153,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,62:3,0 +000000F 402 . C <*> 0 . DP=5;I16=1,3,0,0,83,2041,0,0,240,14400,0,0,24,198,0,0;QS=1,0;MQ0F=0 PL:AD 0,12,75:4,0 +000000F 403 . A <*> 0 . DP=5;I16=2,3,0,0,102,2818,0,0,300,18000,0,0,29,251,0,0;QS=1,0;MQ0F=0 PL:AD 0,15,91:5,0 +000000F 404 . T <*> 0 . DP=5;I16=2,3,0,0,141,4875,0,0,300,18000,0,0,34,314,0,0;QS=1,0;MQ0F=0 PL:AD 0,15,127:5,0 +000000F 405 . G <*> 0 . DP=6;I16=3,3,0,0,175,6043,0,0,360,21600,0,0,39,387,0,0;QS=1,0;MQ0F=0 PL:AD 0,18,150:6,0 +000000F 406 . A T,<*> 0 . DP=6;I16=3,2,0,1,143,5019,12,144,300,18000,60,3600,34,350,11,121;QS=0.922581,0.0774194,0;SGB=-0.379885;RPBZ=0.603023;MQBZ=0;MQSBZ=0;BQBZ=-0.948683;SCBZ=-0.447214;MQ0F=0 PL:AD 0,5,117,15,120,122:5,1,0 +000000F 407 . G <*> 0 . DP=6;I16=3,3,0,0,170,5748,0,0,360,21600,0,0,51,567,0,0;QS=1,0;MQ0F=0 PL:AD 0,18,148:6,0 +000000F 408 . A <*> 0 . DP=6;I16=3,3,0,0,150,4818,0,0,360,21600,0,0,57,675,0,0;QS=1,0;MQ0F=0 PL:AD 0,18,130:6,0 +000000F 409 . G <*> 0 . DP=6;I16=3,3,0,0,194,6940,0,0,360,21600,0,0,63,795,0,0;QS=1,0;MQ0F=0 PL:AD 0,18,166:6,0 +000000F 410 . T <*> 0 . DP=7;I16=4,3,0,0,157,4587,0,0,420,25200,0,0,88,1288,0,0;QS=1,0;MQ0F=0 PL:AD 0,21,132:7,0 +000000F 411 . T <*> 0 . DP=8;I16=5,3,0,0,255,8901,0,0,480,28800,0,0,115,1871,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,196:8,0 +000000F 412 . A G,<*> 0 . DP=8;I16=5,2,0,1,235,8721,8,64,420,25200,60,3600,113,2009,10,100;QS=0.967078,0.0329218,0;SGB=-0.379885;RPBZ=0;MQBZ=0;MQSBZ=0;BQBZ=-1.62747;SCBZ=-0.75;MQ0F=0 PL:AD 0,14,175,21,178,176:7,1,0 +000000F 413 . T <*> 0 . DP=9;I16=5,4,0,0,288,10546,0,0,540,32400,0,0,131,2363,0,0;QS=1,0;MQ0F=0 PL:AD 0,27,214:9,0 +000000F 414 . G <*> 0 . DP=9;I16=5,4,0,0,317,12083,0,0,540,32400,0,0,140,2634,0,0;QS=1,0;MQ0F=0 PL:AD 0,27,236:9,0 +000000F 415 . T G,<*> 0 . DP=9;I16=5,3,0,1,291,10937,22,484,480,28800,60,3600,136,2754,13,169;QS=0.929712,0.0702875,0;SGB=-0.379885;RPBZ=0.195283;MQBZ=0;MQSBZ=0;BQBZ=-1.5;SCBZ=-0.690269;MQ0F=0 PL:AD 0,5,202,24,205,214:8,1,0 +000000F 416 . G C,<*> 0 . DP=10;I16=6,3,0,1,324,11944,12,144,540,32400,60,3600,144,3034,14,196;QS=0.964286,0.0357143,0;SGB=-0.379885;RPBZ=0.350285;MQBZ=0;MQSBZ=0;BQBZ=-1.62698;SCBZ=-0.642529;MQ0F=0 PL:AD 0,17,225,27,228,228:9,1,0 +000000F 417 . T <*> 0 . DP=10;I16=6,4,0,0,364,13992,0,0,600,36000,0,0,166,3454,0,0;QS=1,0;MQ0F=0 PL:AD 0,30,255:10,0 +000000F 418 . A <*> 0 . DP=10;I16=6,4,0,0,360,13680,0,0,600,36000,0,0,173,3643,0,0;QS=1,0;MQ0F=0 PL:AD 0,30,254:10,0 +000000F 419 . A <*> 0 . DP=10;I16=6,4,0,0,344,13112,0,0,600,36000,0,0,180,3846,0,0;QS=1,0;MQ0F=0 PL:AD 0,30,242:10,0 +000000F 420 . A C,<*> 0 . DP=11;I16=7,3,0,1,306,11440,12,144,600,36000,60,3600,190,4180,18,324;QS=0.962382,0.0376176,0;SGB=-0.379885;RPBZ=0.476513;MQBZ=0;MQSBZ=0;BQBZ=-1.00766;SCBZ=-0.73252;MQ0F=0 PL:AD 0,20,212,30,215,215:10,1,0 +000000F 421 . A <*> 0 . DP=12;I16=8,4,0,0,388,13950,0,0,720,43200,0,0,236,5160,0,0;QS=1,0;MQ0F=0 PL:AD 0,36,255:12,0 +000000F 422 . C <*> 0 . DP=13;I16=8,5,0,0,484,18266,0,0,780,46800,0,0,243,5389,0,0;QS=1,0;MQ0F=0 PL:AD 0,39,255:13,0 +000000F 423 . A <*> 0 . DP=15;I16=9,6,0,0,509,18579,0,0,900,54000,0,0,266,5858,0,0;QS=1,0;MQ0F=0 PL:AD 0,45,255:15,0 +000000F 424 . G <*> 0 . DP=15;I16=9,6,0,0,516,19368,0,0,900,54000,0,0,276,6150,0,0;QS=1,0;MQ0F=0 PL:AD 0,45,255:15,0 +000000F 425 . A <*> 0 . DP=15;I16=9,6,0,0,516,19668,0,0,900,54000,0,0,284,6360,0,0;QS=1,0;MQ0F=0 PL:AD 0,45,255:15,0 +000000F 426 . G T,<*> 0 . DP=15;I16=9,5,0,1,519,20109,12,144,840,50400,60,3600,288,6570,4,16;QS=0.977401,0.0225989,0;SGB=-0.379885;RPBZ=-1.28103;MQBZ=0;MQSBZ=0;BQBZ=-1.70698;SCBZ=-0.879593;MQ0F=0 PL:AD 0,31,255,42,255,255:14,1,0 +000000F 427 . C <*> 0 . DP=15;I16=9,6,0,0,550,21100,0,0,900,54000,0,0,300,6828,0,0;QS=1,0;MQ0F=0 PL:AD 0,45,255:15,0 +000000F 428 . T <*> 0 . DP=15;I16=9,6,0,0,521,19363,0,0,900,54000,0,0,306,6984,0,0;QS=1,0;MQ0F=0 PL:AD 0,45,255:15,0 +000000F 429 . C A,<*> 0 . DP=17;I16=10,6,0,1,533,19507,12,144,960,57600,60,3600,287,6527,25,625;QS=0.977982,0.0220183,0;SGB=-0.379885;RPBZ=1.33584;MQBZ=0;MQSBZ=0;BQBZ=-1.55274;SCBZ=-0.887279;MQ0F=0 PL:AD 0,37,255,48,255,255:16,1,0 +000000F 430 . A <*> 0 . DP=18;I16=10,8,0,0,583,20803,0,0,1080,64800,0,0,320,7334,0,0;QS=1,0;MQ0F=0 PL:AD 0,54,255:18,0 +000000F 431 . A <*> 0 . DP=19;I16=10,8,0,0,616,23012,0,0,1080,64800,0,0,328,7482,0,0;QS=1,0;MQ0F=0 PL:AD 0,54,255:18,0 +000000F 432 . T <*> 0 . DP=19;I16=11,8,0,0,682,26264,0,0,1140,68400,0,0,337,7647,0,0;QS=1,0;MQ0F=0 PL:AD 0,57,255:19,0 +000000F 433 . T <*> 0 . DP=19;I16=11,8,0,0,696,26452,0,0,1140,68400,0,0,346,7830,0,0;QS=1,0;MQ0F=0 PL:AD 0,57,255:19,0 +000000F 434 . T <*> 0 . DP=19;I16=11,8,0,0,678,25728,0,0,1140,68400,0,0,354,7980,0,0;QS=1,0;MQ0F=0 PL:AD 0,57,255:19,0 +000000F 435 . T <*> 0 . DP=20;I16=11,9,0,0,717,26931,0,0,1200,72000,0,0,362,8146,0,0;QS=1,0;MQ0F=0 PL:AD 0,60,255:20,0 +000000F 436 . A C,<*> 0 . DP=21;I16=11,9,0,1,728,28034,12,144,1200,72000,60,3600,346,7704,25,625;QS=0.983784,0.0162162,0;SGB=-0.379885;RPBZ=0.909477;MQBZ=0;MQSBZ=0;BQBZ=-1.66309;SCBZ=-0.82685;MQ0F=0 PL:AD 0,49,255,60,255,255:20,1,0 +000000F 437 . T <*> 0 . DP=21;I16=11,10,0,0,770,29698,0,0,1260,75600,0,0,381,8531,0,0;QS=1,0;MQ0F=0 PL:AD 0,63,255:21,0 +000000F 438 . T <*> 0 . DP=22;I16=11,10,0,0,726,26976,0,0,1260,75600,0,0,391,8753,0,0;QS=1,0;MQ0F=0 PL:AD 0,63,255:21,0 +000000F 439 . T <*> 0 . DP=25;I16=14,10,0,0,804,30038,0,0,1440,86400,0,0,444,9912,0,0;QS=1,0;MQ0F=0 PL:AD 0,72,255:24,0 +000000F 440 . T A,<*> 0 . DP=27;I16=15,11,1,0,896,33856,12,144,1560,93600,60,3600,455,10163,25,625;QS=0.986784,0.0132159,0;SGB=-0.379885;RPBZ=1.28585;MQBZ=0;MQSBZ=0;BQBZ=-1.41134;SCBZ=0.48519;MQ0F=0 PL:AD 0,66,255,78,255,255:26,1,0 +000000F 441 . G <*> 0 . DP=29;I16=17,11,0,0,1033,39137,0,0,1680,100800,0,0,495,11163,0,0;QS=1,0;MQ0F=0 PL:AD 0,84,255:28,0 +000000F 442 . T <*> 0 . DP=29;I16=18,11,0,0,1067,41223,0,0,1740,104400,0,0,531,11951,0,0;QS=1,0;MQ0F=0 PL:AD 0,87,255:29,0 +000000F 443 . A <*> 0 . DP=31;I16=20,11,0,0,1078,40562,0,0,1860,111600,0,0,544,12226,0,0;QS=1,0;MQ0F=0 PL:AD 0,93,255:31,0 +000000F 444 . T <*> 0 . DP=31;I16=20,11,0,0,1124,43000,0,0,1860,111600,0,0,559,12529,0,0;QS=1,0;MQ0F=0 PL:AD 0,93,255:31,0 +000000F 445 . T <*> 0 . DP=32;I16=20,11,0,0,1050,38808,0,0,1860,111600,0,0,574,12862,0,0;QS=1,0;MQ0F=0 PL:AD 0,93,255:31,0 +000000F 446 . T <*> 0 . DP=34;I16=22,11,0,0,1205,45921,0,0,1980,118800,0,0,611,13703,0,0;QS=1,0;MQ0F=0 PL:AD 0,99,255:33,0 +000000F 447 . A <*> 0 . DP=35;I16=23,12,0,0,1177,43259,0,0,2100,126000,0,0,650,14620,0,0;QS=1,0;MQ0F=0 PL:AD 0,105,255:35,0 +000000F 448 . T <*> 0 . DP=35;I16=23,12,0,0,1220,45736,0,0,2100,126000,0,0,667,15037,0,0;QS=1,0;MQ0F=0 PL:AD 0,105,255:35,0 +000000F 449 . T <*> 0 . DP=35;I16=23,12,0,0,1249,47703,0,0,2100,126000,0,0,679,15245,0,0;QS=1,0;MQ0F=0 PL:AD 0,105,255:35,0 +000000F 450 . G <*> 0 . DP=38;I16=25,13,0,0,1449,56453,0,0,2280,136800,0,0,715,16059,0,0;QS=1,0;MQ0F=0 PL:AD 0,114,255:38,0 +000000F 451 . T A,<*> 0 . DP=43;I16=24,16,0,1,1485,57079,12,144,2400,144000,60,3600,708,15882,0,0;QS=0.991984,0.00801603,0;SGB=-0.379885;RPBZ=-1.56545;MQBZ=0;MQSBZ=0;BQBZ=-1.91213;SCBZ=-0.839032;MQ0F=0 PL:AD 0,107,255,120,255,255:40,1,0 +000000F 452 . G T,<*> 0 . DP=45;I16=24,18,0,1,1541,58723,12,144,2520,151200,60,3600,707,15841,23,529;QS=0.992273,0.00772698,0;SGB=-0.379885;RPBZ=0.645561;MQBZ=0;MQSBZ=0;BQBZ=-1.75577;SCBZ=0.629801;MQ0F=0 PL:AD 0,113,255,126,255,255:42,1,0 +000000F 453 . C <*> 0 . DP=45;I16=24,19,0,0,1623,65335,0,0,2580,154800,0,0,753,16853,0,0;QS=1,0;MQ0F=0 PL:AD 0,129,255:43,0 +000000F 454 . A C,<*> 0 . DP=47;I16=24,20,1,0,1699,68295,12,144,2640,158400,60,3600,774,17286,25,625;QS=0.992987,0.00701344,0;SGB=-0.379885;RPBZ=1.46482;MQBZ=0;MQSBZ=0;BQBZ=-1.87433;SCBZ=0.550575;MQ0F=0 PL:AD 0,119,255,132,255,255:44,1,0 +000000F 455 . G <*> 0 . DP=48;I16=26,20,0,0,1807,74553,0,0,2760,165600,0,0,823,18385,0,0;QS=1,0;MQ0F=0 PL:AD 0,138,255:46,0 +000000F 456 . T <*> 0 . DP=49;I16=26,21,0,0,1757,69583,0,0,2820,169200,0,0,845,18853,0,0;QS=1,0;MQ0F=0 PL:AD 0,141,255:47,0 +000000F 457 . T <*> 0 . DP=49;I16=26,21,0,0,1836,73528,0,0,2820,169200,0,0,866,19264,0,0;QS=1,0;MQ0F=0 PL:AD 0,141,255:47,0 +000000F 458 . A <*> 0 . DP=50;I16=26,22,0,0,1817,72337,0,0,2880,172800,0,0,887,19717,0,0;QS=1,0;MQ0F=0 PL:AD 0,144,255:48,0 +000000F 459 . G <*> 0 . DP=50;I16=26,22,0,0,1791,70171,0,0,2880,172800,0,0,909,20213,0,0;QS=1,0;MQ0F=0 PL:AD 0,144,255:48,0 +000000F 460 . A <*> 0 . DP=49;I16=26,22,0,0,1675,63873,0,0,2880,172800,0,0,923,20689,0,0;QS=1,0;MQ0F=0 PL:AD 0,144,255:48,0 +000000F 461 . A T,C,<*> 0 . DP=52;I16=25,22,1,1,1650,63298,24,288,2820,169200,120,7200,927,21061,17,145;QS=0.985663,0.00716846,0.00716846,0;VDB=0.06;SGB=-0.453602;RPBZ=-1.92231;MQBZ=0;MQSBZ=0;BQBZ=-2.2578;SCBZ=-0.055422;MQ0F=0 PL:AD 0,128,255,128,255,255,141,255,255,255:47,1,1,0 +000000F 462 . A T,<*> 0 . DP=52;I16=26,22,0,1,1705,65163,12,144,2880,172800,60,3600,955,21615,10,100;QS=0.993011,0.00698893,0;SGB=-0.379885;RPBZ=-1.09702;MQBZ=0;MQSBZ=0;BQBZ=-1.74927;SCBZ=-0.853241;MQ0F=0 PL:AD 0,130,255,144,255,255:48,1,0 +000000F 463 . A <*> 0 . DP=52;I16=26,24,0,0,1836,71956,0,0,3000,180000,0,0,996,22366,0,0;QS=1,0;MQ0F=0 PL:AD 0,151,255:50,0 +000000F 464 . T G,<*> 0 . DP=52;I16=26,23,0,1,1873,73813,12,144,2940,176400,60,3600,993,22355,25,625;QS=0.993634,0.00636605,0;SGB=-0.379885;RPBZ=0.173386;MQBZ=0;MQSBZ=0;BQBZ=-1.93908;SCBZ=-0.840331;MQ0F=0 PL:AD 0,133,255,148,255,255:49,1,0 +000000F 465 . A <*> 0 . DP=52;I16=26,24,0,0,1950,78232,0,0,3000,180000,0,0,1039,23587,0,0;QS=1,0;MQ0F=0 PL:AD 0,151,255:50,0 +000000F 466 . A C,<*> 0 . DP=52;I16=25,24,1,0,1858,74740,12,144,2940,176400,60,3600,1033,23509,25,625;QS=0.993583,0.00641711,0;SGB=-0.379885;RPBZ=-1.24865;MQBZ=0;MQSBZ=0;BQBZ=-1.79438;SCBZ=1.87165;MQ0F=0 PL:AD 0,133,255,148,255,255:49,1,0 +000000F 467 . T <*> 0 . DP=53;I16=26,25,0,0,1890,74962,0,0,3060,183600,0,0,1076,24668,0,0;QS=1,0;MQ0F=0 PL:AD 0,154,255:51,0 +000000F 468 . A <*> 0 . DP=54;I16=27,25,0,0,1967,78281,0,0,3120,187200,0,0,1095,25239,0,0;QS=1,0;MQ0F=0 PL:AD 0,157,255:52,0 +000000F 469 . T <*> 0 . DP=54;I16=27,25,0,0,2046,83348,0,0,3120,187200,0,0,1113,25747,0,0;QS=1,0;MQ0F=0 PL:AD 0,157,255:52,0 +000000F 470 . G <*> 0 . DP=55;I16=28,25,0,0,2047,81939,0,0,3180,190800,0,0,1131,26291,0,0;QS=1,0;MQ0F=0 PL:AD 0,160,255:53,0 +000000F 471 . A <*> 0 . DP=56;I16=28,26,0,0,2028,80100,0,0,3240,194400,0,0,1150,26872,0,0;QS=1,0;MQ0F=0 PL:AD 0,163,255:54,0 +000000F 472 . T <*> 0 . DP=59;I16=29,28,0,0,2122,83726,0,0,3420,205200,0,0,1191,27925,0,0;QS=1,0;MQ0F=0 PL:AD 0,172,255:57,0 +000000F 473 . C A,G,<*> 0 . DP=63;I16=19,15,12,13,1263,49285,963,39221,2040,122400,1500,90000,705,16449,548,12960;QS=0.567385,0.427224,0.00539084,0;VDB=1.64281e-06;SGB=-0.692914;RPBZ=1.05908;MQBZ=0;MQSBZ=0;BQBZ=1.40915;SCBZ=0.597801;MQ0F=0 PL:AD 255,0,255,255,255,255,255,255,255,255:34,24,1,0 +000000F 474 . A <*> 0 . DP=64;I16=33,28,0,0,2172,84276,0,0,3660,219600,0,0,1296,30488,0,0;QS=1,0;MQ0F=0 PL:AD 0,184,255:61,0 +000000F 475 . A <*> 0 . DP=64;I16=34,28,0,0,2092,79478,0,0,3720,223200,0,0,1322,31258,0,0;QS=1,0;MQ0F=0 PL:AD 0,187,255:62,0 +000000F 476 . A G,<*> 0 . DP=65;I16=34,28,0,1,2176,83678,12,144,3720,223200,60,3600,1319,31299,25,625;QS=0.994516,0.00548446,0;SGB=-0.379885;RPBZ=1.04564;MQBZ=0;MQSBZ=0;BQBZ=-1.59705;SCBZ=0.713967;MQ0F=0 PL:AD 0,172,255,187,255,255:62,1,0 +000000F 477 . T G,<*> 0 . DP=65;I16=34,28,0,1,2352,93436,12,144,3720,223200,60,3600,1338,31806,25,625;QS=0.994924,0.00507614,0;SGB=-0.379885;RPBZ=-0.687896;MQBZ=0;MQSBZ=0;BQBZ=-1.84472;SCBZ=-0.892396;MQ0F=0 PL:AD 0,172,255,187,255,255:62,1,0 +000000F 478 . C <*> 0 . DP=65;I16=34,29,0,0,2376,94566,0,0,3780,226800,0,0,1381,32925,0,0;QS=1,0;MQ0F=0 PL:AD 0,190,255:63,0 +000000F 479 . T G,<*> 0 . DP=68;I16=36,29,0,1,2467,98357,12,144,3900,234000,60,3600,1416,33712,21,441;QS=0.995159,0.00484066,0;SGB=-0.379885;RPBZ=-0.787962;MQBZ=0;MQSBZ=0;BQBZ=-1.86106;SCBZ=-0.927153;MQ0F=0 PL:AD 0,180,255,196,255,255:65,1,0 +000000F 480 . G <*> 0 . DP=70;I16=38,30,0,0,2563,101527,0,0,4080,244800,0,0,1495,35435,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 481 . T <*> 0 . DP=70;I16=38,30,0,0,2521,98551,0,0,4080,244800,0,0,1514,35994,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 482 . T G,<*> 0 . DP=70;I16=37,30,1,0,2575,103005,12,144,4020,241200,60,3600,1518,36344,14,196;QS=0.995361,0.00463858,0;SGB=-0.379885;RPBZ=-0.892198;MQBZ=0;MQSBZ=0;BQBZ=-1.93937;SCBZ=-0.948507;MQ0F=0 PL:AD 0,186,255,202,255,255:67,1,0 +000000F 483 . T <*> 0 . DP=70;I16=38,30,0,0,2646,106206,0,0,4080,244800,0,0,1549,37071,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 484 . G <*> 0 . DP=70;I16=38,30,0,0,2659,107225,0,0,4080,244800,0,0,1565,37585,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 485 . T <*> 0 . DP=70;I16=38,30,0,0,2618,104932,0,0,4080,244800,0,0,1578,37978,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 486 . T <*> 0 . DP=70;I16=38,30,0,0,2589,103545,0,0,4080,244800,0,0,1589,38295,0,0;QS=1,0;MQ0F=0 PL:AD 0,205,255:68,0 +000000F 487 . T <*> 0 . DP=73;I16=38,33,0,0,2694,107284,0,0,4260,255600,0,0,1599,38583,0,0;QS=1,0;MQ0F=0 PL:AD 0,214,255:71,0 +000000F 488 . A G,<*> 0 . DP=76;I16=41,32,0,1,2640,102296,8,64,4380,262800,60,3600,1679,40447,1,1;QS=0.996979,0.00302115,0;SGB=-0.379885;RPBZ=-1.52228;MQBZ=0;MQSBZ=0;BQBZ=-1.89405;SCBZ=-0.946449;MQ0F=0 PL:AD 0,204,255,220,255,255:73,1,0 +000000F 489 . C A,<*> 0 . DP=79;I16=43,33,0,1,2694,103126,12,144,4560,273600,60,3600,1715,41273,25,625;QS=0.995565,0.00443459,0;SGB=-0.379885;RPBZ=-0.495167;MQBZ=0;MQSBZ=0;BQBZ=-1.58784;SCBZ=-0.974263;MQ0F=0 PL:AD 0,213,255,229,255,255:76,1,0 +000000F 490 . C <*> 0 . DP=80;I16=43,35,0,0,2796,107040,0,0,4680,280800,0,0,1782,43020,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 491 . T <*> 0 . DP=80;I16=43,35,0,0,2910,113818,0,0,4680,280800,0,0,1798,43500,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 492 . G <*> 0 . DP=81;I16=43,35,0,0,3089,125687,0,0,4680,280800,0,0,1814,44012,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 493 . T <*> 0 . DP=81;I16=43,35,0,0,2986,119178,0,0,4680,280800,0,0,1829,44505,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 494 . G <*> 0 . DP=81;I16=43,35,0,0,3060,125030,0,0,4680,280800,0,0,1842,44926,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 495 . T <*> 0 . DP=84;I16=45,36,0,0,3053,121389,0,0,4860,291600,0,0,1898,46292,0,0;QS=1,0;MQ0F=0 PL:AD 0,244,255:81,0 +000000F 496 . T <*> 0 . DP=87;I16=47,37,0,0,3238,128760,0,0,5040,302400,0,0,1917,46777,0,0;QS=1,0;MQ0F=0 PL:AD 0,253,255:84,0 +000000F 497 . G <*> 0 . DP=87;I16=47,37,0,0,3280,132640,0,0,5040,302400,0,0,1933,47227,0,0;QS=1,0;MQ0F=0 PL:AD 0,253,255:84,0 +000000F 498 . T <*> 0 . DP=88;I16=48,37,0,0,3081,120187,0,0,5100,306000,0,0,1971,48181,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:85,0 +000000F 499 . T C,<*> 0 . DP=89;I16=48,36,0,1,3190,126958,8,64,5040,302400,60,3600,1957,47809,25,625;QS=0.997498,0.00250156,0;SGB=-0.379885;RPBZ=-0.305833;MQBZ=0;MQSBZ=0;BQBZ=-1.95559;SCBZ=-1.05778;MQ0F=0 PL:AD 0,237,255,253,255,255:84,1,0 +000000F 500 . T G,A,<*> 0 . DP=92;I16=50,37,1,1,3308,133104,24,288,5220,313200,120,7200,1986,48376,50,1250;QS=0.992797,0.00360144,0.00360144,0;VDB=0.32;SGB=-0.453602;RPBZ=0.41539;MQBZ=0;MQSBZ=0;BQBZ=-2.60206;SCBZ=-1.0009;MQ0F=0 PL:AD 0,245,255,245,255,255,255,255,255,255:87,1,1,0 +000000F 501 . G <*> 0 . DP=97;I16=53,40,0,0,3485,138049,0,0,5580,334800,0,0,2050,49962,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:93,0 +000000F 502 . T A,<*> 0 . DP=98;I16=53,40,1,0,3391,132771,12,144,5580,334800,60,3600,2065,50183,25,625;QS=0.996476,0.00352423,0;SGB=-0.379885;RPBZ=1.67751;MQBZ=0;MQSBZ=0;BQBZ=-1.7355;SCBZ=0.856937;MQ0F=0 PL:AD 0,255,255,255,255,255:93,1,0 +000000F 503 . G <*> 0 . DP=98;I16=54,40,0,0,3540,140610,0,0,5640,338400,0,0,2108,51206,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:94,0 +000000F 504 . T C,A,<*> 0 . DP=102;I16=53,42,1,1,3464,137680,20,208,5700,342000,120,7200,2099,50913,25,625;QS=0.994259,0.00344432,0.00229621,0;VDB=0.92;SGB=-0.453602;RPBZ=0.0126977;MQBZ=0;MQSBZ=0;BQBZ=-2.43319;SCBZ=0.512446;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:95,1,1,0 +000000F 505 . G <*> 0 . DP=102;I16=54,43,0,0,3640,144678,0,0,5820,349200,0,0,2141,51803,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:97,0 +000000F 506 . C G,<*> 0 . DP=103;I16=54,43,0,1,3621,145273,12,144,5820,349200,60,3600,2138,51502,25,625;QS=0.996697,0.00330306,0;SGB=-0.379885;RPBZ=0.194473;MQBZ=0;MQSBZ=0;BQBZ=-1.75415;SCBZ=-1.07827;MQ0F=0 PL:AD 0,255,255,255,255,255:97,1,0 +000000F 507 . T <*> 0 . DP=103;I16=54,44,0,0,3520,139202,0,0,5880,352800,0,0,2181,52471,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:98,0 +000000F 508 . C T,<*> 0 . DP=103;I16=54,43,0,1,3476,137498,12,144,5820,349200,60,3600,2174,52226,25,625;QS=0.99656,0.00344037,0;SGB=-0.379885;RPBZ=0.265217;MQBZ=0;MQSBZ=0;BQBZ=-1.63996;SCBZ=-1.07818;MQ0F=0 PL:AD 0,255,255,255,255,255:97,1,0 +000000F 509 . C <*> 0 . DP=103;I16=54,44,0,0,3580,142206,0,0,5880,352800,0,0,2217,53267,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:98,0 +000000F 510 . C A,G,<*> 0 . DP=106;I16=54,43,1,2,3646,145190,42,692,5820,349200,180,10800,2189,52645,67,1515;QS=0.988612,0.00921909,0.0021692,0;VDB=0.401962;SGB=-0.511536;RPBZ=-0.707412;MQBZ=0;MQSBZ=0;BQBZ=-3.02794;SCBZ=-0.99293;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:97,2,1,0 +000000F 511 . T A,<*> 0 . DP=107;I16=55,45,0,1,3663,144413,12,144,6000,360000,60,3600,2251,54067,25,625;QS=0.996735,0.00326531,0;SGB=-0.379885;RPBZ=-0.0343102;MQBZ=0;MQSBZ=0;BQBZ=-1.71065;SCBZ=-1.08137;MQ0F=0 PL:AD 0,255,255,255,255,255:100,1,0 +000000F 512 . C A,G,<*> 0 . DP=107;I16=55,43,1,3,3657,146609,54,836,5880,352800,240,14400,2238,53902,83,1939;QS=0.985449,0.0123956,0.00215575,0;VDB=0.681156;SGB=-0.556411;RPBZ=-0.465574;MQBZ=0;MQSBZ=0;BQBZ=-3.45917;SCBZ=-0.667699;MQ0F=0 PL:AD 0,248,255,255,255,255,255,255,255,255:98,3,1,0 +000000F 513 . T G,<*> 0 . DP=108;I16=56,46,0,1,3774,150038,7,49,6120,367200,60,3600,2340,56352,0,0;QS=0.998149,0.00185136,0;SGB=-0.379885;RPBZ=-1.71577;MQBZ=0;MQSBZ=0;BQBZ=-1.90666;SCBZ=-1.07835;MQ0F=0 PL:AD 0,255,255,255,255,255:102,1,0 +000000F 514 . G T,<*> 0 . DP=110;I16=56,47,1,1,3763,148539,20,208,6180,370800,120,7200,2348,56422,25,625;QS=0.994713,0.00528681,0;VDB=0.44;SGB=-0.453602;RPBZ=-1.58286;MQBZ=0;MQSBZ=0;BQBZ=-2.48498;SCBZ=0.0121282;MQ0F=0 PL:AD 0,255,255,255,255,255:103,2,0 +000000F 515 . C G,<*> 0 . DP=110;I16=57,47,0,1,3848,153002,12,144,6240,374400,60,3600,2367,56887,25,625;QS=0.996891,0.00310881,0;SGB=-0.379885;RPBZ=0.263997;MQBZ=0;MQSBZ=0;BQBZ=-1.75323;SCBZ=-1.07538;MQ0F=0 PL:AD 0,255,255,255,255,255:104,1,0 +000000F 516 . G T,<*> 0 . DP=111;I16=57,48,0,1,3969,158573,12,144,6300,378000,60,3600,2411,58015,0,0;QS=0.996986,0.00301432,0;SGB=-0.379885;RPBZ=-1.71626;MQBZ=0;MQSBZ=0;BQBZ=-1.8341;SCBZ=-1.06697;MQ0F=0 PL:AD 0,255,255,255,255,255:105,1,0 +000000F 517 . T C,<*> 0 . DP=112;I16=56,50,1,0,3850,151946,12,144,6360,381600,60,3600,2408,58016,21,441;QS=0.996893,0.0031072,0;SGB=-0.379885;RPBZ=-0.7772;MQBZ=0;MQSBZ=0;BQBZ=-1.67048;SCBZ=0.856997;MQ0F=0 PL:AD 0,255,255,255,255,255:106,1,0 +000000F 518 . G <*> 0 . DP=115;I16=59,50,0,0,3977,155237,0,0,6540,392400,0,0,2447,58891,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:109,0 +000000F 519 . C A,<*> 0 . DP=116;I16=60,49,0,1,4018,159736,12,144,6540,392400,60,3600,2443,58747,25,625;QS=0.997022,0.00297767,0;SGB=-0.379885;RPBZ=0.110257;MQBZ=0;MQSBZ=0;BQBZ=-1.72465;SCBZ=-1.03419;MQ0F=0 PL:AD 0,255,255,255,255,255:109,1,0 +000000F 520 . T G,C,A 0 . DP=116;I16=59,48,1,2,4046,162680,48,1152,6420,385200,180,10800,2436,58644,53,1259;QS=0.988276,0.00781632,0.00195408,0.00195408;VDB=0.913952;SGB=-0.511536;RPBZ=0.578224;MQBZ=0;MQSBZ=0;BQBZ=-2.88822;SCBZ=-0.373153;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:107,1,1,1 +000000F 521 . G A,T,<*> 0 . DP=118;I16=61,48,1,2,4131,166701,36,432,6540,392400,180,10800,2454,59246,53,1089;QS=0.991361,0.00575954,0.00287977,0;VDB=0.251321;SGB=-0.511536;RPBZ=-1.60419;MQBZ=0;MQSBZ=0;BQBZ=-3.05428;SCBZ=-0.836529;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:109,2,1,0 +000000F 522 . G <*> 0 . DP=118;I16=62,50,0,0,4076,161128,0,0,6720,403200,0,0,2525,60719,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:112,0 +000000F 523 . C A,<*> 0 . DP=119;I16=63,49,0,1,4174,166860,12,144,6720,403200,60,3600,2518,60530,25,625;QS=0.997133,0.0028667,0;SGB=-0.379885;RPBZ=-0.13799;MQBZ=0;MQSBZ=0;BQBZ=-1.80506;SCBZ=-1.02482;MQ0F=0 PL:AD 0,255,255,255,255,255:112,1,0 +000000F 524 . T G,<*> 0 . DP=119;I16=62,50,1,0,4191,166163,27,729,6720,403200,60,3600,2537,61019,25,625;QS=0.993599,0.00640114,0;SGB=-0.379885;RPBZ=-0.82791;MQBZ=0;MQSBZ=0;BQBZ=-1.39926;SCBZ=0.0480382;MQ0F=0 PL:AD 0,255,255,255,255,255:112,1,0 +000000F 525 . A C,G,<*> 0 . DP=119;I16=62,48,1,2,4098,163296,36,432,6600,396000,180,10800,2509,60447,71,1691;QS=0.991292,0.00580552,0.00290276,0;VDB=0.199299;SGB=-0.511536;RPBZ=-1.60781;MQBZ=0;MQSBZ=0;BQBZ=-2.97339;SCBZ=1.45142;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:110,2,1,0 +000000F 526 . T A,<*> 0 . DP=120;I16=63,49,0,1,4180,166064,12,144,6720,403200,60,3600,2570,61910,25,625;QS=0.997137,0.0028626,0;SGB=-0.379885;RPBZ=-1.2266;MQBZ=0;MQSBZ=0;BQBZ=-1.74833;SCBZ=0.367327;MQ0F=0 PL:AD 0,255,255,255,255,255:112,1,0 +000000F 527 . C G,A,<*> 0 . DP=120;I16=62,49,1,1,4246,172572,24,288,6660,399600,120,7200,2557,61573,48,1154;QS=0.994379,0.0028103,0.0028103,0;VDB=0.18;SGB=-0.453602;RPBZ=-1.28504;MQBZ=0;MQSBZ=0;BQBZ=-2.51884;SCBZ=1.10043;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:111,1,1,0 +000000F 528 . A G,<*> 0 . DP=121;I16=62,51,1,0,4295,173733,12,144,6780,406800,60,3600,2588,62236,25,625;QS=0.997214,0.00278616,0;SGB=-0.379885;RPBZ=-0.805477;MQBZ=0;MQSBZ=0;BQBZ=-1.82642;SCBZ=0.0634184;MQ0F=0 PL:AD 0,255,255,255,255,255:113,1,0 +000000F 529 . T C,<*> 0 . DP=123;I16=65,50,0,1,4363,176845,8,64,6900,414000,60,3600,2639,63305,25,625;QS=0.99817,0.00183024,0;SGB=-0.379885;RPBZ=-1.04546;MQBZ=0;MQSBZ=0;BQBZ=-1.9592;SCBZ=-1.04156;MQ0F=0 PL:AD 0,255,255,255,255,255:115,1,0 +000000F 530 . G <*> 0 . DP=123;I16=65,51,0,0,4357,176035,0,0,6960,417600,0,0,2672,64088,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:116,0 +000000F 531 . T A,G,<*> 0 . DP=123;I16=63,49,2,2,4199,169729,48,576,6720,403200,240,14400,2579,61741,100,2500;QS=0.988698,0.00847657,0.00282552,0;VDB=0.0550934;SGB=-0.556411;RPBZ=-0.650816;MQBZ=0;MQSBZ=0;BQBZ=-3.46139;SCBZ=-0.463641;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:112,3,1,0 +000000F 532 . C A,G,<*> 0 . DP=124;I16=65,49,0,3,4352,176182,36,432,6840,410400,180,10800,2640,63492,45,897;QS=0.991796,0.00546946,0.00273473,0;VDB=0.588406;SGB=-0.511536;RPBZ=-1.58686;MQBZ=0;MQSBZ=0;BQBZ=-3.10955;SCBZ=-1.82952;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:114,2,1,0 +000000F 533 . A <*> 0 . DP=124;I16=65,52,0,0,4235,168199,0,0,7020,421200,0,0,2692,64582,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:117,0 +000000F 534 . G A,<*> 0 . DP=124;I16=65,51,0,1,4319,173543,12,144,6960,417600,60,3600,2677,64331,21,441;QS=0.997229,0.00277072,0;SGB=-0.379885;RPBZ=-1.3327;MQBZ=0;MQSBZ=0;BQBZ=-1.72023;SCBZ=-1.04702;MQ0F=0 PL:AD 0,255,255,255,255,255:116,1,0 +000000F 535 . G A,<*> 0 . DP=125;I16=65,52,0,1,4309,171791,12,144,7020,421200,60,3600,2679,64385,25,625;QS=0.997223,0.00277713,0;SGB=-0.379885;RPBZ=-1.10121;MQBZ=0;MQSBZ=0;BQBZ=-1.74874;SCBZ=0.0762649;MQ0F=0 PL:AD 0,255,255,255,255,255:117,1,0 +000000F 536 . T G,A,<*> 0 . DP=125;I16=65,51,0,2,4274,171298,24,288,6960,417600,120,7200,2661,64041,48,1154;QS=0.994416,0.002792,0.002792,0;VDB=0.1;SGB=-0.453602;RPBZ=-1.74128;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=-0.693187;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:116,1,1,0 +000000F 537 . A <*> 0 . DP=125;I16=65,53,0,0,4390,175290,0,0,7080,424800,0,0,2713,65375,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:118,0 +000000F 537 . AC A 0 . INDEL;IDV=58;IMF=0.464;DP=125;I16=38,28,31,27,2640,105600,2320,92800,3960,237600,3480,208800,1498,35936,1313,31375;QS=0.532258,0.467742;VDB=0.457405;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=0.641853;MQ0F=0 PL:AD 133,0,139:66,58 +000000F 538 . C <*> 0 . DP=65;I16=36,26,0,0,2195,86349,0,0,3720,223200,0,0,1432,34600,0,0;QS=1,0;MQ0F=0 PL:AD 0,187,255:62,0 +000000F 538 . CT C 0 . INDEL;IDV=65;IMF=0.52;DP=125;I16=31,27,37,28,2320,92800,2600,104000,3480,208800,3900,234000,1318,31556,1480,35714;QS=0.473469,0.526531;VDB=0.00206877;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=-0.994262;MQ0F=0 PL:AD 138,0,133:58,65 +000000F 539 . T <*> 0 . DP=60;I16=29,26,0,0,2120,86238,0,0,3300,198000,0,0,1260,30374,0,0;QS=1,0;MQ0F=0 PL:AD 0,166,255:55,0 +000000F 540 . G <*> 0 . DP=124;I16=64,53,0,0,4130,161310,0,0,7020,421200,0,0,2703,65511,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:117,0 +000000F 541 . G <*> 0 . DP=124;I16=64,53,0,0,4143,160525,0,0,7020,421200,0,0,2705,65703,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:117,0 +000000F 542 . T G,A,<*> 0 . DP=124;I16=63,52,1,1,4035,156631,24,288,6900,414000,120,7200,2657,64685,50,1250;QS=0.994087,0.00295639,0.00295639,0;VDB=0.26;SGB=-0.453602;RPBZ=-1.44078;MQBZ=0;MQSBZ=0;BQBZ=-2.25702;SCBZ=-0.906022;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:115,1,1,0 +000000F 543 . C G,A,<*> 0 . DP=122;I16=62,53,1,1,4006,154170,24,288,6900,414000,120,7200,2673,65063,50,1250;QS=0.994045,0.00297767,0.00297767,0;VDB=0.22;SGB=-0.453602;RPBZ=-0.483734;MQBZ=0;MQSBZ=0;BQBZ=-2.21926;SCBZ=0.31726;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:115,1,1,0 +000000F 544 . T A,<*> 0 . DP=121;I16=62,53,0,1,4151,162625,12,144,6900,414000,60,3600,2697,65733,25,625;QS=0.997117,0.00288254,0;SGB=-0.379885;RPBZ=0.238977;MQBZ=0;MQSBZ=0;BQBZ=-1.67238;SCBZ=-1.05505;MQ0F=0 PL:AD 0,255,255,255,255,255:115,1,0 +000000F 545 . G A,<*> 0 . DP=121;I16=62,53,0,1,4078,158586,12,144,6900,414000,60,3600,2710,66228,8,64;QS=0.997066,0.00293399,0;SGB=-0.379885;RPBZ=1.6131;MQBZ=0;MQSBZ=0;BQBZ=-1.61213;SCBZ=-1.05503;MQ0F=0 PL:AD 0,255,255,255,255,255:115,1,0 +000000F 546 . G A,<*> 0 . DP=121;I16=62,53,0,1,4230,164790,12,144,6900,414000,60,3600,2689,65637,25,625;QS=0.997171,0.00282885,0;SGB=-0.379885;RPBZ=-0.97077;MQBZ=0;MQSBZ=0;BQBZ=-1.77555;SCBZ=-1.0549;MQ0F=0 PL:AD 0,255,255,255,255,255:115,1,0 +000000F 547 . A C,<*> 0 . DP=119;I16=62,51,0,1,4105,158737,12,144,6780,406800,60,3600,2686,65592,25,625;QS=0.997085,0.00291474,0;SGB=-0.379885;RPBZ=-0.896646;MQBZ=0;MQSBZ=0;BQBZ=-1.70779;SCBZ=0.0314879;MQ0F=0 PL:AD 0,255,255,255,255,255:113,1,0 +000000F 548 . C G,<*> 0 . DP=119;I16=62,51,0,1,4014,154686,12,144,6780,406800,60,3600,2681,65479,25,625;QS=0.997019,0.00298063,0;SGB=-0.379885;RPBZ=-0.896728;MQBZ=0;MQSBZ=0;BQBZ=-1.67128;SCBZ=0.0314876;MQ0F=0 PL:AD 0,255,255,255,255,255:113,1,0 +000000F 549 . C A,<*> 0 . DP=117;I16=33,29,29,22,2214,83510,1893,75363,3720,223200,3060,183600,1463,35977,1212,29370;QS=0.53908,0.46092,0;VDB=0.0978768;SGB=-0.693147;RPBZ=-0.600202;MQBZ=0;MQSBZ=0;BQBZ=1.17837;SCBZ=0.928426;MQ0F=0 PL:AD 255,0,255,255,255,255:62,51,0 +000000F 550 . G T,<*> 0 . DP=117;I16=62,50,0,1,4113,161301,22,484,6720,403200,60,3600,2641,64473,25,625;QS=0.99468,0.00532044,0;SGB=-0.379885;RPBZ=-0.950536;MQBZ=0;MQSBZ=0;BQBZ=-1.55326;SCBZ=-1.07862;MQ0F=0 PL:AD 0,255,255,255,255,255:112,1,0 +000000F 551 . G T,A,<*> 0 . DP=116;I16=60,50,1,1,4131,164089,24,288,6600,396000,120,7200,2607,63583,50,1250;QS=0.994224,0.00288809,0.00288809,0;VDB=0.32;SGB=-0.453602;RPBZ=-1.01088;MQBZ=0;MQSBZ=0;BQBZ=-2.59034;SCBZ=0.329971;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:110,1,1,0 +000000F 552 . A C,<*> 0 . DP=116;I16=60,50,1,1,3927,151331,24,288,6600,396000,120,7200,2598,63352,50,1250;QS=0.993926,0.00607441,0;VDB=0.1;SGB=-0.453602;RPBZ=-1.12079;MQBZ=0;MQSBZ=0;BQBZ=-2.37929;SCBZ=-0.136538;MQ0F=0 PL:AD 0,255,255,255,255,255:110,2,0 +000000F 553 . G T,<*> 0 . DP=116;I16=60,51,1,0,4046,158108,12,144,6660,399600,60,3600,2613,63729,25,625;QS=0.997043,0.00295712,0;SGB=-0.379885;RPBZ=-0.35579;MQBZ=0;MQSBZ=0;BQBZ=-1.77886;SCBZ=0.688792;MQ0F=0 PL:AD 0,255,255,255,255,255:111,1,0 +000000F 554 . A <*> 0 . DP=113;I16=60,50,0,0,4046,159278,0,0,6600,396000,0,0,2629,64087,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:110,0 +000000F 555 . G <*> 0 . DP=112;I16=60,49,0,0,4016,156988,0,0,6540,392400,0,0,2600,63438,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:109,0 +000000F 556 . A <*> 0 . DP=112;I16=60,49,0,0,3915,151463,0,0,6540,392400,0,0,2588,63068,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:109,0 +000000F 557 . A <*> 0 . DP=112;I16=60,49,0,0,3957,151889,0,0,6540,392400,0,0,2575,62681,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:109,0 +000000F 558 . A G,<*> 0 . DP=112;I16=60,47,0,2,3726,140280,24,288,6420,385200,120,7200,2510,60980,50,1250;QS=0.9936,0.0064,0;VDB=0.22;SGB=-0.453602;RPBZ=-1.14043;MQBZ=0;MQSBZ=0;BQBZ=-2.29525;SCBZ=-0.444037;MQ0F=0 PL:AD 0,255,255,255,255,255:107,2,0 +000000F 559 . C T,<*> 0 . DP=112;I16=59,49,1,0,3985,155617,12,144,6480,388800,60,3600,2518,61092,25,625;QS=0.996998,0.00300225,0;SGB=-0.379885;RPBZ=0.731168;MQBZ=0;MQSBZ=0;BQBZ=-1.78272;SCBZ=-1.08527;MQ0F=0 PL:AD 0,255,255,255,255,255:108,1,0 +000000F 560 . T <*> 0 . DP=110;I16=59,48,0,0,3896,150870,0,0,6420,385200,0,0,2507,60841,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:107,0 +000000F 561 . G <*> 0 . DP=110;I16=59,48,0,0,3921,153367,0,0,6420,385200,0,0,2492,60440,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:107,0 +000000F 562 . T C,A,<*> 0 . DP=110;I16=58,47,1,1,3748,143542,24,288,6300,378000,120,7200,2425,58723,50,1250;QS=0.993637,0.00318134,0.00318134,0;VDB=0.32;SGB=-0.453602;RPBZ=1.03535;MQBZ=0;MQSBZ=0;BQBZ=-2.35872;SCBZ=-0.452888;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:105,1,1,0 +000000F 563 . G A,<*> 0 . DP=109;I16=59,45,0,2,3881,153011,24,288,6240,374400,120,7200,2384,57712,50,1250;QS=0.993854,0.00614597,0;VDB=0.06;SGB=-0.453602;RPBZ=-1.81161;MQBZ=0;MQSBZ=0;BQBZ=-2.59069;SCBZ=-0.927311;MQ0F=0 PL:AD 0,255,255,255,255,255:104,2,0 +000000F 564 . G <*> 0 . DP=109;I16=59,47,0,0,3954,155218,0,0,6360,381600,0,0,2417,58561,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:106,0 +000000F 565 . A G,<*> 0 . DP=108;I16=59,44,0,2,3818,150006,24,288,6180,370800,120,7200,2351,56943,50,1250;QS=0.993753,0.00624675,0;VDB=0.52;SGB=-0.453602;RPBZ=-0.0703536;MQBZ=0;MQSBZ=0;BQBZ=-2.43017;SCBZ=-0.752998;MQ0F=0 PL:AD 0,255,255,255,255,255:103,2,0 +000000F 566 . T <*> 0 . DP=108;I16=59,46,0,0,3981,157549,0,0,6300,378000,0,0,2384,57808,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:105,0 +000000F 567 . G T,C,<*> 0 . DP=108;I16=57,46,2,0,3847,151871,20,208,6180,370800,120,7200,2318,56254,48,1154;QS=0.994828,0.00310318,0.00206879,0;VDB=0.7;SGB=-0.453602;RPBZ=0.128964;MQBZ=0;MQSBZ=0;BQBZ=-2.74966;SCBZ=-0.146119;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:103,1,1,0 +000000F 568 . T C,<*> 0 . DP=106;I16=56,47,1,0,3763,145559,8,64,6180,370800,60,3600,2349,56993,25,625;QS=0.997879,0.00212145,0;SGB=-0.379885;RPBZ=0.983018;MQBZ=0;MQSBZ=0;BQBZ=-1.93576;SCBZ=-1.05631;MQ0F=0 PL:AD 0,255,255,255,255,255:103,1,0 +000000F 569 . C G,A,<*> 0 . DP=104;I16=55,45,0,2,3734,146948,24,288,6000,360000,120,7200,2309,55985,50,1250;QS=0.993614,0.00319319,0.00319319,0;VDB=0.58;SGB=-0.453602;RPBZ=-0.543243;MQBZ=0;MQSBZ=0;BQBZ=-2.54989;SCBZ=-1.48352;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:100,1,1,0 +000000F 570 . T <*> 0 . DP=104;I16=55,47,0,0,3862,151566,0,0,6120,367200,0,0,2342,56784,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:102,0 +000000F 571 . G <*> 0 . DP=104;I16=55,47,0,0,3733,145003,0,0,6120,367200,0,0,2325,56367,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:102,0 +000000F 572 . T A,G,<*> 0 . DP=103;I16=53,46,1,1,3664,142548,34,628,5940,356400,120,7200,2259,54733,50,1250;QS=0.990806,0.00594916,0.003245,0;VDB=0.22;SGB=-0.453602;RPBZ=0.219457;MQBZ=0;MQSBZ=0;BQBZ=-2.44659;SCBZ=0.216127;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:99,1,1,0 +000000F 573 . C G,<*> 0 . DP=103;I16=54,46,0,1,3749,146645,12,144,6000,360000,60,3600,2267,54957,25,625;QS=0.996809,0.00319064,0;SGB=-0.379885;RPBZ=-0.428856;MQBZ=0;MQSBZ=0;BQBZ=-1.85387;SCBZ=-1.03794;MQ0F=0 PL:AD 0,255,255,255,255,255:100,1,0 +000000F 574 . A <*> 0 . DP=102;I16=54,46,0,0,3671,141999,0,0,6000,360000,0,0,2275,55165,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:100,0 +000000F 575 . A <*> 0 . DP=101;I16=53,46,0,0,3653,142115,0,0,5940,356400,0,0,2259,54781,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:99,0 +000000F 576 . C <*> 0 . DP=99;I16=52,45,0,0,3665,144129,0,0,5820,349200,0,0,2229,54203,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:97,0 +000000F 577 . A <*> 0 . DP=99;I16=52,45,0,0,3533,136769,0,0,5820,349200,0,0,2212,53762,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:97,0 +000000F 578 . C <*> 0 . DP=99;I16=52,45,0,0,3593,140361,0,0,5820,349200,0,0,2189,53061,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:97,0 +000000F 579 . A <*> 0 . DP=98;I16=51,45,0,0,3516,137036,0,0,5760,345600,0,0,2166,52356,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:96,0 +000000F 580 . A <*> 0 . DP=98;I16=51,45,0,0,3569,139629,0,0,5760,345600,0,0,2141,51599,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:96,0 +000000F 581 . C <*> 0 . DP=95;I16=49,44,0,0,3512,137782,0,0,5580,334800,0,0,2096,50392,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:93,0 +000000F 582 . A <*> 0 . DP=94;I16=49,43,0,0,3411,135315,0,0,5520,331200,0,0,2073,49673,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:92,0 +000000F 583 . G C,<*> 0 . DP=90;I16=46,41,1,0,3292,129550,12,144,5220,313200,60,3600,1985,47479,25,625;QS=0.996368,0.00363196,0;SGB=-0.379885;RPBZ=0.334737;MQBZ=0;MQSBZ=0;BQBZ=-1.89321;SCBZ=-1.01118;MQ0F=0 PL:AD 0,245,255,255,255,255:87,1,0 +000000F 584 . A <*> 0 . DP=87;I16=45,40,0,0,3152,123280,0,0,5100,306000,0,0,1940,46204,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:85,0 +000000F 585 . A <*> 0 . DP=88;I16=44,40,0,0,3202,127432,0,0,5040,302400,0,0,1914,45520,0,0;QS=1,0;MQ0F=0 PL:AD 0,253,255:84,0 +000000F 586 . A G,<*> 0 . DP=88;I16=43,40,1,0,2964,114852,12,144,4980,298800,60,3600,1875,44509,16,256;QS=0.995968,0.00403226,0;SGB=-0.379885;RPBZ=0.866495;MQBZ=0;MQSBZ=0;BQBZ=-1.73561;SCBZ=1.14098;MQ0F=0 PL:AD 0,234,255,250,255,255:83,1,0 +000000F 587 . A C,<*> 0 . DP=88;I16=45,40,0,1,2997,115249,12,144,5100,306000,60,3600,1891,44583,25,625;QS=0.996012,0.00398804,0;SGB=-0.379885;RPBZ=-0.584321;MQBZ=0;MQSBZ=0;BQBZ=-1.64438;SCBZ=0.548806;MQ0F=0 PL:AD 0,240,255,255,255,255:85,1,0 +000000F 588 . A <*> 0 . DP=87;I16=45,40,0,0,2971,113817,0,0,5100,306000,0,0,1891,44401,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:85,0 +000000F 589 . G A,<*> 0 . DP=87;I16=44,40,1,0,3059,117323,12,144,5040,302400,60,3600,1840,42970,25,625;QS=0.996092,0.00390752,0;SGB=-0.379885;RPBZ=0.550542;MQBZ=0;MQSBZ=0;BQBZ=-1.79452;SCBZ=1.68471;MQ0F=0 PL:AD 0,237,255,253,255,255:84,1,0 +000000F 590 . T <*> 0 . DP=87;I16=45,40,0,0,3019,114421,0,0,5100,306000,0,0,1839,42841,0,0;QS=1,0;MQ0F=0 PL:AD 0,255,255:85,0 +000000F 591 . G C,<*> 0 . DP=86;I16=43,40,1,0,3033,117015,12,144,4980,298800,60,3600,1807,42121,3,9;QS=0.996059,0.00394089,0;SGB=-0.379885;RPBZ=1.67089;MQBZ=0;MQSBZ=0;BQBZ=-1.80181;SCBZ=-1.01597;MQ0F=0 PL:AD 0,234,255,250,255,255:83,1,0 +000000F 592 . A C,<*> 0 . DP=85;I16=44,38,0,2,2821,106231,24,288,4920,295200,120,7200,1748,40690,36,746;QS=0.991564,0.00843585,0;VDB=0.5;SGB=-0.453602;RPBZ=0.484389;MQBZ=0;MQSBZ=0;BQBZ=-2.1998;SCBZ=-0.0922779;MQ0F=0 PL:AD 0,217,255,247,255,255:82,2,0 +000000F 593 . C G,A,<*> 0 . DP=84;I16=42,39,1,1,2912,110366,24,288,4860,291600,120,7200,1709,39543,50,1250;QS=0.991826,0.00408719,0.00408719,0;VDB=0.36;SGB=-0.453602;RPBZ=-0.0891287;MQBZ=0;MQSBZ=0;BQBZ=-2.57106;SCBZ=-0.108692;MQ0F=0 PL:AD 0,228,255,228,255,255,244,255,255,255:81,1,1,0 +000000F 594 . T <*> 0 . DP=84;I16=43,40,0,0,2911,110919,0,0,4980,298800,0,0,1733,40151,0,0;QS=1,0;MQ0F=0 PL:AD 0,250,255:83,0 +000000F 595 . G <*> 0 . DP=82;I16=42,39,0,0,2849,108157,0,0,4860,291600,0,0,1681,38837,0,0;QS=1,0;MQ0F=0 PL:AD 0,244,255:81,0 +000000F 596 . A C,<*> 0 . DP=81;I16=42,37,0,1,2846,108902,12,144,4740,284400,60,3600,1613,37303,25,625;QS=0.995801,0.00419874,0;SGB=-0.379885;RPBZ=-0.9748;MQBZ=0;MQSBZ=0;BQBZ=-1.73819;SCBZ=0.250526;MQ0F=0 PL:AD 0,222,255,238,255,255:79,1,0 +000000F 597 . T G,<*> 0 . DP=80;I16=42,36,0,1,2742,102116,12,144,4680,280800,60,3600,1574,36614,15,225;QS=0.995643,0.0043573,0;SGB=-0.379885;RPBZ=0.833621;MQBZ=0;MQSBZ=0;BQBZ=-1.70797;SCBZ=0.254296;MQ0F=0 PL:AD 0,219,255,235,255,255:78,1,0 +000000F 598 . T <*> 0 . DP=79;I16=41,37,0,0,2839,109553,0,0,4680,280800,0,0,1562,36238,0,0;QS=1,0;MQ0F=0 PL:AD 0,235,255:78,0 +000000F 599 . A <*> 0 . DP=77;I16=40,36,0,0,2770,106710,0,0,4560,273600,0,0,1528,35518,0,0;QS=1,0;MQ0F=0 PL:AD 0,229,255:76,0 +000000F 600 . G <*> 0 . DP=77;I16=40,36,0,0,2714,103152,0,0,4560,273600,0,0,1498,34792,0,0;QS=1,0;MQ0F=0 PL:AD 0,229,255:76,0 +000000F 601 . T G,C,<*> 0 . DP=77;I16=39,35,1,1,2583,97801,24,288,4440,266400,120,7200,1417,32827,50,1250;QS=0.990798,0.00460123,0.00460123,0;VDB=0.1;SGB=-0.453602;RPBZ=-0.795612;MQBZ=0;MQSBZ=0;BQBZ=-2.28816;SCBZ=0.0680273;MQ0F=0 PL:AD 0,207,255,207,255,255,223,255,255,255:74,1,1,0 +000000F 602 . T A,C,<*> 0 . DP=74;I16=37,33,1,2,2416,90472,61,1657,4200,252000,180,10800,1357,31631,60,1350;QS=0.975373,0.019782,0.00484457,0;VDB=0.568985;SGB=-0.511536;RPBZ=0.333732;MQBZ=0;MQSBZ=0;BQBZ=-2.13242;SCBZ=0.845379;MQ0F=0 PL:AD 0,164,255,195,255,255,211,255,255,255:70,2,1,0 +000000F 603 . A C,<*> 0 . DP=68;I16=35,30,1,0,2312,88408,12,144,3900,234000,60,3600,1347,31297,25,625;QS=0.994837,0.00516351,0;SGB=-0.379885;RPBZ=-0.131301;MQBZ=0;MQSBZ=0;BQBZ=-1.66952;SCBZ=-1.04349;MQ0F=0 PL:AD 0,180,255,196,255,255:65,1,0 +000000F 604 . G <*> 0 . DP=67;I16=36,30,0,0,2403,93039,0,0,3960,237600,0,0,1349,31351,0,0;QS=1,0;MQ0F=0 PL:AD 0,199,255:66,0 +000000F 605 . A G,<*> 0 . DP=66;I16=35,28,1,1,2254,85780,24,288,3780,226800,120,7200,1292,30038,33,689;QS=0.989464,0.0105356,0;VDB=0.36;SGB=-0.453602;RPBZ=0.589131;MQBZ=0;MQSBZ=0;BQBZ=-2.37279;SCBZ=1.62638;MQ0F=0 PL:AD 0,162,255,190,255,255:63,2,0 +000000F 606 . T G,<*> 0 . DP=65;I16=34,29,1,0,2247,87073,22,484,3780,226800,60,3600,1287,29883,13,169;QS=0.990304,0.0096959,0;SGB=-0.379885;RPBZ=1.00206;MQBZ=0;MQSBZ=0;BQBZ=-1.38823;SCBZ=-1.04683;MQ0F=0 PL:AD 0,170,255,190,255,255:63,1,0 +000000F 607 . G <*> 0 . DP=64;I16=35,28,0,0,2255,86451,0,0,3780,226800,0,0,1255,29027,0,0;QS=1,0;MQ0F=0 PL:AD 0,190,255:63,0 +000000F 608 . T G,<*> 0 . DP=62;I16=33,27,1,0,2177,84023,8,64,3600,216000,60,3600,1182,27240,25,625;QS=0.996339,0.00366133,0;SGB=-0.379885;RPBZ=0.0852552;MQBZ=0;MQSBZ=0;BQBZ=-1.89659;SCBZ=-1.04111;MQ0F=0 PL:AD 0,166,255,181,255,255:60,1,0 +000000F 609 . A G,<*> 0 . DP=60;I16=33,25,0,1,2036,77468,12,144,3480,208800,60,3600,1139,26307,25,625;QS=0.994141,0.00585938,0;SGB=-0.379885;RPBZ=-0.0881423;MQBZ=0;MQSBZ=0;BQBZ=-1.68674;SCBZ=1.23654;MQ0F=0 PL:AD 0,160,255,175,255,255:58,1,0 +000000F 610 . A <*> 0 . DP=59;I16=33,25,0,0,2025,77505,0,0,3480,208800,0,0,1144,26524,0,0;QS=1,0;MQ0F=0 PL:AD 0,175,255:58,0 +000000F 611 . C T,<*> 0 . DP=55;I16=14,12,16,12,990,39320,1009,38281,1560,93600,1680,100800,576,13946,550,12206;QS=0.495248,0.504752,0;VDB=1.84196e-06;SGB=-0.693054;RPBZ=1.75927;MQBZ=0;MQSBZ=0;BQBZ=-0.270305;SCBZ=-0.614376;MQ0F=0 PL:AD 255,0,255,255,255,255:26,28,0 +000000F 612 . T <*> 0 . DP=54;I16=29,24,0,0,1898,72790,0,0,3180,190800,0,0,1111,25815,0,0;QS=1,0;MQ0F=0 PL:AD 0,160,255:53,0 +000000F 613 . A G,C,<*> 0 . DP=54;I16=27,24,2,0,1946,77058,24,288,3060,183600,120,7200,1046,24258,50,1250;QS=0.987817,0.00609137,0.00609137,0;VDB=0.06;SGB=-0.453602;RPBZ=0.537207;MQBZ=0;MQSBZ=0;BQBZ=-2.67554;SCBZ=0.495445;MQ0F=0 PL:AD 0,139,255,139,255,255,154,255,255,255:51,1,1,0 +000000F 614 . A C,<*> 0 . DP=53;I16=28,23,0,1,1870,73000,12,144,3060,183600,60,3600,1054,24458,25,625;QS=0.993624,0.0063762,0;SGB=-0.379885;RPBZ=0.033359;MQBZ=0;MQSBZ=0;BQBZ=-1.88;SCBZ=1.24225;MQ0F=0 PL:AD 0,139,255,154,255,255:51,1,0 +000000F 615 . T <*> 0 . DP=53;I16=28,24,0,0,1924,74686,0,0,3120,187200,0,0,1061,24643,0,0;QS=1,0;MQ0F=0 PL:AD 0,157,255:52,0 +000000F 616 . T G,<*> 0 . DP=53;I16=28,23,0,1,1815,69225,12,144,3060,183600,60,3600,1017,23565,25,625;QS=0.993432,0.00656814,0;SGB=-0.379885;RPBZ=-0.833529;MQBZ=0;MQSBZ=0;BQBZ=-1.66586;SCBZ=0.352496;MQ0F=0 PL:AD 0,139,255,154,255,255:51,1,0 +000000F 617 . T <*> 0 . DP=52;I16=27,24,0,0,1844,70962,0,0,3060,183600,0,0,1024,23774,0,0;QS=1,0;MQ0F=0 PL:AD 0,154,255:51,0 +000000F 618 . C <*> 0 . DP=52;I16=27,24,0,0,1782,67390,0,0,3060,183600,0,0,1005,23345,0,0;QS=1,0;MQ0F=0 PL:AD 0,154,255:51,0 +000000F 619 . A C,<*> 0 . DP=52;I16=26,24,1,0,1756,66842,12,144,3000,180000,60,3600,961,22329,25,625;QS=0.993213,0.00678733,0;SGB=-0.379885;RPBZ=0.612053;MQBZ=0;MQSBZ=0;BQBZ=-1.58836;SCBZ=1.65684;MQ0F=0 PL:AD 0,136,255,151,255,255:50,1,0 +000000F 620 . A <*> 0 . DP=50;I16=25,24,0,0,1670,63616,0,0,2940,176400,0,0,969,22599,0,0;QS=1,0;MQ0F=0 PL:AD 0,148,255:49,0 +000000F 621 . C <*> 0 . DP=49;I16=24,24,0,0,1738,67580,0,0,2880,172800,0,0,953,22277,0,0;QS=1,0;MQ0F=0 PL:AD 0,144,255:48,0 +000000F 622 . T <*> 0 . DP=47;I16=23,23,0,0,1642,63060,0,0,2760,165600,0,0,921,21681,0,0;QS=1,0;MQ0F=0 PL:AD 0,138,255:46,0 +000000F 623 . A G,C,<*> 0 . DP=46;I16=22,21,1,1,1593,61369,24,288,2580,154800,120,7200,862,20368,44,986;QS=0.985158,0.00742115,0.00742115,0;VDB=0.28;SGB=-0.453602;RPBZ=0.110272;MQBZ=0;MQSBZ=0;BQBZ=-2.46441;SCBZ=-0.699329;MQ0F=0 PL:AD 0,116,255,116,255,255,129,255,255,255:43,1,1,0 +000000F 624 . T <*> 0 . DP=45;I16=22,22,0,0,1522,57304,0,0,2640,158400,0,0,892,21056,0,0;QS=1,0;MQ0F=0 PL:AD 0,132,255:44,0 +000000F 625 . C <*> 0 . DP=43;I16=20,23,0,0,1462,54230,0,0,2580,154800,0,0,905,21409,0,0;QS=1,0;MQ0F=0 PL:AD 0,129,255:43,0 +000000F 626 . T A,<*> 0 . DP=42;I16=19,22,0,1,1407,52739,12,144,2460,147600,60,3600,869,20535,25,625;QS=0.991543,0.00845666,0;SGB=-0.379885;RPBZ=0.0826192;MQBZ=0;MQSBZ=0;BQBZ=-1.63079;SCBZ=-0.998652;MQ0F=0 PL:AD 0,110,255,123,255,255:41,1,0 +000000F 627 . A <*> 0 . DP=42;I16=19,23,0,0,1458,54660,0,0,2520,151200,0,0,880,20786,0,0;QS=1,0;MQ0F=0 PL:AD 0,126,255:42,0 +000000F 628 . G C,<*> 0 . DP=41;I16=17,23,1,0,1491,57863,12,144,2400,144000,60,3600,838,19618,24,576;QS=0.992016,0.00798403,0;SGB=-0.379885;RPBZ=0.296217;MQBZ=0;MQSBZ=0;BQBZ=-1.99912;SCBZ=-0.981384;MQ0F=0 PL:AD 0,107,255,120,255,255:40,1,0 +000000F 629 . T <*> 0 . DP=40;I16=18,22,0,0,1452,55272,0,0,2400,144000,0,0,827,19349,0,0;QS=1,0;MQ0F=0 PL:AD 0,120,255:40,0 +000000F 630 . C <*> 0 . DP=38;I16=16,22,0,0,1338,50174,0,0,2280,136800,0,0,812,18860,0,0;QS=1,0;MQ0F=0 PL:AD 0,114,255:38,0 +000000F 631 . T G,<*> 0 . DP=37;I16=14,22,1,0,1280,48224,32,1024,2160,129600,60,3600,783,18083,11,121;QS=0.97561,0.0243902,0;SGB=-0.379885;RPBZ=1.36088;MQBZ=0;MQSBZ=0;BQBZ=-1.06419;SCBZ=-0.906765;MQ0F=0 PL:AD 0,79,255,108,255,255:36,1,0 +000000F 632 . T <*> 0 . DP=36;I16=15,21,0,0,1320,50548,0,0,2160,129600,0,0,761,17359,0,0;QS=1,0;MQ0F=0 PL:AD 0,108,255:36,0 +000000F 633 . C A,<*> 0 . DP=35;I16=14,19,0,2,1183,44803,24,288,1980,118800,120,7200,678,15322,47,1109;QS=0.980116,0.019884,0;VDB=0.26;SGB=-0.453602;RPBZ=0.463397;MQBZ=0;MQSBZ=0;BQBZ=-2.3085;SCBZ=1.47371;MQ0F=0 PL:AD 0,76,255,99,255,255:33,2,0 +000000F 634 . C <*> 0 . DP=35;I16=14,21,0,0,1290,49572,0,0,2100,126000,0,0,708,15898,0,0;QS=1,0;MQ0F=0 PL:AD 0,105,255:35,0 +000000F 635 . T <*> 0 . DP=35;I16=14,21,0,0,1242,47888,0,0,2100,126000,0,0,691,15399,0,0;QS=1,0;MQ0F=0 PL:AD 0,105,255:35,0 +000000F 636 . C T,<*> 0 . DP=34;I16=13,20,1,0,1163,44105,12,144,1980,118800,60,3600,643,14453,16,256;QS=0.989787,0.0102128,0;SGB=-0.379885;RPBZ=0.66524;MQBZ=0;MQSBZ=0;BQBZ=-1.57615;SCBZ=-0.84147;MQ0F=0 PL:AD 0,87,255,99,255,255:33,1,0 +000000F 637 . T <*> 0 . DP=33;I16=14,19,0,0,1151,43717,0,0,1980,118800,0,0,627,14033,0,0;QS=1,0;MQ0F=0 PL:AD 0,99,255:33,0 +000000F 638 . A <*> 0 . DP=32;I16=13,19,0,0,1113,41153,0,0,1920,115200,0,0,605,13531,0,0;QS=1,0;MQ0F=0 PL:AD 0,96,255:32,0 +000000F 639 . A T,<*> 0 . DP=30;I16=13,16,0,1,965,34529,12,144,1740,104400,60,3600,577,12917,13,169;QS=0.987718,0.0122825,0;SGB=-0.379885;RPBZ=0.988773;MQBZ=0;MQSBZ=0;BQBZ=-1.63451;SCBZ=-0.878553;MQ0F=0 PL:AD 0,75,255,87,255,255:29,1,0 +000000F 640 . A <*> 0 . DP=29;I16=12,17,0,0,994,36924,0,0,1740,104400,0,0,575,12621,0,0;QS=1,0;MQ0F=0 PL:AD 0,87,255:29,0 +000000F 641 . G T,<*> 0 . DP=27;I16=11,15,0,1,869,31995,12,144,1560,93600,60,3600,529,11623,11,121;QS=0.986379,0.0136209,0;SGB=-0.379885;RPBZ=1.35554;MQBZ=0;MQSBZ=0;BQBZ=-1.5086;SCBZ=-0.793551;MQ0F=0 PL:AD 0,66,255,78,255,255:26,1,0 +000000F 642 . A C,<*> 0 . DP=26;I16=11,14,0,1,850,31010,22,484,1500,90000,60,3600,504,11078,9,81;QS=0.974771,0.0252294,0;SGB=-0.379885;RPBZ=1.41458;MQBZ=0;MQSBZ=0;BQBZ=-1.18957;SCBZ=0.532911;MQ0F=0 PL:AD 0,56,255,75,255,255:25,1,0 +000000F 643 . C <*> 0 . DP=26;I16=11,15,0,0,912,34334,0,0,1560,93600,0,0,499,10747,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 644 . C A,<*> 0 . DP=26;I16=11,14,0,1,869,31707,12,144,1500,90000,60,3600,464,9914,20,400;QS=0.986379,0.0136209,0;SGB=-0.379885;RPBZ=0.268045;MQBZ=0;MQSBZ=0;BQBZ=-1.6731;SCBZ=0.989693;MQ0F=0 PL:AD 0,63,255,75,255,255:25,1,0 +000000F 645 . C <*> 0 . DP=26;I16=11,15,0,0,911,33421,0,0,1560,93600,0,0,466,9764,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 646 . C <*> 0 . DP=26;I16=11,15,0,0,850,29718,0,0,1560,93600,0,0,447,9201,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 647 . T C,<*> 0 . DP=26;I16=11,14,0,1,898,34076,12,144,1500,90000,60,3600,423,8651,5,25;QS=0.986813,0.0131868,0;SGB=-0.379885;RPBZ=1.14097;MQBZ=0;MQSBZ=0;BQBZ=-1.57982;SCBZ=-0.760963;MQ0F=0 PL:AD 0,63,255,75,255,255:25,1,0 +000000F 648 . A <*> 0 . DP=26;I16=11,15,0,0,870,32110,0,0,1560,93600,0,0,407,8091,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 649 . C <*> 0 . DP=26;I16=11,15,0,0,928,35102,0,0,1560,93600,0,0,386,7548,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 650 . T C,<*> 0 . DP=26;I16=11,14,0,1,897,33867,12,144,1500,90000,60,3600,359,6973,5,25;QS=0.986799,0.0132013,0;SGB=-0.379885;RPBZ=0.874779;MQBZ=0;MQSBZ=0;BQBZ=-1.67266;SCBZ=-0.760963;MQ0F=0 PL:AD 0,63,255,75,255,255:25,1,0 +000000F 651 . T <*> 0 . DP=26;I16=11,15,0,0,905,33255,0,0,1560,93600,0,0,342,6492,0,0;QS=1,0;MQ0F=0 PL:AD 0,78,255:26,0 +000000F 652 . T <*> 0 . DP=25;I16=11,14,0,0,891,32693,0,0,1500,90000,0,0,321,6029,0,0;QS=1,0;MQ0F=0 PL:AD 0,75,255:25,0 +000000F 653 . A <*> 0 . DP=21;I16=9,12,0,0,804,31130,0,0,1260,75600,0,0,303,5555,0,0;QS=1,0;MQ0F=0 PL:AD 0,63,255:21,0 +000000F 654 . A <*> 0 . DP=21;I16=9,12,0,0,659,22061,0,0,1260,75600,0,0,285,5117,0,0;QS=1,0;MQ0F=0 PL:AD 0,63,255:21,0 +000000F 655 . C <*> 0 . DP=21;I16=9,12,0,0,664,22342,0,0,1260,75600,0,0,266,4666,0,0;QS=1,0;MQ0F=0 PL:AD 0,63,255:21,0 +000000F 655 . CACAATACAA CACAA 0 . INDEL;IDV=6;IMF=0.285714;DP=21;I16=0,2,5,1,240,28800,720,86400,120,7200,360,21600,46,1060,100,1788;QS=0.25,0.75;VDB=0.00189453;SGB=-0.616816;RPBZ=-2.81289;MQBZ=0;MQSBZ=0;BQBZ=-1.67266;SCBZ=-2.52262;MQ0F=0 PL:AD 67,0,14:2,6 +000000F 656 . A <*> 0 . DP=11;I16=4,7,0,0,404,15690,0,0,660,39600,0,0,141,2411,0,0;QS=1,0;MQ0F=0 PL:AD 0,33,255:11,0 +000000F 657 . C <*> 0 . DP=11;I16=4,7,0,0,413,15607,0,0,660,39600,0,0,131,2189,0,0;QS=1,0;MQ0F=0 PL:AD 0,33,255:11,0 +000000F 658 . A <*> 0 . DP=10;I16=3,7,0,0,121,1651,0,0,600,36000,0,0,122,1986,0,0;QS=1,0;MQ0F=0 PL:AD 0,30,79:10,0 +000000F 658 . AA AAATTA 0 . INDEL;IDV=7;IMF=0.4375;DP=16;I16=0,1,5,2,100,10000,700,70000,60,3600,420,25200,25,625,110,1820;QS=0.235294,0.764706;VDB=0.000272497;SGB=-0.636426;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;BQBZ=-1.67266;SCBZ=-1.35678;MQ0F=0 PL:AD 81,0,16:1,7 +000000F 659 . A <*> 0 . DP=10;I16=3,5,0,0,86,1088,0,0,480,28800,0,0,75,1077,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,63:8,0 +000000F 660 . T <*> 0 . DP=2;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;MQ0F=0 PL:AD 0,0,0:0,0 +000000F 661 . A <*> 0 . DP=8;I16=0,2,0,0,8,32,0,0,120,7200,0,0,26,340,0,0;QS=1,0;MQ0F=0 PL:AD 0,6,7:2,0 +000000F 662 . C <*> 0 . DP=8;I16=0,2,0,0,10,50,0,0,120,7200,0,0,24,290,0,0;QS=1,0;MQ0F=0 PL:AD 0,6,9:2,0 +000000F 663 . A <*> 0 . DP=8;I16=5,3,0,0,40,224,0,0,480,28800,0,0,107,1627,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,24:8,0 +000000F 664 . A <*> 0 . DP=8;I16=5,3,0,0,94,1646,0,0,480,28800,0,0,99,1421,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,69:8,0 +000000F 665 . A <*> 0 . DP=8;I16=5,3,0,0,165,3551,0,0,480,28800,0,0,91,1231,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,124:8,0 +000000F 666 . T <*> 0 . DP=8;I16=5,3,0,0,269,9361,0,0,480,28800,0,0,83,1057,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,207:8,0 +000000F 667 . T <*> 0 . DP=8;I16=5,3,0,0,279,10001,0,0,480,28800,0,0,75,899,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,214:8,0 +000000F 668 . A <*> 0 . DP=8;I16=5,3,0,0,278,10018,0,0,480,28800,0,0,67,757,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,214:8,0 +000000F 669 . C <*> 0 . DP=8;I16=5,3,0,0,279,10051,0,0,480,28800,0,0,59,631,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,214:8,0 +000000F 670 . A <*> 0 . DP=8;I16=5,3,0,0,287,10675,0,0,480,28800,0,0,51,521,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,220:8,0 +000000F 671 . T <*> 0 . DP=8;I16=5,3,0,0,264,9116,0,0,480,28800,0,0,43,427,0,0;QS=1,0;MQ0F=0 PL:AD 0,24,205:8,0 +000000F 672 . C <*> 0 . DP=7;I16=4,3,0,0,237,8337,0,0,420,25200,0,0,36,348,0,0;QS=1,0;MQ0F=0 PL:AD 0,21,193:7,0 +000000F 673 . T <*> 0 . DP=7;I16=4,3,0,0,223,7385,0,0,420,25200,0,0,29,283,0,0;QS=1,0;MQ0F=0 PL:AD 0,21,181:7,0 +000000F 674 . A <*> 0 . DP=6;I16=4,2,0,0,199,6757,0,0,360,21600,0,0,23,231,0,0;QS=1,0;MQ0F=0 PL:AD 0,18,164:6,0 +000000F 675 . G <*> 0 . DP=4;I16=2,2,0,0,147,5443,0,0,240,14400,0,0,19,189,0,0;QS=1,0;MQ0F=0 PL:AD 0,12,134:4,0 +000000F 676 . A <*> 0 . DP=3;I16=2,1,0,0,100,3434,0,0,180,10800,0,0,16,154,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,95:3,0 +000000F 677 . T <*> 0 . DP=3;I16=2,1,0,0,104,3846,0,0,180,10800,0,0,13,125,0,0;QS=1,0;MQ0F=0 PL:AD 0,9,100:3,0 +000000F 678 . A <*> 0 . DP=2;I16=1,1,0,0,53,1825,0,0,120,7200,0,0,11,101,0,0;QS=1,0;MQ0F=0 PL:AD 0,6,53:2,0 +000000F 679 . T <*> 0 . DP=2;I16=1,1,0,0,68,2410,0,0,120,7200,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL:AD 0,6,68:2,0 +000000F 680 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,41:1,0 +000000F 681 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,41:1,0 +000000F 682 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,41:1,0 +000000F 683 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,35:1,0 +000000F 684 . T <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,31:1,0 +000000F 685 . A <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,29:1,0 +000000F 686 . T <*> 0 . DP=1;I16=0,1,0,0,3,9,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL:AD 0,3,4:1,0 +000000F 687 . N <*> 0 . DP=1;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;MQ0F=0 PL:AD 0,0,0:0,0 +000000F 688 . N <*> 0 . DP=1;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;MQ0F=0 PL:AD 0,0,0:0,0 diff --git a/test/test.pl b/test/test.pl index c998b6d80..af43e637a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -899,6 +899,7 @@ run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-G {PATH}/mplp.11.rgs]); run_test(\&test_mpileup,$opts,in=>[qw(mpileup.3 mpileup.4)],out=>'mpileup/mpileup.11.out',args=>q[-G {PATH}/mplp.11.rgs]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.1)],out=>'mpileup/indel-AD.1.out',ref=>'indel-AD.1.fa',args=>q[-a AD]); +run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.1)],out=>'mpileup/indel-AD.1cns.out',ref=>'indel-AD.1.fa',args=>q[-a AD --indels-cns]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.2.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.3.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD]); run_test(\&test_mpileup,$opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.4.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD0]); From fbbfbeac6fbdc7b8170e97a8eadc386d89f2c21c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 29 Jan 2024 15:25:35 +0000 Subject: [PATCH 7/8] Fix a clang16 warning on bit-field overflow. vcfbuf.c:249:32: error: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Werror,-Wsingle-bit-bitfield-constant-conversion] buf->vcf[i].af_set = 1; ^ ~ --- vcfbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcfbuf.c b/vcfbuf.c index 9d60c493c..3d822948b 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -44,7 +44,7 @@ typedef struct { bcf1_t *rec; double af; - int af_set:1, filter:1, idx:30; + unsigned int af_set:1, filter:1, idx:30; } vcfrec_t; From 8ac35578f91083957c57271ef6f87ff3e4c06200 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 1 Feb 2024 16:55:27 +0000 Subject: [PATCH 8/8] Add some scripts for evaluation bcftools mpileup. These use VT's "decompose_blocksub" command, which since writing may not be possible without bcftools natively. If so I haven't evaluated how it differs. (The origin of these scripts come from Crumble evaluation many years ago so I could see what the impact was on reducing quality in the SynDip data set.) --- mpileup_bench/README | 103 +++++++++ mpileup_bench/compare_vcf_simple.sh | 321 ++++++++++++++++++++++++++++ mpileup_bench/get_data.sh | 51 +++++ mpileup_bench/plot_isec.pl | 150 +++++++++++++ mpileup_bench/run_mpileup.sh | 58 +++++ 5 files changed, 683 insertions(+) create mode 100644 mpileup_bench/README create mode 100755 mpileup_bench/compare_vcf_simple.sh create mode 100755 mpileup_bench/get_data.sh create mode 100755 mpileup_bench/plot_isec.pl create mode 100755 mpileup_bench/run_mpileup.sh diff --git a/mpileup_bench/README b/mpileup_bench/README new file mode 100644 index 000000000..41f2d4aec --- /dev/null +++ b/mpileup_bench/README @@ -0,0 +1,103 @@ +1. First up, there is a get_data.sh script which downloads a bunch of +files including truth sets, BED files and some BAMs. These may be +very large. Just edit the script accordingly or look at the URLs and +have a browse around manually. + +Once it's run you'll never need to download them again. You should +also obtain your own copy of GRCh38 as I couldn't work out which of +the many ones at GIAB are appropriate. This didn't seem to matter for +chr1 or chr20 though and I expect it's mainly the patches which are +different. + + +2. The two evaluation scripts are run_mpileup_HG002.sh and +run_mpileup_HG005.sh. For consistency, rapid turnaround tuning, +setting of parameters and general code improvement should be done with +HG002. Avoid any chromosomes you may wish to evaluate on later. + +When finally finished tuning, use another chromosome (eg chr20) of +HG005 to avoid over-fitting to specific regions or specific samples +and instrument runs. + +I usually name my temp outputs starting in underscore for ease of +removal, but that's just personal preference. So _pb here is the +output from the caller and evaluation. + +An example of running a small region for training. + + ./run_mpileup_HG002.sh pb_50x.bam chr1:10000000-20000000 _pb -X pacbio-ccs + +For final evaluation. + + BCFTOOLS=bcftools.devel ./run_mpileup_HG005.sh illumina_300x.bam chr20:20000000-21000000 _i_dev -L999 -X illumina + ./run_mpileup_HG005.sh illumina_300x.bam chr20:20000000-21000000 _i_new -L999 -X illumina + +Note: we cannot be quite so pure for all instrument types as HG002 is +by far the most widely studied sample and some data sets are only +available on that sample. + + +3. After running the mpileup scripts and getting a bunch of output +directories you can plot them in gnuplot. + + set ylabel "FNr" + set yrange [0:10] + set title "PacBio 50x indels" + + set xlabel "FPr" + a=8;b=10;t="ALL";plot " 0003.vcf.gz; tabix -f 0003.vcf.gz; + bgzip < 0000.vcf > 0000.vcf.gz; tabix -f 0000.vcf.gz; + bgzip < 0001.vcf > 0001.vcf.gz;tabix -f 0001.vcf.gz) + + d=_new/bcftools.isec + (cd $d; + bgzip < 0003.vcf > 0003.vcf.gz; tabix -f 0003.vcf.gz; + bgzip < 0000.vcf > 0000.vcf.gz; tabix -f 0000.vcf.gz; + bgzip < 0001.vcf > 0001.vcf.gz;tabix -f 0001.vcf.gz) + + # Produce a new isec output + bcftools isec -p _fp _{old,new}/0001.vcf.gz + +_{old,new}/0001.vcf is the false positives our two runs produces. +After intersecting those, we now have: + +- _fp/0000.vcf: old false positives we have removed +- _fp/0001.vcf: new false positives we have acquired + +Similarly on _{old,new}/0000.vcf to identify cured / caused +false-negatives. This provides a way to data mine things much more +carefully. We can further drill down on these cured/caused variants +by subdividing into deletions vs insertions, or filtering to high +quality only. diff --git a/mpileup_bench/compare_vcf_simple.sh b/mpileup_bench/compare_vcf_simple.sh new file mode 100755 index 000000000..f7994eec1 --- /dev/null +++ b/mpileup_bench/compare_vcf_simple.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# +# Usage compare_vcf A.vcf B.vcf [region-skip.bed] + +bcftools=${BCFTOOLS:-bcftools} +vt=${VT:-/nfs/users/nfs_j/jkb/lustre/vt/vt} + +if [ $# -lt 2 ] +then + echo Usage: compare_vcf A.vcf B.vcf [region-skip.bed] [region-include.bed] [region] + exit 1 +fi + +v1=$1 +v2=$2 +exclude=$3 +include=$4 +region=$5 + +qual=${QUAL:-30} + +#href=${HREF:-/nfs/srpipe_references/references/Human/1000Genomes_hs37d5/all/fasta/hs37d5.fa} +href=${REF:-$HREF38} + +pp() { + awk "END {printf(\"%f\", 100 *$1 / $2)}" < /dev/null +} + +norm() { + # Also consider norm "-m +both" without a "-d both" step. + # This produces more differences, but is also more correct. + + v=$1 + if [ x"$region" != x ] + then + #$bcftools norm -t $region -f $href $v 2>/dev/null | $bcftools norm -d both -N | $bcftools view -T ^$exclude | $bcftools view -T $include > $v.norm.vcf + #$bcftools norm -m -both -t $region -f $href $v 2>/dev/null | $bcftools view -T ^$exclude | $bcftools view -T $include > $v.norm.vcf + if [ x"$exclude" != x ] + then + $bcftools norm -m -both -t $region -f $href $v 2>/dev/null | $vt decompose_blocksub - 2>/dev/null | $bcftools view -T ^$exclude | $bcftools view -T $include > $v.norm.vcf + else + $bcftools norm -m -both -t $region -f $href $v 2>/dev/null | $vt decompose_blocksub - 2>/dev/null | $bcftools view -T $include > $v.norm.vcf + fi + elif [ x"$include" != x ] + then + $bcftools norm -m -both -f $href $v 2>/dev/null | $vt decompose_blocksub - 2>/dev/null | $bcftools view -T ^$exclude | $bcftools view -T $include > $v.norm.vcf + elif [ x"$exclude" != x ] + then + $bcftools norm -m -both -f $href $v 2>/dev/null | $vt decompose_blocksub - 2>/dev/null | $bcftools view -T ^$exclude > $v.norm.vcf + else + $bcftools norm -m -both -f $href $v 2>/dev/null | $vt decompose_blocksub - 2>/dev/null > $v.norm.vcf + fi + rm $v.norm.vcf.gz 2>/dev/null + bgzip $v.norm.vcf + $bcftools index $v.norm.vcf.gz +} + +if [ "$NORM" != "" ] +then + norm $v1 +fi +norm $v2 + +# NB: consider CHM13_1 chr20:2241427 +# The Illumina data is all mapped with high mqual and shows homozygous deletion +# of AAAC in an AAAC STR. The truth set claims AAAC del is heterozygous. +# +# $Bcftools isec therefore claims it is shared between both sets (in 0002 and 0003.vcf) +# but also only in the query set (0001) due to the extra allele. + +#$bcftools isec -p $v1.isec $v1.norm.vcf.gz $v2.norm.vcf.gz +$bcftools isec -c both -p $v2.isec $v1.norm.vcf.gz $v2.norm.vcf.gz +# Or "-c any"? Works better possibly + + +# 0000 is private to v1 => FN +# 0001 is private to v2 => FP +# 0002/3 are records common to v1/v2 (from v1 or v2 respectively => TP) + +# Depth filtering is beneficial to all tools, so we use it in this evaluation. +case $v2 in + *15x*) dp=15;DP=30;; + *30x*) dp=30;DP=60;; + *53X*) dp=53;DP=106;; + *60x*) dp=60;DP=120;; + *100x*) dp=100;DP=200;; + *150x*) dp=150;DP=300;; + *300x*) dp=300;DP=600;; + *) dp=90;DP=120;; +# *) DP=90;; +esac + +case $v2 in + *gatk*) + prog=GATK + #gatk: https://software.broadinstitute.org/gatk/documentation/article.php?id=3225 + # s_filt_exp="QUAL < $qual || QD < 2 || FS > 60 || MQ < 40 || SOR > 3 || MQRankSum < -12.5 || ReadPosRankSum < -8" + # i_filt_exp="QUAL < $qual || QD < 2 || FS > 200 || ReadPosRankSum < -20" + s_filt_exp="QUAL < $qual || QD < 2 || FS > 60 || MQ < 40 || SOR > 3 || MQRankSum < -2.5 || ReadPosRankSum < -8 || INFO/DP>$DP" + i_filt_exp="QUAL < $qual || QD < 2 || FS > 60 || ReadPosRankSum < -8 || INFO/DP>$DP || SOR > 3" + + # better on GIAB. Also on others? Unknown. SOR 3->6 and MQ 40->20 + s_filt_exp="QUAL < $qual || QD < 2 || FS > 60 || MQ < 20 || SOR > 6 || MQRankSum < -4 || ReadPosRankSum < -8 || INFO/DP>$DP" + + # Depth aware variant of above + s_filt_exp="QUAL < $qual || QD < 2 || FS > 55+INFO/DP/6 || MQ < 20 || SOR > 6+INFO/DP/25 || MQRankSum < -(3+INFO/DP/50) || ReadPosRankSum < -(3.5+INFO/DP/50) || INFO/DP>$DP" + + ;; + *freebayes*) + prog=Freebayes + #freebayes: https://wiki.uiowa.edu/download/attachments/145192256/erik%20garrison%20-%20iowa%20talk%202.pdf?api=v2 + #s_filt_exp="QUAL < $qual || SAF <= 0 || SAR <= 0 || RPR <= 1 || RPL <= 1 || INFO/DP > $DP" + s_filt_exp="QUAL < $qual || SAF <= 0 || SAR <= 0 || RPR <= 0 || RPL <= 0 || INFO/DP > $DP" + + #i_filt_exp=$s_filt_exp + #i_filt_exp="QUAL < $qual || INFO/DP > $DP" + i_filt_exp="QUAL < $qual || RPR <= 0 || RPL <= 0 || INFO/DP > $DP" + ;; + *bcftools*) + prog=Bcftools + + # Simple filters; QUAL, DP and for indels IDV/IMF + s_filt_exp="QUAL < $qual || DP>$DP" + i_filt_exp="IDV < 3 || IMF < 0.1 || DP>$DP || QUAL < $qual" + + # Suggest: + #s_filt_exp="QUAL < $qual || DP>$dp*2 || MQBZ < -(4+$dp/30) || RPBZ > (3.5+$dp/60) || RPBZ < -(3.5+$dp/60) || FORMAT/SP > 30+$dp/2 || SCBZ > 3+$dp/50" + #q_filt_exp="IDV<(2+DP*$qual/2000) || IMF < 0.02+(($qual+1)/($qual+31))*(($qual+1)/($qual+31))/3 || DP>$dp*2 || MQBZ < -(4+$dp/30) || FORMAT/SP > 30+$dp/2 || RPBZ+SCBZ > 5.5" + + # I_FILT_EXP='IDV<(2+DP*$qual/2000) || IMF < 0.02+(($qual+1)/($qual+31))*(($qual+1)/($qual+31))/3 || DP>$dp*2 || MQBZ < -(4+$dp/30) || FORMAT/SP > 30+$dp/2 || RPBZ+SCBZ > 5.5' S_FILT_EXP='QUAL < $qual || DP>$dp*2 || MQBZ < -(4+$dp/30) || RPBZ > (3.5+$dp/60) || RPBZ < -(3.5+$dp/60) || FORMAT/SP > 30+$dp/2 || SCBZ > 3+$dp/50' ./compare_vcf_simple.sh + + ;; + *octopus*) + prog=octopus + s_filt_exp="FILTER!=\"PASS\" || QUAL < $qual || INFO/DP>$DP" + i_filt_exp="FILTER!=\"PASS\" || QUAL < $qual || INFO/DP>$DP" + ;; + *) + prog=Unknown + echo Unrecognised type, no specific filters + s_filt_exp="QUAL<$qual || INFO/DP>$DP" + i_filt_exp="QUAL<$qual || INFO/DP>$DP" + ;; +esac + +S_FILT_EXP=`eval echo \"${S_FILT_EXP:-$s_filt_exp}\"` +I_FILT_EXP=`eval echo \"${I_FILT_EXP:-$i_filt_exp}\"` + +#echo "SNP: $S_FILT_EXP" 1>&2 +#echo "Indel: $I_FILT_EXP" 1>&2 + +# We classify wrong genotypes as both a true and untrue call. We don't +# label it as a false negative as the call exists and it's not as bad +# a situation as missing it entirely. + +# Hacky addition of GTFAIL filter if 0002.vcf and 0003.vcf have a different +# GT call. +perl -e 'BEGIN {$"="\t"} open(F1,"<'$v2'.isec/0002.vcf");open(F2,"<'$v2'.isec/0003.vcf");$_=;print;print "##FILTER=\n";while() {next if /^#/;chomp($_);@F=split("\t",$_);$F[-1]=~/^(\d+)[|\/](\d+)/;$gt=($1<$2)?"$1/$2":"$2/$1";while () {if (/^#/) {print;next} else {last}};chomp($_);@G=split("\t",$_);$G[-1]=~/^(\d+)[|\/](\d+)/;$gt2=($1<$2)?"$1/$2":"$2/$1";if ($gt ne $gt2) {$G[6]="GTFAIL"} print "@G\n"}' > $v2.isec/0003b.vcf + +# Produce isec/filtered.vcf as a filtered copy of the call set, so +# we can pass this to rtg vcfeval for an alternative way of evaluating +# data sets. +$bcftools view -e "QUAL < $qual || (TYPE='snp' && ($S_FILT_EXP)) || (TYPE='indel' && ($I_FILT_EXP))" $v2 > $v2.isec/filtered.vcf + +# QUAL 1 is recommended minimum for freebayes to remove detritus. +v1_snp=` $bcftools view -H -i "TYPE='snp'" $v2.isec/0000.vcf|wc -l` +v2_snp=` $bcftools view -H -i "TYPE='snp' && QUAL >= 1" $v2.isec/0001.vcf|wc -l` +v2_snp_hq=` $bcftools view -H -i "TYPE='snp' && QUAL >= $qual" $v2.isec/0001.vcf|wc -l` +v2_snp_fi=` $bcftools view -i "TYPE='snp'" $v2.isec/0001.vcf | bcftools view -H -e "$S_FILT_EXP" -|wc -l` +v12_snp=` $bcftools view -H -i "TYPE='snp'" $v2.isec/0002.vcf|wc -l` +v12_snp_hq=`$bcftools view -H -i "TYPE='snp' && QUAL >= $qual" $v2.isec/0003.vcf|wc -l` +v12_snp_fi=`$bcftools view -i "TYPE='snp'" $v2.isec/0003.vcf | bcftools view -H -e "$S_FILT_EXP" -|wc -l` + +v12_snp_gt=` $bcftools view -H -f GTFAIL -i "TYPE='snp' && QUAL >= 1" $v2.isec/0003b.vcf|wc -l` +v12_snp_hq_gt=`$bcftools view -H -f GTFAIL -i "TYPE='snp' && QUAL >= $qual" $v2.isec/0003b.vcf|wc -l` +v12_snp_fi_gt=`$bcftools view -f GTFAIL -i "TYPE='snp'" $v2.isec/0003b.vcf | bcftools view -H -e "$I_FILT_EXP" -|wc -l` + +#v2_snp=`expr $v2_snp + $v12_snp_gt` +#v2_snp_hq=`expr $v2_snp_hq + $v12_snp_hq_gt` +#v2_snp_fi=`expr $v2_snp_fi + $v12_snp_fi_gt` + + +v1_indel=` $bcftools view -H -i "TYPE='indel'" $v2.isec/0000.vcf|wc -l` +v2_indel=` $bcftools view -H -i "TYPE='indel' && QUAL >= 1" $v2.isec/0001.vcf|wc -l` +v2_indel_hq=` $bcftools view -H -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0001.vcf|wc -l` +v2_indel_fi=` $bcftools view -i "TYPE='indel'" $v2.isec/0001.vcf | bcftools view -H -e "$I_FILT_EXP" -|wc -l` +v12_indel=` $bcftools view -H -i "TYPE='indel'" $v2.isec/0002.vcf|wc -l` +v12_indel_hq=`$bcftools view -H -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0003.vcf|wc -l` +v12_indel_fi=`$bcftools view -i "TYPE='indel'" $v2.isec/0003.vcf | bcftools view -H -e "$I_FILT_EXP" -|wc -l` + +v12_indel_gt=` $bcftools view -H -f GTFAIL -i "TYPE='indel'" $v2.isec/0003b.vcf|wc -l` +v12_indel_hq_gt=`$bcftools view -H -f GTFAIL -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0003b.vcf|wc -l` +v12_indel_fi_gt=`$bcftools view -f GTFAIL -i "TYPE='indel'" $v2.isec/0003b.vcf | bcftools view -H -e "$I_FILT_EXP" -|wc -l` + +#v2_indel=`expr $v2_indel + $v12_indel_gt` +#v2_indel_hq=`expr $v2_indel_hq + $v12_indel_hq_gt` +#v2_indel_fi=`expr $v2_indel_fi + $v12_indel_fi_gt` + +# quality trimmed FN aren't the records private to v1 above QUAL, but the +# total number of records not in v12 after filtering. Thus as we increase +# acceptance threshold to reduce FP we increase FN. +v1_snp_hq=`expr $v1_snp + $v12_snp - $v12_snp_hq` +v1_snp_fi=`expr $v1_snp + $v12_snp - $v12_snp_fi` +v1_indel_hq=`expr $v1_indel + $v12_indel - $v12_indel_hq` +v1_indel_fi=`expr $v1_indel + $v12_indel - $v12_indel_fi` + +# Assumption A.vcf is truth set and B.vcf is test set +if [ "$FORMAT" = "tex" ] +then + printf '\\bigskip\n' + printf '\\begin{minipage}{\\linewidth}\n' + printf '\\centering\n' + printf '\\captionof{table}{%s: FIXME}\n' $prog + printf '{\\begin{tabular}{ll|r|rr}\n' + printf 'Variants & & \\textbf{Q>0} & \\textbf{Q>=%d} & \\textbf{Filtered} \\\\ \\midrule\n' $qual + printf 'SNP & TP & %7d & %7d & %7d \\\\\n' $v12_snp $v12_snp_hq $v12_snp_fi + printf 'SNP & FP & %7d & %7d & %7d \\\\\n' $v2_snp $v2_snp_hq $v2_snp_fi + printf 'SNP & GT & %7d & %7d & %7d \\\\\n' $v12_snp_gt $v12_snp_hq_gt $v12_snp_fi_gt + printf 'SNP & FN & %7d & %7d & %7d \\\\\n' $v1_snp $v1_snp_hq $v1_snp_fi + printf '\\midrule\n'; + printf 'InDel & TP & %7d & %7d & %7d \\\\\n' $v12_indel $v12_indel_hq $v12_indel_fi + printf 'InDel & FP & %7d & %7d & %7d \\\\\n' $v2_indel $v2_indel_hq $v2_indel_fi + printf 'InDel & GT & %7d & %7d & %7d \\\\\n' $v12_indel_gt $v12_indel_hq_gt $v12_indel_fi_gt + printf 'InDel & FN & %7d & %7d & %7d \\\\\n' $v1_indel $v1_indel_hq $v1_indel_fi + printf '\\end{tabular}}\n' + printf '\\par\n' + printf '\\textbf{CRAM qual size x,x}\n' + printf '\\bigskip\n' + printf '\\end{minipage}\n' + +elif [ "$FORMAT" = "gnuplot" ] +then + # v1_snp = total number of FN SNPs (at any qual; from truth set) + # v2_snp = total number of FP SNPs (at any qual; our extra calls) + # v12_snp = total number of TP SNPs + # => v12_snp + v1_snp = total true SNPs + # + # Columns: + # Filename + # Qual $q + # No. TP SNP in this 10-qual bin + # No. FP SNP in this 10-qual bin + # No. TP Indel in this 10-qual bin + # No. FP Indel in this 10-qual bin + # Total no. SNPs + # Number FN SNP at QUAL >= $q + # Number FP SNP at QUAL >= $q + # Total no. INDELs + # Number FN INDEL at QUAL >= $q + # Number FP INDEL at QUAL >= $q + # //Number GT SNP errs at QUAL >= $q + tot_snp=`expr $v1_snp + $v12_snp` + tot_indel=`expr $v1_indel + $v12_indel` + + # For gnuplot + for Q in `seq 0 10 250` + do + qual=$Q + qual_max=`expr $Q + 10` + v2_snp_hq=` $bcftools view -H -i "TYPE='snp' && QUAL >= $qual && QUAL < $qual_max" $v2.isec/0001.vcf|wc -l` + v2_snp_hq2=` $bcftools view -H -i "TYPE='snp' && QUAL >= $qual" $v2.isec/0001.vcf|wc -l` + v12_snp_hq=`$bcftools view -H -i "TYPE='snp' && QUAL >= $qual && QUAL < $qual_max" $v2.isec/0003.vcf|wc -l` + v12_snp_hq2=`$bcftools view -H -i "TYPE='snp' && QUAL >= $qual" $v2.isec/0003.vcf|wc -l` + v2_indel_hq=` $bcftools view -H -i "TYPE='indel' && QUAL >= $qual && QUAL < $qual_max" $v2.isec/0001.vcf|wc -l` + v2_indel_hq2=` $bcftools view -H -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0001.vcf|wc -l` + v12_indel_hq=`$bcftools view -H -i "TYPE='indel' && QUAL >= $qual && QUAL < $qual_max" $v2.isec/0003.vcf|wc -l` + v12_indel_hq2=`$bcftools view -H -i "TYPE='indel' && QUAL >= $qual" $v2.isec/0003.vcf|wc -l` + + # Total number of SNPs minue true HQ snps we call + v12_snp_hq2=`expr $tot_snp - $v12_snp_hq2` + v12_indel_hq2=`expr $tot_indel - $v12_indel_hq2` + printf "$v2\t$qual\t$v12_snp_hq $v2_snp_hq\t$v12_indel_hq $v2_indel_hq\t$tot_snp $v12_snp_hq2 $v2_snp_hq2\t$tot_indel $v12_indel_hq2 $v2_indel_hq2\n" + done + +elif [ "$FORMAT" = "percent" ] +then + printf "SNP Q>0 / Q>=$qual / Filtered\n" + x=`expr $v12_snp + $v1_snp` + printf "SNP TP %7.2f / %7.2f / %7.2f\n" \ + `pp $v12_snp $x` `pp $v12_snp_hq $x` `pp $v12_snp_fi $x` + x=`expr $v12_snp + $v2_snp` + printf "SNP FP %7.2f / %7.2f / %7.2f\n" \ + `pp $v2_snp $x` `pp $v2_snp_hq $x` `pp $v2_snp_fi $x` + x=`expr $v12_snp` + printf 'SNP GT %7.2f / %7.2f / %7.2f\n' \ + `pp $v12_snp_gt $x` `pp $v12_snp_hq_gt $x` `pp $v12_snp_fi_gt $x` + x=`expr $v12_snp + $v1_snp` + printf "SNP FN %7.2f / %7.2f / %7.2f\n" \ + `pp $v1_snp $x` `pp $v1_snp_hq $x` `pp $v1_snp_fi $x` +#printf "SNP %4.1f%% prec, %4.1f%% rec\n" 100.0*$v12_snp_hq/($v12_snp_hq+$v2_snp_hq) 100.0*$v12_snp_hq/($v12_snp_hq+$v1_snp_hq); + printf "\n"; + x=`expr $v12_indel + $v1_indel` + printf "InDel TP %7.2f / %7.2f / %7.2f\n" \ + `pp $v12_indel $x` `pp $v12_indel_hq $x` `pp $v12_indel_fi $x` + x=`expr $v12_indel + $v2_indel` + printf "InDel FP %7.2f / %7.2f / %7.2f\n" \ + `pp $v2_indel $x` `pp $v2_indel_hq $x` `pp $v2_indel_fi $x` + x=`expr $v12_indel` + printf 'InDel GT %7.2f / %7.2f / %7.2f\n' \ + `pp $v12_indel_gt $x` `pp $v12_indel_hq_gt $x` `pp $v12_indel_fi_gt $x` + x=`expr $v12_indel + $v1_indel` + printf "InDel FN %7.2f / %7.2f / %7.2f\n" \ + `pp $v1_indel $x` `pp $v1_indel_hq $x` `pp $v1_indel_fi $x` + +else + printf "SNP Q>0 / Q>=$qual / Filtered\n" + printf "SNP TP %7d / %7d / %7d\n" $v12_snp $v12_snp_hq $v12_snp_fi + printf "SNP FP %7d / %7d / %7d\n" $v2_snp $v2_snp_hq $v2_snp_fi + printf 'SNP GT %7d / %7d / %7d\n' $v12_snp_gt $v12_snp_hq_gt $v12_snp_fi_gt + printf "SNP FN %7d / %7d / %7d\n" $v1_snp $v1_snp_hq $v1_snp_fi + #printf "SNP %4.1f%% prec, %4.1f%% rec\n" 100.0*$v12_snp_hq/($v12_snp_hq+$v2_snp_hq) 100.0*$v12_snp_hq/($v12_snp_hq+$v1_snp_hq); + printf "\n"; + printf "InDel TP %7d / %7d / %7d\n" $v12_indel $v12_indel_hq $v12_indel_fi + printf "InDel FP %7d / %7d / %7d\n" $v2_indel $v2_indel_hq $v2_indel_fi + printf 'InDel GT %7d / %7d / %7d\n' $v12_indel_gt $v12_indel_hq_gt $v12_indel_fi_gt + printf "InDel FN %7d / %7d / %7d\n" $v1_indel $v1_indel_hq $v1_indel_fi + #printf "InDel %4.1f%% prec, %4.1f%% rec\n" 100.0*$v12_indel_hq/($v12_indel_hq+$v2_indel_hq) 100.0*$v12_indel_hq/($v12_indel_hq+$v1_indel_hq); +fi + +#rm $v1.norm* $v2.norm* +#rm -rf $v2.isec diff --git a/mpileup_bench/get_data.sh b/mpileup_bench/get_data.sh new file mode 100755 index 000000000..4581f0b52 --- /dev/null +++ b/mpileup_bench/get_data.sh @@ -0,0 +1,51 @@ +# ---------------------------------------------------------------------- +# Human reference +# I have this locally: +# /nfs/srpipe_references/references/Human/GRCh38_full_analysis_set_plus_decoy_hla/all/fasta/Homo_sapiens.GRCh38_full_analysis_set_plus_decoy_hla.fa + +# This has 3366 sequences. +# The BAMs appear to be aligned against 2580 sequences. +# The GIAB reference directory appears to be a few hundred max. +# I'm unsure where their reference used was, but if we stick to the +# already aligned files and stick to the primary chromosomes then +# frankly any GRCh38 should be fine for evaluation purposes. +# (NB: unsure if true for chr21 due to changes, but that may be in +# patch form only) + +# ---------------------------------------------------------------------- +# HG005 for final evaluation only +# chr20 only + +# ---- Truth set +echo "Fetching truth set" +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.bed +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi + +# ---- Data files +reg=chr20:20000000-21000000 +echo "Getting Illumina $reg" +samtools view -o illumina_300x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/HG005_NA24631_son_HiSeq_300x/NHGRI_Illumina300X_Chinesetrio_novoalign_bams/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x.bam $reg + +echo "Getting PacBio $reg" +samtools view -o pacbio_50x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/NIST_BGIseq_2x150bp_100x/GRCh38/HG005_GRCh38_BGIseq-2x150-100x_NIST_20211126.bam $reg + +echo "Getting BGI $reg" +samtools view -o bgi_100x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/PacBio_CCS_15kb_20kb_chemistry2/GRCh38/GIAB_5mC_CpG/HG005.GRCh38.deepvariant.haplotagged.bam $reg + +# ---------------------------------------------------------------------- +# HG002 for code modification, tuning, tweaking and round-trips. +# chr1 only + +# ---- Truth set +echo "Fetching truth set" +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi +wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.noinconsistent.bed + +# ---- Data files +# Data is same locations as above, but more data available. Have a browse +# basically and decide what to test with +reg=chr1 +echo "Browse https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/" +# eg https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/Element_AVITI_20231018/HG002_GRCh37_Element-StdInsert_2X150_81x_20231018.bam diff --git a/mpileup_bench/plot_isec.pl b/mpileup_bench/plot_isec.pl new file mode 100755 index 000000000..6e6d28652 --- /dev/null +++ b/mpileup_bench/plot_isec.pl @@ -0,0 +1,150 @@ +#!/usr/bin/perl -w + +# Reports the cumulative total of false positives, genotype assignment errors, +# and false negatives. These are using an assumed filter of QUAL>=x, so as we +# increase QUAL filtering we're removing errors (FP and GT) and increasing +# missing calls (FN). +# +# We also report these in percentage terms. +# FP is fraction of called variants that are false +# GT is fraction of true variants with the wrong genotype +# FN is fraction of true variants not called + +use strict; + +my $dir=shift(@ARGV); +my $type=shift(@ARGV); + +my ($fn_count, $I_fn_count, $D_fn_count) = (0,0,0); +my ($fp_count, $I_fp_count, $D_fp_count) = (0,0,0); +my ($gt_count, $I_gt_count, $D_gt_count) = (0,0,0); +my (@fp, @I_fp, @D_fp); +my (@tp, @I_tp, @D_tp); +my (@gt, @I_gt, @D_gt); +my ($total_true, $I_total_true, $D_total_true) = (0,0,0); + +# False negatives. No score, so just count them as base number +open(my $fn, "bcftools query -i 'TYPE=\"$type\"' -f '%QUAL %REF %ALT\n' $dir/0000.vcf|"); +while (<$fn>) { + my ($q,$r,$a) = split(/\s+/,$_); + $fn_count++; + $I_fn_count++ if (length($r) < length($a)); + $D_fn_count++ if (length($r) > length($a)); +} +close($fn); + +# False positives, bin by QUAL +open(my $fp, "bcftools query -i 'TYPE=\"$type\"' -f '%QUAL %REF %ALT\n' $dir/0001.vcf|"); +while (<$fp>) { + my ($q,$r,$a) = split(/\s+/,$_); + @fp[int($q)]++; + $fp_count++; + if (length($r) < length($a)) { + @I_fp[int($q)]++; + $I_fp_count++; + } + if (length($r) > length($a)) { + @D_fp[int($q)]++; + $D_fp_count++; + } +} +close($fp); + +# True positives, bin by QUAL +my $total_tp_call = 0; +my $I_total_tp_call = 0; +my $D_total_tp_call = 0; +open(my $tp, "bcftools query -i 'TYPE=\"$type\"' -f '%QUAL %REF %ALT %FILTER\n' $dir/0003b.vcf|"); +while (<$tp>) { + my ($q,$r,$a,$f) = split(/\s+/,$_); + $total_tp_call++; + @tp[int($q)]++; + if (length($r) < length($a)) { + $I_total_tp_call++; + @I_tp[int($q)]++; + } + if (length($r) > length($a)) { + $D_total_tp_call++; + @D_tp[int($q)]++; + } + + if (/GTFAIL/) { + @gt[int($q)]++; + $gt_count++; + if (length($r) < length($a)) { + @I_gt[int($q)]++; + $I_gt_count++; + } + if (length($r) > length($a)) { + @D_gt[int($q)]++; + $D_gt_count++; + } + } +} +close($tp); +$total_true = $fn_count + $total_tp_call; +$I_total_true = $I_fn_count + $I_total_tp_call; +$D_total_true = $D_fn_count + $D_total_tp_call; + +print STDERR "Total true indel: $total_true\n"; +print STDERR "Total true ins: $I_total_true\n"; +print STDERR "Total true del:: $D_total_true\n"; + +print STDERR " QUAL : FP GT FN\n"; + +for (my $qual = 0; $qual < 5000; $qual++) { + $fp_count -= $fp[$qual] if (defined($fp[$qual])); + $fn_count += $tp[$qual] if (defined($tp[$qual])); + $gt_count -= $gt[$qual] if (defined($gt[$qual])); + + last if $fp_count == 0; + + $I_fp_count -= $I_fp[$qual] if (defined($I_fp[$qual])); + $I_fn_count += $I_tp[$qual] if (defined($I_tp[$qual])); + $I_gt_count -= $I_gt[$qual] if (defined($I_gt[$qual])); + + $D_fp_count -= $D_fp[$qual] if (defined($D_fp[$qual])); + $D_fn_count += $D_tp[$qual] if (defined($D_tp[$qual])); + $D_gt_count -= $D_gt[$qual] if (defined($D_gt[$qual])); + + my $total_call = $total_true - $fn_count + $fp_count; + my $I_total_call = $I_total_true - $I_fn_count + $I_fp_count; + my $D_total_call = $D_total_true - $D_fn_count + $D_fp_count; + + printf("ALL %4d : %5d %5d %5d", $qual, $fp_count, $gt_count, $fn_count); + printf(" : %7.4f %7.4f %7.4f $total_call\n", + 100*$fp_count/$total_call, + 100*$gt_count/$total_true, + 100*$fn_count/$total_true); + + printf("INS %4d : %5d %5d %5d", + $qual, $I_fp_count, $I_gt_count, $I_fn_count); + printf(" : %7.4f %7.4f %7.4f\n", + 100*$I_fp_count/$I_total_call, + 100*$I_gt_count/$I_total_true, + 100*$I_fn_count/$I_total_true); + + printf("DEL %4d : %5d %5d %5d", + $qual, $D_fp_count, $D_gt_count, $D_fn_count); + printf(" : %7.4f %7.4f %7.4f\n", + 100*$D_fp_count/$D_total_call, + 100*$D_gt_count/$D_total_true, + 100*$D_fn_count/$D_total_true); +} + +__END__ + +/nfs/users/nfs_j/jkb/work/samtools_master/bcftools/plot_isec.pl HG002.GRCh38.60x.RG.bcftools-10h.vcf.isec indel > _2 +Total true indel: 45604 +Total true ins: 22194 +Total true del:: 23410 + +# Plot FP (x) vs FN (y) +a=8;b=10;plot "$dir/bcftools.mpileup.out | $bcftools call -vm - > $dir/bcftools.vcf 2>$dir/bcftools.call.out + +# A primary evaluation. +# They key thing here is it leaves behind the .isec directory with the +# intersection of the truth and call sets. +echo "=== ./compare_vcf_simple.sh $TRUTH $dir/bcftools.vcf "" $BED $region" +QUAL=30 NORM=1 ./compare_vcf_simple.sh $TRUTH $dir/bcftools.vcf "" $BED $region + +# Produce a .plot file for use in gnuplot, along with a basic summary too. +echo "=== ./plot_isec.pl $dir/bcftools.vcf.isec indel > $dir/plot++" +./plot_isec.pl $dir/bcftools.vcf.isec indel > $dir/plot++ +grep ALL $dir/plot++ > $dir/plot +grep INS $dir/plot++ > $dir/plot.ins +grep DEL $dir/plot++ > $dir/plot.del + +awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/plot | cut -c 1-28|head -20 + + +# Example gnuplot: +# set xlabel "FPr" +# set ylabel "FNr" +# set yrange [0:10] +# set title "PacBio 50x indels" + +# FP vs FN +# a=8;b=10;t="ALL";plot "