From 210155b32002ea9e9707a53b6d6335b129588927 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 13:48:54 +0200 Subject: [PATCH 01/38] Prototype planned VCF FORMAT parser --- test/test_view.c | 25 ++ vcf.c | 688 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 713 insertions(+) diff --git a/test/test_view.c b/test/test_view.c index c899ff995..b24ba9b46 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -37,6 +37,12 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/vcf.h" #include "../htslib/hts_log.h" +extern void hts_vcf_simd_probe_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, uint64_t *tabs); +extern void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, + uint64_t *parsed_samples); + struct opts { char *fn_ref; int flag; @@ -431,6 +437,25 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); + if (getenv("HTS_VCF_SIMD_STATS")) { + uint64_t attempts = 0, hits = 0, fallback = 0, tabs = 0; + hts_vcf_simd_probe_stats(&attempts, &hits, &fallback, &tabs); + fprintf(stderr, + "vcf-simd-tabs attempts=%llu hits=%llu fallback=%llu tabs=%llu\n", + (unsigned long long) attempts, (unsigned long long) hits, + (unsigned long long) fallback, (unsigned long long) tabs); + } + + if (getenv("HTS_VCF_FORMAT_PLAN_STATS")) { + uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; + hts_vcf_format_plan_stats(&attempts, &hits, &fallback, &parsed_samples); + fprintf(stderr, + "vcf-format-plan attempts=%llu hits=%llu fallback=%llu parsed_samples=%llu\n", + (unsigned long long) attempts, (unsigned long long) hits, + (unsigned long long) fallback, + (unsigned long long) parsed_samples); + } + if (fclose(stdout) != 0 && errno != EBADF) { fprintf(stderr, "Error closing standard output.\n"); exit_code = EXIT_FAILURE; diff --git a/vcf.c b/vcf.c index 544fe8c01..ab0532f7c 100644 --- a/vcf.c +++ b/vcf.c @@ -36,11 +36,20 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION #include "fuzz_settings.h" #endif +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif + +#if defined(__AVX2__) +#include +#endif + #include "htslib/vcf.h" #include "htslib/bgzf.h" #include "htslib/tbx.h" @@ -3133,6 +3142,450 @@ static inline int align_mem(kstring_t *s) #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ +typedef struct { + uint64_t attempts; + uint64_t hits; + uint64_t fallback; + uint64_t parsed_samples; +} vcf_format_plan_stats_t; + +static vcf_format_plan_stats_t vcf_format_plan_stats; + +void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, uint64_t *parsed_samples) +{ + if (attempts) *attempts = vcf_format_plan_stats.attempts; + if (hits) *hits = vcf_format_plan_stats.hits; + if (fallback) *fallback = vcf_format_plan_stats.fallback; + if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; +} + +static int vcf_format_plan_enabled(void) +{ + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("HTS_VCF_FORMAT_PLAN"); + enabled = env && env[0] && strcmp(env, "0") != 0; + } + return enabled; +} + +typedef struct { + char format[64]; + int supported; + int has_ab; + int has_phase; + int key_gt; + int key_ab; + int key_ad; + int key_dp; + int key_gq; + int key_pgt; + int key_pid; + int key_pl; +} vcf_format_plan_t; + +static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, + vcf_format_plan_t *plan) +{ + memset(plan, 0, sizeof(*plan)); + if (strlen(format) >= sizeof(plan->format)) + return 0; + strcpy(plan->format, format); + + if (strcmp(format, "GT:AB:AD:DP:GQ:PL") == 0) { + plan->supported = 1; + plan->has_ab = 1; + } else if (strcmp(format, "GT:AD:DP:GQ:PL") == 0) { + plan->supported = 1; + } else if (strcmp(format, "GT:AB:AD:DP:GQ:PGT:PID:PL") == 0) { + plan->supported = 1; + plan->has_ab = 1; + plan->has_phase = 1; + } else if (strcmp(format, "GT:AD:DP:GQ:PGT:PID:PL") == 0) { + plan->supported = 1; + plan->has_phase = 1; + } else { + return 0; + } + + plan->key_gt = bcf_hdr_id2int(h, BCF_DT_ID, "GT"); + plan->key_ad = bcf_hdr_id2int(h, BCF_DT_ID, "AD"); + plan->key_dp = bcf_hdr_id2int(h, BCF_DT_ID, "DP"); + plan->key_gq = bcf_hdr_id2int(h, BCF_DT_ID, "GQ"); + plan->key_pl = bcf_hdr_id2int(h, BCF_DT_ID, "PL"); + plan->key_ab = plan->has_ab ? bcf_hdr_id2int(h, BCF_DT_ID, "AB") : -1; + plan->key_pgt = plan->has_phase ? bcf_hdr_id2int(h, BCF_DT_ID, "PGT") : -1; + plan->key_pid = plan->has_phase ? bcf_hdr_id2int(h, BCF_DT_ID, "PID") : -1; + if (plan->key_gt < 0 || plan->key_ad < 0 || plan->key_dp < 0 || + plan->key_gq < 0 || plan->key_pl < 0 || + (plan->has_ab && plan->key_ab < 0) || + (plan->has_phase && (plan->key_pgt < 0 || plan->key_pid < 0))) + plan->supported = 0; + + return plan->supported; +} + +static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *format) +{ + enum { N_PLAN_CACHE = 8 }; + static vcf_format_plan_t cache[N_PLAN_CACHE]; + static int ncache = 0; + int i; + + for (i = 0; i < ncache; i++) + if (strcmp(cache[i].format, format) == 0) + return cache[i].supported ? &cache[i] : NULL; + + if (ncache == N_PLAN_CACHE) + return NULL; + vcf_format_plan_compile(h, format, &cache[ncache]); + return cache[ncache++].supported ? &cache[ncache-1] : NULL; +} + +static inline int vcf_plan_gt2(const char **sp, int32_t out[2]) +{ + const char *s = *sp; + int a0, a1, phased; + + if (s[0] == '.' && (s[1] == '/' || s[1] == '|') && s[2] == '.') { + out[0] = 0; + out[1] = 0; + *sp = s + 3; + return 0; + } + if (!(s[0] >= '0' && s[0] <= '9') || (s[1] != '/' && s[1] != '|') || + !(s[2] >= '0' && s[2] <= '9')) + return -1; + + a0 = s[0] - '0'; + a1 = s[2] - '0'; + phased = s[1] == '|'; + out[0] = ((a0 + 1) << 1) | phased; + out[1] = ((a1 + 1) << 1) | phased; + *sp = s + 3; + return 0; +} + +static inline int vcf_plan_int_value(const char **sp, int32_t *out) +{ + const char *s = *sp; + int sign = 1, val = 0; + + if (*s == '.') { + *out = bcf_int32_missing; + *sp = s + 1; + return 0; + } + if (*s == '-') { + sign = -1; + s++; + } + if (!(*s >= '0' && *s <= '9')) + return -1; + while (*s >= '0' && *s <= '9') { + val = val * 10 + (*s - '0'); + s++; + } + *out = sign * val; + *sp = s; + return 0; +} + +static inline int vcf_plan_float_value(const char **sp, float *out) +{ + const char *s = *sp; + char *end = NULL; + int failed = 0; + + if (*s == '.') { + bcf_float_set_missing(*out); + *sp = s + 1; + return 0; + } + *out = hts_str2dbl(s, &end, &failed); + if (failed || end == s) + return -1; + *sp = end; + return 0; +} + +static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) +{ + const char *s = *sp; + int i; + + for (i = 0; i < width; i++) { + if (vcf_plan_int_value(&s, &out[i]) < 0) + return -1; + if (*s != ',') { + i++; + break; + } + s++; + } + for (; i < width; i++) + out[i] = bcf_int32_vector_end; + if (*s == ',') + return -1; + *sp = s; + return 0; +} + +static inline int vcf_plan_expect_sep(const char **sp, int sep) +{ + if (**sp != sep) + return -1; + (*sp)++; + return 0; +} + +static inline int vcf_plan_skip_field(const char **sp, int sep) +{ + const char *s = *sp; + while (*s && *s != sep && *s != '\t') + s++; + if (*s != sep) + return -1; + *sp = s + 1; + return 0; +} + +static inline int vcf_plan_measure_string(const char **sp, int sep, int *max_l) +{ + const char *s = *sp, *t = s; + int l; + + while (*t && *t != sep && *t != '\t') + t++; + if (*t != sep) + return -1; + l = t - s; + if (*max_l < l) + *max_l = l; + *sp = t + 1; + return 0; +} + +static inline int vcf_plan_copy_string(const char **sp, char *out, int width) +{ + const char *s = *sp, *t = s; + int l; + + while (*t && *t != ':' && *t != '\t') + t++; + l = t - s; + if (l > width) + return -1; + memcpy(out, s, l); + if (l < width) + memset(out + l, 0, width - l); + *sp = t; + return 0; +} + +static int vcf_plan_phase_widths(const bcf_hdr_t *h, const vcf_format_plan_t *plan, + kstring_t *s, char *q, int *pgt_w, int *pid_w) +{ + const char *cur = q + 1, *end = s->s + s->l; + int sample, nsamples = bcf_hdr_nsamples(h); + + *pgt_w = 0; + *pid_w = 0; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (vcf_plan_skip_field(&cur, ':') < 0) + return -1; + if (plan->has_ab && vcf_plan_skip_field(&cur, ':') < 0) + return -1; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -1; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -1; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -1; + if (vcf_plan_measure_string(&cur, ':', pgt_w) < 0) + return -1; + if (vcf_plan_measure_string(&cur, ':', pid_w) < 0) + return -1; + while (cur < end && *cur && *cur != '\t') + cur++; + if (*cur == '\t') + cur++; + } + if (sample != nsamples) + return -1; + // The generic FORMAT max-length pass includes the preceding ':' in + // non-GT string widths, leaving one byte of padding per sample. + (*pgt_w)++; + (*pid_w)++; + return 0; +} + +static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) +{ + vcf_format_plan_t *plan; + kstring_t *mem; + int nsamples, ad_w, pl_w, sample, nwords, pgt_w = 0, pid_w = 0; + size_t gt_off, ab_off = 0, ad_off, dp_off, gq_off, pgt_off = 0, pid_off = 0, pl_off, total_bytes; + int32_t *gt, *ad, *dp, *gq, *pl; + float *ab = NULL; + char *pgt = NULL, *pid = NULL; + const char *cur, *end; + + if (!vcf_format_plan_enabled()) + return -3; + vcf_format_plan_stats.attempts++; + if (h->keep_samples) + goto fallback; + + plan = vcf_format_plan_get(h, p); + if (!plan) + goto fallback; + + nsamples = bcf_hdr_nsamples(h); + if (!nsamples) + return 0; + if (v->n_allele < 1 || v->n_allele > 8) + goto fallback; + ad_w = v->n_allele; + pl_w = v->n_allele * (v->n_allele + 1) / 2; + if (pl_w < 1 || pl_w > 36) + goto fallback; + if (plan->has_phase && vcf_plan_phase_widths(h, plan, s, q, &pgt_w, &pid_w) < 0) + goto fallback; + + mem = (kstring_t*)&h->mem; + mem->l = 0; + if (align_mem(mem) < 0) + return -1; + + total_bytes = (size_t) nsamples * (2 + ad_w + 1 + 1 + pl_w + plan->has_ab) * sizeof(int32_t); + total_bytes += (size_t) nsamples * (pgt_w + pid_w); + if (total_bytes > INT_MAX) + return -1; + if (ks_resize(mem, mem->l + total_bytes) < 0) + return -1; + + gt_off = mem->l; mem->l += (size_t) nsamples * 2 * sizeof(int32_t); + if (plan->has_ab) { + ab_off = mem->l; mem->l += (size_t) nsamples * sizeof(float); + } + ad_off = mem->l; mem->l += (size_t) nsamples * ad_w * sizeof(int32_t); + dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); + gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); + if (plan->has_phase) { + pgt_off = mem->l; mem->l += (size_t) nsamples * pgt_w; + pid_off = mem->l; mem->l += (size_t) nsamples * pid_w; + } + pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); + + gt = (int32_t *) (mem->s + gt_off); + if (plan->has_ab) + ab = (float *) (mem->s + ab_off); + ad = (int32_t *) (mem->s + ad_off); + dp = (int32_t *) (mem->s + dp_off); + gq = (int32_t *) (mem->s + gq_off); + if (plan->has_phase) { + pgt = mem->s + pgt_off; + pid = mem->s + pid_off; + } + pl = (int32_t *) (mem->s + pl_off); + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (vcf_plan_gt2(&cur, >[sample * 2]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (plan->has_ab) { + if (vcf_plan_float_value(&cur, &ab[sample]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } + if (vcf_plan_parse_int_vector(&cur, &ad[sample * ad_w], ad_w) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value(&cur, &dp[sample]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value(&cur, &gq[sample]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (plan->has_phase) { + if (vcf_plan_copy_string(&cur, &pgt[sample * pgt_w], pgt_w) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_copy_string(&cur, &pid[sample * pid_w], pid_w) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } + if (vcf_plan_parse_int_vector(&cur, &pl[sample * pl_w], pl_w) < 0) + goto fallback; + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + goto fallback; + } + if (sample != nsamples) + goto fallback; + + v->n_fmt = plan->has_phase ? (plan->has_ab ? 8 : 7) : (plan->has_ab ? 6 : 5); + v->n_sample = nsamples; + bcf_enc_int1(&v->indiv, plan->key_gt); + if (bcf_enc_vint(&v->indiv, nsamples * 2, gt, 2) < 0) + return -1; + if (plan->has_ab) { + bcf_enc_int1(&v->indiv, plan->key_ab); + bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT); + if (serialize_float_array(&v->indiv, nsamples, ab) < 0) + return -1; + } + bcf_enc_int1(&v->indiv, plan->key_ad); + nwords = nsamples * ad_w; + if (bcf_enc_vint(&v->indiv, nwords, ad, ad_w) < 0) + return -1; + bcf_enc_int1(&v->indiv, plan->key_dp); + if (bcf_enc_vint(&v->indiv, nsamples, dp, 1) < 0) + return -1; + bcf_enc_int1(&v->indiv, plan->key_gq); + if (bcf_enc_vint(&v->indiv, nsamples, gq, 1) < 0) + return -1; + if (plan->has_phase) { + bcf_enc_int1(&v->indiv, plan->key_pgt); + if (bcf_enc_size(&v->indiv, pgt_w, BCF_BT_CHAR) < 0) + return -1; + if (kputsn(pgt, (size_t) nsamples * pgt_w, &v->indiv) < 0) + return -1; + bcf_enc_int1(&v->indiv, plan->key_pid); + if (bcf_enc_size(&v->indiv, pid_w, BCF_BT_CHAR) < 0) + return -1; + if (kputsn(pid, (size_t) nsamples * pid_w, &v->indiv) < 0) + return -1; + } + bcf_enc_int1(&v->indiv, plan->key_pl); + nwords = nsamples * pl_w; + if (bcf_enc_vint(&v->indiv, nwords, pl, pl_w) < 0) + return -1; + + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + return 0; + +fallback: + vcf_format_plan_stats.fallback++; + return -3; +} + // detect FORMAT "." static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *p, const char *q) { @@ -3686,7 +4139,13 @@ static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { + int pret; if ( !bcf_hdr_nsamples(h) ) return 0; + + pret = vcf_parse_format_planned(s, h, v, p, q); + if (pret != -3) + return pret; + kstring_t *mem = (kstring_t*)&h->mem; mem->l = 0; @@ -3984,6 +4443,230 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p return -1; } +typedef struct { + uint64_t attempts; + uint64_t hits; + uint64_t fallback; + uint64_t tabs; +} vcf_simd_probe_stats_t; + +static vcf_simd_probe_stats_t vcf_simd_probe_stats; + +static int vcf_simd_tabs_enabled(void) +{ + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("HTS_VCF_SIMD_TABS"); + enabled = env && env[0] && strcmp(env, "0") != 0; + } + return enabled; +} + +void hts_vcf_simd_probe_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, uint64_t *tabs) +{ + if (attempts) *attempts = vcf_simd_probe_stats.attempts; + if (hits) *hits = vcf_simd_probe_stats.hits; + if (fallback) *fallback = vcf_simd_probe_stats.fallback; + if (tabs) *tabs = vcf_simd_probe_stats.tabs; +} + +static int vcf_find_tabs_scalar(const char *s, size_t len, + size_t *tabs, int max_tabs) +{ + int n = 0; + size_t i; + for (i = 0; i < len && n < max_tabs; i++) { + if (s[i] == '\t') + tabs[n++] = i; + } + return n; +} + +static int vcf_find_tabs_simd(const char *s, size_t len, + size_t *tabs, int max_tabs) +{ + int n = 0; + size_t i = 0; + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) + const uint8x16_t tab = vdupq_n_u8('\t'); + for (; i + 16 <= len && n < max_tabs; i += 16) { + uint8x16_t bytes = vld1q_u8((const uint8_t *) s + i); + uint8x16_t eq = vceqq_u8(bytes, tab); + uint64_t lo = vgetq_lane_u64(vreinterpretq_u64_u8(eq), 0); + uint64_t hi = vgetq_lane_u64(vreinterpretq_u64_u8(eq), 1); + uint32_t mask = 0; + int j; + + for (j = 0; j < 8; j++) mask |= ((lo >> (j * 8)) & 0x80) ? 1u << j : 0; + for (j = 0; j < 8; j++) mask |= ((hi >> (j * 8)) & 0x80) ? 1u << (j + 8) : 0; + + while (mask && n < max_tabs) { + unsigned bit = (unsigned) __builtin_ctz(mask); + tabs[n++] = i + bit; + mask &= mask - 1; + } + } +#elif defined(__AVX2__) + const __m256i tab = _mm256_set1_epi8('\t'); + for (; i + 32 <= len && n < max_tabs; i += 32) { + __m256i bytes = _mm256_loadu_si256((const __m256i *) (s + i)); + uint32_t mask = (uint32_t) _mm256_movemask_epi8(_mm256_cmpeq_epi8(bytes, tab)); + while (mask && n < max_tabs) { + unsigned bit = (unsigned) __builtin_ctz(mask); + tabs[n++] = i + bit; + mask &= mask - 1; + } + } +#endif + + for (; i < len && n < max_tabs; i++) { + if (s[i] == '\t') + tabs[n++] = i; + } + + return n; +} + +#define VCF_NOT_DOT_FIELD(p) (memcmp((p), ".\0", 2)) + +static int vcf_parse_simd_tabs(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) +{ + int ret = -2, overflow = 0, ntabs, i; + size_t tabs[9], line_end; + char *base, *p, *q, *r, *t; + kstring_t *str; + khint_t k; + vdict_t *d; + + if (!vcf_simd_tabs_enabled()) + return -3; + + vcf_simd_probe_stats.attempts++; + if (!s || !h || !v || !(s->s)) + return -3; + if (ks_resize(s, s->l + 4) < 0) + return -2; + + base = s->s; + line_end = s->l; + ntabs = vcf_find_tabs_simd(base, line_end, tabs, 9); + vcf_simd_probe_stats.tabs += ntabs; + if (ntabs < 7) { + vcf_simd_probe_stats.fallback++; + return -3; + } + + s->s[s->l + 0] = 0; + s->s[s->l + 1] = 0; + s->s[s->l + 2] = 0; + s->s[s->l + 3] = 0; + + bcf_clear1(v); + str = &v->shared; + for (i = 0; i < 7; i++) + base[tabs[i]] = 0; + + p = base; + d = (vdict_t*)h->dict[BCF_DT_CTG]; + k = kh_get(vdict, d, p); + if (k == kh_end(d)) { + hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); + v->errcode = BCF_ERR_CTG_UNDEF; + if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { + hts_log_error("Could not add dummy header for contig '%s'", p); + v->errcode |= BCF_ERR_CTG_INVALID; + goto err; + } + } + v->rid = kh_val(d, k).id; + + p = base + tabs[0] + 1; + overflow = 0; + t = p; + v->pos = hts_str2uint(p, &p, 62, &overflow); + if (overflow) { + hts_log_error("Position value '%s' is too large", t); + goto err; + } else if (*p) { + hts_log_error("Could not parse the position '%s'", t); + goto err; + } else { + v->pos -= 1; + } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; + + p = base + tabs[1] + 1; + q = base + tabs[2]; + if (VCF_NOT_DOT_FIELD(p)) bcf_enc_vchar(str, q - p, p); + else bcf_enc_size(str, 0, BCF_BT_CHAR); + + p = base + tabs[2] + 1; + q = base + tabs[3]; + bcf_enc_vchar(str, q - p, p); + v->n_allele = 1, v->rlen = q - p; + + p = base + tabs[3] + 1; + q = base + tabs[4]; + if (VCF_NOT_DOT_FIELD(p)) { + for (r = t = p;; ++r) { + if (*r == ',' || *r == 0) { + if (v->n_allele == UINT16_MAX) { + hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; + goto err; + } + bcf_enc_vchar(str, r - t, t); + t = r + 1; + ++v->n_allele; + } + if (r == q) break; + } + } + + p = base + tabs[4] + 1; + if (VCF_NOT_DOT_FIELD(p)) v->qual = atof(p); + else bcf_float_set_missing(v->qual); + if (v->max_unpack && !(v->max_unpack>>1)) goto end; + + p = base + tabs[5] + 1; + q = base + tabs[6]; + if (VCF_NOT_DOT_FIELD(p)) { + if (vcf_parse_filter(str, h, v, p, q)) + goto err; + } else bcf_enc_vint(str, 0, 0, -1); + if (v->max_unpack && !(v->max_unpack>>2)) goto end; + + p = base + tabs[6] + 1; + q = ntabs > 7 ? base + tabs[7] : base + line_end; + if (ntabs > 7) + *q = 0; + if (VCF_NOT_DOT_FIELD(p)) { + if (vcf_parse_info(str, h, v, p, q)) + goto err; + } + if (v->max_unpack && !(v->max_unpack>>3)) goto end; + + if (ntabs > 7) { + p = base + tabs[7] + 1; + q = ntabs > 8 ? base + tabs[8] : base + line_end; + *q = 0; + if (vcf_parse_format(s, h, v, p, q)) + goto err; + } + + end: + v->rlen = get_rlen(h, v); + ret = 0; + vcf_simd_probe_stats.hits++; + + err: + return ret; +} + int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { int ret = -2, overflow = 0; @@ -4001,6 +4684,11 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (!s || !h || !v || !(s->s)) return ret; + ret = vcf_parse_simd_tabs(s, h, v); + if (ret != -3) + return ret; + ret = -2; + // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); From 3ccf1c8a1bb5948e2aba5b460af66c29aafbd585 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 15:32:47 +0200 Subject: [PATCH 02/38] Document CCDG FORMAT parser benchmark --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 201 +++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 docs/CCDG_FORMAT_PLAN_BENCHMARK.md diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md new file mode 100644 index 000000000..cdd8e955a --- /dev/null +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -0,0 +1,201 @@ +# CCDG FORMAT Plan MVP Benchmark + +Date: 2026-04-28 + +Worktree: `/tmp/htslib-vcf-avx-sanity` + +Branch: `codex/vcf-avx-sanity` + +## Goal + +Estimate whether a runtime-planned VCF FORMAT parser can improve end-to-end +compressed VCF/BCF conversion performance on a wide CCDG VCF. + +The MVP implementation is gated by: + +```sh +HTS_VCF_FORMAT_PLAN=1 +HTS_VCF_FORMAT_PLAN_STATS=1 +``` + +It dynamically caches observed FORMAT layouts. The current MVP has direct +executors for the four dominant CCDG layouts: + +```text +GT:AB:AD:DP:GQ:PL +GT:AD:DP:GQ:PL +GT:AB:AD:DP:GQ:PGT:PID:PL +GT:AD:DP:GQ:PGT:PID:PL +``` + +Other layouts fall back to the existing generic FORMAT parser. + +## Data + +Source file: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +Subset used for this benchmark: + +```text +/tmp/ccdg_chr22_10k.vcf +``` + +The subset contains 10,000 variant records plus header lines. It is wide: +3,202 samples and about 866 MiB uncompressed. + +Compressed inputs prepared from the subset: + +```text +/tmp/ccdg_chr22_10k.vcf.gz +/tmp/ccdg_chr22_10k.bcf +``` + +Approximate input sizes: + +```text +ccdg_chr22_10k.vcf.gz 118 MiB by ls, 129 MiB by du +ccdg_chr22_10k.bcf 152 MiB by ls, 160 MiB by du +``` + +## FORMAT Coverage + +On the 10k CCDG subset after adding `PGT:PID` support: + +```text +attempts=10000 +hits=10000 +fallback=0 +parsed_samples=32020000 +``` + +The planned parser therefore handled 100% of records and parsed 32.0 million +sample FORMAT entries directly. + +For comparison, before `PGT:PID` support, coverage was: + +```text +attempts=10000 +hits=5494 +fallback=4506 +parsed_samples=17591788 +``` + +The fallback records were almost entirely the two layouts containing +`PGT:PID`. + +## Four-Cell Compressed Conversion Benchmark + +All cells are compressed input to compressed output. Each timing is a single +wall-clock run using `/usr/bin/time -p`; treat these as directional, not a +statistically rigorous benchmark. + +| Conversion | Baseline real | FORMAT plan real | Change | +|---|---:|---:|---:| +| VCF.gz -> BCF | 9.150 s | 8.266 s | 9.7% faster | +| BCF -> BCF | 7.168 s | 7.221 s | neutral, 0.7% slower | +| BCF -> VCF.gz | 11.367 s | 11.487 s | neutral, 1.1% slower | +| VCF.gz -> VCF.gz | 13.405 s | 12.670 s | 5.5% faster | + +Command shapes: + +```sh +./test/test_view.baseline -b -p /tmp/bench_base_vcf_to_bcf.bcf /tmp/ccdg_chr22_10k.vcf.gz +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -p /tmp/bench_plan_vcf_to_bcf.bcf /tmp/ccdg_chr22_10k.vcf.gz + +./test/test_view.baseline -b -p /tmp/bench_base_bcf_to_bcf.bcf /tmp/ccdg_chr22_10k.bcf +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -p /tmp/bench_plan_bcf_to_bcf.bcf /tmp/ccdg_chr22_10k.bcf + +./test/test_view.baseline -z -p /tmp/bench_base_bcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.bcf +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -z -p /tmp/bench_plan_bcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.bcf + +./test/test_view.baseline -z -p /tmp/bench_base_vcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.vcf.gz +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -z -p /tmp/bench_plan_vcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.vcf.gz +``` + +For each cell, the baseline output and planned-parser output were compared with +`cmp` and matched byte-for-byte. The BCF-input cells have +`attempts=0 hits=0 fallback=0` because they never enter the VCF text FORMAT +parser. + +## Compressed VCF to Uncompressed BCF + +This additional case keeps compressed VCF input but removes output compression +by writing BCF at compression level 0. + +| Conversion | Baseline real | FORMAT plan real | Change | +|---|---:|---:|---:| +| VCF.gz -> uncompressed BCF | 2.817 s | 1.930 s | 31.5% faster | + +Command shape: + +```sh +./test/test_view.baseline -b -l 0 -p /tmp/bench_base_vcfgz_to_ubcf.bcf /tmp/ccdg_chr22_10k.vcf.gz +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -l 0 -p /tmp/bench_plan_vcfgz_to_ubcf.bcf /tmp/ccdg_chr22_10k.vcf.gz +``` + +The baseline and planned-parser outputs were compared with `cmp` and matched +byte-for-byte. + +## Parse-Only Reference Timings + +For context, earlier parse-only tests on the same subsets showed a much larger +effect because output compression was removed from the critical path: + +| Dataset | Baseline parse-only | FORMAT plan parse-only | Change | +|---|---:|---:|---:| +| 10k CCDG subset, pre-`PGT:PID` executor | about 2.30 s | about 1.64 s | about 29% faster | +| 100k CCDG subset, pre-`PGT:PID` executor | 23.94 s | 17.71 s | about 26% faster | +| 100k CCDG subset, all-hit executor | 24.22 s | 14.95 s | about 38% faster | +| 100k CCDG VCF.gz -> uncompressed BCF, all-hit executor | 26.65 s | 18.12 s | about 32% faster | + +The all-hit executor was byte-identical against baseline on the 10k BCF output +and on a targeted one-record phased-layout test. + +## Profiling Notes + +After `PGT:PID` support, the generic FORMAT fallback is no longer a meaningful +cost for the CCDG benchmark. A macOS `sample` profile of +`VCF.gz -> uncompressed BCF` on the 100k subset showed the next hot areas: + +```text +vcf_plan_parse_int_vector 189 samples +libdeflate input decompress 158 samples +vcf_parse_format 154 samples +bcf_enc_vint 83 samples +vcf_plan_int_value 42 samples +vcf_plan_copy_string 33 samples +vcf_plan_gt2 27 samples +vcf_plan_float_value 24 samples +read 16 samples +``` + +This is a statistical sample, not exact cycle accounting, but it is useful +directionally. The next parser-side targets are direct integer-vector parsing +for AD/PL and reducing repeated `bcf_enc_vint` work in the planned path. + +## Findings + +The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, +parse-heavy VCF to uncompressed BCF conversion improves by about 30-40% on the +100k subset. + +For fully compressed-to-compressed conversion, output/input compression and VCF +formatting absorb much of the parser win. The MVP still improved VCF-input +conversions by about 5-10%, while BCF-input conversions were unchanged as +expected. When output compression is removed, VCF.gz to uncompressed BCF improves +by about 32%, much closer to the parse-only gain. + +The practical takeaway is that FORMAT planning is a better optimization target +than top-level VCF delimiter SIMD scanning. The earlier delimiter-only probe had +100% record coverage but was essentially neutral, while FORMAT planning moved +the parse-heavy workload substantially. + +The next highest-value extension is not more FORMAT layout coverage for this +CCDG benchmark, because coverage is already 100%. It is reducing the cost inside +the planned path: AD/PL integer-vector parsing, BCF integer encoding, and then +possibly pipelining decompression/parse/encode once the single-threaded parser +work has been squeezed further. From 1e46cb8e08592e280f2ea6d8bf598fa5a28a695c Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 15:40:35 +0200 Subject: [PATCH 03/38] Optimize planned AD and PL parsing --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 40 ++++++++- vcf.c | 139 +++++++++++++++++++++-------- 2 files changed, 138 insertions(+), 41 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index cdd8e955a..0f15fd33d 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -177,6 +177,39 @@ This is a statistical sample, not exact cycle accounting, but it is useful directionally. The next parser-side targets are direct integer-vector parsing for AD/PL and reducing repeated `bcf_enc_vint` work in the planned path. +## Follow-Up: Fixed-Width AD/PL Parsing + +The first follow-up optimization added fixed-width planned parsers for the most +common biallelic case: + +```text +AD width = 2 +PL width = 3 +``` + +On the 10k subset, about 82% of records are biallelic, so this removes a large +number of generic integer-vector loop iterations and helper calls while leaving +multi-allelic rows on the generic planned-vector parser. + +Correctness checks remained byte-identical against baseline for: + +```text +/tmp/ccdg_one_phase.vcf -> uncompressed BCF +/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF +``` + +Directional timings after the fixed-width parser change: + +| Dataset | Previous all-hit plan | Fixed-width AD/PL plan | Change | +|---|---:|---:|---:| +| 100k CCDG VCF -> uncompressed BCF | 14.95 s | 13.1-13.6 s | about 9-12% faster | +| 100k CCDG VCF.gz -> uncompressed BCF | 18.12 s | 15.6-16.5 s | about 9-14% faster | + +An attempted range-tracked replacement for `bcf_enc_vint` was also tested. It +preserved byte identity, but it slowed these same parse-heavy cases, so it was +not kept. The likely issue is that tracking ranges during parse adds enough +per-value work to outweigh skipping `bcf_enc_vint`'s later range scan. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, @@ -196,6 +229,7 @@ the parse-heavy workload substantially. The next highest-value extension is not more FORMAT layout coverage for this CCDG benchmark, because coverage is already 100%. It is reducing the cost inside -the planned path: AD/PL integer-vector parsing, BCF integer encoding, and then -possibly pipelining decompression/parse/encode once the single-threaded parser -work has been squeezed further. +the planned path and then possibly pipelining decompression/parse/encode once +the single-threaded parser work has been squeezed further. After the fixed-width +AD/PL parser, `bcf_enc_vint` and input decompression remain the most obvious +next bottlenecks. diff --git a/vcf.c b/vcf.c index ab0532f7c..1381db4df 100644 --- a/vcf.c +++ b/vcf.c @@ -3185,6 +3185,12 @@ typedef struct { int key_pl; } vcf_format_plan_t; +#if defined(__GNUC__) +#define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) +#else +#define VCF_PLAN_ALWAYS_INLINE static inline +#endif + static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, vcf_format_plan_t *plan) { @@ -3243,7 +3249,7 @@ static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *fo return cache[ncache++].supported ? &cache[ncache-1] : NULL; } -static inline int vcf_plan_gt2(const char **sp, int32_t out[2]) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2(const char **sp, int32_t out[2]) { const char *s = *sp; int a0, a1, phased; @@ -3267,7 +3273,7 @@ static inline int vcf_plan_gt2(const char **sp, int32_t out[2]) return 0; } -static inline int vcf_plan_int_value(const char **sp, int32_t *out) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) { const char *s = *sp; int sign = 1, val = 0; @@ -3292,7 +3298,7 @@ static inline int vcf_plan_int_value(const char **sp, int32_t *out) return 0; } -static inline int vcf_plan_float_value(const char **sp, float *out) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) { const char *s = *sp; char *end = NULL; @@ -3332,7 +3338,56 @@ static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) return 0; } -static inline int vcf_plan_expect_sep(const char **sp, int sep) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t *out) +{ + const char *s = *sp; + + if (vcf_plan_int_value(&s, &out[0]) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + *sp = s; + return 0; + } + s++; + if (vcf_plan_int_value(&s, &out[1]) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) +{ + const char *s = *sp; + + if (vcf_plan_int_value(&s, &out[0]) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + *sp = s; + return 0; + } + s++; + if (vcf_plan_int_value(&s, &out[1]) < 0) + return -1; + if (*s != ',') { + out[2] = bcf_int32_vector_end; + *sp = s; + return 0; + } + s++; + if (vcf_plan_int_value(&s, &out[2]) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) { if (**sp != sep) return -1; @@ -3340,7 +3395,7 @@ static inline int vcf_plan_expect_sep(const char **sp, int sep) return 0; } -static inline int vcf_plan_skip_field(const char **sp, int sep) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_skip_field(const char **sp, int sep) { const char *s = *sp; while (*s && *s != sep && *s != '\t') @@ -3351,7 +3406,7 @@ static inline int vcf_plan_skip_field(const char **sp, int sep) return 0; } -static inline int vcf_plan_measure_string(const char **sp, int sep, int *max_l) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_measure_string(const char **sp, int sep, int *max_l) { const char *s = *sp, *t = s; int l; @@ -3367,7 +3422,7 @@ static inline int vcf_plan_measure_string(const char **sp, int sep, int *max_l) return 0; } -static inline int vcf_plan_copy_string(const char **sp, char *out, int width) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int width) { const char *s = *sp, *t = s; int l; @@ -3494,29 +3549,33 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, cur = q + 1; end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (vcf_plan_gt2(&cur, >[sample * 2]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (vcf_plan_gt2(&cur, >[sample * 2]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; if (plan->has_ab) { if (vcf_plan_float_value(&cur, &ab[sample]) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; } - if (vcf_plan_parse_int_vector(&cur, &ad[sample * ad_w], ad_w) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value(&cur, &dp[sample]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value(&cur, &gq[sample]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) + if (ad_w == 2) { + if (vcf_plan_parse_int_vector2(&cur, &ad[sample * 2]) < 0) + goto fallback; + } else if (vcf_plan_parse_int_vector(&cur, &ad[sample * ad_w], ad_w) < 0) { goto fallback; + } + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value(&cur, &dp[sample]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value(&cur, &gq[sample]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; if (plan->has_phase) { if (vcf_plan_copy_string(&cur, &pgt[sample * pgt_w], pgt_w) < 0) goto fallback; @@ -3527,10 +3586,14 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; } - if (vcf_plan_parse_int_vector(&cur, &pl[sample * pl_w], pl_w) < 0) - goto fallback; - if (*cur == '\t') - cur++; + if (pl_w == 3) { + if (vcf_plan_parse_int_vector3(&cur, &pl[sample * 3]) < 0) + goto fallback; + } else if (vcf_plan_parse_int_vector(&cur, &pl[sample * pl_w], pl_w) < 0) { + goto fallback; + } + if (*cur == '\t') + cur++; else if (*cur == '\0' || cur >= end) ; else @@ -3543,20 +3606,20 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, v->n_sample = nsamples; bcf_enc_int1(&v->indiv, plan->key_gt); if (bcf_enc_vint(&v->indiv, nsamples * 2, gt, 2) < 0) - return -1; + return -1; if (plan->has_ab) { bcf_enc_int1(&v->indiv, plan->key_ab); bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT); if (serialize_float_array(&v->indiv, nsamples, ab) < 0) return -1; } - bcf_enc_int1(&v->indiv, plan->key_ad); - nwords = nsamples * ad_w; - if (bcf_enc_vint(&v->indiv, nwords, ad, ad_w) < 0) - return -1; - bcf_enc_int1(&v->indiv, plan->key_dp); - if (bcf_enc_vint(&v->indiv, nsamples, dp, 1) < 0) - return -1; + bcf_enc_int1(&v->indiv, plan->key_ad); + nwords = nsamples * ad_w; + if (bcf_enc_vint(&v->indiv, nwords, ad, ad_w) < 0) + return -1; + bcf_enc_int1(&v->indiv, plan->key_dp); + if (bcf_enc_vint(&v->indiv, nsamples, dp, 1) < 0) + return -1; bcf_enc_int1(&v->indiv, plan->key_gq); if (bcf_enc_vint(&v->indiv, nsamples, gq, 1) < 0) return -1; @@ -3573,9 +3636,9 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return -1; } bcf_enc_int1(&v->indiv, plan->key_pl); - nwords = nsamples * pl_w; - if (bcf_enc_vint(&v->indiv, nwords, pl, pl_w) < 0) - return -1; + nwords = nsamples * pl_w; + if (bcf_enc_vint(&v->indiv, nwords, pl, pl_w) < 0) + return -1; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; From ccffd35524e0c7312c08aa34765a4c98c9d99f25 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 16:37:32 +0200 Subject: [PATCH 04/38] Add general VCF FORMAT plan prototype --- docs/FORMAT_PLAN_SPEC.md | 103 +++++++++++ test/format-plan-edge.vcf | 23 +++ test/test_format_plan.sh | 16 ++ vcf.c | 354 +++++++++++++++++++++++++++++++++++++- 4 files changed, 494 insertions(+), 2 deletions(-) create mode 100644 docs/FORMAT_PLAN_SPEC.md create mode 100644 test/format-plan-edge.vcf create mode 100755 test/test_format_plan.sh diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md new file mode 100644 index 000000000..11730c53a --- /dev/null +++ b/docs/FORMAT_PLAN_SPEC.md @@ -0,0 +1,103 @@ +# FORMAT Plan Parser Spec + +This document describes the intended direction for the experimental +`HTS_VCF_FORMAT_PLAN=1` VCF FORMAT parser. + +## Goal + +Keep the existing parser as the source of truth, but add a runtime-compiled +fast path for common FORMAT layouts. The fast path should be opportunistic: +compile a plan for repeated FORMAT strings, execute known-safe operations +directly, and fall back to the generic parser whenever the record leaves the +supported subset. + +## Architecture + +The parser is tiered: + +1. Exact kernels for dominant production layouts. The current CCDG kernels cover + `GT:AB:AD:DP:GQ:PL`, `GT:AD:DP:GQ:PL`, + `GT:AB:AD:DP:GQ:PGT:PID:PL`, and `GT:AD:DP:GQ:PGT:PID:PL`. +2. A compiled op-list interpreter for regular FORMAT layouts. It caches the + FORMAT string, resolves header IDs once, then executes per-field operations + for GT, integer vectors, float vectors, and strings. +3. Generic htslib parsing for everything else, including sample subsetting, + duplicate FORMAT tags, undefined tags that require dummy header insertion, + unsupported header types, malformed values, or future VCF constructs. + +The cache key is the literal FORMAT column. Record-specific widths are still +computed per row because BCF stores each FORMAT field as a rectangular +sample-by-value array, and the width depends on observed ploidy, vector length, +string length, and allele count. + +## Correctness Rules + +The planned parser must produce byte-identical BCF to the generic parser for any +record it claims. If it cannot prove that, it must return `-3` so the existing +parser handles the record. + +Required invariants: + +- No planned parsing while `h->keep_samples` is active. +- Header IDs and types are resolved before execution. +- Duplicate tags use the generic parser. +- Undefined tags use the generic parser, preserving current dummy-header + behavior and warnings. +- GT encoding must match generic htslib phasing semantics, including haploid + genotypes and VCF 4.4 prefix phasing. +- Numeric vectors use observed row width and pad shorter samples with vector-end + sentinels. +- Strings use observed maximum byte length and zero-pad shorter samples. +- Integer and float overflow/error behavior should either match generic htslib + or force fallback. + +## Current MVP + +The current implementation keeps the CCDG exact kernels as the first tier and +adds a general compiled op-list tier for defined FORMAT fields with type +`String`, `Integer`, or `Float`. The op-list tier handles: + +- arbitrary field order, +- haploid, diploid, multidigit, missing, and phased GT values, +- integer and float vectors with row-local observed widths, +- string fields with row-local observed widths, +- multiallelic `Number=R` and `Number=G` rows by using observed vector width. + +The MVP intentionally falls back for sample subsetting, duplicate tags, +undefined tags, unsupported header types, and malformed values. + +## Edge Fixture + +`test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise +common awkward cases: + +- the exact CCDG layouts, +- reordered fields, +- multiallelic AD/PL and GL, +- haploid GT, +- multidigit allele indexes, +- fixed integer vectors, +- string FORMAT fields, +- exact-kernel fallbacks such as haploid GT and multidigit allele indexes. + +Run: + +```sh +./test/test_format_plan.sh +``` + +The script writes BCF through the generic parser and through +`HTS_VCF_FORMAT_PLAN=1`, compares them with `cmp`, and prints plan hit/fallback +statistics. + +## Next Work + +- Add more exact kernels only after coverage data shows that they dominate real + inputs. +- Split the op-list interpreter into smaller specialized op handlers so common + shapes like scalar int, fixed-width int vector, biallelic AD, and biallelic PL + can avoid generic vector loops. +- Add overflow-compatible numeric parsing or force fallback before committing to + the plan on extreme integer/float values. +- Integrate the edge fixture into the standard htslib test runner once the + experimental flag graduates beyond local benchmarking. diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf new file mode 100644 index 000000000..2b7f05373 --- /dev/null +++ b/test/format-plan-edge.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10510061 . A T 64.12 PASS . GT:AB:AD:DP:GQ:PL 0/0:.:3,0:3:9:0,9,104 0/1:0.5:5,4:9:99:99,0,123 ./.:.:0,0:0:.:. +chr22 10510352 . AT A 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 1/1:0,5:5:15:1|1:10510352_AT_A:225,15,0 0/1:3,2:5:20:0|1:10510352_AT_A:20,0,200 ./.:0,0:0:.:.:.:. +chr22 10520000 . A C,G 50 PASS . GT:AD:DP:GQ:PL 1/2:1,4,5:10:60:100,80,70,60,0,20 0/2:3,0,2:5:30:80,70,60,50,40,0 ./.:0,0,0:0:.:. +chr22 10530000 . G A 50 PASS . GT:DP:AD:GQ:PL 0/1:7:3,4:42:99,0,120 0/0:5:5,0:15:0,15,200 ./.:0:0,0:.:. +chr22 10540000 . C T 50 PASS . GT:HQ:DP:GQ 0/1:10,20:7:40 0/0:.,.:5:50 ./.:.:0:. +chr22 10550000 . C T 50 PASS . GT:FT:DP:GQ 0/1:PASS:7:40 0/0:LowQual:5:50 ./.:.:0:. +chr22 10560000 . A C,G 50 PASS . GT:GL:DP:GQ 0/1:-0.1,-1.2,-9.9,-2.0,-3.0,-4.0:7:40 1/2:-9.9,-8.8,-7.7,-6.6,-5.5,-4.4:5:50 ./.:.:0:. +chr22 10570000 . A T 50 PASS . GT:AD:DP:GQ:PL 0:3,0:3:10:0,10,100 1:0,3:3:20:100,10,0 .:0,0:0:.:. +chr22 10580000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT:AD:DP:GQ:PL 10/10:0,0,0,0,0,0,0,0,0,0,7:7:20:200,190,180,170,160,150,140,130,120,110,100,90,80,70,60,50,40,30,20,10,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460 0/10:3,0,0,0,0,0,0,0,0,0,2:5:30:0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650 ./.:0,0,0,0,0,0,0,0,0,0,0:0:.:. diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh new file mode 100755 index 000000000..1c5e71ecc --- /dev/null +++ b/test/test_format_plan.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -eu + +test_view=${TEST_VIEW:-./test/test_view} +input=${1:-test/format-plan-edge.vcf} +tmpdir=${TMPDIR:-/tmp} +base=${tmpdir}/hts-format-plan-base.$$ +plan=${tmpdir}/hts-format-plan-plan.$$ +stats=${tmpdir}/hts-format-plan-stats.$$ + +trap 'rm -f "$base" "$plan" "$stats"' EXIT HUP INT TERM + +env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 "$input" > "$base" +env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$plan" 2> "$stats" +cmp "$base" "$plan" +cat "$stats" diff --git a/vcf.c b/vcf.c index 1381db4df..bb3ba9097 100644 --- a/vcf.c +++ b/vcf.c @@ -3172,6 +3172,7 @@ static int vcf_format_plan_enabled(void) typedef struct { char format[64]; + const bcf_hdr_t *hdr; int supported; int has_ab; int has_phase; @@ -3185,6 +3186,20 @@ typedef struct { int key_pl; } vcf_format_plan_t; +typedef struct { + int key; + uint8_t htype; + uint8_t is_gt; +} vcf_format_op_t; + +typedef struct { + char format[256]; + const bcf_hdr_t *hdr; + int supported; + int n_ops; + vcf_format_op_t ops[MAX_N_FMT]; +} vcf_format_general_plan_t; + #if defined(__GNUC__) #define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) #else @@ -3198,6 +3213,7 @@ static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, if (strlen(format) >= sizeof(plan->format)) return 0; strcpy(plan->format, format); + plan->hdr = h; if (strcmp(format, "GT:AB:AD:DP:GQ:PL") == 0) { plan->supported = 1; @@ -3240,7 +3256,7 @@ static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *fo int i; for (i = 0; i < ncache; i++) - if (strcmp(cache[i].format, format) == 0) + if (cache[i].hdr == h && strcmp(cache[i].format, format) == 0) return cache[i].supported ? &cache[i] : NULL; if (ncache == N_PLAN_CACHE) @@ -3249,6 +3265,67 @@ static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *fo return cache[ncache++].supported ? &cache[ncache-1] : NULL; } +static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *format, + vcf_format_general_plan_t *plan) +{ + char tmp[256], *tok, *saveptr = NULL; + int i; + + memset(plan, 0, sizeof(*plan)); + if (strlen(format) >= sizeof(plan->format)) + return 0; + strcpy(plan->format, format); + strcpy(tmp, format); + plan->hdr = h; + + for (tok = strtok_r(tmp, ":", &saveptr); tok; + tok = strtok_r(NULL, ":", &saveptr)) { + int key, htype; + + if (plan->n_ops >= MAX_N_FMT) + return 0; + key = bcf_hdr_id2int(h, BCF_DT_ID, tok); + if (key < 0 || !bcf_hdr_idinfo_exists(h, BCF_HL_FMT, key)) + return 0; + for (i = 0; i < plan->n_ops; i++) + if (plan->ops[i].key == key) + return 0; + + htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); + if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) + return 0; + + plan->ops[plan->n_ops].key = key; + plan->ops[plan->n_ops].htype = htype; + plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; + plan->n_ops++; + } + + if (!plan->n_ops) + return 0; + + plan->supported = 1; + return 1; +} + +static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h, + const char *format) +{ + enum { N_GENERAL_PLAN_CACHE = 16 }; + static vcf_format_general_plan_t cache[N_GENERAL_PLAN_CACHE]; + static int ncache = 0; + int i; + + for (i = 0; i < ncache; i++) + if (cache[i].hdr == h && strcmp(cache[i].format, format) == 0) + return cache[i].supported ? &cache[i] : NULL; + + if (ncache == N_GENERAL_PLAN_CACHE) + return NULL; + vcf_format_general_plan_compile(h, format, &cache[ncache]); + return cache[ncache++].supported ? &cache[ncache-1] : NULL; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2(const char **sp, int32_t out[2]) { const char *s = *sp; @@ -3439,6 +3516,279 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int return 0; } +static int vcf_plan_measure_general(kstring_t *s, const bcf_hdr_t *h, + const vcf_format_general_plan_t *plan, + char *q, int *widths) +{ + const char *cur = q + 1, *end = s->s + s->l; + int sample, j, nsamples = bcf_hdr_nsamples(h); + + for (j = 0; j < plan->n_ops; j++) + widths[j] = 0; + + for (sample = 0; sample < nsamples && cur < end; sample++) { + for (j = 0; j < plan->n_ops; j++) { + const char *field = cur; + const vcf_format_op_t *op = &plan->ops[j]; + int w = 1; + + while (cur < end && *cur && *cur != ':' && *cur != '\t') { + if (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) { + if (*cur == ',') + w++; + } else if (op->is_gt) { + if (*cur == '/' || *cur == '|') + w++; + } + cur++; + } + + if (op->htype == BCF_HT_STR && !op->is_gt) { + w = cur - field; + if (j > 0) + w++; + } + if (w <= 0) + w = 1; + if (widths[j] < w) + widths[j] = w; + + if (j + 1 < plan->n_ops) { + if (*cur != ':') + return -1; + cur++; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + return -1; + } + } + } + + return sample == nsamples ? 0 : -1; +} + +static int vcf_plan_parse_gt_dynamic(const char **sp, int32_t *out, int width, int vcf44) +{ + const char *s = *sp; + int l = 0, ploidy = 0, anyunphased = 0, phasingprfx = 0, unknown1 = 0; + int32_t is_phased = 0; + + if (vcf44 && (*s == '|' || *s == '/')) { + is_phased = *s++ == '|'; + phasingprfx = 1; + } + + for (;;) { + uint32_t val = 0; + + if (l >= width) + return -1; + ploidy++; + if (*s == '.') { + s++; + out[l++] = is_phased; + if (l == 1) + unknown1 = 1; + } else if (*s >= '0' && *s <= '9') { + do { + if (val > ((uint32_t)INT32_MAX >> 1) - 1) + return -1; + val = val * 10 + (*s - '0'); + s++; + } while (*s >= '0' && *s <= '9'); + if (val > ((uint32_t)INT32_MAX >> 1) - 1) + return -1; + out[l++] = ((val + 1) << 1) | is_phased; + } else { + return -1; + } + + anyunphased |= (ploidy != 1) && !is_phased; + is_phased = *s == '|'; + if (*s != '|' && *s != '/') + break; + s++; + } + + if (!phasingprfx) { + if (ploidy == 1) { + if (!unknown1) + out[0] |= 1; + } else { + out[0] |= anyunphased ? 0 : 1; + } + } + for (; l < width; l++) + out[l] = bcf_int32_vector_end; + + *sp = s; + return 0; +} + +static int vcf_plan_parse_int_vector_dynamic(const char **sp, int32_t *out, int width) +{ + const char *s = *sp; + int i = 0; + + if (*s == ':' || *s == '\t' || *s == '\0') { + out[i++] = bcf_int32_missing; + } else { + for (;;) { + if (i >= width || vcf_plan_int_value(&s, &out[i]) < 0) + return -1; + i++; + if (*s != ',') + break; + s++; + } + } + for (; i < width; i++) + out[i] = bcf_int32_vector_end; + *sp = s; + return 0; +} + +static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int width) +{ + const char *s = *sp; + int i = 0; + + if (*s == ':' || *s == '\t' || *s == '\0') { + bcf_float_set_missing(out[i++]); + } else { + for (;;) { + if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) + return -1; + i++; + if (*s != ',') + break; + s++; + } + } + for (; i < width; i++) + bcf_float_set_vector_end(out[i]); + *sp = s; + return 0; +} + +static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, char *p, char *q) +{ + vcf_format_general_plan_t *plan; + kstring_t *mem; + int widths[MAX_N_FMT], sizes[MAX_N_FMT], offsets[MAX_N_FMT]; + int nsamples, sample, j, vcf44; + const char *cur, *end; + + plan = vcf_format_general_plan_get(h, p); + if (!plan) + goto fallback; + + nsamples = bcf_hdr_nsamples(h); + if (!nsamples) + return 0; + if (vcf_plan_measure_general(s, h, plan, q, widths) < 0) + goto fallback; + + mem = (kstring_t*)&h->mem; + mem->l = 0; + for (j = 0; j < plan->n_ops; j++) { + int size; + const vcf_format_op_t *op = &plan->ops[j]; + + if (widths[j] <= 0) + widths[j] = 1; + if (op->htype == BCF_HT_STR && !op->is_gt) + size = widths[j]; + else + size = widths[j] * (int)sizeof(int32_t); + if (size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) size > INT_MAX) + return -1; + if (align_mem(mem) < 0) + return -1; + offsets[j] = mem->l; + sizes[j] = size; + if (ks_resize(mem, mem->l + nsamples * (size_t) size) < 0) + return -1; + mem->l += nsamples * (size_t) size; + } + + cur = q + 1; + end = s->s + s->l; + vcf44 = bcf_get_version(h, NULL) >= VCF44; + for (sample = 0; sample < nsamples && cur < end; sample++) { + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + uint8_t *buf = (uint8_t*)mem->s + offsets[j] + sample * (size_t)sizes[j]; + + if (op->is_gt) { + if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, widths[j], vcf44) < 0) + goto fallback; + } else if (op->htype == BCF_HT_STR) { + if (vcf_plan_copy_string(&cur, (char *)buf, widths[j]) < 0) + goto fallback; + } else if (op->htype == BCF_HT_INT) { + if (vcf_plan_parse_int_vector_dynamic(&cur, (int32_t *)buf, widths[j]) < 0) + goto fallback; + } else if (op->htype == BCF_HT_REAL) { + if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, widths[j]) < 0) + goto fallback; + } else { + goto fallback; + } + + if (j + 1 < plan->n_ops) { + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + goto fallback; + } + } + } + if (sample != nsamples) + goto fallback; + + v->n_fmt = plan->n_ops; + v->n_sample = nsamples; + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + uint8_t *buf = (uint8_t*)mem->s + offsets[j]; + + bcf_enc_int1(&v->indiv, op->key); + if (op->htype == BCF_HT_STR && !op->is_gt) { + if (bcf_enc_size(&v->indiv, widths[j], BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)widths[j], &v->indiv) < 0) + return -1; + } else if (op->htype == BCF_HT_INT || op->is_gt) { + if (bcf_enc_vint(&v->indiv, nsamples * widths[j], (int32_t *)buf, widths[j]) < 0) + return -1; + } else { + if (bcf_enc_size(&v->indiv, widths[j], BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(&v->indiv, nsamples * (size_t)widths[j], (float *)buf) < 0) + return -1; + } + } + + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + return 0; + +fallback: + vcf_format_plan_stats.fallback++; + return -3; +} + static int vcf_plan_phase_widths(const bcf_hdr_t *h, const vcf_format_plan_t *plan, kstring_t *s, char *q, int *pgt_w, int *pid_w) { @@ -3496,7 +3846,7 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, plan = vcf_format_plan_get(h, p); if (!plan) - goto fallback; + return vcf_parse_format_general_planned(s, h, v, p, q); nsamples = bcf_hdr_nsamples(h); if (!nsamples) From 03e0d7f99a838c13aaf5838de6c39e2da8d84293 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 16:43:20 +0200 Subject: [PATCH 05/38] Benchmark compiled FORMAT interpreter mode --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 44 ++++++++++++++++++++++++++++++ docs/FORMAT_PLAN_SPEC.md | 4 ++- vcf.c | 21 ++++++++++---- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 0f15fd33d..45f37ef10 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -210,6 +210,50 @@ preserved byte identity, but it slowed these same parse-heavy cases, so it was not kept. The likely issue is that tracking ranges during parse adds enough per-value work to outweigh skipping `bcf_enc_vint`'s later range scan. +## Follow-Up: Compiled Op Interpreter + +A second planned-parser tier was added to test whether a more general compiled +FORMAT op interpreter can recover much of the exact-kernel benefit while +covering more layouts. The exact CCDG kernels still run for +`HTS_VCF_FORMAT_PLAN=1`; `HTS_VCF_FORMAT_PLAN=interp` skips those kernels and +uses only the compiled op interpreter. + +Correctness checks: + +```text +./test/test_format_plan.sh +HTS_VCF_FORMAT_PLAN=interp ./test/test_view -b -l 0 test/format-plan-edge.vcf +/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF +``` + +All planned outputs were compared against baseline with `cmp` and matched +byte-for-byte. The CCDG 10k VCF-input cases had 10,000 attempts, 10,000 hits, +0 fallback, and 32,020,000 parsed samples for both exact and interpreter modes. + +Single-pass 10k CCDG conversion matrix, real seconds: + +| Conversion | Baseline | Exact kernels | Compiled interp | Exact vs baseline | Interp vs baseline | +|---|---:|---:|---:|---:|---:| +| VCF.gz -> BCF.gz | 9.11 | 7.97 | 8.79 | 12.5% faster | 3.5% faster | +| BCF -> BCF.gz | 7.03 | 7.06 | 7.03 | neutral | neutral | +| BCF -> VCF.gz | 11.20 | 11.32 | 11.21 | neutral | neutral | +| VCF.gz -> VCF.gz | 13.18 | 12.01 | 12.92 | 8.9% faster | 2.0% faster | +| VCF.gz -> uncompressed BCF | 2.79 | 1.64 | 2.61 | 41.2% faster | 6.5% faster | + +Parse-heavy uncompressed reference: + +| Conversion | Baseline | Exact kernels | Compiled interp | Exact vs baseline | Interp vs baseline | +|---|---:|---:|---:|---:|---:| +| VCF -> uncompressed BCF | 2.56 | 1.36 | 2.33 | 46.9% faster | 9.0% faster | + +The compiled interpreter is useful for validating the architecture, but it is +not yet where the performance is. Its per-sample dynamic dispatch, generic +width pass, generic vector loops, and indirect per-op buffer handling leave it +much closer to the baseline parser than to the exact CCDG kernels. This argues +for a hybrid approach: use the interpreter as a safe coverage layer and add +small specialized op handlers for the very common shapes inside it, especially +diploid GT, scalar ints, biallelic AD, biallelic PL, and fixed-width strings. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 11730c53a..f57dd2a36 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -88,7 +88,9 @@ Run: The script writes BCF through the generic parser and through `HTS_VCF_FORMAT_PLAN=1`, compares them with `cmp`, and prints plan hit/fallback -statistics. +statistics. `HTS_VCF_FORMAT_PLAN=interp` or `HTS_VCF_FORMAT_PLAN=general` +skips the exact kernels and runs the compiled op-list interpreter directly, +which is useful for isolating interpreter performance. ## Next Work diff --git a/vcf.c b/vcf.c index bb3ba9097..59414c459 100644 --- a/vcf.c +++ b/vcf.c @@ -3160,14 +3160,19 @@ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; } -static int vcf_format_plan_enabled(void) +static int vcf_format_plan_mode(void) { - static int enabled = -1; - if (enabled < 0) { + static int mode = -1; + if (mode < 0) { const char *env = getenv("HTS_VCF_FORMAT_PLAN"); - enabled = env && env[0] && strcmp(env, "0") != 0; + if (!env || !env[0] || strcmp(env, "0") == 0) + mode = 0; + else if (strcmp(env, "interp") == 0 || strcmp(env, "general") == 0) + mode = 2; + else + mode = 1; } - return enabled; + return mode; } typedef struct { @@ -3837,13 +3842,17 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, float *ab = NULL; char *pgt = NULL, *pid = NULL; const char *cur, *end; + int plan_mode; - if (!vcf_format_plan_enabled()) + plan_mode = vcf_format_plan_mode(); + if (!plan_mode) return -3; vcf_format_plan_stats.attempts++; if (h->keep_samples) goto fallback; + if (plan_mode == 2) + return vcf_parse_format_general_planned(s, h, v, p, q); plan = vcf_format_plan_get(h, p); if (!plan) return vcf_parse_format_general_planned(s, h, v, p, q); From f2d3db145f304ad6e7f48825bcee41bce7515b7c Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 16:55:24 +0200 Subject: [PATCH 06/38] Specialize compiled FORMAT opcode handlers --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 47 ++++++++ docs/FORMAT_PLAN_SPEC.md | 10 +- test/test_format_plan.sh | 7 +- vcf.c | 184 +++++++++++++++++++++++------ 4 files changed, 208 insertions(+), 40 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 45f37ef10..56b57ae35 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -254,6 +254,53 @@ for a hybrid approach: use the interpreter as a safe coverage layer and add small specialized op handlers for the very common shapes inside it, especially diploid GT, scalar ints, biallelic AD, biallelic PL, and fixed-width strings. +## Follow-Up: Opcode Tape Specialization + +The compiled interpreter was then changed from "inspect each op type while +parsing" to a row-specific opcode tape. The FORMAT string is still cached as a +flexible op list, but after the row width pass each op is resolved to a narrower +handler: + +```text +GT2, GT-dynamic, INT1, INT2, INT3, INTN, FLOAT1, FLOATN, STR +``` + +This preserves the flexible interpreter path for arbitrary defined +String/Integer/Float FORMAT layouts, while avoiding repeated `is_gt` / type +checks and using the same fixed-width integer helpers as the exact CCDG kernel +when the observed row width permits it. + +Correctness checks remained byte-identical against baseline for: + +```text +./test/test_format_plan.sh +HTS_VCF_FORMAT_PLAN=interp ./test/test_view -b -l 0 test/format-plan-edge.vcf +/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF +``` + +Single-pass 10k CCDG conversion matrix after opcode specialization, real +seconds: + +| Conversion | Baseline | Exact kernels | Opcode interp | Exact vs baseline | Interp vs baseline | +|---|---:|---:|---:|---:|---:| +| VCF.gz -> BCF.gz | 9.19 | 7.99 | 9.28 | 13.1% faster | neutral/noisy | +| BCF -> BCF.gz | 8.04 | 8.22 | 8.10 | neutral | neutral | +| BCF -> VCF.gz | 12.71 | 12.04 | 12.99 | neutral/noisy | neutral/noisy | +| VCF.gz -> VCF.gz | 13.76 | 12.33 | 13.88 | 10.4% faster | neutral/noisy | +| VCF.gz -> uncompressed BCF | 2.87 | 1.68 | 2.43 | 41.5% faster | 15.3% faster | + +Parse-heavy uncompressed reference: + +| Conversion | Baseline | Exact kernels | Opcode interp | Exact vs baseline | Interp vs baseline | +|---|---:|---:|---:|---:|---:| +| VCF -> uncompressed BCF | 2.57 | 1.42 | 2.12 | 44.7% faster | 17.5% faster | + +Relative to the first compiled interpreter measurement, opcode specialization +improved the parse-heavy uncompressed case from 2.33 s to 2.12 s and VCF.gz to +uncompressed BCF from 2.61 s to 2.43 s. That is real movement, but the exact +kernels remain substantially faster because they also avoid the generic width +measurement, per-op buffer indirection, and per-sample opcode switch. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index f57dd2a36..37a5bfd82 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -66,6 +66,11 @@ adds a general compiled op-list tier for defined FORMAT fields with type The MVP intentionally falls back for sample subsetting, duplicate tags, undefined tags, unsupported header types, and malformed values. +After the row width pass, the interpreter resolves each cached FORMAT op to a +row-specific opcode such as `GT2`, `GT`, `INT1`, `INT2`, `INT3`, `INTN`, +`FLOAT1`, `FLOATN`, or `STR`. This keeps layout coverage flexible while +memoizing the common "muscle memory" for repeated shapes. + ## Edge Fixture `test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise @@ -96,9 +101,8 @@ which is useful for isolating interpreter performance. - Add more exact kernels only after coverage data shows that they dominate real inputs. -- Split the op-list interpreter into smaller specialized op handlers so common - shapes like scalar int, fixed-width int vector, biallelic AD, and biallelic PL - can avoid generic vector loops. +- Add plan- or shape-level executors for dominant opcode sequences so hot rows + can also avoid the per-sample opcode switch. - Add overflow-compatible numeric parsing or force fallback before committing to the plan on extreme integer/float values. - Integrate the edge fixture into the standard htslib test runner once the diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh index 1c5e71ecc..ba3a61836 100755 --- a/test/test_format_plan.sh +++ b/test/test_format_plan.sh @@ -6,11 +6,16 @@ input=${1:-test/format-plan-edge.vcf} tmpdir=${TMPDIR:-/tmp} base=${tmpdir}/hts-format-plan-base.$$ plan=${tmpdir}/hts-format-plan-plan.$$ +interp=${tmpdir}/hts-format-plan-interp.$$ stats=${tmpdir}/hts-format-plan-stats.$$ +interp_stats=${tmpdir}/hts-format-plan-interp-stats.$$ -trap 'rm -f "$base" "$plan" "$stats"' EXIT HUP INT TERM +trap 'rm -f "$base" "$plan" "$interp" "$stats" "$interp_stats"' EXIT HUP INT TERM env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 "$input" > "$base" env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$plan" 2> "$stats" +env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$interp" 2> "$interp_stats" cmp "$base" "$plan" +cmp "$base" "$interp" cat "$stats" +cat "$interp_stats" diff --git a/vcf.c b/vcf.c index 59414c459..155daad1a 100644 --- a/vcf.c +++ b/vcf.c @@ -3205,6 +3205,26 @@ typedef struct { vcf_format_op_t ops[MAX_N_FMT]; } vcf_format_general_plan_t; +typedef enum { + VCF_FORMAT_ROW_GT, + VCF_FORMAT_ROW_GT2, + VCF_FORMAT_ROW_INT1, + VCF_FORMAT_ROW_INT2, + VCF_FORMAT_ROW_INT3, + VCF_FORMAT_ROW_INTN, + VCF_FORMAT_ROW_FLOAT1, + VCF_FORMAT_ROW_FLOATN, + VCF_FORMAT_ROW_STR +} vcf_format_row_kind_t; + +typedef struct { + int key; + int width; + int size; + int offset; + vcf_format_row_kind_t kind; +} vcf_format_row_op_t; + #if defined(__GNUC__) #define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) #else @@ -3680,12 +3700,88 @@ static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible(const char **sp, int32_t *out) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + *out = bcf_int32_missing; + return 0; + } + return vcf_plan_int_value(sp, out); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float *out) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + bcf_float_set_missing(*out); + return 0; + } + return vcf_plan_float_value(sp, out); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible(const char **sp, int32_t *out) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + out[0] = bcf_int32_missing; + out[1] = bcf_int32_vector_end; + return 0; + } + return vcf_plan_parse_int_vector2(sp, out); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, int32_t *out) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + out[0] = bcf_int32_missing; + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + return 0; + } + return vcf_plan_parse_int_vector3(sp, out); +} + +static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, + bcf1_t *v, int *widths, + vcf_format_row_op_t *row_ops) +{ + int j; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + vcf_format_row_op_t *row = &row_ops[j]; + + row->key = op->key; + row->width = widths[j] > 0 ? widths[j] : 1; + row->offset = 0; + if (op->is_gt) { + row->kind = row->width == 2 && v->n_allele <= 10 ? VCF_FORMAT_ROW_GT2 : VCF_FORMAT_ROW_GT; + row->size = row->width * (int)sizeof(int32_t); + } else if (op->htype == BCF_HT_INT) { + if (row->width == 1) + row->kind = VCF_FORMAT_ROW_INT1; + else if (row->width == 2) + row->kind = VCF_FORMAT_ROW_INT2; + else if (row->width == 3) + row->kind = VCF_FORMAT_ROW_INT3; + else + row->kind = VCF_FORMAT_ROW_INTN; + row->size = row->width * (int)sizeof(int32_t); + } else if (op->htype == BCF_HT_REAL) { + row->kind = row->width == 1 ? VCF_FORMAT_ROW_FLOAT1 : VCF_FORMAT_ROW_FLOATN; + row->size = row->width * (int)sizeof(float); + } else { + row->kind = VCF_FORMAT_ROW_STR; + row->size = row->width; + } + } +} + static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { vcf_format_general_plan_t *plan; kstring_t *mem; - int widths[MAX_N_FMT], sizes[MAX_N_FMT], offsets[MAX_N_FMT]; + int widths[MAX_N_FMT]; + vcf_format_row_op_t row_ops[MAX_N_FMT]; int nsamples, sample, j, vcf44; const char *cur, *end; @@ -3698,28 +3794,21 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, return 0; if (vcf_plan_measure_general(s, h, plan, q, widths) < 0) goto fallback; + vcf_format_general_resolve_ops(plan, v, widths, row_ops); mem = (kstring_t*)&h->mem; mem->l = 0; for (j = 0; j < plan->n_ops; j++) { - int size; - const vcf_format_op_t *op = &plan->ops[j]; + vcf_format_row_op_t *op = &row_ops[j]; - if (widths[j] <= 0) - widths[j] = 1; - if (op->htype == BCF_HT_STR && !op->is_gt) - size = widths[j]; - else - size = widths[j] * (int)sizeof(int32_t); - if (size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) size > INT_MAX) + if (op->size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) return -1; if (align_mem(mem) < 0) return -1; - offsets[j] = mem->l; - sizes[j] = size; - if (ks_resize(mem, mem->l + nsamples * (size_t) size) < 0) + op->offset = mem->l; + if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) return -1; - mem->l += nsamples * (size_t) size; + mem->l += nsamples * (size_t) op->size; } cur = q + 1; @@ -3727,23 +3816,46 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, vcf44 = bcf_get_version(h, NULL) >= VCF44; for (sample = 0; sample < nsamples && cur < end; sample++) { for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - uint8_t *buf = (uint8_t*)mem->s + offsets[j] + sample * (size_t)sizes[j]; + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset + sample * (size_t)op->size; - if (op->is_gt) { - if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, widths[j], vcf44) < 0) + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) goto fallback; - } else if (op->htype == BCF_HT_STR) { - if (vcf_plan_copy_string(&cur, (char *)buf, widths[j]) < 0) + break; + case VCF_FORMAT_ROW_GT: + if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, op->width, vcf44) < 0) goto fallback; - } else if (op->htype == BCF_HT_INT) { - if (vcf_plan_parse_int_vector_dynamic(&cur, (int32_t *)buf, widths[j]) < 0) + break; + case VCF_FORMAT_ROW_INT1: + if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)buf) < 0) goto fallback; - } else if (op->htype == BCF_HT_REAL) { - if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, widths[j]) < 0) + break; + case VCF_FORMAT_ROW_INT2: + if (vcf_plan_parse_int_vector2_flexible(&cur, (int32_t *)buf) < 0) goto fallback; - } else { - goto fallback; + break; + case VCF_FORMAT_ROW_INT3: + if (vcf_plan_parse_int_vector3_flexible(&cur, (int32_t *)buf) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_INTN: + if (vcf_plan_parse_int_vector_dynamic(&cur, (int32_t *)buf, op->width) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_FLOAT1: + if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_FLOATN: + if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_STR: + if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) + goto fallback; + break; } if (j + 1 < plan->n_ops) { @@ -3765,22 +3877,22 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, v->n_fmt = plan->n_ops; v->n_sample = nsamples; for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - uint8_t *buf = (uint8_t*)mem->s + offsets[j]; + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; bcf_enc_int1(&v->indiv, op->key); - if (op->htype == BCF_HT_STR && !op->is_gt) { - if (bcf_enc_size(&v->indiv, widths[j], BCF_BT_CHAR) < 0) + if (op->kind == VCF_FORMAT_ROW_STR) { + if (bcf_enc_size(&v->indiv, op->width, BCF_BT_CHAR) < 0) return -1; - if (kputsn((char *)buf, nsamples * (size_t)widths[j], &v->indiv) < 0) + if (kputsn((char *)buf, nsamples * (size_t)op->width, &v->indiv) < 0) return -1; - } else if (op->htype == BCF_HT_INT || op->is_gt) { - if (bcf_enc_vint(&v->indiv, nsamples * widths[j], (int32_t *)buf, widths[j]) < 0) + } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + if (bcf_enc_size(&v->indiv, op->width, BCF_BT_FLOAT) < 0) return -1; - } else { - if (bcf_enc_size(&v->indiv, widths[j], BCF_BT_FLOAT) < 0) + if (serialize_float_array(&v->indiv, nsamples * (size_t)op->width, (float *)buf) < 0) return -1; - if (serialize_float_array(&v->indiv, nsamples * (size_t)widths[j], (float *)buf) < 0) + } else { + if (bcf_enc_vint(&v->indiv, nsamples * op->width, (int32_t *)buf, op->width) < 0) return -1; } } From 3d934315edaae830eeb968b10639dfacbb736497 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 17:12:09 +0200 Subject: [PATCH 07/38] Harden and specialize FORMAT planning --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 59 ++++ docs/FORMAT_PLAN_SPEC.md | 14 + test/format-plan-edge.vcf | 1 + vcf.c | 449 ++++++++++++++++++++++++++--- 4 files changed, 486 insertions(+), 37 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 56b57ae35..3c2555877 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -301,6 +301,65 @@ uncompressed BCF from 2.61 s to 2.43 s. That is real movement, but the exact kernels remain substantially faster because they also avoid the generic width measurement, per-op buffer indirection, and per-sample opcode switch. +## Follow-Up: Strict Width and Shape Executors + +The next iteration hardened correctness and tested more aggressive FORMAT +planning: + +- planned integer parsing now detects BCF int32 payload overflow and falls back, + avoiding undefined overflow and preserving generic warning/missing behavior; +- exact AD/PL paths validate that the observed max vector width matches the + emitted width, falling back for sparse rows that generic htslib would encode + narrower; +- the general interpreter can skip the observed-width pass for strict + header/allele-count-derived numeric rows; +- common numeric opcode tapes `GT2:INT2:INT1:INT1:INT3` and + `GT2:FLOAT1:INT2:INT1:INT1:INT3` use shape-level executors; +- validated `GT2` rows emit BCF int8 directly instead of calling + `bcf_enc_vint()`. + +`test/format-plan-edge.vcf` now includes an all-missing AD/PL row to verify that +the exact path falls back when its expected vector width would not match generic +observed-width BCF output. + +Correctness checks remained byte-identical: + +```text +./test/test_format_plan.sh +/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF, exact mode +/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF, interpreter mode +``` + +Parse-heavy 10k CCDG reference after these changes: + +| Conversion | Baseline | Exact kernels | Strict/shape interp | +|---|---:|---:|---:| +| VCF -> uncompressed BCF | 2.63 s | 1.61 s | 2.31 s | + +Full 10k CCDG compressed matrix, real seconds: + +| Conversion | Baseline | Exact kernels | Strict/shape interp | +|---|---:|---:|---:| +| VCF.gz -> BCF.gz | 9.26 | 8.22 | 8.94 | +| BCF -> BCF.gz | 7.18 | 7.20 | 7.16 | +| BCF -> VCF.gz | 11.45 | 11.33 | 11.85 | +| VCF.gz -> VCF.gz | 14.37 | 13.55 | 13.52 | +| VCF.gz -> uncompressed BCF | 2.94 | 1.92 | 2.66 | + +On a 3k CCDG subset containing only non-phase FORMAT layouts, the strict/shape +interpreter improved over baseline but still did not approach the exact kernel: + +| Dataset | Baseline | Exact kernels | Strict/shape interp | +|---|---:|---:|---:| +| 3k non-phase VCF -> uncompressed BCF | 0.68 s | 0.34 s | 0.58 s | + +The takeaway is mixed. The hardening is worth keeping, and direct `GT2` +encoding is simple and safe. However, shape-level dispatch alone does not close +the remaining gap. The next high-ROI parser-side experiment should reduce +memory traffic by parsing validated fixed-width fields directly into final BCF +payload buffers, or specialize complete row executors that combine parse, +validation, and encode rather than only replacing the opcode switch. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 37a5bfd82..934af71d4 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -71,6 +71,18 @@ row-specific opcode such as `GT2`, `GT`, `INT1`, `INT2`, `INT3`, `INTN`, `FLOAT1`, `FLOATN`, or `STR`. This keeps layout coverage flexible while memoizing the common "muscle memory" for repeated shapes. +For rows whose widths can be predicted from the header and allele count, the +interpreter can try a strict path before the observed-width pass. The strict +path validates the observed maximum width while parsing and falls back to the +observed-width interpreter if the row is sparse, malformed, string-bearing, or +otherwise not byte-identical. Common numeric opcode tapes such as +`GT2:INT2:INT1:INT1:INT3` and `GT2:FLOAT1:INT2:INT1:INT1:INT3` have +shape-level executors that avoid the per-op switch. + +Planned integer parsing must be overflow-safe. If a value is outside the BCF +int32 payload range, the planned parser falls back so the generic parser keeps +its warning and missing-value behavior. + ## Edge Fixture `test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise @@ -103,6 +115,8 @@ which is useful for isolating interpreter performance. inputs. - Add plan- or shape-level executors for dominant opcode sequences so hot rows can also avoid the per-sample opcode switch. +- Explore direct final-buffer output for validated fixed-width fields; this is + likely higher leverage than adding more switch-level shape executors. - Add overflow-compatible numeric parsing or force fallback before committing to the plan on extreme integer/float values. - Integrate the edge fixture into the standard htslib test runner once the diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index 2b7f05373..aa9b185b9 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -21,3 +21,4 @@ chr22 10550000 . C T 50 PASS . GT:FT:DP:GQ 0/1:PASS:7:40 0/0:LowQual:5:50 ./.:.: chr22 10560000 . A C,G 50 PASS . GT:GL:DP:GQ 0/1:-0.1,-1.2,-9.9,-2.0,-3.0,-4.0:7:40 1/2:-9.9,-8.8,-7.7,-6.6,-5.5,-4.4:5:50 ./.:.:0:. chr22 10570000 . A T 50 PASS . GT:AD:DP:GQ:PL 0:3,0:3:10:0,10,100 1:0,3:3:20:100,10,0 .:0,0:0:.:. chr22 10580000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT:AD:DP:GQ:PL 10/10:0,0,0,0,0,0,0,0,0,0,7:7:20:200,190,180,170,160,150,140,130,120,110,100,90,80,70,60,50,40,30,20,10,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460 0/10:3,0,0,0,0,0,0,0,0,0,2:5:30:0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650 ./.:0,0,0,0,0,0,0,0,0,0,0:0:.:. +chr22 10585000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/0:.:3:10:. 0/1:.:5:20:. ./.:.:0:.:. diff --git a/vcf.c b/vcf.c index 155daad1a..b199c7449 100644 --- a/vcf.c +++ b/vcf.c @@ -3193,14 +3193,17 @@ typedef struct { typedef struct { int key; + int number; uint8_t htype; uint8_t is_gt; + uint8_t vl_type; } vcf_format_op_t; typedef struct { char format[256]; const bcf_hdr_t *hdr; int supported; + int strict_supported; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; } vcf_format_general_plan_t; @@ -3217,6 +3220,12 @@ typedef enum { VCF_FORMAT_ROW_STR } vcf_format_row_kind_t; +typedef enum { + VCF_FORMAT_SHAPE_NONE, + VCF_FORMAT_SHAPE_GT2_I2_I1_I1_I3, + VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3 +} vcf_format_shape_kind_t; + typedef struct { int key; int width; @@ -3302,6 +3311,7 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma strcpy(plan->format, format); strcpy(tmp, format); plan->hdr = h; + plan->strict_supported = 1; for (tok = strtok_r(tmp, ":", &saveptr); tok; tok = strtok_r(NULL, ":", &saveptr)) { @@ -3321,8 +3331,16 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma return 0; plan->ops[plan->n_ops].key = key; + plan->ops[plan->n_ops].number = bcf_hdr_id2number(h, BCF_HL_FMT, key); plan->ops[plan->n_ops].htype = htype; plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; + plan->ops[plan->n_ops].vl_type = bcf_hdr_id2length(h, BCF_HL_FMT, key); + if (!plan->ops[plan->n_ops].is_gt) { + int vl = plan->ops[plan->n_ops].vl_type; + if (htype == BCF_HT_STR || + (vl != BCF_VL_FIXED && vl != BCF_VL_A && vl != BCF_VL_R && vl != BCF_VL_G)) + plan->strict_supported = 0; + } plan->n_ops++; } @@ -3378,7 +3396,8 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2(const char **sp, int32_t out[2]) VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) { const char *s = *sp; - int sign = 1, val = 0; + int sign = 1; + uint32_t val = 0, limit, cutoff, cutlim; if (*s == '.') { *out = bcf_int32_missing; @@ -3391,15 +3410,44 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) } if (!(*s >= '0' && *s <= '9')) return -1; + limit = sign < 0 ? (uint32_t)(-(int64_t)BCF_MIN_BT_INT32) : (uint32_t)BCF_MAX_BT_INT32; + cutoff = limit / 10; + cutlim = limit % 10; while (*s >= '0' && *s <= '9') { - val = val * 10 + (*s - '0'); + uint32_t digit = *s - '0'; + if (val > cutoff || (val == cutoff && digit > cutlim)) + return -1; + val = val * 10 + digit; s++; } - *out = sign * val; + if (sign < 0) + *out = -(int32_t)val; + else + *out = (int32_t)val; *sp = s; return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_vector_count(const int32_t *vals, int width) +{ + int i; + + for (i = 0; i < width; i++) + if (vals[i] == bcf_int32_vector_end) + break; + return i; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_vector_count(const float *vals, int width) +{ + int i; + + for (i = 0; i < width; i++) + if (bcf_float_is_vector_end(vals[i])) + break; + return i; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) { const char *s = *sp; @@ -3418,7 +3466,8 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) return 0; } -static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) +static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, + int width, int *nread) { const char *s = *sp; int i; @@ -3432,6 +3481,8 @@ static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) } s++; } + if (nread) + *nread = i; for (; i < width; i++) out[i] = bcf_int32_vector_end; if (*s == ',') @@ -3440,7 +3491,12 @@ static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t *out) +static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) +{ + return vcf_plan_parse_int_vector_counted(sp, out, width, NULL); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted(const char **sp, int32_t *out, int *nread) { const char *s = *sp; @@ -3449,6 +3505,8 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t * if (*s != ',') { out[1] = bcf_int32_vector_end; *sp = s; + if (nread) + *nread = 1; return 0; } s++; @@ -3457,10 +3515,17 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t * if (*s == ',') return -1; *sp = s; + if (nread) + *nread = 2; return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t *out) +{ + return vcf_plan_parse_int_vector2_counted(sp, out, NULL); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted(const char **sp, int32_t *out, int *nread) { const char *s = *sp; @@ -3470,6 +3535,8 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t * out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; *sp = s; + if (nread) + *nread = 1; return 0; } s++; @@ -3478,6 +3545,8 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t * if (*s != ',') { out[2] = bcf_int32_vector_end; *sp = s; + if (nread) + *nread = 2; return 0; } s++; @@ -3486,9 +3555,16 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t * if (*s == ',') return -1; *sp = s; + if (nread) + *nread = 3; return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) +{ + return vcf_plan_parse_int_vector3_counted(sp, out, NULL); +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) { if (**sp != sep) @@ -3718,25 +3794,39 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float return vcf_plan_float_value(sp, out); } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible(const char **sp, int32_t *out) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted(const char **sp, int32_t *out, int *nread) { if (**sp == ':' || **sp == '\t' || **sp == '\0') { out[0] = bcf_int32_missing; out[1] = bcf_int32_vector_end; + if (nread) + *nread = 1; return 0; } - return vcf_plan_parse_int_vector2(sp, out); + return vcf_plan_parse_int_vector2_counted(sp, out, nread); } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, int32_t *out) +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible(const char **sp, int32_t *out) +{ + return vcf_plan_parse_int_vector2_flexible_counted(sp, out, NULL); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted(const char **sp, int32_t *out, int *nread) { if (**sp == ':' || **sp == '\t' || **sp == '\0') { out[0] = bcf_int32_missing; out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; + if (nread) + *nread = 1; return 0; } - return vcf_plan_parse_int_vector3(sp, out); + return vcf_plan_parse_int_vector3_counted(sp, out, nread); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, int32_t *out) +{ + return vcf_plan_parse_int_vector3_flexible_counted(sp, out, NULL); } static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, @@ -3775,6 +3865,296 @@ static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan } } +static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t *v) +{ + if (op->is_gt) + return 2; + if (op->htype == BCF_HT_STR) + return 0; + + switch (op->vl_type) { + case BCF_VL_FIXED: + return op->number > 0 ? op->number : 0; + case BCF_VL_A: + return v->n_allele > 1 ? v->n_allele - 1 : 0; + case BCF_VL_R: + return v->n_allele; + case BCF_VL_G: + return v->n_allele * (v->n_allele + 1) / 2; + default: + return 0; + } +} + +static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt); + +static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, + int nsamples, int n_ops, + const vcf_format_row_op_t *row_ops) +{ + int j; + + for (j = 0; j < n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; + + bcf_enc_int1(dst, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_STR) { + if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) + return -1; + } else { + if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) + return -1; + } + } + return 0; +} + +static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) +{ + int i, n = nsamples * 2; + uint8_t *p; + + if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0 || + ks_resize(dst, dst->l + n) < 0) + return -1; + p = (uint8_t *)dst->s + dst->l; + for (i = 0; i < n; i++) + p[i] = (uint8_t)gt[i]; + dst->l += n; + return 0; +} + +static vcf_format_shape_kind_t vcf_format_general_shape_kind(const vcf_format_row_op_t *ops, + int n_ops) +{ + if (n_ops == 5 && + ops[0].kind == VCF_FORMAT_ROW_GT2 && + ops[1].kind == VCF_FORMAT_ROW_INT2 && + ops[2].kind == VCF_FORMAT_ROW_INT1 && + ops[3].kind == VCF_FORMAT_ROW_INT1 && + ops[4].kind == VCF_FORMAT_ROW_INT3) + return VCF_FORMAT_SHAPE_GT2_I2_I1_I1_I3; + if (n_ops == 6 && + ops[0].kind == VCF_FORMAT_ROW_GT2 && + ops[1].kind == VCF_FORMAT_ROW_FLOAT1 && + ops[2].kind == VCF_FORMAT_ROW_INT2 && + ops[3].kind == VCF_FORMAT_ROW_INT1 && + ops[4].kind == VCF_FORMAT_ROW_INT1 && + ops[5].kind == VCF_FORMAT_ROW_INT3) + return VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3; + return VCF_FORMAT_SHAPE_NONE; +} + +static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_shape_kind_t shape, + vcf_format_row_op_t *row_ops) +{ + kstring_t *mem = (kstring_t*)&h->mem; + int nsamples = bcf_hdr_nsamples(h), sample, j; + int max_i2 = 0, max_i3 = 0; + const char *cur = q + 1, *end = s->s + s->l; + + mem->l = 0; + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + if ((uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) + return -1; + if (align_mem(mem) < 0) + return -1; + op->offset = mem->l; + if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) + return -1; + mem->l += nsamples * (size_t) op->size; + } + + for (sample = 0; sample < nsamples && cur < end; sample++) { + int nread; + int32_t *gt = (int32_t *)(mem->s + row_ops[0].offset + sample * (size_t)row_ops[0].size); + int op = 1; + + if (vcf_plan_gt2(&cur, gt) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + + if (shape == VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3) { + float *f1 = (float *)(mem->s + row_ops[1].offset + sample * (size_t)row_ops[1].size); + if (vcf_plan_float_scalar_flexible(&cur, f1) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + op++; + } + + if (vcf_plan_parse_int_vector2_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0 || + vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + if (max_i2 < nread) + max_i2 = nread; + op++; + + if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || + vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + op++; + + if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || + vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + op++; + + if (vcf_plan_parse_int_vector3_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0) + return -4; + if (max_i3 < nread) + max_i3 = nread; + + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + return -4; + } + if (sample != nsamples || max_i2 != 2 || max_i3 != 3) + return -4; + + v->n_fmt = plan->n_ops; + v->n_sample = nsamples; + if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) + return -1; + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + return 0; +} + +static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q) +{ + kstring_t *mem; + int widths[MAX_N_FMT], max_counts[MAX_N_FMT]; + vcf_format_row_op_t row_ops[MAX_N_FMT]; + int nsamples = bcf_hdr_nsamples(h), sample, j, vcf44; + const char *cur, *end; + + for (j = 0; j < plan->n_ops; j++) { + widths[j] = vcf_format_general_expected_width(&plan->ops[j], v); + if (widths[j] <= 0 || widths[j] > 64) + return -4; + max_counts[j] = 0; + } + vcf_format_general_resolve_ops(plan, v, widths, row_ops); + { + vcf_format_shape_kind_t shape = vcf_format_general_shape_kind(row_ops, plan->n_ops); + if (shape != VCF_FORMAT_SHAPE_NONE) + return vcf_parse_format_general_shape(s, h, v, plan, q, shape, row_ops); + } + + mem = (kstring_t*)&h->mem; + mem->l = 0; + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + if (op->size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) + return -1; + if (align_mem(mem) < 0) + return -1; + op->offset = mem->l; + if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) + return -1; + mem->l += nsamples * (size_t) op->size; + } + + cur = q + 1; + end = s->s + s->l; + vcf44 = bcf_get_version(h, NULL) >= VCF44; + for (sample = 0; sample < nsamples && cur < end; sample++) { + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset + sample * (size_t)op->size; + int n = op->width; + + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) + return -4; + break; + case VCF_FORMAT_ROW_GT: + if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, op->width, vcf44) < 0) + return -4; + n = vcf_plan_int_vector_count((int32_t *)buf, op->width); + break; + case VCF_FORMAT_ROW_INT1: + if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)buf) < 0) + return -4; + break; + case VCF_FORMAT_ROW_INT2: + if (vcf_plan_parse_int_vector2_flexible_counted(&cur, (int32_t *)buf, &n) < 0) + return -4; + break; + case VCF_FORMAT_ROW_INT3: + if (vcf_plan_parse_int_vector3_flexible_counted(&cur, (int32_t *)buf, &n) < 0) + return -4; + break; + case VCF_FORMAT_ROW_INTN: + if (vcf_plan_parse_int_vector_counted(&cur, (int32_t *)buf, op->width, &n) < 0) + return -4; + break; + case VCF_FORMAT_ROW_FLOAT1: + if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) + return -4; + break; + case VCF_FORMAT_ROW_FLOATN: + if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) + return -4; + n = vcf_plan_float_vector_count((float *)buf, op->width); + break; + case VCF_FORMAT_ROW_STR: + return -4; + } + if (max_counts[j] < n) + max_counts[j] = n; + + if (j + 1 < plan->n_ops) { + if (vcf_plan_expect_sep(&cur, ':') < 0) + return -4; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + return -4; + } + } + } + if (sample != nsamples) + return -4; + for (j = 0; j < plan->n_ops; j++) + if (max_counts[j] != row_ops[j].width) + return -4; + + v->n_fmt = plan->n_ops; + v->n_sample = nsamples; + if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) + return -1; + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + return 0; +} + static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { @@ -3782,7 +4162,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, kstring_t *mem; int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; - int nsamples, sample, j, vcf44; + int nsamples, sample, j, vcf44, ret; const char *cur, *end; plan = vcf_format_general_plan_get(h, p); @@ -3792,6 +4172,11 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, nsamples = bcf_hdr_nsamples(h); if (!nsamples) return 0; + if (plan->strict_supported) { + ret = vcf_parse_format_general_strict(s, h, v, plan, q); + if (ret != -4) + return ret; + } if (vcf_plan_measure_general(s, h, plan, q, widths) < 0) goto fallback; vcf_format_general_resolve_ops(plan, v, widths, row_ops); @@ -3876,26 +4261,8 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, v->n_fmt = plan->n_ops; v->n_sample = nsamples; - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset; - - bcf_enc_int1(&v->indiv, op->key); - if (op->kind == VCF_FORMAT_ROW_STR) { - if (bcf_enc_size(&v->indiv, op->width, BCF_BT_CHAR) < 0) - return -1; - if (kputsn((char *)buf, nsamples * (size_t)op->width, &v->indiv) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { - if (bcf_enc_size(&v->indiv, op->width, BCF_BT_FLOAT) < 0) - return -1; - if (serialize_float_array(&v->indiv, nsamples * (size_t)op->width, (float *)buf) < 0) - return -1; - } else { - if (bcf_enc_vint(&v->indiv, nsamples * op->width, (int32_t *)buf, op->width) < 0) - return -1; - } - } + if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) + return -1; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; @@ -3949,6 +4316,7 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, vcf_format_plan_t *plan; kstring_t *mem; int nsamples, ad_w, pl_w, sample, nwords, pgt_w = 0, pid_w = 0; + int max_ad_count = 0, max_pl_count = 0; size_t gt_off, ab_off = 0, ad_off, dp_off, gq_off, pgt_off = 0, pid_off = 0, pl_off, total_bytes; int32_t *gt, *ad, *dp, *gq, *pl; float *ab = NULL; @@ -4021,6 +4389,7 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, cur = q + 1; end = s->s + s->l; for (sample = 0; sample < nsamples && cur < end; sample++) { + int nread; if (vcf_plan_gt2(&cur, >[sample * 2]) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) @@ -4030,13 +4399,15 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; - } + } if (ad_w == 2) { - if (vcf_plan_parse_int_vector2(&cur, &ad[sample * 2]) < 0) + if (vcf_plan_parse_int_vector2_counted(&cur, &ad[sample * 2], &nread) < 0) goto fallback; - } else if (vcf_plan_parse_int_vector(&cur, &ad[sample * ad_w], ad_w) < 0) { + } else if (vcf_plan_parse_int_vector_counted(&cur, &ad[sample * ad_w], ad_w, &nread) < 0) { goto fallback; } + if (max_ad_count < nread) + max_ad_count = nread; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; if (vcf_plan_int_value(&cur, &dp[sample]) < 0) @@ -4058,11 +4429,13 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, goto fallback; } if (pl_w == 3) { - if (vcf_plan_parse_int_vector3(&cur, &pl[sample * 3]) < 0) + if (vcf_plan_parse_int_vector3_counted(&cur, &pl[sample * 3], &nread) < 0) goto fallback; - } else if (vcf_plan_parse_int_vector(&cur, &pl[sample * pl_w], pl_w) < 0) { + } else if (vcf_plan_parse_int_vector_counted(&cur, &pl[sample * pl_w], pl_w, &nread) < 0) { goto fallback; } + if (max_pl_count < nread) + max_pl_count = nread; if (*cur == '\t') cur++; else if (*cur == '\0' || cur >= end) @@ -4072,11 +4445,13 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } if (sample != nsamples) goto fallback; + if (max_ad_count != ad_w || max_pl_count != pl_w) + goto fallback; v->n_fmt = plan->has_phase ? (plan->has_ab ? 8 : 7) : (plan->has_ab ? 6 : 5); v->n_sample = nsamples; bcf_enc_int1(&v->indiv, plan->key_gt); - if (bcf_enc_vint(&v->indiv, nsamples * 2, gt, 2) < 0) + if (vcf_enc_gt2_int8(&v->indiv, nsamples, gt) < 0) return -1; if (plan->has_ab) { bcf_enc_int1(&v->indiv, plan->key_ab); From bad7312d8a7eba4d00fef5f272fa717419c7a7a7 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 17:24:22 +0200 Subject: [PATCH 08/38] Direct-write planned FORMAT payloads --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 39 +++ docs/FORMAT_PLAN_SPEC.md | 8 +- vcf.c | 366 +++++++++++++++++++++++++---- 3 files changed, 363 insertions(+), 50 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 3c2555877..58c068d7c 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -360,6 +360,45 @@ memory traffic by parsing validated fixed-width fields directly into final BCF payload buffers, or specialize complete row executors that combine parse, validation, and encode rather than only replacing the opcode switch. +## Follow-Up: Direct Payload Sinks + +The next pass tested direct final-buffer output for fields whose BCF +representation is known before parsing: + +- exact `GT2` writes directly to a final `INT8` payload instead of scratch + `int32_t` values plus `bcf_enc_vint()`; +- exact `AB` writes directly to a final float payload; +- strict shape executors direct-write `GT2` and optional leading `FLOAT1` + payloads, with rollback on fallback; +- exact AD/DP/GQ/PL also carry integer range metadata into a known-range encoder + to avoid the range pass in `bcf_enc_vint()`. + +Correctness remained byte-identical for the edge fixture and 10k CCDG exact and +interpreter modes. + +Parse-heavy 10k CCDG reference: + +| Conversion | Baseline | Exact kernels | Direct-sink interp | +|---|---:|---:|---:| +| VCF -> uncompressed BCF | 2.51-2.68 s | 1.57-1.58 s | 2.29-2.39 s | + +Full 10k CCDG compressed matrix, real seconds: + +| Conversion | Baseline | Exact kernels | Direct-sink interp | +|---|---:|---:|---:| +| VCF.gz -> BCF.gz | 9.51 | 8.53 | 9.28 | +| BCF -> BCF.gz | 7.46 | 7.46 | 7.46 | +| BCF -> VCF.gz | 11.95 | 12.00 | 12.02 | +| VCF.gz -> VCF.gz | 14.16 | 12.95 | 13.62 | +| VCF.gz -> uncompressed BCF | 2.95 | 1.92 | 2.67 | + +The direct sinks are safe but small on this dataset. The known-range encoder +was also byte-identical but did not produce a clear timing win, suggesting that +range tracking during parse still mostly trades one cost for another. Broader +direct integer output likely needs either a cheap type-prediction/rollback +strategy or complete fused row executors that avoid both scratch traffic and +post-parse encoding for multiple fields at once. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 934af71d4..1ee33b6fb 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -83,6 +83,10 @@ Planned integer parsing must be overflow-safe. If a value is outside the BCF int32 payload range, the planned parser falls back so the generic parser keeps its warning and missing-value behavior. +Validated `GT2` payloads and exact-layout `AB` float payloads can be written +directly into `v->indiv` instead of going through scratch arrays. Any direct +writer must save the entry length and roll back before returning fallback. + ## Edge Fixture `test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise @@ -115,8 +119,8 @@ which is useful for isolating interpreter performance. inputs. - Add plan- or shape-level executors for dominant opcode sequences so hot rows can also avoid the per-sample opcode switch. -- Explore direct final-buffer output for validated fixed-width fields; this is - likely higher leverage than adding more switch-level shape executors. +- Extend direct final-buffer output only where BCF type selection is + byte-identical, or where the direct writer can cheaply roll back. - Add overflow-compatible numeric parsing or force fallback before committing to the plan on extreme integer/float values. - Integrate the edge fixture into the standard htslib test runner once the diff --git a/vcf.c b/vcf.c index b199c7449..f3da83736 100644 --- a/vcf.c +++ b/vcf.c @@ -2930,6 +2930,61 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; } +static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, + int32_t min, int32_t max) +{ + int i; + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_int1(s, a[0]); + } else { + if (wsize <= 0) wsize = n; + + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + s->l += n * sizeof(int16_t); + } else { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + i32_to_le(a[i], p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } + } + + return 0; +} + #ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; @@ -3234,6 +3289,11 @@ typedef struct { vcf_format_row_kind_t kind; } vcf_format_row_op_t; +typedef struct { + int32_t min; + int32_t max; +} vcf_plan_int_range_t; + #if defined(__GNUC__) #define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) #else @@ -3393,6 +3453,30 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2(const char **sp, int32_t out[2]) return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) +{ + const char *s = *sp; + int a0, a1, phased; + + if (s[0] == '.' && (s[1] == '/' || s[1] == '|') && s[2] == '.') { + out[0] = 0; + out[1] = 0; + *sp = s + 3; + return 0; + } + if (!(s[0] >= '0' && s[0] <= '9') || (s[1] != '/' && s[1] != '|') || + !(s[2] >= '0' && s[2] <= '9')) + return -1; + + a0 = s[0] - '0'; + a1 = s[2] - '0'; + phased = s[1] == '|'; + out[0] = (uint8_t)(((a0 + 1) << 1) | phased); + out[1] = (uint8_t)(((a1 + 1) << 1) | phased); + *sp = s + 3; + return 0; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) { const char *s = *sp; @@ -3438,6 +3522,20 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_vector_count(const int32_t *vals, int wi return i; } +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_init(vcf_plan_int_range_t *range) +{ + range->min = INT32_MAX; + range->max = INT32_MIN; +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add(vcf_plan_int_range_t *range, int32_t val) +{ + if (range->max < val) + range->max = val; + if (range->min > val && val > INT32_MIN + 1) + range->min = val; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_vector_count(const float *vals, int width) { int i; @@ -3466,6 +3564,15 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value_range(const char **sp, int32_t *out, + vcf_plan_int_range_t *range) +{ + if (vcf_plan_int_value(sp, out) < 0) + return -1; + vcf_plan_int_range_add(range, *out); + return 0; +} + static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, int width, int *nread) { @@ -3491,6 +3598,35 @@ static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, return 0; } +static int vcf_plan_parse_int_vector_counted_range(const char **sp, int32_t *out, + int width, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + int i, nvals; + + for (i = 0; i < width; i++) { + if (vcf_plan_int_value_range(&s, &out[i], range) < 0) + return -1; + if (*s != ',') { + i++; + break; + } + s++; + } + nvals = i; + if (nread) + *nread = nvals; + for (; i < width; i++) { + out[i] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[i]); + } + if (*s == ',') + return -1; + *sp = s; + return 0; +} + static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) { return vcf_plan_parse_int_vector_counted(sp, out, width, NULL); @@ -3520,6 +3656,32 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted(const char **sp, i return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char **sp, int32_t *out, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[1]); + *sp = s; + if (nread) + *nread = 1; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + if (nread) + *nread = 2; + return 0; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t *out) { return vcf_plan_parse_int_vector2_counted(sp, out, NULL); @@ -3560,6 +3722,45 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted(const char **sp, i return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char **sp, int32_t *out, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[1]); + vcf_plan_int_range_add(range, out[2]); + *sp = s; + if (nread) + *nread = 1; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { + out[2] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[2]); + *sp = s; + if (nread) + *nread = 2; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + if (nread) + *nread = 3; + return 0; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) { return vcf_plan_parse_int_vector3_counted(sp, out, NULL); @@ -3920,6 +4121,39 @@ static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, return 0; } +static int vcf_format_general_encode_row_ops_from(kstring_t *dst, kstring_t *mem, + int nsamples, int n_ops, + const vcf_format_row_op_t *row_ops, + int first_op) +{ + int j; + + for (j = first_op; j < n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; + + bcf_enc_int1(dst, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_STR) { + if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) + return -1; + } else { + if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) + return -1; + } + } + return 0; +} + static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) { int i, n = nsamples * 2; @@ -3966,10 +4200,31 @@ static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h), sample, j; int max_i2 = 0, max_i3 = 0; + int direct_ops = shape == VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3 ? 2 : 1; + size_t indiv_l0 = v->indiv.l, gt8_off, f1_off = 0; + uint8_t *gt8, *f1_le = NULL; const char *cur = q + 1, *end = s->s + s->l; + bcf_enc_int1(&v->indiv, row_ops[0].key); + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + return -1; + gt8_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * 2; + if (direct_ops == 2) { + bcf_enc_int1(&v->indiv, row_ops[1].key); + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) + return -1; + f1_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * sizeof(float); + } + gt8 = (uint8_t *)v->indiv.s + gt8_off; + if (direct_ops == 2) + f1_le = (uint8_t *)v->indiv.s + f1_off; + mem->l = 0; - for (j = 0; j < plan->n_ops; j++) { + for (j = direct_ops; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; if ((uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) @@ -3984,38 +4239,37 @@ static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, for (sample = 0; sample < nsamples && cur < end; sample++) { int nread; - int32_t *gt = (int32_t *)(mem->s + row_ops[0].offset + sample * (size_t)row_ops[0].size); - int op = 1; + int op = direct_ops; - if (vcf_plan_gt2(&cur, gt) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - return -4; + if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; if (shape == VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3) { - float *f1 = (float *)(mem->s + row_ops[1].offset + sample * (size_t)row_ops[1].size); - if (vcf_plan_float_scalar_flexible(&cur, f1) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - return -4; - op++; + float f1_val; + if (vcf_plan_float_scalar_flexible(&cur, &f1_val) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + float_to_le(f1_val, f1_le + (size_t)sample * sizeof(float)); } if (vcf_plan_parse_int_vector2_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - return -4; + goto fallback; if (max_i2 < nread) max_i2 = nread; op++; if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - return -4; + goto fallback; op++; if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - return -4; + goto fallback; op++; if (vcf_plan_parse_int_vector3_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0) - return -4; + goto fallback; if (max_i3 < nread) max_i3 = nread; @@ -4024,18 +4278,22 @@ static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, else if (*cur == '\0' || cur >= end) ; else - return -4; + goto fallback; } if (sample != nsamples || max_i2 != 2 || max_i3 != 3) - return -4; + goto fallback; v->n_fmt = plan->n_ops; v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) + if (vcf_format_general_encode_row_ops_from(&v->indiv, mem, nsamples, plan->n_ops, row_ops, direct_ops) < 0) return -1; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; return 0; + +fallback: + v->indiv.l = indiv_l0; + return -4; } static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, @@ -4317,13 +4575,16 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, kstring_t *mem; int nsamples, ad_w, pl_w, sample, nwords, pgt_w = 0, pid_w = 0; int max_ad_count = 0, max_pl_count = 0; - size_t gt_off, ab_off = 0, ad_off, dp_off, gq_off, pgt_off = 0, pid_off = 0, pl_off, total_bytes; - int32_t *gt, *ad, *dp, *gq, *pl; - float *ab = NULL; + vcf_plan_int_range_t ad_range, dp_range, gq_range, pl_range; + size_t ad_off, dp_off, gq_off, pgt_off = 0, pid_off = 0, pl_off, total_bytes; + size_t indiv_l0, gt8_off, ab_le_off = 0; + uint8_t *gt8, *ab_le = NULL; + int32_t *ad, *dp, *gq, *pl; char *pgt = NULL, *pid = NULL; const char *cur, *end; int plan_mode; + indiv_l0 = v->indiv.l; plan_mode = vcf_format_plan_mode(); if (!plan_mode) return -3; @@ -4348,23 +4609,41 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, goto fallback; if (plan->has_phase && vcf_plan_phase_widths(h, plan, s, q, &pgt_w, &pid_w) < 0) goto fallback; + vcf_plan_int_range_init(&ad_range); + vcf_plan_int_range_init(&dp_range); + vcf_plan_int_range_init(&gq_range); + vcf_plan_int_range_init(&pl_range); mem = (kstring_t*)&h->mem; mem->l = 0; if (align_mem(mem) < 0) return -1; - total_bytes = (size_t) nsamples * (2 + ad_w + 1 + 1 + pl_w + plan->has_ab) * sizeof(int32_t); + bcf_enc_int1(&v->indiv, plan->key_gt); + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + return -1; + gt8_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * 2; + if (plan->has_ab) { + bcf_enc_int1(&v->indiv, plan->key_ab); + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) + return -1; + ab_le_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * sizeof(float); + } + gt8 = (uint8_t *)v->indiv.s + gt8_off; + if (plan->has_ab) + ab_le = (uint8_t *)v->indiv.s + ab_le_off; + + total_bytes = (size_t) nsamples * (ad_w + 1 + 1 + pl_w) * sizeof(int32_t); total_bytes += (size_t) nsamples * (pgt_w + pid_w); if (total_bytes > INT_MAX) return -1; if (ks_resize(mem, mem->l + total_bytes) < 0) return -1; - gt_off = mem->l; mem->l += (size_t) nsamples * 2 * sizeof(int32_t); - if (plan->has_ab) { - ab_off = mem->l; mem->l += (size_t) nsamples * sizeof(float); - } ad_off = mem->l; mem->l += (size_t) nsamples * ad_w * sizeof(int32_t); dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); @@ -4374,9 +4653,6 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); - gt = (int32_t *) (mem->s + gt_off); - if (plan->has_ab) - ab = (float *) (mem->s + ab_off); ad = (int32_t *) (mem->s + ad_off); dp = (int32_t *) (mem->s + dp_off); gq = (int32_t *) (mem->s + gq_off); @@ -4390,31 +4666,33 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, end = s->s + s->l; for (sample = 0; sample < nsamples && cur < end; sample++) { int nread; - if (vcf_plan_gt2(&cur, >[sample * 2]) < 0) + if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; if (plan->has_ab) { - if (vcf_plan_float_value(&cur, &ab[sample]) < 0) + float ab_val; + if (vcf_plan_float_value(&cur, &ab_val) < 0) goto fallback; + float_to_le(ab_val, ab_le + (size_t)sample * sizeof(float)); if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; } if (ad_w == 2) { - if (vcf_plan_parse_int_vector2_counted(&cur, &ad[sample * 2], &nread) < 0) + if (vcf_plan_parse_int_vector2_counted_range(&cur, &ad[sample * 2], &nread, &ad_range) < 0) goto fallback; - } else if (vcf_plan_parse_int_vector_counted(&cur, &ad[sample * ad_w], ad_w, &nread) < 0) { + } else if (vcf_plan_parse_int_vector_counted_range(&cur, &ad[sample * ad_w], ad_w, &nread, &ad_range) < 0) { goto fallback; } if (max_ad_count < nread) max_ad_count = nread; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; - if (vcf_plan_int_value(&cur, &dp[sample]) < 0) + if (vcf_plan_int_value_range(&cur, &dp[sample], &dp_range) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; - if (vcf_plan_int_value(&cur, &gq[sample]) < 0) + if (vcf_plan_int_value_range(&cur, &gq[sample], &gq_range) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; @@ -4429,9 +4707,9 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, goto fallback; } if (pl_w == 3) { - if (vcf_plan_parse_int_vector3_counted(&cur, &pl[sample * 3], &nread) < 0) + if (vcf_plan_parse_int_vector3_counted_range(&cur, &pl[sample * 3], &nread, &pl_range) < 0) goto fallback; - } else if (vcf_plan_parse_int_vector_counted(&cur, &pl[sample * pl_w], pl_w, &nread) < 0) { + } else if (vcf_plan_parse_int_vector_counted_range(&cur, &pl[sample * pl_w], pl_w, &nread, &pl_range) < 0) { goto fallback; } if (max_pl_count < nread) @@ -4450,24 +4728,15 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, v->n_fmt = plan->has_phase ? (plan->has_ab ? 8 : 7) : (plan->has_ab ? 6 : 5); v->n_sample = nsamples; - bcf_enc_int1(&v->indiv, plan->key_gt); - if (vcf_enc_gt2_int8(&v->indiv, nsamples, gt) < 0) - return -1; - if (plan->has_ab) { - bcf_enc_int1(&v->indiv, plan->key_ab); - bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT); - if (serialize_float_array(&v->indiv, nsamples, ab) < 0) - return -1; - } bcf_enc_int1(&v->indiv, plan->key_ad); nwords = nsamples * ad_w; - if (bcf_enc_vint(&v->indiv, nwords, ad, ad_w) < 0) + if (bcf_enc_vint_known_range(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max) < 0) return -1; bcf_enc_int1(&v->indiv, plan->key_dp); - if (bcf_enc_vint(&v->indiv, nsamples, dp, 1) < 0) + if (bcf_enc_vint_known_range(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max) < 0) return -1; bcf_enc_int1(&v->indiv, plan->key_gq); - if (bcf_enc_vint(&v->indiv, nsamples, gq, 1) < 0) + if (bcf_enc_vint_known_range(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max) < 0) return -1; if (plan->has_phase) { bcf_enc_int1(&v->indiv, plan->key_pgt); @@ -4483,7 +4752,7 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } bcf_enc_int1(&v->indiv, plan->key_pl); nwords = nsamples * pl_w; - if (bcf_enc_vint(&v->indiv, nwords, pl, pl_w) < 0) + if (bcf_enc_vint_known_range(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max) < 0) return -1; vcf_format_plan_stats.hits++; @@ -4491,6 +4760,7 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return 0; fallback: + v->indiv.l = indiv_l0; vcf_format_plan_stats.fallback++; return -3; } From dfef785af81c6b03265be2f37aa03056070447e0 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 18:30:39 +0200 Subject: [PATCH 09/38] Guard planned FORMAT fast paths --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 32 +++++++++++ docs/FORMAT_PLAN_SPEC.md | 24 ++++++++ vcf.c | 88 +++++++++++++++++++++++++++++- 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 58c068d7c..554ad6312 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -399,6 +399,38 @@ direct integer output likely needs either a cheap type-prediction/rollback strategy or complete fused row executors that avoid both scratch traffic and post-parse encoding for multiple fields at once. +## Follow-Up: Optimistic Guards + +The fast paths now have a small circuit breaker in the cached plan state. This +is tuned for the practical expectation that files are piecewise fixed-format, +with occasional weird rows rather than uniformly weird records. + +The fast parser still validates as it parses and immediately rolls back on any +mismatch. The new guard only decides whether to keep trying that fast parser on +later records: + +- a success resets the consecutive-miss streak; +- isolated weird rows fall back once and do not disable the fast path; +- eight consecutive misses pause the fast path; +- after 128 attempts, more than 10% fallbacks also pauses it; +- paused paths cool down for 256 skipped records, then re-probe so later + fixed-format regions can recover. + +The clean CCDG path is unchanged: on the 10k subset, exact mode still reports +`10000 hits / 0 fallbacks`. The edge fixture remains byte-identical and keeps +the expected mixed behavior: + +```text +./test/test_format_plan.sh +vcf-format-plan attempts=10 hits=7 fallback=3 parsed_samples=21 +vcf-format-plan attempts=10 hits=10 fallback=0 parsed_samples=30 +``` + +The full compressed matrix was not re-recorded for this guard-only change +because the machine was under unrelated CPU load during the interrupted run. +The parse-heavy 10k BCF outputs were re-compared byte-for-byte for exact and +interpreter modes. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 1ee33b6fb..20a270030 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -87,6 +87,30 @@ Validated `GT2` payloads and exact-layout `AB` float payloads can be written directly into `v->indiv` instead of going through scratch arrays. Any direct writer must save the entry length and roll back before returning fallback. +## Guard Policy + +Planned FORMAT parsing is optimistic and self-validating: parsing each field is +also the shape check. When a guard fails, the parser rolls back any direct +`v->indiv` writes and falls back to the more general parser. + +Each cached exact/general FORMAT plan keeps small runtime guard state: + +- attempts, hits, fallbacks, +- consecutive miss streak, +- a temporary cooldown flag. + +The guard is tuned for piecewise fixed-format VCFs with infrequent weird rows. +An isolated fallback does not disable the fast path; the next success resets the +miss streak. A fast path is paused only after eight consecutive misses, or +after at least 128 attempts with more than 10% fallbacks. Paused plans are not +blacklisted forever: after 256 skipped records, the plan probes the fast path +again so later fixed-format regions can recover the optimized path. + +For exact CCDG kernels, a paused exact guard routes the row to the compiled +general planner. For general plans, a paused strict guard skips directly to the +measured-width general planner, and a paused general guard returns to legacy +htslib parsing. + ## Edge Fixture `test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise diff --git a/vcf.c b/vcf.c index f3da83736..77d69205f 100644 --- a/vcf.c +++ b/vcf.c @@ -3230,6 +3230,69 @@ static int vcf_format_plan_mode(void) return mode; } +typedef struct { + uint32_t attempts; + uint32_t hits; + uint32_t fallbacks; + uint16_t miss_streak; + uint16_t cooldown; + uint8_t disabled; +} vcf_format_fast_guard_t; + +enum { + VCF_FORMAT_FAST_DISABLE_STREAK = 8, + VCF_FORMAT_FAST_PROBE_ATTEMPTS = 128, + VCF_FORMAT_FAST_MAX_FALLBACK_PCT = 10, + VCF_FORMAT_FAST_COOLDOWN_RECORDS = 256 +}; + +static inline int vcf_format_fast_guard_enabled(vcf_format_fast_guard_t *guard) +{ + if (!guard->disabled) + return 1; + if (guard->cooldown) { + guard->cooldown--; + return 0; + } + guard->attempts = 0; + guard->hits = 0; + guard->fallbacks = 0; + guard->miss_streak = 0; + guard->disabled = 0; + return 1; +} + +static inline void vcf_format_fast_guard_success(vcf_format_fast_guard_t *guard) +{ + if (guard->attempts != UINT32_MAX) + guard->attempts++; + if (guard->hits != UINT32_MAX) + guard->hits++; + guard->miss_streak = 0; +} + +static inline void vcf_format_fast_guard_fallback(vcf_format_fast_guard_t *guard) +{ + if (guard->attempts != UINT32_MAX) + guard->attempts++; + if (guard->fallbacks != UINT32_MAX) + guard->fallbacks++; + if (guard->miss_streak != UINT16_MAX) + guard->miss_streak++; + + if (guard->miss_streak >= VCF_FORMAT_FAST_DISABLE_STREAK) { + guard->disabled = 1; + guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; + return; + } + if (guard->attempts >= VCF_FORMAT_FAST_PROBE_ATTEMPTS && + (uint64_t) guard->fallbacks * 100 > + (uint64_t) guard->attempts * VCF_FORMAT_FAST_MAX_FALLBACK_PCT) { + guard->disabled = 1; + guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; + } +} + typedef struct { char format[64]; const bcf_hdr_t *hdr; @@ -3244,6 +3307,7 @@ typedef struct { int key_pgt; int key_pid; int key_pl; + vcf_format_fast_guard_t guard; } vcf_format_plan_t; typedef struct { @@ -3261,6 +3325,8 @@ typedef struct { int strict_supported; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; + vcf_format_fast_guard_t strict_guard; + vcf_format_fast_guard_t general_guard; } vcf_format_general_plan_t; typedef enum { @@ -4426,14 +4492,24 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, plan = vcf_format_general_plan_get(h, p); if (!plan) goto fallback; + if (!vcf_format_fast_guard_enabled(&plan->general_guard)) { + vcf_format_plan_stats.fallback++; + return -3; + } nsamples = bcf_hdr_nsamples(h); if (!nsamples) return 0; - if (plan->strict_supported) { + if (plan->strict_supported && + vcf_format_fast_guard_enabled(&plan->strict_guard)) { ret = vcf_parse_format_general_strict(s, h, v, plan, q); + if (ret == 0) { + vcf_format_fast_guard_success(&plan->strict_guard); + return ret; + } if (ret != -4) return ret; + vcf_format_fast_guard_fallback(&plan->strict_guard); } if (vcf_plan_measure_general(s, h, plan, q, widths) < 0) goto fallback; @@ -4524,9 +4600,12 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; + vcf_format_fast_guard_success(&plan->general_guard); return 0; fallback: + if (plan) + vcf_format_fast_guard_fallback(&plan->general_guard); vcf_format_plan_stats.fallback++; return -3; } @@ -4571,7 +4650,7 @@ static int vcf_plan_phase_widths(const bcf_hdr_t *h, const vcf_format_plan_t *pl static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { - vcf_format_plan_t *plan; + vcf_format_plan_t *plan = NULL; kstring_t *mem; int nsamples, ad_w, pl_w, sample, nwords, pgt_w = 0, pid_w = 0; int max_ad_count = 0, max_pl_count = 0; @@ -4597,6 +4676,8 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, plan = vcf_format_plan_get(h, p); if (!plan) return vcf_parse_format_general_planned(s, h, v, p, q); + if (!vcf_format_fast_guard_enabled(&plan->guard)) + return vcf_parse_format_general_planned(s, h, v, p, q); nsamples = bcf_hdr_nsamples(h); if (!nsamples) @@ -4757,10 +4838,13 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; + vcf_format_fast_guard_success(&plan->guard); return 0; fallback: v->indiv.l = indiv_l0; + if (plan) + vcf_format_fast_guard_fallback(&plan->guard); vcf_format_plan_stats.fallback++; return -3; } From ff9b3b574e1b670b3744b84f328b4460d5e47583 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 18:41:40 +0200 Subject: [PATCH 10/38] Add dynamic strict FORMAT numeric executor --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 61 +++++ docs/FORMAT_PLAN_SPEC.md | 14 +- vcf.c | 342 ++++++++++++++++++++--------- 3 files changed, 310 insertions(+), 107 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 554ad6312..497c7de81 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -431,6 +431,67 @@ because the machine was under unrelated CPU load during the interrupted run. The parse-heavy 10k BCF outputs were re-compared byte-for-byte for exact and interpreter modes. +## Follow-Up: Generic Strict Numeric Executor + +The next iteration removed the two hard-coded shape executors and replaced them +with a generic strict fixed-numeric executor. This is the dynamic-exact version +of the FORMAT planner: + +- the executor is keyed by resolved row op kinds and widths, not FORMAT field + names; +- any fixed-width numeric op sequence is eligible; +- leading `GT2` and scalar float fields are written directly into the final BCF + `indiv` buffer; +- integer fields carry min/max range metadata from parse into encode so + `bcf_enc_vint()` does not rescan scratch arrays; +- any mismatch rolls back direct writes and falls back to the measured-width + general planner or legacy parser. + +Correctness checks: + +```text +make -j4 test/test_view +./test/test_format_plan.sh +cmp baseline/exact/interp BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz +cmp baseline/exact/interp compressed BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz +``` + +The mixed edge fixture remains byte-identical: + +```text +vcf-format-plan attempts=10 hits=7 fallback=3 parsed_samples=21 +vcf-format-plan attempts=10 hits=10 fallback=0 parsed_samples=30 +``` + +Parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: + +| Mode | Run 1 | Run 2 | Run 3 | +|---|---:|---:|---:| +| Baseline | 2.86 | 2.87 | 2.85 | +| Exact kernels | 1.85 | 1.85 | 1.86 | +| Dynamic strict/interp | 1.87 | 1.88 | 1.88 | + +After removing the old shape-specific templates, a cleanup check still showed +exact and dynamic strict essentially tied: + +| Mode | Real seconds | +|---|---:| +| Exact kernels | 1.89 | +| Dynamic strict/interp | 1.87 | + +Single-run compressed VCF.gz to compressed BCF.gz, real seconds: + +| Mode | Real seconds | +|---|---:| +| Baseline | 10.08 | +| Exact kernels | 9.01 | +| Dynamic strict/interp | 8.58 | + +The compressed result should be read as directional because compression noise is +larger, but the outputs were byte-identical and the main parse-heavy result is +the key signal: the dynamic strict path is now within measurement noise of the +hand-written exact CCDG kernel without matching on CCDG field names. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 20a270030..271e5048d 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -75,15 +75,21 @@ For rows whose widths can be predicted from the header and allele count, the interpreter can try a strict path before the observed-width pass. The strict path validates the observed maximum width while parsing and falls back to the observed-width interpreter if the row is sparse, malformed, string-bearing, or -otherwise not byte-identical. Common numeric opcode tapes such as -`GT2:INT2:INT1:INT1:INT3` and `GT2:FLOAT1:INT2:INT1:INT1:INT3` have -shape-level executors that avoid the per-op switch. +otherwise not byte-identical. + +Strict numeric rows now use a generic fixed-schema executor rather than +FORMAT-name special cases. It accepts any fixed-width numeric opcode sequence, +direct-writes a leading `GT2`/`FLOAT1` prefix into `v->indiv`, parses remaining +integer/float fields into row-local scratch, carries integer min/max metadata +from parse to encode, and rolls back direct writes on the first mismatch. This +gives CCDG-like rows exact-kernel performance while keeping the executor keyed +by dynamic row shape rather than field names. Planned integer parsing must be overflow-safe. If a value is outside the BCF int32 payload range, the planned parser falls back so the generic parser keeps its warning and missing-value behavior. -Validated `GT2` payloads and exact-layout `AB` float payloads can be written +Validated `GT2` payloads and leading scalar float payloads can be written directly into `v->indiv` instead of going through scratch arrays. Any direct writer must save the entry length and roll back before returning fallback. diff --git a/vcf.c b/vcf.c index 77d69205f..d0c980f73 100644 --- a/vcf.c +++ b/vcf.c @@ -3341,12 +3341,6 @@ typedef enum { VCF_FORMAT_ROW_STR } vcf_format_row_kind_t; -typedef enum { - VCF_FORMAT_SHAPE_NONE, - VCF_FORMAT_SHAPE_GT2_I2_I1_I1_I3, - VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3 -} vcf_format_shape_kind_t; - typedef struct { int key; int width; @@ -4052,6 +4046,17 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible(const char **sp, int32_t return vcf_plan_int_value(sp, out); } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible_range(const char **sp, int32_t *out, + vcf_plan_int_range_t *range) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + *out = bcf_int32_missing; + vcf_plan_int_range_add(range, *out); + return 0; + } + return vcf_plan_int_value_range(sp, out, range); +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float *out) { if (**sp == ':' || **sp == '\t' || **sp == '\0') { @@ -4073,6 +4078,23 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted(const cha return vcf_plan_parse_int_vector2_counted(sp, out, nread); } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(const char **sp, + int32_t *out, + int *nread, + vcf_plan_int_range_t *range) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + out[0] = bcf_int32_missing; + out[1] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[0]); + vcf_plan_int_range_add(range, out[1]); + if (nread) + *nread = 1; + return 0; + } + return vcf_plan_parse_int_vector2_counted_range(sp, out, nread, range); +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible(const char **sp, int32_t *out) { return vcf_plan_parse_int_vector2_flexible_counted(sp, out, NULL); @@ -4091,6 +4113,25 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted(const cha return vcf_plan_parse_int_vector3_counted(sp, out, nread); } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(const char **sp, + int32_t *out, + int *nread, + vcf_plan_int_range_t *range) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + out[0] = bcf_int32_missing; + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + vcf_plan_int_range_add(range, out[0]); + vcf_plan_int_range_add(range, out[1]); + vcf_plan_int_range_add(range, out[2]); + if (nread) + *nread = 1; + return 0; + } + return vcf_plan_parse_int_vector3_counted_range(sp, out, nread, range); +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, int32_t *out) { return vcf_plan_parse_int_vector3_flexible_counted(sp, out, NULL); @@ -4220,6 +4261,47 @@ static int vcf_format_general_encode_row_ops_from(kstring_t *dst, kstring_t *mem return 0; } +static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring_t *mem, + int nsamples, int n_ops, + const vcf_format_row_op_t *row_ops, + const vcf_plan_int_range_t *ranges, + int first_op) +{ + int j; + + for (j = first_op; j < n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; + + bcf_enc_int1(dst, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_STR) { + if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_INT1 || + op->kind == VCF_FORMAT_ROW_INT2 || + op->kind == VCF_FORMAT_ROW_INT3 || + op->kind == VCF_FORMAT_ROW_INTN) { + if (bcf_enc_vint_known_range(dst, nsamples * op->width, (int32_t *)buf, + op->width, ranges[j].min, ranges[j].max) < 0) + return -1; + } else { + if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) + return -1; + } + } + return 0; +} + static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) { int i, n = nsamples * 2; @@ -4235,59 +4317,82 @@ static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) return 0; } -static vcf_format_shape_kind_t vcf_format_general_shape_kind(const vcf_format_row_op_t *ops, - int n_ops) -{ - if (n_ops == 5 && - ops[0].kind == VCF_FORMAT_ROW_GT2 && - ops[1].kind == VCF_FORMAT_ROW_INT2 && - ops[2].kind == VCF_FORMAT_ROW_INT1 && - ops[3].kind == VCF_FORMAT_ROW_INT1 && - ops[4].kind == VCF_FORMAT_ROW_INT3) - return VCF_FORMAT_SHAPE_GT2_I2_I1_I1_I3; - if (n_ops == 6 && - ops[0].kind == VCF_FORMAT_ROW_GT2 && - ops[1].kind == VCF_FORMAT_ROW_FLOAT1 && - ops[2].kind == VCF_FORMAT_ROW_INT2 && - ops[3].kind == VCF_FORMAT_ROW_INT1 && - ops[4].kind == VCF_FORMAT_ROW_INT1 && - ops[5].kind == VCF_FORMAT_ROW_INT3) - return VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3; - return VCF_FORMAT_SHAPE_NONE; -} - -static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, - bcf1_t *v, - const vcf_format_general_plan_t *plan, - char *q, - vcf_format_shape_kind_t shape, - vcf_format_row_op_t *row_ops) +static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int n_ops) +{ + int j; + + for (j = 0; j < n_ops; j++) { + if (row_ops[j].kind != VCF_FORMAT_ROW_GT2 && + row_ops[j].kind != VCF_FORMAT_ROW_FLOAT1) + break; + } + return j; +} + +static int vcf_format_general_fixed_numeric_supported(const vcf_format_row_op_t *row_ops, + int n_ops) +{ + int j; + + for (j = 0; j < n_ops; j++) { + switch (row_ops[j].kind) { + case VCF_FORMAT_ROW_GT2: + case VCF_FORMAT_ROW_INT1: + case VCF_FORMAT_ROW_INT2: + case VCF_FORMAT_ROW_INT3: + case VCF_FORMAT_ROW_INTN: + case VCF_FORMAT_ROW_FLOAT1: + case VCF_FORMAT_ROW_FLOATN: + break; + default: + return 0; + } + } + return 1; +} + +static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_row_op_t *row_ops) { kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h), sample, j; - int max_i2 = 0, max_i3 = 0; - int direct_ops = shape == VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3 ? 2 : 1; - size_t indiv_l0 = v->indiv.l, gt8_off, f1_off = 0; - uint8_t *gt8, *f1_le = NULL; + int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); + int max_counts[MAX_N_FMT]; + vcf_plan_int_range_t ranges[MAX_N_FMT]; + size_t indiv_l0 = v->indiv.l; + size_t direct_offsets[MAX_N_FMT]; const char *cur = q + 1, *end = s->s + s->l; - bcf_enc_int1(&v->indiv, row_ops[0].key); - if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) - return -1; - gt8_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * 2; - if (direct_ops == 2) { - bcf_enc_int1(&v->indiv, row_ops[1].key); - if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) - return -1; - f1_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * sizeof(float); + if (!vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) + return -4; + + for (j = 0; j < plan->n_ops; j++) { + max_counts[j] = 0; + direct_offsets[j] = 0; + vcf_plan_int_range_init(&ranges[j]); + } + + for (j = 0; j < direct_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + bcf_enc_int1(&v->indiv, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + return -1; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)nsamples * 2; + } else { + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) + return -1; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)nsamples * sizeof(float); + } } - gt8 = (uint8_t *)v->indiv.s + gt8_off; - if (direct_ops == 2) - f1_le = (uint8_t *)v->indiv.s + f1_off; mem->l = 0; for (j = direct_ops; j < plan->n_ops; j++) { @@ -4304,54 +4409,84 @@ static int vcf_parse_format_general_shape(kstring_t *s, const bcf_hdr_t *h, } for (sample = 0; sample < nsamples && cur < end; sample++) { - int nread; - int op = direct_ops; - - if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = j < direct_ops + ? (uint8_t *)v->indiv.s + direct_offsets[j] + + sample * (size_t)(op->kind == VCF_FORMAT_ROW_GT2 ? 2 : op->size) + : (uint8_t *)mem->s + op->offset + sample * (size_t)op->size; + int n = op->width; - if (shape == VCF_FORMAT_SHAPE_GT2_F1_I2_I1_I1_I3) { - float f1_val; - if (vcf_plan_float_scalar_flexible(&cur, &f1_val) < 0 || vcf_plan_expect_sep(&cur, ':') < 0) + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (j < direct_ops) { + if (vcf_plan_gt2_u8(&cur, buf) < 0) + goto fallback; + } else if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) { + goto fallback; + } + break; + case VCF_FORMAT_ROW_INT1: + if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_INT2: + if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_INT3: + if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_INTN: + if (vcf_plan_parse_int_vector_counted_range(&cur, (int32_t *)buf, op->width, &n, &ranges[j]) < 0) + goto fallback; + break; + case VCF_FORMAT_ROW_FLOAT1: + if (j < direct_ops) { + float f; + if (vcf_plan_float_scalar_flexible(&cur, &f) < 0) + goto fallback; + float_to_le(f, buf); + } else if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) { + goto fallback; + } + break; + case VCF_FORMAT_ROW_FLOATN: + if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) + goto fallback; + n = vcf_plan_float_vector_count((float *)buf, op->width); + break; + default: goto fallback; - float_to_le(f1_val, f1_le + (size_t)sample * sizeof(float)); - } - - if (vcf_plan_parse_int_vector2_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0 || - vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (max_i2 < nread) - max_i2 = nread; - op++; - - if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || - vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - op++; - - if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size)) < 0 || - vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - op++; - - if (vcf_plan_parse_int_vector3_flexible_counted(&cur, (int32_t *)(mem->s + row_ops[op].offset + sample * (size_t)row_ops[op].size), &nread) < 0) - goto fallback; - if (max_i3 < nread) - max_i3 = nread; + } + if (max_counts[j] < n) + max_counts[j] = n; - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; + if (j + 1 < plan->n_ops) { + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + goto fallback; + } + } } - if (sample != nsamples || max_i2 != 2 || max_i3 != 3) + if (sample != nsamples) goto fallback; + for (j = 0; j < plan->n_ops; j++) + if (max_counts[j] != row_ops[j].width) + goto fallback; v->n_fmt = plan->n_ops; v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops_from(&v->indiv, mem, nsamples, plan->n_ops, row_ops, direct_ops) < 0) + if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, nsamples, + plan->n_ops, row_ops, + ranges, direct_ops) < 0) return -1; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; @@ -4370,6 +4505,7 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, kstring_t *mem; int widths[MAX_N_FMT], max_counts[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; + vcf_plan_int_range_t ranges[MAX_N_FMT]; int nsamples = bcf_hdr_nsamples(h), sample, j, vcf44; const char *cur, *end; @@ -4378,13 +4514,11 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, if (widths[j] <= 0 || widths[j] > 64) return -4; max_counts[j] = 0; + vcf_plan_int_range_init(&ranges[j]); } vcf_format_general_resolve_ops(plan, v, widths, row_ops); - { - vcf_format_shape_kind_t shape = vcf_format_general_shape_kind(row_ops, plan->n_ops); - if (shape != VCF_FORMAT_SHAPE_NONE) - return vcf_parse_format_general_shape(s, h, v, plan, q, shape, row_ops); - } + if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) + return vcf_parse_format_general_fixed_numeric(s, h, v, plan, q, row_ops); mem = (kstring_t*)&h->mem; mem->l = 0; @@ -4421,19 +4555,19 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, n = vcf_plan_int_vector_count((int32_t *)buf, op->width); break; case VCF_FORMAT_ROW_INT1: - if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)buf) < 0) + if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) return -4; break; case VCF_FORMAT_ROW_INT2: - if (vcf_plan_parse_int_vector2_flexible_counted(&cur, (int32_t *)buf, &n) < 0) + if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) return -4; break; case VCF_FORMAT_ROW_INT3: - if (vcf_plan_parse_int_vector3_flexible_counted(&cur, (int32_t *)buf, &n) < 0) + if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) return -4; break; case VCF_FORMAT_ROW_INTN: - if (vcf_plan_parse_int_vector_counted(&cur, (int32_t *)buf, op->width, &n) < 0) + if (vcf_plan_parse_int_vector_counted_range(&cur, (int32_t *)buf, op->width, &n, &ranges[j]) < 0) return -4; break; case VCF_FORMAT_ROW_FLOAT1: @@ -4472,7 +4606,9 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, v->n_fmt = plan->n_ops; v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) + if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, nsamples, + plan->n_ops, row_ops, + ranges, 0) < 0) return -1; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; From 544660cb245f8c5967cbdd6575a76b6c8bcf9e48 Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 18:47:00 +0200 Subject: [PATCH 11/38] Tighten dynamic FORMAT executor --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 34 +++++++++++-- docs/FORMAT_PLAN_SPEC.md | 4 ++ test/format-plan-edge.vcf | 6 +++ vcf.c | 78 ++++++++++-------------------- 4 files changed, 64 insertions(+), 58 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 497c7de81..44f661cc7 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -422,8 +422,8 @@ the expected mixed behavior: ```text ./test/test_format_plan.sh -vcf-format-plan attempts=10 hits=7 fallback=3 parsed_samples=21 -vcf-format-plan attempts=10 hits=10 fallback=0 parsed_samples=30 +vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 +vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 ``` The full compressed matrix was not re-recorded for this guard-only change @@ -456,11 +456,14 @@ cmp baseline/exact/interp BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz cmp baseline/exact/interp compressed BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz ``` -The mixed edge fixture remains byte-identical: +The mixed edge fixture remains byte-identical. It now includes reordered +numeric FORMAT fields, a scalar float away from the first FORMAT positions, +non-CCDG fixed-width numeric tags, and integer values that cross BCF int8/int16 +encoding thresholds: ```text -vcf-format-plan attempts=10 hits=7 fallback=3 parsed_samples=21 -vcf-format-plan attempts=10 hits=10 fallback=0 parsed_samples=30 +vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 +vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 ``` Parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: @@ -492,6 +495,27 @@ larger, but the outputs were byte-identical and the main parse-heavy result is the key signal: the dynamic strict path is now within measurement noise of the hand-written exact CCDG kernel without matching on CCDG field names. +## Follow-Up: Subagent Review Cleanup + +Three review passes suggested tightening the generic executor rather than adding +more field-name cases. The resulting cleanup: + +- rolls back `v->indiv` on hard errors after direct writes, not only on shape + fallback; +- skips range updates for padding vector-end sentinels while preserving the + `bcf_enc_vint()` min/max contract; +- precomputes per-op base pointers and strides before the sample loop; +- removes the stale non-range encoder variant; +- expands the edge fixture with reordered and non-CCDG numeric FORMAT rows. + +Post-cleanup parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: + +| Mode | Run 1 | Run 2 | Run 3 | +|---|---:|---:|---:| +| Baseline | 2.83 | 2.84 | 2.86 | +| Exact kernels | 1.85 | 1.89 | 1.86 | +| Dynamic strict/interp | 1.86 | 1.83 | 1.83 | + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 271e5048d..716e6bab1 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -124,6 +124,10 @@ common awkward cases: - the exact CCDG layouts, - reordered fields, +- reordered numeric fields with `GT` and scalar floats away from the first + FORMAT position, +- non-CCDG numeric tag names with fixed widths, +- integer values around BCF int8/int16 type boundaries, - multiallelic AD/PL and GL, - haploid GT, - multidigit allele indexes, diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index aa9b185b9..3f45fbbe7 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -11,6 +11,8 @@ ##FORMAT= ##FORMAT= ##FORMAT= +##FORMAT= +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 chr22 10510061 . A T 64.12 PASS . GT:AB:AD:DP:GQ:PL 0/0:.:3,0:3:9:0,9,104 0/1:0.5:5,4:9:99:99,0,123 ./.:.:0,0:0:.:. chr22 10510352 . AT A 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 1/1:0,5:5:15:1|1:10510352_AT_A:225,15,0 0/1:3,2:5:20:0|1:10510352_AT_A:20,0,200 ./.:0,0:0:.:.:.:. @@ -22,3 +24,7 @@ chr22 10560000 . A C,G 50 PASS . GT:GL:DP:GQ 0/1:-0.1,-1.2,-9.9,-2.0,-3.0,-4.0:7 chr22 10570000 . A T 50 PASS . GT:AD:DP:GQ:PL 0:3,0:3:10:0,10,100 1:0,3:3:20:100,10,0 .:0,0:0:.:. chr22 10580000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT:AD:DP:GQ:PL 10/10:0,0,0,0,0,0,0,0,0,0,7:7:20:200,190,180,170,160,150,140,130,120,110,100,90,80,70,60,50,40,30,20,10,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460 0/10:3,0,0,0,0,0,0,0,0,0,2:5:30:0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650 ./.:0,0,0,0,0,0,0,0,0,0,0:0:.:. chr22 10585000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/0:.:3:10:. 0/1:.:5:20:. ./.:.:0:.:. +chr22 10590000 . A T 50 PASS . DP:GQ:GT:AD:PL 11:50:0/1:6,5:80,0,90 8:45:0/0:8,0:0,45,100 0:.:./.:0,0:. +chr22 10591000 . A T 50 PASS . AD:PL:GT:DP:GQ 4,3:70,0,80:0/1:7:60 9,0:0,70,120:0/0:9:50 0,0:.:./.:0:. +chr22 10592000 . A T 50 PASS . GT:DP:AB:GQ:AD:PL 0/1:12:0.42:70:7,5:90,0,100 0/0:10:0.01:60:10,0:0,60,120 ./.:0:.:.:0,0:. +chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,20:8:8,0,0,0 ./.:.,.:0:.,.,.,. diff --git a/vcf.c b/vcf.c index d0c980f73..10154400a 100644 --- a/vcf.c +++ b/vcf.c @@ -2934,6 +2934,8 @@ static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, int32_t min, int32_t max) { int i; + // min/max must match bcf_enc_vint()'s scan: missing and vector-end values + // may affect max, but are excluded from min. if (n <= 0) { return bcf_enc_size(s, 0, BCF_BT_NULL); } else if (n == 1) { @@ -3677,10 +3679,8 @@ static int vcf_plan_parse_int_vector_counted_range(const char **sp, int32_t *out nvals = i; if (nread) *nread = nvals; - for (; i < width; i++) { + for (; i < width; i++) out[i] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[i]); - } if (*s == ',') return -1; *sp = s; @@ -3725,7 +3725,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char * return -1; if (*s != ',') { out[1] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[1]); *sp = s; if (nread) *nread = 1; @@ -3792,8 +3791,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char * if (*s != ',') { out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[1]); - vcf_plan_int_range_add(range, out[2]); *sp = s; if (nread) *nread = 1; @@ -3804,7 +3801,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char * return -1; if (*s != ',') { out[2] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[2]); *sp = s; if (nread) *nread = 2; @@ -4087,7 +4083,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(con out[0] = bcf_int32_missing; out[1] = bcf_int32_vector_end; vcf_plan_int_range_add(range, out[0]); - vcf_plan_int_range_add(range, out[1]); if (nread) *nread = 1; return 0; @@ -4123,8 +4118,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(con out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; vcf_plan_int_range_add(range, out[0]); - vcf_plan_int_range_add(range, out[1]); - vcf_plan_int_range_add(range, out[2]); if (nread) *nread = 1; return 0; @@ -4228,39 +4221,6 @@ static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, return 0; } -static int vcf_format_general_encode_row_ops_from(kstring_t *dst, kstring_t *mem, - int nsamples, int n_ops, - const vcf_format_row_op_t *row_ops, - int first_op) -{ - int j; - - for (j = first_op; j < n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset; - - bcf_enc_int1(dst, op->key); - if (op->kind == VCF_FORMAT_ROW_GT2) { - if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_STR) { - if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) - return -1; - if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { - if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) - return -1; - if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) - return -1; - } else { - if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) - return -1; - } - } - return 0; -} - static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring_t *mem, int nsamples, int n_ops, const vcf_format_row_op_t *row_ops, @@ -4364,6 +4324,8 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t vcf_plan_int_range_t ranges[MAX_N_FMT]; size_t indiv_l0 = v->indiv.l; size_t direct_offsets[MAX_N_FMT]; + uint8_t *op_base[MAX_N_FMT]; + size_t op_stride[MAX_N_FMT]; const char *cur = q + 1, *end = s->s + s->l; if (!vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) @@ -4382,13 +4344,13 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t if (op->kind == VCF_FORMAT_ROW_GT2) { if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) - return -1; + goto error; direct_offsets[j] = v->indiv.l; v->indiv.l += (size_t)nsamples * 2; } else { if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) - return -1; + goto error; direct_offsets[j] = v->indiv.l; v->indiv.l += (size_t)nsamples * sizeof(float); } @@ -4399,22 +4361,29 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t vcf_format_row_op_t *op = &row_ops[j]; if ((uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) - return -1; + goto error; if (align_mem(mem) < 0) - return -1; + goto error; op->offset = mem->l; if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) - return -1; + goto error; mem->l += nsamples * (size_t) op->size; } + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + if (j < direct_ops) { + op_base[j] = (uint8_t *)v->indiv.s + direct_offsets[j]; + op_stride[j] = op->kind == VCF_FORMAT_ROW_GT2 ? 2 : (size_t)op->size; + } else { + op_base[j] = (uint8_t *)mem->s + op->offset; + op_stride[j] = (size_t)op->size; + } + } for (sample = 0; sample < nsamples && cur < end; sample++) { for (j = 0; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = j < direct_ops - ? (uint8_t *)v->indiv.s + direct_offsets[j] + - sample * (size_t)(op->kind == VCF_FORMAT_ROW_GT2 ? 2 : op->size) - : (uint8_t *)mem->s + op->offset + sample * (size_t)op->size; + uint8_t *buf = op_base[j] + sample * op_stride[j]; int n = op->width; switch (op->kind) { @@ -4487,7 +4456,7 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, nsamples, plan->n_ops, row_ops, ranges, direct_ops) < 0) - return -1; + goto error; vcf_format_plan_stats.hits++; vcf_format_plan_stats.parsed_samples += nsamples; return 0; @@ -4495,6 +4464,9 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t fallback: v->indiv.l = indiv_l0; return -4; +error: + v->indiv.l = indiv_l0; + return -1; } static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, From 45439acbb5f71c383f93e29bdf7d776dd9543ade Mon Sep 17 00:00:00 2001 From: jhl-oai Date: Tue, 28 Apr 2026 18:56:20 +0200 Subject: [PATCH 12/38] Benchmark dynamic FORMAT conversions --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 44f661cc7..9f0dc5cce 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -516,6 +516,60 @@ Post-cleanup parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: | Exact kernels | 1.85 | 1.89 | 1.86 | | Dynamic strict/interp | 1.86 | 1.83 | 1.83 | +## Follow-Up: Broader Operation Checks + +After the dynamic strict executor cleanup, the 10k CCDG conversion matrix was +rerun with `test/test_view`. Outputs were compared byte-for-byte against the +baseline output for every exact/interp cell. + +Single-run format conversion matrix, real seconds: + +| Conversion | Baseline | Exact kernels | Dynamic strict/interp | +|---|---:|---:|---:| +| VCF.gz -> BCF.gz | 8.73 | 7.78 | 8.58 | +| BCF -> BCF.gz | 6.85 | 6.92 | 7.02 | +| BCF -> VCF.gz | 11.18 | 11.22 | 11.15 | +| VCF.gz -> VCF.gz | 13.26 | 12.34 | 13.01 | +| VCF.gz -> uncompressed BCF | 2.83 | 1.85 | 2.58 | + +The `VCF.gz -> uncompressed BCF` interp cell above was a noisy outlier; a direct +focused rerun of that same parse-heavy case reproduced exact-speed dynamic +strict behavior: + +| Mode | Run 1 | Run 2 | Run 3 | +|---|---:|---:|---:| +| Exact kernels | 1.84 | 1.82 | 1.85 | +| Dynamic strict/interp | 1.84 | 1.84 | 1.85 | + +Read-only scan with `test_view -B` isolates input decode/parse without output +formatting or compression: + +| Input | Mode | Run 1 | Run 2 | Run 3 | +|---|---|---:|---:|---:| +| VCF.gz | Baseline | 2.59 | 2.61 | 2.58 | +| VCF.gz | Exact kernels | 1.62 | 1.62 | 1.65 | +| VCF.gz | Dynamic strict/interp | 1.62 | 1.63 | 1.62 | +| BCF | Baseline | 0.62 | 0.61 | 0.61 | +| BCF | Exact kernels | 0.61 | 0.61 | 0.63 | +| BCF | Dynamic strict/interp | 0.61 | 0.62 | 0.61 | + +Threaded compressed output with `test_view -@ 4` makes the parser win visible +again even for compressed-to-compressed workflows: + +| Conversion | Baseline | Exact kernels | Dynamic strict/interp | +|---|---:|---:|---:| +| VCF.gz -> BCF.gz, `-@ 4` | 2.64 | 2.03 | 2.06 | +| VCF.gz -> VCF.gz, `-@ 4` | 3.96 | 3.03 | 3.02 | + +BCF-input conversions and BCF read-only scans remain unchanged, as expected, +because the optimization only affects VCF FORMAT parsing. + +An attempted bcftools rebuild against this htslib worktree failed at link time +because the sibling bcftools checkout expects `bcf_write_take_ownership`, which +is not present in this htslib worktree. Operation-level bcftools timings should +therefore be rerun only after pairing this htslib branch with a matching +bcftools revision or porting that API. + ## Findings The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, From 3ec2e9f9c1fceed34915d098b9a6a2f9b1c62e6c Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 14:23:37 +0200 Subject: [PATCH 13/38] Clarify FORMAT plan benchmark state --- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 613 +++++------------------------ docs/FORMAT_PLAN_SPEC.md | 192 ++++----- 2 files changed, 177 insertions(+), 628 deletions(-) diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 9f0dc5cce..7a35c8ead 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -1,35 +1,30 @@ -# CCDG FORMAT Plan MVP Benchmark +# CCDG FORMAT Plan Benchmark Checkpoint -Date: 2026-04-28 +Date: 2026-04-29 Worktree: `/tmp/htslib-vcf-avx-sanity` Branch: `codex/vcf-avx-sanity` -## Goal +## Current Takeaway -Estimate whether a runtime-planned VCF FORMAT parser can improve end-to-end -compressed VCF/BCF conversion performance on a wide CCDG VCF. +The experimental FORMAT planner is viable, but the current large CCDG win comes +from the handwritten exact CCDG kernels, not yet from the fully dynamic +strict/interpreter path. -The MVP implementation is gated by: +The dynamic general planner is correct and modestly faster than baseline. It is +the path we want to improve next, using the exact kernels as a performance +oracle. -```sh -HTS_VCF_FORMAT_PLAN=1 -HTS_VCF_FORMAT_PLAN_STATS=1 -``` - -It dynamically caches observed FORMAT layouts. The current MVP has direct -executors for the four dominant CCDG layouts: +## Modes -```text -GT:AB:AD:DP:GQ:PL -GT:AD:DP:GQ:PL -GT:AB:AD:DP:GQ:PGT:PID:PL -GT:AD:DP:GQ:PGT:PID:PL +```sh +HTS_VCF_FORMAT_PLAN=0 # baseline generic parser +HTS_VCF_FORMAT_PLAN=1 # exact CCDG kernels, then dynamic general fallback +HTS_VCF_FORMAT_PLAN=interp # dynamic general planner only +HTS_VCF_FORMAT_PLAN_STATS=1 # print planner counters from test/test_view ``` -Other layouts fall back to the existing generic FORMAT parser. - ## Data Source file: @@ -38,128 +33,97 @@ Source file: /Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` -Subset used for this benchmark: - -```text -/tmp/ccdg_chr22_10k.vcf -``` - -The subset contains 10,000 variant records plus header lines. It is wide: -3,202 samples and about 866 MiB uncompressed. - -Compressed inputs prepared from the subset: +Subset used for the current benchmark: ```text /tmp/ccdg_chr22_10k.vcf.gz /tmp/ccdg_chr22_10k.bcf ``` -Approximate input sizes: +The subset contains 10,000 variant records and 3,202 samples. The observed +FORMAT distribution is: -```text -ccdg_chr22_10k.vcf.gz 118 MiB by ls, 129 MiB by du -ccdg_chr22_10k.bcf 152 MiB by ls, 160 MiB by du -``` +| Records | FORMAT | +|---:|---| +| 4,681 | `GT:AB:AD:DP:GQ:PL` | +| 3,774 | `GT:AB:AD:DP:GQ:PGT:PID:PL` | +| 813 | `GT:AD:DP:GQ:PL` | +| 732 | `GT:AD:DP:GQ:PGT:PID:PL` | -## FORMAT Coverage +The exact CCDG tier covers all four layouts. -On the 10k CCDG subset after adding `PGT:PID` support: +## Clean Sanity Rerun -```text -attempts=10000 -hits=10000 -fallback=0 -parsed_samples=32020000 -``` +These numbers were rerun after noticing that an earlier table mislabeled the +dynamic/interpreter result. Timings are single wall-clock runs on the 10k CCDG +subset, so treat them as directional. -The planned parser therefore handled 100% of records and parsed 32.0 million -sample FORMAT entries directly. +| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | +|---|---:|---:| +| Baseline | 2.58 s | 2.83 s | +| Exact + dynamic fallback | 1.61 s | 1.86 s | +| Dynamic general only | 2.34 s | 2.55 s | -For comparison, before `PGT:PID` support, coverage was: +Planner counters on VCF.gz -> uncompressed BCF: -```text -attempts=10000 -hits=5494 -fallback=4506 -parsed_samples=17591788 -``` +| Mode | Attempts | Hits | Fallback | Parsed samples | +|---|---:|---:|---:|---:| +| Exact + dynamic fallback | 10,000 | 10,000 | 0 | 32,020,000 | +| Dynamic general only | 10,000 | 10,000 | 0 | 32,020,000 | -The fallback records were almost entirely the two layouts containing -`PGT:PID`. +Both planned modes are byte-identical against baseline in the sanity tests, but +the exact tier is much faster. -## Four-Cell Compressed Conversion Benchmark +## Broader Conversion Matrix -All cells are compressed input to compressed output. Each timing is a single -wall-clock run using `/usr/bin/time -p`; treat these as directional, not a -statistically rigorous benchmark. +Earlier single-run compressed conversion checks used `test/test_view` and +compared outputs byte-for-byte with `cmp`. -| Conversion | Baseline real | FORMAT plan real | Change | +| Conversion | Baseline | Exact + dynamic fallback | Dynamic general only | |---|---:|---:|---:| -| VCF.gz -> BCF | 9.150 s | 8.266 s | 9.7% faster | -| BCF -> BCF | 7.168 s | 7.221 s | neutral, 0.7% slower | -| BCF -> VCF.gz | 11.367 s | 11.487 s | neutral, 1.1% slower | -| VCF.gz -> VCF.gz | 13.405 s | 12.670 s | 5.5% faster | +| VCF.gz -> BCF.gz | 8.73 s | 7.78 s | 8.58 s | +| BCF -> BCF.gz | 6.85 s | 6.92 s | 7.02 s | +| BCF -> VCF.gz | 11.18 s | 11.22 s | 11.15 s | +| VCF.gz -> VCF.gz | 13.26 s | 12.34 s | 13.01 s | +| VCF.gz -> uncompressed BCF | 2.83 s | 1.85 s | 2.58 s | -Command shapes: - -```sh -./test/test_view.baseline -b -p /tmp/bench_base_vcf_to_bcf.bcf /tmp/ccdg_chr22_10k.vcf.gz -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -p /tmp/bench_plan_vcf_to_bcf.bcf /tmp/ccdg_chr22_10k.vcf.gz +BCF-input conversions are unchanged, as expected, because this optimization only +affects VCF text FORMAT parsing. -./test/test_view.baseline -b -p /tmp/bench_base_bcf_to_bcf.bcf /tmp/ccdg_chr22_10k.bcf -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -p /tmp/bench_plan_bcf_to_bcf.bcf /tmp/ccdg_chr22_10k.bcf - -./test/test_view.baseline -z -p /tmp/bench_base_bcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.bcf -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -z -p /tmp/bench_plan_bcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.bcf - -./test/test_view.baseline -z -p /tmp/bench_base_vcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.vcf.gz -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -z -p /tmp/bench_plan_vcf_to_vcf.vcf.gz /tmp/ccdg_chr22_10k.vcf.gz -``` - -For each cell, the baseline output and planned-parser output were compared with -`cmp` and matched byte-for-byte. The BCF-input cells have -`attempts=0 hits=0 fallback=0` because they never enter the VCF text FORMAT -parser. - -## Compressed VCF to Uncompressed BCF - -This additional case keeps compressed VCF input but removes output compression -by writing BCF at compression level 0. +Threaded compressed output with `test_view -@ 4` makes the parser win visible +even for compressed-to-compressed workflows: -| Conversion | Baseline real | FORMAT plan real | Change | +| Conversion | Baseline | Exact + dynamic fallback | Dynamic general only | |---|---:|---:|---:| -| VCF.gz -> uncompressed BCF | 2.817 s | 1.930 s | 31.5% faster | +| VCF.gz -> BCF.gz, `-@ 4` | 2.64 s | 2.03 s | 2.06 s | +| VCF.gz -> VCF.gz, `-@ 4` | 3.96 s | 3.03 s | 3.02 s | -Command shape: - -```sh -./test/test_view.baseline -b -l 0 -p /tmp/bench_base_vcfgz_to_ubcf.bcf /tmp/ccdg_chr22_10k.vcf.gz -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 ./test/test_view -b -l 0 -p /tmp/bench_plan_vcfgz_to_ubcf.bcf /tmp/ccdg_chr22_10k.vcf.gz -``` +The threaded dynamic-only numbers should be rerun before drawing strong +conclusions; the clean single-thread rerun shows dynamic-only is not yet at +exact-kernel speed. -The baseline and planned-parser outputs were compared with `cmp` and matched -byte-for-byte. +## Edge Fixture -## Parse-Only Reference Timings +`./test/test_format_plan.sh` compares baseline, `HTS_VCF_FORMAT_PLAN=1`, and +`HTS_VCF_FORMAT_PLAN=interp` on `test/format-plan-edge.vcf`. -For context, earlier parse-only tests on the same subsets showed a much larger -effect because output compression was removed from the critical path: +Current output: -| Dataset | Baseline parse-only | FORMAT plan parse-only | Change | -|---|---:|---:|---:| -| 10k CCDG subset, pre-`PGT:PID` executor | about 2.30 s | about 1.64 s | about 29% faster | -| 100k CCDG subset, pre-`PGT:PID` executor | 23.94 s | 17.71 s | about 26% faster | -| 100k CCDG subset, all-hit executor | 24.22 s | 14.95 s | about 38% faster | -| 100k CCDG VCF.gz -> uncompressed BCF, all-hit executor | 26.65 s | 18.12 s | about 32% faster | +```text +vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 +vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 +``` -The all-hit executor was byte-identical against baseline on the 10k BCF output -and on a targeted one-record phased-layout test. +The first line is `HTS_VCF_FORMAT_PLAN=1`: exact kernels claim the CCDG-shaped +rows and intentionally fall back for rows outside their narrow shape. The +second line is dynamic-only: the general planner handles all 14 fixture rows. ## Profiling Notes After `PGT:PID` support, the generic FORMAT fallback is no longer a meaningful -cost for the CCDG benchmark. A macOS `sample` profile of -`VCF.gz -> uncompressed BCF` on the 100k subset showed the next hot areas: +cost for the CCDG benchmark when exact kernels are enabled. A macOS `sample` +profile of VCF.gz -> uncompressed BCF on the 100k subset showed the next hot +areas inside the planned path: ```text vcf_plan_parse_int_vector 189 samples @@ -173,423 +137,30 @@ vcf_plan_float_value 24 samples read 16 samples ``` -This is a statistical sample, not exact cycle accounting, but it is useful -directionally. The next parser-side targets are direct integer-vector parsing -for AD/PL and reducing repeated `bcf_enc_vint` work in the planned path. - -## Follow-Up: Fixed-Width AD/PL Parsing - -The first follow-up optimization added fixed-width planned parsers for the most -common biallelic case: - -```text -AD width = 2 -PL width = 3 -``` - -On the 10k subset, about 82% of records are biallelic, so this removes a large -number of generic integer-vector loop iterations and helper calls while leaving -multi-allelic rows on the generic planned-vector parser. - -Correctness checks remained byte-identical against baseline for: - -```text -/tmp/ccdg_one_phase.vcf -> uncompressed BCF -/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF -``` - -Directional timings after the fixed-width parser change: - -| Dataset | Previous all-hit plan | Fixed-width AD/PL plan | Change | -|---|---:|---:|---:| -| 100k CCDG VCF -> uncompressed BCF | 14.95 s | 13.1-13.6 s | about 9-12% faster | -| 100k CCDG VCF.gz -> uncompressed BCF | 18.12 s | 15.6-16.5 s | about 9-14% faster | - -An attempted range-tracked replacement for `bcf_enc_vint` was also tested. It -preserved byte identity, but it slowed these same parse-heavy cases, so it was -not kept. The likely issue is that tracking ranges during parse adds enough -per-value work to outweigh skipping `bcf_enc_vint`'s later range scan. - -## Follow-Up: Compiled Op Interpreter - -A second planned-parser tier was added to test whether a more general compiled -FORMAT op interpreter can recover much of the exact-kernel benefit while -covering more layouts. The exact CCDG kernels still run for -`HTS_VCF_FORMAT_PLAN=1`; `HTS_VCF_FORMAT_PLAN=interp` skips those kernels and -uses only the compiled op interpreter. - -Correctness checks: - -```text -./test/test_format_plan.sh -HTS_VCF_FORMAT_PLAN=interp ./test/test_view -b -l 0 test/format-plan-edge.vcf -/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF -``` - -All planned outputs were compared against baseline with `cmp` and matched -byte-for-byte. The CCDG 10k VCF-input cases had 10,000 attempts, 10,000 hits, -0 fallback, and 32,020,000 parsed samples for both exact and interpreter modes. - -Single-pass 10k CCDG conversion matrix, real seconds: - -| Conversion | Baseline | Exact kernels | Compiled interp | Exact vs baseline | Interp vs baseline | -|---|---:|---:|---:|---:|---:| -| VCF.gz -> BCF.gz | 9.11 | 7.97 | 8.79 | 12.5% faster | 3.5% faster | -| BCF -> BCF.gz | 7.03 | 7.06 | 7.03 | neutral | neutral | -| BCF -> VCF.gz | 11.20 | 11.32 | 11.21 | neutral | neutral | -| VCF.gz -> VCF.gz | 13.18 | 12.01 | 12.92 | 8.9% faster | 2.0% faster | -| VCF.gz -> uncompressed BCF | 2.79 | 1.64 | 2.61 | 41.2% faster | 6.5% faster | - -Parse-heavy uncompressed reference: - -| Conversion | Baseline | Exact kernels | Compiled interp | Exact vs baseline | Interp vs baseline | -|---|---:|---:|---:|---:|---:| -| VCF -> uncompressed BCF | 2.56 | 1.36 | 2.33 | 46.9% faster | 9.0% faster | - -The compiled interpreter is useful for validating the architecture, but it is -not yet where the performance is. Its per-sample dynamic dispatch, generic -width pass, generic vector loops, and indirect per-op buffer handling leave it -much closer to the baseline parser than to the exact CCDG kernels. This argues -for a hybrid approach: use the interpreter as a safe coverage layer and add -small specialized op handlers for the very common shapes inside it, especially -diploid GT, scalar ints, biallelic AD, biallelic PL, and fixed-width strings. - -## Follow-Up: Opcode Tape Specialization - -The compiled interpreter was then changed from "inspect each op type while -parsing" to a row-specific opcode tape. The FORMAT string is still cached as a -flexible op list, but after the row width pass each op is resolved to a narrower -handler: - -```text -GT2, GT-dynamic, INT1, INT2, INT3, INTN, FLOAT1, FLOATN, STR -``` - -This preserves the flexible interpreter path for arbitrary defined -String/Integer/Float FORMAT layouts, while avoiding repeated `is_gt` / type -checks and using the same fixed-width integer helpers as the exact CCDG kernel -when the observed row width permits it. - -Correctness checks remained byte-identical against baseline for: - -```text -./test/test_format_plan.sh -HTS_VCF_FORMAT_PLAN=interp ./test/test_view -b -l 0 test/format-plan-edge.vcf -/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF -``` - -Single-pass 10k CCDG conversion matrix after opcode specialization, real -seconds: - -| Conversion | Baseline | Exact kernels | Opcode interp | Exact vs baseline | Interp vs baseline | -|---|---:|---:|---:|---:|---:| -| VCF.gz -> BCF.gz | 9.19 | 7.99 | 9.28 | 13.1% faster | neutral/noisy | -| BCF -> BCF.gz | 8.04 | 8.22 | 8.10 | neutral | neutral | -| BCF -> VCF.gz | 12.71 | 12.04 | 12.99 | neutral/noisy | neutral/noisy | -| VCF.gz -> VCF.gz | 13.76 | 12.33 | 13.88 | 10.4% faster | neutral/noisy | -| VCF.gz -> uncompressed BCF | 2.87 | 1.68 | 2.43 | 41.5% faster | 15.3% faster | - -Parse-heavy uncompressed reference: - -| Conversion | Baseline | Exact kernels | Opcode interp | Exact vs baseline | Interp vs baseline | -|---|---:|---:|---:|---:|---:| -| VCF -> uncompressed BCF | 2.57 | 1.42 | 2.12 | 44.7% faster | 17.5% faster | - -Relative to the first compiled interpreter measurement, opcode specialization -improved the parse-heavy uncompressed case from 2.33 s to 2.12 s and VCF.gz to -uncompressed BCF from 2.61 s to 2.43 s. That is real movement, but the exact -kernels remain substantially faster because they also avoid the generic width -measurement, per-op buffer indirection, and per-sample opcode switch. - -## Follow-Up: Strict Width and Shape Executors - -The next iteration hardened correctness and tested more aggressive FORMAT -planning: - -- planned integer parsing now detects BCF int32 payload overflow and falls back, - avoiding undefined overflow and preserving generic warning/missing behavior; -- exact AD/PL paths validate that the observed max vector width matches the - emitted width, falling back for sparse rows that generic htslib would encode - narrower; -- the general interpreter can skip the observed-width pass for strict - header/allele-count-derived numeric rows; -- common numeric opcode tapes `GT2:INT2:INT1:INT1:INT3` and - `GT2:FLOAT1:INT2:INT1:INT1:INT3` use shape-level executors; -- validated `GT2` rows emit BCF int8 directly instead of calling - `bcf_enc_vint()`. - -`test/format-plan-edge.vcf` now includes an all-missing AD/PL row to verify that -the exact path falls back when its expected vector width would not match generic -observed-width BCF output. - -Correctness checks remained byte-identical: - -```text -./test/test_format_plan.sh -/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF, exact mode -/tmp/ccdg_chr22_10k.vcf -> uncompressed BCF, interpreter mode -``` - -Parse-heavy 10k CCDG reference after these changes: - -| Conversion | Baseline | Exact kernels | Strict/shape interp | -|---|---:|---:|---:| -| VCF -> uncompressed BCF | 2.63 s | 1.61 s | 2.31 s | - -Full 10k CCDG compressed matrix, real seconds: - -| Conversion | Baseline | Exact kernels | Strict/shape interp | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz | 9.26 | 8.22 | 8.94 | -| BCF -> BCF.gz | 7.18 | 7.20 | 7.16 | -| BCF -> VCF.gz | 11.45 | 11.33 | 11.85 | -| VCF.gz -> VCF.gz | 14.37 | 13.55 | 13.52 | -| VCF.gz -> uncompressed BCF | 2.94 | 1.92 | 2.66 | - -On a 3k CCDG subset containing only non-phase FORMAT layouts, the strict/shape -interpreter improved over baseline but still did not approach the exact kernel: - -| Dataset | Baseline | Exact kernels | Strict/shape interp | -|---|---:|---:|---:| -| 3k non-phase VCF -> uncompressed BCF | 0.68 s | 0.34 s | 0.58 s | - -The takeaway is mixed. The hardening is worth keeping, and direct `GT2` -encoding is simple and safe. However, shape-level dispatch alone does not close -the remaining gap. The next high-ROI parser-side experiment should reduce -memory traffic by parsing validated fixed-width fields directly into final BCF -payload buffers, or specialize complete row executors that combine parse, -validation, and encode rather than only replacing the opcode switch. - -## Follow-Up: Direct Payload Sinks - -The next pass tested direct final-buffer output for fields whose BCF -representation is known before parsing: - -- exact `GT2` writes directly to a final `INT8` payload instead of scratch - `int32_t` values plus `bcf_enc_vint()`; -- exact `AB` writes directly to a final float payload; -- strict shape executors direct-write `GT2` and optional leading `FLOAT1` - payloads, with rollback on fallback; -- exact AD/DP/GQ/PL also carry integer range metadata into a known-range encoder - to avoid the range pass in `bcf_enc_vint()`. - -Correctness remained byte-identical for the edge fixture and 10k CCDG exact and -interpreter modes. - -Parse-heavy 10k CCDG reference: - -| Conversion | Baseline | Exact kernels | Direct-sink interp | -|---|---:|---:|---:| -| VCF -> uncompressed BCF | 2.51-2.68 s | 1.57-1.58 s | 2.29-2.39 s | - -Full 10k CCDG compressed matrix, real seconds: - -| Conversion | Baseline | Exact kernels | Direct-sink interp | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz | 9.51 | 8.53 | 9.28 | -| BCF -> BCF.gz | 7.46 | 7.46 | 7.46 | -| BCF -> VCF.gz | 11.95 | 12.00 | 12.02 | -| VCF.gz -> VCF.gz | 14.16 | 12.95 | 13.62 | -| VCF.gz -> uncompressed BCF | 2.95 | 1.92 | 2.67 | - -The direct sinks are safe but small on this dataset. The known-range encoder -was also byte-identical but did not produce a clear timing win, suggesting that -range tracking during parse still mostly trades one cost for another. Broader -direct integer output likely needs either a cheap type-prediction/rollback -strategy or complete fused row executors that avoid both scratch traffic and -post-parse encoding for multiple fields at once. - -## Follow-Up: Optimistic Guards - -The fast paths now have a small circuit breaker in the cached plan state. This -is tuned for the practical expectation that files are piecewise fixed-format, -with occasional weird rows rather than uniformly weird records. - -The fast parser still validates as it parses and immediately rolls back on any -mismatch. The new guard only decides whether to keep trying that fast parser on -later records: - -- a success resets the consecutive-miss streak; -- isolated weird rows fall back once and do not disable the fast path; -- eight consecutive misses pause the fast path; -- after 128 attempts, more than 10% fallbacks also pauses it; -- paused paths cool down for 256 skipped records, then re-probe so later - fixed-format regions can recover. - -The clean CCDG path is unchanged: on the 10k subset, exact mode still reports -`10000 hits / 0 fallbacks`. The edge fixture remains byte-identical and keeps -the expected mixed behavior: - -```text -./test/test_format_plan.sh -vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 -vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 -``` - -The full compressed matrix was not re-recorded for this guard-only change -because the machine was under unrelated CPU load during the interrupted run. -The parse-heavy 10k BCF outputs were re-compared byte-for-byte for exact and -interpreter modes. - -## Follow-Up: Generic Strict Numeric Executor +This is statistical sampling, not exact cycle accounting. Directionally, the +next parser-side targets are integer-vector parsing, `PGT/PID` string handling, +per-sample dispatch, and repeated BCF integer encoding work. -The next iteration removed the two hard-coded shape executors and replaced them -with a generic strict fixed-numeric executor. This is the dynamic-exact version -of the FORMAT planner: +## Checkpoint Recommendation -- the executor is keyed by resolved row op kinds and widths, not FORMAT field - names; -- any fixed-width numeric op sequence is eligible; -- leading `GT2` and scalar float fields are written directly into the final BCF - `indiv` buffer; -- integer fields carry min/max range metadata from parse into encode so - `bcf_enc_vint()` does not rescan scratch arrays; -- any mismatch rolls back direct writes and falls back to the measured-width - general planner or legacy parser. +Commit this state as an honest experimental checkpoint: -Correctness checks: +- keep the exact CCDG kernels because they establish the upper-bound target; +- keep the dynamic general planner and edge fixture because they are the path to + a general solution; +- keep benchmark docs explicit that dynamic-only is not yet the big win; +- do not open an upstream-facing PR until the dynamic executor closes more of + the gap or the PR is framed as an experimental CCDG-specialized prototype. -```text -make -j4 test/test_view -./test/test_format_plan.sh -cmp baseline/exact/interp BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz -cmp baseline/exact/interp compressed BCF outputs for /tmp/ccdg_chr22_10k.vcf.gz -``` - -The mixed edge fixture remains byte-identical. It now includes reordered -numeric FORMAT fields, a scalar float away from the first FORMAT positions, -non-CCDG fixed-width numeric tags, and integer values that cross BCF int8/int16 -encoding thresholds: - -```text -vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 -vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 -``` - -Parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: - -| Mode | Run 1 | Run 2 | Run 3 | -|---|---:|---:|---:| -| Baseline | 2.86 | 2.87 | 2.85 | -| Exact kernels | 1.85 | 1.85 | 1.86 | -| Dynamic strict/interp | 1.87 | 1.88 | 1.88 | - -After removing the old shape-specific templates, a cleanup check still showed -exact and dynamic strict essentially tied: - -| Mode | Real seconds | -|---|---:| -| Exact kernels | 1.89 | -| Dynamic strict/interp | 1.87 | - -Single-run compressed VCF.gz to compressed BCF.gz, real seconds: - -| Mode | Real seconds | -|---|---:| -| Baseline | 10.08 | -| Exact kernels | 9.01 | -| Dynamic strict/interp | 8.58 | - -The compressed result should be read as directional because compression noise is -larger, but the outputs were byte-identical and the main parse-heavy result is -the key signal: the dynamic strict path is now within measurement noise of the -hand-written exact CCDG kernel without matching on CCDG field names. - -## Follow-Up: Subagent Review Cleanup - -Three review passes suggested tightening the generic executor rather than adding -more field-name cases. The resulting cleanup: - -- rolls back `v->indiv` on hard errors after direct writes, not only on shape - fallback; -- skips range updates for padding vector-end sentinels while preserving the - `bcf_enc_vint()` min/max contract; -- precomputes per-op base pointers and strides before the sample loop; -- removes the stale non-range encoder variant; -- expands the edge fixture with reordered and non-CCDG numeric FORMAT rows. - -Post-cleanup parse-heavy 10k CCDG, VCF.gz to uncompressed BCF, real seconds: - -| Mode | Run 1 | Run 2 | Run 3 | -|---|---:|---:|---:| -| Baseline | 2.83 | 2.84 | 2.86 | -| Exact kernels | 1.85 | 1.89 | 1.86 | -| Dynamic strict/interp | 1.86 | 1.83 | 1.83 | - -## Follow-Up: Broader Operation Checks - -After the dynamic strict executor cleanup, the 10k CCDG conversion matrix was -rerun with `test/test_view`. Outputs were compared byte-for-byte against the -baseline output for every exact/interp cell. - -Single-run format conversion matrix, real seconds: - -| Conversion | Baseline | Exact kernels | Dynamic strict/interp | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz | 8.73 | 7.78 | 8.58 | -| BCF -> BCF.gz | 6.85 | 6.92 | 7.02 | -| BCF -> VCF.gz | 11.18 | 11.22 | 11.15 | -| VCF.gz -> VCF.gz | 13.26 | 12.34 | 13.01 | -| VCF.gz -> uncompressed BCF | 2.83 | 1.85 | 2.58 | - -The `VCF.gz -> uncompressed BCF` interp cell above was a noisy outlier; a direct -focused rerun of that same parse-heavy case reproduced exact-speed dynamic -strict behavior: - -| Mode | Run 1 | Run 2 | Run 3 | -|---|---:|---:|---:| -| Exact kernels | 1.84 | 1.82 | 1.85 | -| Dynamic strict/interp | 1.84 | 1.84 | 1.85 | - -Read-only scan with `test_view -B` isolates input decode/parse without output -formatting or compression: - -| Input | Mode | Run 1 | Run 2 | Run 3 | -|---|---|---:|---:|---:| -| VCF.gz | Baseline | 2.59 | 2.61 | 2.58 | -| VCF.gz | Exact kernels | 1.62 | 1.62 | 1.65 | -| VCF.gz | Dynamic strict/interp | 1.62 | 1.63 | 1.62 | -| BCF | Baseline | 0.62 | 0.61 | 0.61 | -| BCF | Exact kernels | 0.61 | 0.61 | 0.63 | -| BCF | Dynamic strict/interp | 0.61 | 0.62 | 0.61 | - -Threaded compressed output with `test_view -@ 4` makes the parser win visible -again even for compressed-to-compressed workflows: - -| Conversion | Baseline | Exact kernels | Dynamic strict/interp | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz, `-@ 4` | 2.64 | 2.03 | 2.06 | -| VCF.gz -> VCF.gz, `-@ 4` | 3.96 | 3.03 | 3.02 | +## Next Work -BCF-input conversions and BCF read-only scans remain unchanged, as expected, -because the optimization only affects VCF FORMAT parsing. +The highest-value next step is to make a dynamic fixed-shape executor that +captures the exact-kernel benefits without matching on CCDG field names. The +target is exact-like speed for piecewise fixed FORMAT regions with quick +fallback when a row leaves the proven shape. An attempted bcftools rebuild against this htslib worktree failed at link time because the sibling bcftools checkout expects `bcf_write_take_ownership`, which is not present in this htslib worktree. Operation-level bcftools timings should -therefore be rerun only after pairing this htslib branch with a matching -bcftools revision or porting that API. - -## Findings - -The planned FORMAT parser is viable. With all four dominant CCDG layouts covered, -parse-heavy VCF to uncompressed BCF conversion improves by about 30-40% on the -100k subset. - -For fully compressed-to-compressed conversion, output/input compression and VCF -formatting absorb much of the parser win. The MVP still improved VCF-input -conversions by about 5-10%, while BCF-input conversions were unchanged as -expected. When output compression is removed, VCF.gz to uncompressed BCF improves -by about 32%, much closer to the parse-only gain. - -The practical takeaway is that FORMAT planning is a better optimization target -than top-level VCF delimiter SIMD scanning. The earlier delimiter-only probe had -100% record coverage but was essentially neutral, while FORMAT planning moved -the parse-heavy workload substantially. - -The next highest-value extension is not more FORMAT layout coverage for this -CCDG benchmark, because coverage is already 100%. It is reducing the cost inside -the planned path and then possibly pipelining decompression/parse/encode once -the single-threaded parser work has been squeezed further. After the fixed-width -AD/PL parser, `bcf_enc_vint` and input decompression remain the most obvious -next bottlenecks. +be rerun only after pairing this htslib branch with a matching bcftools revision +or porting that API. diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 716e6bab1..47c0509e1 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -1,139 +1,121 @@ # FORMAT Plan Parser Spec -This document describes the intended direction for the experimental -`HTS_VCF_FORMAT_PLAN=1` VCF FORMAT parser. +This document describes the current experimental `HTS_VCF_FORMAT_PLAN` VCF +FORMAT parser and the direction for making it more general. ## Goal -Keep the existing parser as the source of truth, but add a runtime-compiled -fast path for common FORMAT layouts. The fast path should be opportunistic: -compile a plan for repeated FORMAT strings, execute known-safe operations -directly, and fall back to the generic parser whenever the record leaves the -supported subset. +Keep the existing htslib FORMAT parser as the source of truth, but add +opportunistic fast paths for repeated FORMAT layouts. A fast path may only claim +a record when it can produce byte-identical BCF. Otherwise it must return `-3` +and let the existing parser handle the row. -## Architecture +## Current Architecture -The parser is tiered: +`HTS_VCF_FORMAT_PLAN=1` enables a tiered parser: -1. Exact kernels for dominant production layouts. The current CCDG kernels cover +1. Handwritten exact kernels for the four dominant CCDG FORMAT layouts: `GT:AB:AD:DP:GQ:PL`, `GT:AD:DP:GQ:PL`, `GT:AB:AD:DP:GQ:PGT:PID:PL`, and `GT:AD:DP:GQ:PGT:PID:PL`. -2. A compiled op-list interpreter for regular FORMAT layouts. It caches the - FORMAT string, resolves header IDs once, then executes per-field operations - for GT, integer vectors, float vectors, and strings. -3. Generic htslib parsing for everything else, including sample subsetting, - duplicate FORMAT tags, undefined tags that require dummy header insertion, - unsupported header types, malformed values, or future VCF constructs. - -The cache key is the literal FORMAT column. Record-specific widths are still -computed per row because BCF stores each FORMAT field as a rectangular -sample-by-value array, and the width depends on observed ploidy, vector length, -string length, and allele count. +2. A dynamic general FORMAT planner keyed by the literal FORMAT column and + header pointer. It resolves field IDs/types once and executes row-specific + operations for GT, integer vectors, float vectors, and strings. +3. The existing generic htslib FORMAT parser for unsupported or suspicious + rows. -## Correctness Rules +`HTS_VCF_FORMAT_PLAN=interp` or `HTS_VCF_FORMAT_PLAN=general` skips the exact +CCDG kernels and runs only the dynamic general planner. This mode is useful for +isolating how much performance the general approach has captured. + +## Measured State + +On the 10k CCDG subset, the exact tier is currently the large win. A clean +sanity rerun on 2026-04-29 showed: + +| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | +|---|---:|---:| +| Baseline | 2.58 s | 2.83 s | +| `HTS_VCF_FORMAT_PLAN=1` | 1.61 s | 1.86 s | +| `HTS_VCF_FORMAT_PLAN=interp` | 2.34 s | 2.55 s | + +The earlier docs overstated the dynamic strict/interpreter result. The dynamic +planner is correct and modestly faster than baseline, but it does not yet match +the handwritten CCDG kernels. + +The next development target is to move the exact-kernel advantages into a +dynamic shape executor so common fixed-format regions can get exact-like speed +without field-name-specific kernels. -The planned parser must produce byte-identical BCF to the generic parser for any -record it claims. If it cannot prove that, it must return `-3` so the existing -parser handles the record. +## Correctness Rules -Required invariants: +The planned parser must preserve these invariants: - No planned parsing while `h->keep_samples` is active. - Header IDs and types are resolved before execution. -- Duplicate tags use the generic parser. +- Duplicate FORMAT tags use the generic parser. - Undefined tags use the generic parser, preserving current dummy-header behavior and warnings. - GT encoding must match generic htslib phasing semantics, including haploid - genotypes and VCF 4.4 prefix phasing. -- Numeric vectors use observed row width and pad shorter samples with vector-end - sentinels. + genotypes, missing alleles, multidigit allele indexes, and VCF 4.4 prefix + phasing. +- Numeric vectors use observed or provably fixed row width and pad shorter + samples with vector-end sentinels. - Strings use observed maximum byte length and zero-pad shorter samples. -- Integer and float overflow/error behavior should either match generic htslib - or force fallback. - -## Current MVP - -The current implementation keeps the CCDG exact kernels as the first tier and -adds a general compiled op-list tier for defined FORMAT fields with type -`String`, `Integer`, or `Float`. The op-list tier handles: +- Integer and float overflow/error behavior must either match generic htslib or + force fallback. +- Any fast path that writes directly into `v->indiv` must save the original + length and roll back before fallback. -- arbitrary field order, -- haploid, diploid, multidigit, missing, and phased GT values, -- integer and float vectors with row-local observed widths, -- string fields with row-local observed widths, -- multiallelic `Number=R` and `Number=G` rows by using observed vector width. +## Dynamic Planner -The MVP intentionally falls back for sample subsetting, duplicate tags, -undefined tags, unsupported header types, and malformed values. - -After the row width pass, the interpreter resolves each cached FORMAT op to a -row-specific opcode such as `GT2`, `GT`, `INT1`, `INT2`, `INT3`, `INTN`, -`FLOAT1`, `FLOATN`, or `STR`. This keeps layout coverage flexible while -memoizing the common "muscle memory" for repeated shapes. +The general planner compiles the literal FORMAT string into a cached op list. +After seeing a record, it resolves the ops to row-local opcodes such as `GT2`, +`GT`, `INT1`, `INT2`, `INT3`, `INTN`, `FLOAT1`, `FLOATN`, and `STR`. For rows whose widths can be predicted from the header and allele count, the -interpreter can try a strict path before the observed-width pass. The strict -path validates the observed maximum width while parsing and falls back to the -observed-width interpreter if the row is sparse, malformed, string-bearing, or -otherwise not byte-identical. - -Strict numeric rows now use a generic fixed-schema executor rather than -FORMAT-name special cases. It accepts any fixed-width numeric opcode sequence, -direct-writes a leading `GT2`/`FLOAT1` prefix into `v->indiv`, parses remaining -integer/float fields into row-local scratch, carries integer min/max metadata -from parse to encode, and rolls back direct writes on the first mismatch. This -gives CCDG-like rows exact-kernel performance while keeping the executor keyed -by dynamic row shape rather than field names. - -Planned integer parsing must be overflow-safe. If a value is outside the BCF -int32 payload range, the planned parser falls back so the generic parser keeps -its warning and missing-value behavior. - -Validated `GT2` payloads and leading scalar float payloads can be written -directly into `v->indiv` instead of going through scratch arrays. Any direct -writer must save the entry length and roll back before returning fallback. +planner first tries a strict numeric executor. That path validates shape while +parsing, carries integer min/max metadata into BCF integer encoding, and can +direct-write a leading `GT2`/`FLOAT1` prefix. If the row is sparse, stringy, +malformed, or otherwise not byte-identical, it falls back to the measured-width +general planner. -## Guard Policy +Today, the strict/general path still has enough overhead that it trails the +handwritten CCDG kernels on the CCDG benchmark. Likely remaining gaps include +per-field dispatch, measured-width/string handling for `PGT/PID`, scratch-buffer +traffic, and generic encode costs. -Planned FORMAT parsing is optimistic and self-validating: parsing each field is -also the shape check. When a guard fails, the parser rolls back any direct -`v->indiv` writes and falls back to the more general parser. +## Guard Policy -Each cached exact/general FORMAT plan keeps small runtime guard state: +Each cached exact/general plan has a small runtime guard: - attempts, hits, fallbacks, - consecutive miss streak, -- a temporary cooldown flag. +- temporary cooldown. -The guard is tuned for piecewise fixed-format VCFs with infrequent weird rows. -An isolated fallback does not disable the fast path; the next success resets the -miss streak. A fast path is paused only after eight consecutive misses, or -after at least 128 attempts with more than 10% fallbacks. Paused plans are not -blacklisted forever: after 256 skipped records, the plan probes the fast path -again so later fixed-format regions can recover the optimized path. +An isolated fallback does not disable the fast path. A plan is paused only +after eight consecutive misses, or after at least 128 attempts with more than +10% fallbacks. After 256 skipped records, the plan probes again so later +fixed-format regions can recover the optimized path. -For exact CCDG kernels, a paused exact guard routes the row to the compiled +For exact CCDG kernels, a paused exact guard routes the row to the dynamic general planner. For general plans, a paused strict guard skips directly to the -measured-width general planner, and a paused general guard returns to legacy -htslib parsing. +measured-width planner, and a paused general guard returns to legacy htslib +parsing. ## Edge Fixture -`test/format-plan-edge.vcf` is CCDG-shaped but includes records that exercise -common awkward cases: +`test/format-plan-edge.vcf` is CCDG-shaped but includes awkward realistic rows: - the exact CCDG layouts, -- reordered fields, -- reordered numeric fields with `GT` and scalar floats away from the first - FORMAT position, +- reordered FORMAT fields, - non-CCDG numeric tag names with fixed widths, -- integer values around BCF int8/int16 type boundaries, +- integer values around BCF int8/int16 boundaries, - multiallelic AD/PL and GL, - haploid GT, - multidigit allele indexes, - fixed integer vectors, - string FORMAT fields, -- exact-kernel fallbacks such as haploid GT and multidigit allele indexes. +- exact-kernel fallbacks that the dynamic planner can still handle. Run: @@ -141,21 +123,17 @@ Run: ./test/test_format_plan.sh ``` -The script writes BCF through the generic parser and through -`HTS_VCF_FORMAT_PLAN=1`, compares them with `cmp`, and prints plan hit/fallback -statistics. `HTS_VCF_FORMAT_PLAN=interp` or `HTS_VCF_FORMAT_PLAN=general` -skips the exact kernels and runs the compiled op-list interpreter directly, -which is useful for isolating interpreter performance. +The script writes BCF through the generic parser, `HTS_VCF_FORMAT_PLAN=1`, and +`HTS_VCF_FORMAT_PLAN=interp`, then compares the outputs with `cmp`. ## Next Work -- Add more exact kernels only after coverage data shows that they dominate real - inputs. -- Add plan- or shape-level executors for dominant opcode sequences so hot rows - can also avoid the per-sample opcode switch. -- Extend direct final-buffer output only where BCF type selection is - byte-identical, or where the direct writer can cheaply roll back. -- Add overflow-compatible numeric parsing or force fallback before committing to - the plan on extreme integer/float values. -- Integrate the edge fixture into the standard htslib test runner once the - experimental flag graduates beyond local benchmarking. +- Make a dynamic fixed-shape executor that captures the CCDG exact-kernel wins + without matching on field names. +- Specialize common string-bearing shapes such as `PGT/PID` without baking in + CCDG tag names. +- Reduce per-sample opcode dispatch in hot FORMAT shapes. +- Expand direct final-buffer output only where BCF type selection remains + byte-identical or can cheaply roll back. +- Keep the exact kernels as a performance oracle while iterating, then remove + or demote them once the dynamic executor catches up. From ba09a368d31984b68947dd383c2497b6003f0824 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 14:46:04 +0200 Subject: [PATCH 14/38] Add dynamic FORMAT likelihood shape executor --- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 218 ++++++++ test/test_view.c | 11 + vcf.c | 468 +++++++++++++++++- 3 files changed, 688 insertions(+), 9 deletions(-) create mode 100644 docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md new file mode 100644 index 000000000..21c9d88e9 --- /dev/null +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -0,0 +1,218 @@ +# Dynamic FORMAT Shape Executor Scratchpad + +Date: 2026-04-29 + +Branch: `codex/vcf-avx-sanity` + +## Goal + +Make the general-purpose VCF FORMAT planned parser approach the handwritten +exact CCDG kernel speed without matching on field names. The planner should stay +general, but the hot executor should become shape-specialized once a repeated +FORMAT layout proves stable. + +The production htslib parser remains the source of truth. Any optimized path +must either emit byte-identical BCF or return `-3` and fall back. + +## Current Baseline + +Known modes: + +```sh +HTS_VCF_FORMAT_PLAN=0 # existing generic parser +HTS_VCF_FORMAT_PLAN=1 # exact CCDG kernels, then dynamic fallback +HTS_VCF_FORMAT_PLAN=interp # dynamic planner only +HTS_VCF_FORMAT_PLAN_STATS=1 # counters from test/test_view +``` + +Current 10k CCDG sanity timing: + +| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | +|---|---:|---:| +| Baseline | 2.58 s | 2.83 s | +| Exact + dynamic fallback | 1.61 s | 1.86 s | +| Dynamic general only | 2.34 s | 2.55 s | + +The performance target is the exact CCDG tier. The first milestone is not to +delete exact kernels, but to make a dynamic shape executor selected without tag +name special cases reach the same neighborhood. + +## Working Hypothesis + +The exact kernels are faster because they do less work in the sample loop: + +- no per-field switch dispatch for every sample, +- fewer scratch-buffer passes, +- direct writes into final BCF payloads for cheap fixed fields, +- integer min/max tracking is carried directly into BCF integer width selection, +- CCDG `PGT/PID` string handling is tailored instead of fully generic, +- fallback checks are simple and close to the parse step. + +The dynamic planner should keep general discovery, but execute as a compact +fixed-shape kernel after resolving header metadata and row-local widths. + +## Design Direction + +Compile the literal FORMAT column plus header metadata into a plan as today, then +derive a shape descriptor from the row and header: + +```text +GT2, FLOAT1, INT_R, INT1, INT1, STR1, STR1, INT_G +``` + +This shape says what to parse, not which tag names are present. Field IDs, +header types, and BCF keys still come from the generic plan. + +The executor should be monomorphic for common shapes: + +- `GT2 + fixed numeric fields` +- `GT2 + FLOAT1 + fixed numeric fields` +- `GT2 + fixed numeric fields + fixed strings` +- `GT dynamic + fixed numeric fields` +- measured-width fallback for strings or sparse/non-fixed rows + +The important constraint is to move per-field dispatch out of the per-sample hot +loop wherever possible. + +## Correctness Rules + +- Do not run planned parsing when `h->keep_samples` is active. +- Fall back on duplicate FORMAT tags, undefined tags, unsupported header types, + malformed rows, unsupported GT shape, or integer/float behavior that cannot be + made byte-identical. +- Preserve htslib GT semantics: haploid GT, missing alleles, multidigit allele + indexes, phased/unphased state, and VCF 4.4 prefix phasing. +- Preserve vector-end padding and string zero-padding. +- Save and roll back `v->indiv.l` before any direct final-buffer write that may + fall back. +- Keep exact kernels available as an oracle until dynamic shape execution closes + the gap. + +## Implementation Plan + +1. Add shape classification to the dynamic general plan path. + - Use existing `vcf_format_row_op_t` data where possible. + - Recognize fixed-width rows derived from `Number=1`, `Number=R`, + `Number=G`, and fixed `Number=N`. + - Reject rows needing measured widths unless handled by a specific executor. + +2. Add a first generic fixed-shape executor for CCDG-equivalent structures. + - No tag-name matching. + - Require leading `GT2`. + - Support any mix/order of fixed INT/REAL fields with widths 1, R, G, or + small fixed N. + - Initially support `Number=1` strings with measured max width so `PGT/PID` + can stay on the planned path. + +3. Reduce hot-loop dispatch. + - Precompute field offsets, widths, sizes, and parse actions. + - Prefer executor-family loops over `switch (op->kind)` per field per sample. + - Specialize common width parse helpers for 1, 2, 3, and small fixed widths. + +4. Direct-write final BCF output when safe. + - Continue direct `GT2` int8 output. + - Track integer ranges while parsing and use known-range encoding. + - For floats, serialize directly when field width is fixed. + - For strings, write final char payload after width is known. + +5. Instrument fallback reasons and executor choices. + - Add temporary counters or debug logging gated by env vars if useful. + - Track shape hits, shape fallbacks, strict hits, measured fallback hits. + +6. Benchmark and iterate. + - Correctness: `./test/test_format_plan.sh` + - CCDG subset: `/tmp/ccdg_chr22_10k.vcf.gz` + - Compare baseline, exact, and dynamic-only output with `cmp`. + - Primary target: dynamic-only `HTS_VCF_FORMAT_PLAN=interp` approaching + exact-mode time on VCF.gz -> uncompressed BCF. + +## Test Data + +Full source: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +Benchmark subset: + +```text +/tmp/ccdg_chr22_10k.vcf.gz +/tmp/ccdg_chr22_10k.bcf +``` + +Correctness fixture: + +```text +test/format-plan-edge.vcf +``` + +## Current Scratch Notes + +- `HTS_VCF_FORMAT_PLAN=interp` is the key mode for dynamic executor progress. +- Exact kernels should remain until dynamic-only is close enough to make them + redundant. +- Avoid hardcoding `AD`, `PL`, `DP`, `GQ`, `AB`, `PGT`, or `PID`; use their + header-derived type/number/width instead. +- CCDG-like FORMAT distributions are still the first target because they provide + a real, repeatable workload and a clear oracle. + +## 2026-04-29 Iteration Notes + +Implemented the first dynamic likelihood-shape executor in `vcf.c`. + +What changed: + +- Added an optional `HTS_VCF_FORMAT_PLAN_SHAPE_STATS` counter path in + `test/test_view`. +- Relaxed strict string handling so `Type=String,Number=1` FORMAT fields can be + handled by planned parsing with row-local byte-width measurement. +- Added a shape-specific width derivation for CCDG-like layouts where `AD` may + be declared as `Number=.` in the header but the row shape proves the observed + width is `n_allele`. +- Added a straight-line dynamic executor for: + +```text +GT2, optional FLOAT1, INT[n_allele], INT1, INT1, +optional STR1, optional STR1, INT[n_allele * (n_allele + 1) / 2] +``` + +This executor is selected by FORMAT type/order/width, not by tag names. It +still validates observed AD/PL counts and falls back on mismatch. + +Latest 10k CCDG VCF.gz -> uncompressed BCF single-run timings on the rebuilt +worktree: + +| Mode | Wall | User | Notes | +|---|---:|---:|---| +| Baseline | 2.78 s | 2.56 s | `HTS_VCF_FORMAT_PLAN=0` | +| Exact CCDG | 1.78 s | 1.61 s | exact kernels, shape hits 0 | +| Dynamic shape | 2.53 s | 1.71 s | `interp`, shape hits 10,000 | + +`cmp` passed for both dynamic-shape and exact outputs against baseline BCF. + +The important result is CPU parity is close: dynamic shape is within about 6% of +exact on user time in this run. Wall time is noisier, likely output/cache +effects, and should be rerun in a tighter benchmark loop. + +Next likely cuts: + +- Cache shape classification per `(header, FORMAT)` plan so we do less + per-record type/order checking. +- Split phase and non-phase shape executors to remove `has_phase` branches from + the sample loop. +- Consider separate `has_float` executor variants for the same reason. +- Compare a no-shape-stats build/run to estimate counter overhead, though it is + probably minor. +- Once dynamic shape is consistently at parity, demote the exact CCDG kernels to + oracle-only or remove them. + +## Open Questions + +- How much of the gap is parse-loop dispatch versus generic encode cost? +- Can string width measurement be cached per shape region, or does row-local + width variation force a cheap scan every time? +- Is it better to build several executor families by op sequence, or one generic + fixed-shape executor with parse-function pointers? +- Do temporary fallback reason counters pay for themselves during iteration, or + should they stay under an explicit debug environment variable? diff --git a/test/test_view.c b/test/test_view.c index b24ba9b46..d7c221128 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -42,6 +42,8 @@ extern void hts_vcf_simd_probe_stats(uint64_t *attempts, uint64_t *hits, extern void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples); +extern void hts_vcf_format_plan_shape_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback); struct opts { char *fn_ref; @@ -456,6 +458,15 @@ int main(int argc, char *argv[]) (unsigned long long) parsed_samples); } + if (getenv("HTS_VCF_FORMAT_PLAN_SHAPE_STATS")) { + uint64_t attempts = 0, hits = 0, fallback = 0; + hts_vcf_format_plan_shape_stats(&attempts, &hits, &fallback); + fprintf(stderr, + "vcf-format-likelihood-shape attempts=%llu hits=%llu fallback=%llu\n", + (unsigned long long) attempts, (unsigned long long) hits, + (unsigned long long) fallback); + } + if (fclose(stdout) != 0 && errno != EBADF) { fprintf(stderr, "Error closing standard output.\n"); exit_code = EXIT_FAILURE; diff --git a/vcf.c b/vcf.c index 10154400a..2edb52c4f 100644 --- a/vcf.c +++ b/vcf.c @@ -3207,6 +3207,9 @@ typedef struct { } vcf_format_plan_stats_t; static vcf_format_plan_stats_t vcf_format_plan_stats; +static uint64_t vcf_format_likelihood_shape_attempts; +static uint64_t vcf_format_likelihood_shape_hits; +static uint64_t vcf_format_likelihood_shape_fallback; void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples) @@ -3217,6 +3220,14 @@ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; } +void hts_vcf_format_plan_shape_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback) +{ + if (attempts) *attempts = vcf_format_likelihood_shape_attempts; + if (hits) *hits = vcf_format_likelihood_shape_hits; + if (fallback) *fallback = vcf_format_likelihood_shape_fallback; +} + static int vcf_format_plan_mode(void) { static int mode = -1; @@ -3459,9 +3470,13 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma plan->ops[plan->n_ops].vl_type = bcf_hdr_id2length(h, BCF_HL_FMT, key); if (!plan->ops[plan->n_ops].is_gt) { int vl = plan->ops[plan->n_ops].vl_type; - if (htype == BCF_HT_STR || - (vl != BCF_VL_FIXED && vl != BCF_VL_A && vl != BCF_VL_R && vl != BCF_VL_G)) + if (htype == BCF_HT_STR) { + if (plan->ops[plan->n_ops].number != 1) + plan->strict_supported = 0; + } else if (vl != BCF_VL_FIXED && vl != BCF_VL_A && + vl != BCF_VL_R && vl != BCF_VL_G) { plan->strict_supported = 0; + } } plan->n_ops++; } @@ -4303,6 +4318,7 @@ static int vcf_format_general_fixed_numeric_supported(const vcf_format_row_op_t case VCF_FORMAT_ROW_INTN: case VCF_FORMAT_ROW_FLOAT1: case VCF_FORMAT_ROW_FLOATN: + case VCF_FORMAT_ROW_STR: break; default: return 0; @@ -4311,6 +4327,180 @@ static int vcf_format_general_fixed_numeric_supported(const vcf_format_row_op_t return 1; } +static inline int vcf_format_row_is_int(const vcf_format_row_op_t *op) +{ + return op->kind == VCF_FORMAT_ROW_INT1 || + op->kind == VCF_FORMAT_ROW_INT2 || + op->kind == VCF_FORMAT_ROW_INT3 || + op->kind == VCF_FORMAT_ROW_INTN; +} + +static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, + const vcf_format_general_plan_t *plan, + bcf1_t *v, char *q, int *widths) +{ + const char *cur, *end; + int has_string = 0, sample, j, nsamples = bcf_hdr_nsamples(h); + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + + if (!op->is_gt && op->htype == BCF_HT_STR) { + if (op->number != 1) + return -4; + widths[j] = 0; + has_string = 1; + } else { + widths[j] = vcf_format_general_expected_width(op, v); + if (widths[j] <= 0 || widths[j] > 64) + return -4; + } + } + + if (!has_string) + return 0; + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + const char *field = cur; + + while (cur < end && *cur && *cur != ':' && *cur != '\t') + cur++; + if (!op->is_gt && op->htype == BCF_HT_STR) { + int w = cur - field; + if (j > 0) + w++; + if (w <= 0) + w = 1; + if (widths[j] < w) + widths[j] = w; + } + + if (j + 1 < plan->n_ops) { + if (*cur != ':') + return -4; + cur++; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + return -4; + } + } + } + if (sample != nsamples) + return -4; + for (j = 0; j < plan->n_ops; j++) + if (!plan->ops[j].is_gt && plan->ops[j].htype == BCF_HT_STR && + widths[j] <= 0) + widths[j] = 1; + + return 0; +} + +static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h, + const vcf_format_general_plan_t *plan, + bcf1_t *v, char *q, int *widths) +{ + const char *cur, *end; + int ad_w, pl_w, idx, sample, j, nsamples = bcf_hdr_nsamples(h); + int str1_idx = -1, str2_idx = -1; + + if (plan->n_ops != 5 && plan->n_ops != 6 && + plan->n_ops != 7 && plan->n_ops != 8) + return -4; + if (!plan->ops[0].is_gt) + return -4; + if (v->n_allele < 1 || v->n_allele > 8) + return -4; + ad_w = v->n_allele; + pl_w = v->n_allele * (v->n_allele + 1) / 2; + if (pl_w < 1 || pl_w > 36) + return -4; + + for (j = 0; j < plan->n_ops; j++) + widths[j] = 0; + widths[0] = 2; + + idx = 1; + if (idx < plan->n_ops && plan->ops[idx].htype == BCF_HT_REAL && + plan->ops[idx].number == 1) + widths[idx++] = 1; + if (idx + 3 >= plan->n_ops) + return -4; + if (plan->ops[idx].htype != BCF_HT_INT) + return -4; + widths[idx++] = ad_w; + if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) + return -4; + widths[idx++] = 1; + if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) + return -4; + widths[idx++] = 1; + if (plan->n_ops - idx == 3) { + if (plan->ops[idx].htype != BCF_HT_STR || plan->ops[idx].number != 1 || + plan->ops[idx + 1].htype != BCF_HT_STR || plan->ops[idx + 1].number != 1) + return -4; + str1_idx = idx++; + str2_idx = idx++; + } else if (plan->n_ops - idx != 1) { + return -4; + } + if (plan->ops[idx].htype != BCF_HT_INT) + return -4; + widths[idx++] = pl_w; + if (idx != plan->n_ops) + return -4; + + if (str1_idx < 0) + return 0; + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + for (j = 0; j < plan->n_ops; j++) { + const char *field = cur; + + while (cur < end && *cur && *cur != ':' && *cur != '\t') + cur++; + if (j == str1_idx || j == str2_idx) { + int w = cur - field; + if (j > 0) + w++; + if (w <= 0) + w = 1; + if (widths[j] < w) + widths[j] = w; + } + if (j + 1 < plan->n_ops) { + if (*cur != ':') + return -4; + cur++; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + return -4; + } + } + } + if (sample != nsamples) + return -4; + if (widths[str1_idx] <= 0) + widths[str1_idx] = 1; + if (widths[str2_idx] <= 0) + widths[str2_idx] = 1; + + return 0; +} + static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, @@ -4426,6 +4616,10 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t goto fallback; n = vcf_plan_float_vector_count((float *)buf, op->width); break; + case VCF_FORMAT_ROW_STR: + if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) + goto fallback; + break; default: goto fallback; } @@ -4469,6 +4663,252 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t return -1; } +static int vcf_parse_format_general_likelihood_shape(kstring_t *s, + const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_row_op_t *row_ops) +{ + kstring_t *mem = (kstring_t*)&h->mem; + int nsamples = bcf_hdr_nsamples(h); + int ad_w, pl_w, sample, idx, ad_idx, dp_idx, gq_idx, pl_idx; + int has_float = 0, has_phase = 0, float_idx = -1, str1_idx = -1, str2_idx = -1; + int max_ad_count = 0, max_pl_count = 0, nwords; + vcf_plan_int_range_t ad_range, dp_range, gq_range, pl_range; + size_t indiv_l0 = v->indiv.l; + size_t gt8_off, float_le_off = 0; + size_t ad_off, dp_off, gq_off, str1_off = 0, str2_off = 0, pl_off, total_bytes; + uint8_t *gt8, *float_le = NULL; + int32_t *ad, *dp, *gq, *pl; + char *str1 = NULL, *str2 = NULL; + const char *cur, *end; + + vcf_format_likelihood_shape_attempts++; + if (plan->n_ops != 5 && plan->n_ops != 6 && + plan->n_ops != 7 && plan->n_ops != 8) + return -4; + if (row_ops[0].kind != VCF_FORMAT_ROW_GT2) + return -4; + if (v->n_allele < 1 || v->n_allele > 8) + return -4; + + ad_w = v->n_allele; + pl_w = v->n_allele * (v->n_allele + 1) / 2; + if (pl_w < 1 || pl_w > 36) + return -4; + + idx = 1; + if (idx < plan->n_ops && row_ops[idx].kind == VCF_FORMAT_ROW_FLOAT1) { + has_float = 1; + float_idx = idx++; + } + if (idx + 3 >= plan->n_ops) + return -4; + ad_idx = idx++; + dp_idx = idx++; + gq_idx = idx++; + if (plan->n_ops - idx == 3) { + if (row_ops[idx].kind != VCF_FORMAT_ROW_STR || + row_ops[idx + 1].kind != VCF_FORMAT_ROW_STR) + return -4; + has_phase = 1; + str1_idx = idx++; + str2_idx = idx++; + } else if (plan->n_ops - idx != 1) { + return -4; + } + pl_idx = idx++; + if (idx != plan->n_ops) + return -4; + + if (!vcf_format_row_is_int(&row_ops[ad_idx]) || + row_ops[ad_idx].width != ad_w || + row_ops[dp_idx].kind != VCF_FORMAT_ROW_INT1 || + row_ops[gq_idx].kind != VCF_FORMAT_ROW_INT1 || + !vcf_format_row_is_int(&row_ops[pl_idx]) || + row_ops[pl_idx].width != pl_w) + return -4; + + vcf_plan_int_range_init(&ad_range); + vcf_plan_int_range_init(&dp_range); + vcf_plan_int_range_init(&gq_range); + vcf_plan_int_range_init(&pl_range); + + bcf_enc_int1(&v->indiv, row_ops[0].key); + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + goto error; + gt8_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * 2; + if (has_float) { + bcf_enc_int1(&v->indiv, row_ops[float_idx].key); + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) + goto error; + float_le_off = v->indiv.l; + v->indiv.l += (size_t)nsamples * sizeof(float); + } + gt8 = (uint8_t *)v->indiv.s + gt8_off; + if (has_float) + float_le = (uint8_t *)v->indiv.s + float_le_off; + + mem->l = 0; + if (align_mem(mem) < 0) + goto error; + total_bytes = (size_t) nsamples * (ad_w + 1 + 1 + pl_w) * sizeof(int32_t); + if (has_phase) + total_bytes += (size_t) nsamples * + (row_ops[str1_idx].width + row_ops[str2_idx].width); + if (total_bytes > INT_MAX) + goto error; + if (ks_resize(mem, mem->l + total_bytes) < 0) + goto error; + + ad_off = mem->l; mem->l += (size_t) nsamples * ad_w * sizeof(int32_t); + dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); + gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); + if (has_phase) { + str1_off = mem->l; mem->l += (size_t) nsamples * row_ops[str1_idx].width; + str2_off = mem->l; mem->l += (size_t) nsamples * row_ops[str2_idx].width; + } + pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); + + ad = (int32_t *) (mem->s + ad_off); + dp = (int32_t *) (mem->s + dp_off); + gq = (int32_t *) (mem->s + gq_off); + if (has_phase) { + str1 = mem->s + str1_off; + str2 = mem->s + str2_off; + } + pl = (int32_t *) (mem->s + pl_off); + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + int nread; + + if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (has_float) { + float f; + if (vcf_plan_float_value(&cur, &f) < 0) + goto fallback; + float_to_le(f, float_le + (size_t)sample * sizeof(float)); + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } + if (ad_w == 2) { + if (vcf_plan_parse_int_vector2_counted_range(&cur, &ad[sample * 2], &nread, &ad_range) < 0) + goto fallback; + } else if (ad_w == 3) { + if (vcf_plan_parse_int_vector3_counted_range(&cur, &ad[sample * 3], &nread, &ad_range) < 0) + goto fallback; + } else if (vcf_plan_parse_int_vector_counted_range(&cur, &ad[sample * ad_w], ad_w, &nread, &ad_range) < 0) { + goto fallback; + } + if (max_ad_count < nread) + max_ad_count = nread; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value_range(&cur, &dp[sample], &dp_range) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_int_value_range(&cur, &gq[sample], &gq_range) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (has_phase) { + if (vcf_plan_copy_string(&cur, &str1[sample * row_ops[str1_idx].width], + row_ops[str1_idx].width) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + if (vcf_plan_copy_string(&cur, &str2[sample * row_ops[str2_idx].width], + row_ops[str2_idx].width) < 0) + goto fallback; + if (vcf_plan_expect_sep(&cur, ':') < 0) + goto fallback; + } + if (pl_w == 3) { + if (vcf_plan_parse_int_vector3_counted_range(&cur, &pl[sample * 3], &nread, &pl_range) < 0) + goto fallback; + } else if (vcf_plan_parse_int_vector_counted_range(&cur, &pl[sample * pl_w], pl_w, &nread, &pl_range) < 0) { + goto fallback; + } + if (max_pl_count < nread) + max_pl_count = nread; + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + goto fallback; + } + if (sample != nsamples) + goto fallback; + if (max_ad_count != ad_w || max_pl_count != pl_w) + goto fallback; + + v->n_fmt = plan->n_ops; + v->n_sample = nsamples; + bcf_enc_int1(&v->indiv, row_ops[ad_idx].key); + nwords = nsamples * ad_w; + if (bcf_enc_vint_known_range(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max) < 0) + goto error; + bcf_enc_int1(&v->indiv, row_ops[dp_idx].key); + if (bcf_enc_vint_known_range(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max) < 0) + goto error; + bcf_enc_int1(&v->indiv, row_ops[gq_idx].key); + if (bcf_enc_vint_known_range(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max) < 0) + goto error; + if (has_phase) { + bcf_enc_int1(&v->indiv, row_ops[str1_idx].key); + if (bcf_enc_size(&v->indiv, row_ops[str1_idx].width, BCF_BT_CHAR) < 0 || + kputsn(str1, (size_t) nsamples * row_ops[str1_idx].width, &v->indiv) < 0) + goto error; + bcf_enc_int1(&v->indiv, row_ops[str2_idx].key); + if (bcf_enc_size(&v->indiv, row_ops[str2_idx].width, BCF_BT_CHAR) < 0 || + kputsn(str2, (size_t) nsamples * row_ops[str2_idx].width, &v->indiv) < 0) + goto error; + } + bcf_enc_int1(&v->indiv, row_ops[pl_idx].key); + nwords = nsamples * pl_w; + if (bcf_enc_vint_known_range(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max) < 0) + goto error; + + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + vcf_format_likelihood_shape_hits++; + return 0; + +fallback: + v->indiv.l = indiv_l0; + vcf_format_likelihood_shape_fallback++; + return -4; +error: + v->indiv.l = indiv_l0; + return -1; +} + +static int vcf_parse_format_general_likelihood_strict(kstring_t *s, + const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q) +{ + int widths[MAX_N_FMT]; + vcf_format_row_op_t row_ops[MAX_N_FMT]; + + if (vcf_format_general_likelihood_widths(s, h, plan, v, q, widths) < 0) + return -4; + vcf_format_general_resolve_ops(plan, v, widths, row_ops); + return vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops); +} + static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, @@ -4478,17 +4918,18 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, int widths[MAX_N_FMT], max_counts[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; vcf_plan_int_range_t ranges[MAX_N_FMT]; - int nsamples = bcf_hdr_nsamples(h), sample, j, vcf44; + int nsamples = bcf_hdr_nsamples(h), sample, j, vcf44, ret; const char *cur, *end; + if (vcf_format_general_strict_widths(s, h, plan, v, q, widths) < 0) + return -4; for (j = 0; j < plan->n_ops; j++) { - widths[j] = vcf_format_general_expected_width(&plan->ops[j], v); - if (widths[j] <= 0 || widths[j] > 64) - return -4; max_counts[j] = 0; vcf_plan_int_range_init(&ranges[j]); } vcf_format_general_resolve_ops(plan, v, widths, row_ops); + if ((ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops)) != -4) + return ret; if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) return vcf_parse_format_general_fixed_numeric(s, h, v, plan, q, row_ops); @@ -4594,7 +5035,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, kstring_t *mem; int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; - int nsamples, sample, j, vcf44, ret; + int nsamples, sample, j, vcf44, ret, strict_enabled; const char *cur, *end; plan = vcf_format_general_plan_get(h, p); @@ -4608,8 +5049,17 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, nsamples = bcf_hdr_nsamples(h); if (!nsamples) return 0; - if (plan->strict_supported && - vcf_format_fast_guard_enabled(&plan->strict_guard)) { + strict_enabled = vcf_format_fast_guard_enabled(&plan->strict_guard); + if (strict_enabled) { + ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q); + if (ret == 0) { + vcf_format_fast_guard_success(&plan->strict_guard); + return ret; + } + if (ret != -4) + return ret; + } + if (plan->strict_supported && strict_enabled) { ret = vcf_parse_format_general_strict(s, h, v, plan, q); if (ret == 0) { vcf_format_fast_guard_success(&plan->strict_guard); From 61b4bd175ba33e230df60b7fa512037778c2d8e7 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 14:54:34 +0200 Subject: [PATCH 15/38] Add VCF FORMAT shape benchmark corpus --- bench/format-shape/.gitignore | 7 + bench/format-shape/README.md | 97 ++++++++++++++ bench/format-shape/inputs.tsv | 11 ++ bench/format-shape/results/checks.tsv | 21 +++ bench/format-shape/results/timings.tsv | 31 +++++ bench/format-shape/scripts/make_synthetic.pl | 131 +++++++++++++++++++ bench/format-shape/scripts/run_bench.sh | 81 ++++++++++++ 7 files changed, 379 insertions(+) create mode 100644 bench/format-shape/.gitignore create mode 100644 bench/format-shape/README.md create mode 100644 bench/format-shape/inputs.tsv create mode 100644 bench/format-shape/results/checks.tsv create mode 100644 bench/format-shape/results/timings.tsv create mode 100755 bench/format-shape/scripts/make_synthetic.pl create mode 100755 bench/format-shape/scripts/run_bench.sh diff --git a/bench/format-shape/.gitignore b/bench/format-shape/.gitignore new file mode 100644 index 000000000..b7ed964ef --- /dev/null +++ b/bench/format-shape/.gitignore @@ -0,0 +1,7 @@ +public/*.vcf.gz +public/remote-indexes/*.tbi +synthetic/*.vcf.gz +results/*.bcf +results/*.stderr +!results/timings.tsv +!results/checks.tsv diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md new file mode 100644 index 000000000..937fe6a3a --- /dev/null +++ b/bench/format-shape/README.md @@ -0,0 +1,97 @@ +# VCF FORMAT Shape Benchmark Corpus + +This directory is a local benchmark corpus for the experimental VCF FORMAT +planner in `vcf.c`. It is intentionally kept under the repository worktree +instead of `/tmp` so the inputs survive restarts. + +## Layout + +```text +bench/format-shape/ + inputs.tsv input manifest used by the benchmark script + public/ downloaded public VCF slices + synthetic/ generated VCFs covering targeted FORMAT shapes + scripts/make_synthetic.pl deterministic synthetic VCF generator + scripts/run_bench.sh baseline/exact/interp timing and cmp runner + results/ generated timing logs and BCF outputs +``` + +The downloaded/generated VCF inputs are intentionally ignored by git to avoid +accidentally pushing large benchmark data. The manifest, scripts, docs, and +small result summaries are tracked; the local data can be regenerated from the +commands below. + +`results/` can be regenerated at any time and may become large. The script +keeps BCF outputs locally so `cmp` checks are inspectable, but `.gitignore` +excludes those large files. + +## Public Inputs + +The public files were sliced with `tabix -h URL REGION | ./bgzip -c > file`. +They are small enough to keep in the worktree but diverse enough to catch +non-FORMAT and real-world INFO-heavy workloads. + +| File | Source | Shape | +|---|---|---| +| `public/ccdg_chr22_10k.vcf.gz` | local CCDG subset | 3,202-sample CCDG likelihood FORMAT | +| `public/1000g_chr22_genotypes_16050k_16150k.vcf.gz` | 1000 Genomes Phase 3 chr22 genotypes | sample-rich `GT` FORMAT | +| `public/1000g_wgs_sites_chr22_16050k_16300k.vcf.gz` | 1000 Genomes Phase 3 WGS sites | sites-only | +| `public/clinvar_grch38_chr22_16050k_20000k.vcf.gz` | ClinVar GRCh38 VCF | sites-only clinical annotations | +| `public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz` | gnomAD v4.1 exomes chr22 | sites-only, INFO-heavy | + +Source URLs used: + +```text +https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz +https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz +https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz +https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr22.vcf.bgz +``` + +## Synthetic Inputs + +The synthetic files are generated by: + +```sh +bench/format-shape/scripts/make_synthetic.pl bench/format-shape/synthetic +for f in bench/format-shape/synthetic/*.vcf; do ./bgzip -f "$f"; done +``` + +They cover: + +- CCDG-like likelihood layouts with optional `AB` and `PGT/PID`, +- reordered likelihood fields, +- fixed numeric vectors, +- float-vector plus string FORMAT fields, +- multiallelic AD/PL likelihood rows. + +## Running + +Build the tools first: + +```sh +make test/test_view tabix bgzip +``` + +Run all inputs: + +```sh +bench/format-shape/scripts/run_bench.sh +``` + +The script runs each input in three modes: + +```text +baseline: HTS_VCF_FORMAT_PLAN=0 +exact: HTS_VCF_FORMAT_PLAN=1 +interp: HTS_VCF_FORMAT_PLAN=interp +``` + +It writes: + +```text +bench/format-shape/results/timings.tsv +bench/format-shape/results/checks.tsv +``` + +`checks.tsv` compares exact and interp BCF output against baseline with `cmp`. diff --git a/bench/format-shape/inputs.tsv b/bench/format-shape/inputs.tsv new file mode 100644 index 000000000..f5ae75eb9 --- /dev/null +++ b/bench/format-shape/inputs.tsv @@ -0,0 +1,11 @@ +name path source +ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset +1000g_chr22_genotypes bench/format-shape/public/1000g_chr22_genotypes_16050k_16150k.vcf.gz 1000 Genomes Phase 3 chr22 genotypes slice +1000g_wgs_sites bench/format-shape/public/1000g_wgs_sites_chr22_16050k_16300k.vcf.gz 1000 Genomes Phase 3 WGS sites-only slice +clinvar_grch38_chr22 bench/format-shape/public/clinvar_grch38_chr22_16050k_20000k.vcf.gz ClinVar GRCh38 chr22 slice +gnomad_v4.1_exomes_sites bench/format-shape/public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz gnomAD v4.1 exomes sites chr22 slice +synthetic_ccdg_likelihood bench/format-shape/synthetic/synthetic_ccdg_likelihood.vcf.gz synthetic CCDG-like likelihood FORMAT +synthetic_reordered_likelihood bench/format-shape/synthetic/synthetic_reordered_likelihood.vcf.gz synthetic reordered likelihood FORMAT +synthetic_fixed_numeric bench/format-shape/synthetic/synthetic_fixed_numeric.vcf.gz synthetic fixed numeric FORMAT +synthetic_float_string bench/format-shape/synthetic/synthetic_float_string.vcf.gz synthetic float and string FORMAT +synthetic_multiallelic_likelihood bench/format-shape/synthetic/synthetic_multiallelic_likelihood.vcf.gz synthetic multiallelic likelihood FORMAT diff --git a/bench/format-shape/results/checks.tsv b/bench/format-shape/results/checks.tsv new file mode 100644 index 000000000..58c9e8f97 --- /dev/null +++ b/bench/format-shape/results/checks.tsv @@ -0,0 +1,21 @@ +name comparison status +ccdg_10k baseline_vs_exact ok +ccdg_10k baseline_vs_interp ok +1000g_chr22_genotypes baseline_vs_exact ok +1000g_chr22_genotypes baseline_vs_interp ok +1000g_wgs_sites baseline_vs_exact ok +1000g_wgs_sites baseline_vs_interp ok +clinvar_grch38_chr22 baseline_vs_exact ok +clinvar_grch38_chr22 baseline_vs_interp ok +gnomad_v4.1_exomes_sites baseline_vs_exact ok +gnomad_v4.1_exomes_sites baseline_vs_interp ok +synthetic_ccdg_likelihood baseline_vs_exact ok +synthetic_ccdg_likelihood baseline_vs_interp ok +synthetic_reordered_likelihood baseline_vs_exact ok +synthetic_reordered_likelihood baseline_vs_interp ok +synthetic_fixed_numeric baseline_vs_exact ok +synthetic_fixed_numeric baseline_vs_interp ok +synthetic_float_string baseline_vs_exact ok +synthetic_float_string baseline_vs_interp ok +synthetic_multiallelic_likelihood baseline_vs_exact ok +synthetic_multiallelic_likelihood baseline_vs_interp ok diff --git a/bench/format-shape/results/timings.tsv b/bench/format-shape/results/timings.tsv new file mode 100644 index 000000000..a854a561b --- /dev/null +++ b/bench/format-shape/results/timings.tsv @@ -0,0 +1,31 @@ +name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback +ccdg_10k baseline 2.71 2.49 0.18 0 0 0 0 0 0 0 +ccdg_10k exact 1.76 1.58 0.15 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.85 1.67 0.16 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_genotypes baseline 0.04 0.03 0 0 0 0 0 0 0 0 +1000g_chr22_genotypes exact 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 +1000g_chr22_genotypes interp 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 +1000g_wgs_sites baseline 0.01 0 0 0 0 0 0 0 0 0 +1000g_wgs_sites exact 0.01 0 0 0 0 0 0 0 0 0 +1000g_wgs_sites interp 0.01 0 0 0 0 0 0 0 0 0 +clinvar_grch38_chr22 baseline 0.01 0 0 0 0 0 0 0 0 0 +clinvar_grch38_chr22 exact 0.01 0 0 0 0 0 0 0 0 0 +clinvar_grch38_chr22 interp 0.01 0 0 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites baseline 0.47 0.4 0.05 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites exact 0.47 0.4 0.05 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites interp 0.48 0.4 0.06 0 0 0 0 0 0 0 +synthetic_ccdg_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 +synthetic_ccdg_likelihood exact 0.01 0 0 2000 2000 0 16000 0 0 0 +synthetic_ccdg_likelihood interp 0.01 0 0 2000 2000 0 16000 2000 2000 0 +synthetic_reordered_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 +synthetic_reordered_likelihood exact 0.01 0 0 2000 2000 0 16000 0 0 0 +synthetic_reordered_likelihood interp 0.01 0 0 2000 2000 0 16000 0 0 0 +synthetic_fixed_numeric baseline 0.01 0 0 0 0 0 0 0 0 0 +synthetic_fixed_numeric exact 0 0 0 2000 2000 0 16000 2000 0 0 +synthetic_fixed_numeric interp 0 0 0 2000 2000 0 16000 2000 0 0 +synthetic_float_string baseline 0 0 0 0 0 0 0 0 0 0 +synthetic_float_string exact 0.01 0 0 2000 2000 0 16000 2000 0 0 +synthetic_float_string interp 0.01 0 0 2000 2000 0 16000 2000 0 0 +synthetic_multiallelic_likelihood baseline 0 0 0 0 0 0 0 0 0 0 +synthetic_multiallelic_likelihood exact 0 0 0 1200 1200 0 9600 0 0 0 +synthetic_multiallelic_likelihood interp 0 0 0 1200 1200 0 9600 1200 1200 0 diff --git a/bench/format-shape/scripts/make_synthetic.pl b/bench/format-shape/scripts/make_synthetic.pl new file mode 100755 index 000000000..5266b4dc0 --- /dev/null +++ b/bench/format-shape/scripts/make_synthetic.pl @@ -0,0 +1,131 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my $outdir = shift @ARGV or die "usage: make_synthetic.pl OUTDIR\n"; +my @samples = map { "S$_" } 1..8; + +sub header { + my ($fh) = @_; + print $fh "##fileformat=VCFv4.3\n"; + print $fh "##contig=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", join("\t", @samples), "\n"; +} + +sub open_vcf { + my ($name) = @_; + open my $fh, ">", "$outdir/$name.vcf" or die "$outdir/$name.vcf: $!\n"; + header($fh); + return $fh; +} + +sub genotype { + my ($i, $s, $n_alt) = @_; + return "./." if (($i + $s) % 29) == 0; + return "0/0" if (($i + $s) % 5) == 0; + return "1|1" if $n_alt == 1 && (($i + $s) % 11) == 0; + return $n_alt > 1 && (($i + $s) % 7) == 0 ? "1/2" : "0/1"; +} + +sub ad { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 37) == 0; + return join(",", map { (($i * 3 + $s * 5 + $_ * 7) % 40) } 0..($n_allele - 1)); +} + +sub pl { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 41) == 0; + my $n = $n_allele * ($n_allele + 1) / 2; + return join(",", map { (($i + $s + $_) * 13) % 500 } 0..($n - 1)); +} + +sub gl { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 31) == 0; + my $n = $n_allele * ($n_allele + 1) / 2; + return join(",", map { sprintf("%.2f", -1 * ((($i + $s + $_) % 20) / 3.0)) } 0..($n - 1)); +} + +my $fh = open_vcf("synthetic_ccdg_likelihood"); +for my $i (1..2000) { + my $pos = 20000000 + $i; + my $phase = $i % 2 == 0; + my $fmt = $phase ? "GT:AB:AD:DP:GQ:PGT:PID:PL" : "GT:AB:AD:DP:GQ:PL"; + my @vals; + for my $s (0..$#samples) { + my $gt = genotype($i, $s, 1); + my $ab = $gt eq "0/1" ? sprintf("%.2f", (($i + $s) % 90) / 100) : "."; + my $base = join(":", $gt, $ab, ad($i, $s, 2), (($i+$s)%80), (($i+$s)%99)); + if ($phase) { + push @vals, join(":", $base, ($gt =~ /\|/ ? $gt : "0|1"), "${pos}_A_T", pl($i, $s, 2)); + } else { + push @vals, join(":", $base, pl($i, $s, 2)); + } + } + print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", $fmt, @vals), "\n"; +} +close $fh; + +$fh = open_vcf("synthetic_reordered_likelihood"); +for my $i (1..2000) { + my $pos = 20100000 + $i; + my @vals; + for my $s (0..$#samples) { + push @vals, join(":", (($i+$s)%80), (($i+$s)%99), genotype($i, $s, 1), ad($i, $s, 2), pl($i, $s, 2)); + } + print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", "DP:GQ:GT:AD:PL", @vals), "\n"; +} +close $fh; + +$fh = open_vcf("synthetic_fixed_numeric"); +for my $i (1..2000) { + my $pos = 20200000 + $i; + my @vals; + for my $s (0..$#samples) { + my $hq = (($i+$s)%150) . "," . (($i+$s+9)%150); + my $sb = join(",", map { ($i + $s + $_) % 30 } 0..3); + push @vals, join(":", genotype($i, $s, 1), $hq, (($i+$s)%60), $sb); + } + print $fh join("\t", "chr22", $pos, ".", "C", "A", 50, "PASS", ".", "GT:HQ:MIN_DP:SB", @vals), "\n"; +} +close $fh; + +$fh = open_vcf("synthetic_float_string"); +for my $i (1..2000) { + my $pos = 20300000 + $i; + my @vals; + for my $s (0..$#samples) { + my $ft = (($i+$s)%13) == 0 ? "LowQual" : "PASS"; + push @vals, join(":", genotype($i, $s, 2), gl($i, $s, 3), $ft, (($i+$s)%80), (($i+$s)%99)); + } + print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", "GT:GL:FT:DP:GQ", @vals), "\n"; +} +close $fh; + +$fh = open_vcf("synthetic_multiallelic_likelihood"); +for my $i (1..1200) { + my $pos = 20400000 + $i; + my $n_alt = 1 + ($i % 3); + my @alts = qw(C G T); + my $alt = join(",", @alts[0..($n_alt - 1)]); + my @vals; + for my $s (0..$#samples) { + push @vals, join(":", genotype($i, $s, $n_alt), ad($i, $s, $n_alt + 1), (($i+$s)%90), (($i+$s)%99), pl($i, $s, $n_alt + 1)); + } + print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", "GT:AD:DP:GQ:PL", @vals), "\n"; +} +close $fh; diff --git a/bench/format-shape/scripts/run_bench.sh b/bench/format-shape/scripts/run_bench.sh new file mode 100755 index 000000000..4d7a1845c --- /dev/null +++ b/bench/format-shape/scripts/run_bench.sh @@ -0,0 +1,81 @@ +#!/bin/sh +set -eu + +test_view=${TEST_VIEW:-./test/test_view} +inputs=${1:-bench/format-shape/inputs.tsv} +outdir=${OUTDIR:-bench/format-shape/results} +mkdir -p "$outdir" + +timings="$outdir/timings.tsv" +checks="$outdir/checks.tsv" + +printf 'name\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\tshape_attempts\tshape_hits\tshape_fallback\n' > "$timings" +printf 'name\tcomparison\tstatus\n' > "$checks" + +tail -n +2 "$inputs" | while IFS=' ' read -r name path source +do + base_out="$outdir/$name.baseline.bcf" + exact_out="$outdir/$name.exact.bcf" + interp_out="$outdir/$name.interp.bcf" + + for mode in baseline exact interp + do + err="$outdir/$name.$mode.stderr" + out="$outdir/$name.$mode.bcf" + case "$mode" in + baseline) + env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" + ;; + exact) + env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 HTS_VCF_FORMAT_PLAN_SHAPE_STATS=1 \ + /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" + ;; + interp) + env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 HTS_VCF_FORMAT_PLAN_SHAPE_STATS=1 \ + /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" + ;; + esac + + awk -v name="$name" -v mode="$mode" ' + /^real / { real=$2 } + /^user / { user=$2 } + /^sys / { sys=$2 } + /^vcf-format-plan / { + for (i=1; i<=NF; i++) { + split($i, kv, "=") + if (kv[1] == "attempts") attempts=kv[2] + else if (kv[1] == "hits") hits=kv[2] + else if (kv[1] == "fallback") fallback=kv[2] + else if (kv[1] == "parsed_samples") parsed=kv[2] + } + } + /^vcf-format-likelihood-shape / { + for (i=1; i<=NF; i++) { + split($i, kv, "=") + if (kv[1] == "attempts") shape_attempts=kv[2] + else if (kv[1] == "hits") shape_hits=kv[2] + else if (kv[1] == "fallback") shape_fallback=kv[2] + } + } + END { + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + name, mode, real+0, user+0, sys+0, + attempts+0, hits+0, fallback+0, parsed+0, + shape_attempts+0, shape_hits+0, shape_fallback+0 + } + ' "$err" >> "$timings" + done + + if cmp "$base_out" "$exact_out" >/dev/null 2>&1; then + printf '%s\tbaseline_vs_exact\tok\n' "$name" >> "$checks" + else + printf '%s\tbaseline_vs_exact\tDIFF\n' "$name" >> "$checks" + fi + if cmp "$base_out" "$interp_out" >/dev/null 2>&1; then + printf '%s\tbaseline_vs_interp\tok\n' "$name" >> "$checks" + else + printf '%s\tbaseline_vs_interp\tDIFF\n' "$name" >> "$checks" + fi +done + +printf 'wrote %s and %s\n' "$timings" "$checks" From fea9b9e13f843351beda43780197193625c31cec Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 15:16:21 +0200 Subject: [PATCH 16/38] Add large FORMAT shape benchmark pass --- bench/format-shape/.gitignore | 8 + bench/format-shape/README.md | 41 +++++- bench/format-shape/large/inputs.tsv | 7 + bench/format-shape/large/results/checks.tsv | 13 ++ bench/format-shape/large/results/timings.tsv | 19 +++ bench/format-shape/results/timings.tsv | 22 +-- .../scripts/make_large_synthetic.pl | 137 ++++++++++++++++++ bench/format-shape/scripts/run_bench.sh | 4 + ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 31 ++++ vcf.c | 52 +++---- 10 files changed, 292 insertions(+), 42 deletions(-) create mode 100644 bench/format-shape/large/inputs.tsv create mode 100644 bench/format-shape/large/results/checks.tsv create mode 100644 bench/format-shape/large/results/timings.tsv create mode 100644 bench/format-shape/scripts/make_large_synthetic.pl diff --git a/bench/format-shape/.gitignore b/bench/format-shape/.gitignore index b7ed964ef..1f2b779e7 100644 --- a/bench/format-shape/.gitignore +++ b/bench/format-shape/.gitignore @@ -1,7 +1,15 @@ public/*.vcf.gz public/remote-indexes/*.tbi synthetic/*.vcf.gz +large/**/*.vcf.gz +large/**/*.vcf.bgz +large/**/*.tbi +large/results/*.bcf +large/results/*.stderr +large/results/*.tmp results/*.bcf results/*.stderr !results/timings.tsv !results/checks.tsv +!large/results/timings.tsv +!large/results/checks.tsv diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index 937fe6a3a..27a1f1048 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -1,8 +1,8 @@ # VCF FORMAT Shape Benchmark Corpus -This directory is a local benchmark corpus for the experimental VCF FORMAT -planner in `vcf.c`. It is intentionally kept under the repository worktree -instead of `/tmp` so the inputs survive restarts. +This directory is a local test and benchmark corpus for the experimental VCF +FORMAT planner in `vcf.c`. It is intentionally kept under the repository +worktree instead of `/tmp` so the inputs survive restarts. ## Layout @@ -11,7 +11,9 @@ bench/format-shape/ inputs.tsv input manifest used by the benchmark script public/ downloaded public VCF slices synthetic/ generated VCFs covering targeted FORMAT shapes + large/ meaningful multi-second benchmark inputs/results scripts/make_synthetic.pl deterministic synthetic VCF generator + scripts/make_large_synthetic.pl scripts/run_bench.sh baseline/exact/interp timing and cmp runner results/ generated timing logs and BCF outputs ``` @@ -27,6 +29,10 @@ excludes those large files. ## Public Inputs +The small `public/` and `synthetic/` inputs are smoke/correctness fixtures. They +are not large enough to provide stable timing signal except for the CCDG 10k +subset. Use `large/inputs.tsv` for optimization decisions. + The public files were sliced with `tabix -h URL REGION | ./bgzip -c > file`. They are small enough to keep in the worktree but diverse enough to catch non-FORMAT and real-world INFO-heavy workloads. @@ -79,6 +85,16 @@ Run all inputs: bench/format-shape/scripts/run_bench.sh ``` +Run only the meaningful large corpus: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +`KEEP_OUTPUTS=0` still writes temporary BCF files and compares them with `cmp`, +but deletes the large BCF outputs after each input is checked. + The script runs each input in three modes: ```text @@ -95,3 +111,22 @@ bench/format-shape/results/checks.tsv ``` `checks.tsv` compares exact and interp BCF output against baseline with `cmp`. + +## Large Corpus + +`large/inputs.tsv` currently contains: + +- the CCDG 10k subset, +- the full 1000 Genomes chr22 Phase 3 genotype VCF, +- four generated 2,048-sample synthetic FORMAT workloads: + CCDG-like likelihood, reordered likelihood, multiallelic likelihood, and + float/string FORMAT. + +The latest large run is summarized in: + +```text +bench/format-shape/large/results/timings.tsv +bench/format-shape/large/results/checks.tsv +``` + +All exact and interp outputs in that run compared byte-identical to baseline. diff --git a/bench/format-shape/large/inputs.tsv b/bench/format-shape/large/inputs.tsv new file mode 100644 index 000000000..3dca38e81 --- /dev/null +++ b/bench/format-shape/large/inputs.tsv @@ -0,0 +1,7 @@ +name path source +ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples +1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF +large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples +large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples +large_multiallelic_likelihood_2048s bench/format-shape/large/synthetic/large_multiallelic_likelihood_2048s.vcf.gz synthetic multiallelic likelihood FORMAT, 16k records x 2,048 samples +large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT, 16k records x 2,048 samples diff --git a/bench/format-shape/large/results/checks.tsv b/bench/format-shape/large/results/checks.tsv new file mode 100644 index 000000000..1ec35d27b --- /dev/null +++ b/bench/format-shape/large/results/checks.tsv @@ -0,0 +1,13 @@ +name comparison status +ccdg_10k baseline_vs_exact ok +ccdg_10k baseline_vs_interp ok +1000g_chr22_full_genotypes baseline_vs_exact ok +1000g_chr22_full_genotypes baseline_vs_interp ok +large_ccdg_likelihood_2048s baseline_vs_exact ok +large_ccdg_likelihood_2048s baseline_vs_interp ok +large_reordered_likelihood_2048s baseline_vs_exact ok +large_reordered_likelihood_2048s baseline_vs_interp ok +large_multiallelic_likelihood_2048s baseline_vs_exact ok +large_multiallelic_likelihood_2048s baseline_vs_interp ok +large_float_string_2048s baseline_vs_exact ok +large_float_string_2048s baseline_vs_interp ok diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv new file mode 100644 index 000000000..fd189c759 --- /dev/null +++ b/bench/format-shape/large/results/timings.tsv @@ -0,0 +1,19 @@ +name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback +ccdg_10k baseline 3.43 2.6 0.19 0 0 0 0 0 0 0 +ccdg_10k exact 1.84 1.64 0.17 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.82 1.63 0.17 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 28.01 26.76 1.12 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 10.2 9.32 0.78 1103547 1103547 0 2763281688 1103547 0 0 +1000g_chr22_full_genotypes interp 10.19 9.32 0.81 1103547 1103547 0 2763281688 1103547 0 0 +large_ccdg_likelihood_2048s baseline 4.38 4.1 0.22 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 2.99 2.77 0.2 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 3.1 2.87 0.21 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 3.16 2.97 0.15 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.81 2.65 0.14 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.77 2.62 0.13 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.42 3.22 0.17 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.25 2.09 0.14 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.21 2.05 0.14 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.18 2.95 0.19 0 0 0 0 0 0 0 +large_float_string_2048s exact 3.08 2.84 0.21 16000 16000 0 32768000 16000 0 0 +large_float_string_2048s interp 3.07 2.84 0.21 16000 16000 0 32768000 16000 0 0 diff --git a/bench/format-shape/results/timings.tsv b/bench/format-shape/results/timings.tsv index a854a561b..7966f9d75 100644 --- a/bench/format-shape/results/timings.tsv +++ b/bench/format-shape/results/timings.tsv @@ -1,7 +1,7 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.71 2.49 0.18 0 0 0 0 0 0 0 -ccdg_10k exact 1.76 1.58 0.15 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.85 1.67 0.16 10000 10000 0 32020000 10000 10000 0 +ccdg_10k baseline 2.68 2.46 0.18 0 0 0 0 0 0 0 +ccdg_10k exact 1.74 1.57 0.15 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.85 1.66 0.16 10000 10000 0 32020000 10000 10000 0 1000g_chr22_genotypes baseline 0.04 0.03 0 0 0 0 0 0 0 0 1000g_chr22_genotypes exact 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 1000g_chr22_genotypes interp 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 @@ -11,21 +11,21 @@ ccdg_10k interp 1.85 1.67 0.16 10000 10000 0 32020000 10000 10000 0 clinvar_grch38_chr22 baseline 0.01 0 0 0 0 0 0 0 0 0 clinvar_grch38_chr22 exact 0.01 0 0 0 0 0 0 0 0 0 clinvar_grch38_chr22 interp 0.01 0 0 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites baseline 0.47 0.4 0.05 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites exact 0.47 0.4 0.05 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites interp 0.48 0.4 0.06 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites baseline 0.46 0.4 0.05 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites exact 0.47 0.41 0.04 0 0 0 0 0 0 0 +gnomad_v4.1_exomes_sites interp 0.45 0.4 0.05 0 0 0 0 0 0 0 synthetic_ccdg_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 synthetic_ccdg_likelihood exact 0.01 0 0 2000 2000 0 16000 0 0 0 synthetic_ccdg_likelihood interp 0.01 0 0 2000 2000 0 16000 2000 2000 0 synthetic_reordered_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_reordered_likelihood exact 0.01 0 0 2000 2000 0 16000 0 0 0 +synthetic_reordered_likelihood exact 0 0 0 2000 2000 0 16000 0 0 0 synthetic_reordered_likelihood interp 0.01 0 0 2000 2000 0 16000 0 0 0 synthetic_fixed_numeric baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_fixed_numeric exact 0 0 0 2000 2000 0 16000 2000 0 0 -synthetic_fixed_numeric interp 0 0 0 2000 2000 0 16000 2000 0 0 -synthetic_float_string baseline 0 0 0 0 0 0 0 0 0 0 +synthetic_fixed_numeric exact 0.01 0 0 2000 2000 0 16000 2000 0 0 +synthetic_fixed_numeric interp 0.01 0 0 2000 2000 0 16000 2000 0 0 +synthetic_float_string baseline 0.01 0 0 0 0 0 0 0 0 0 synthetic_float_string exact 0.01 0 0 2000 2000 0 16000 2000 0 0 synthetic_float_string interp 0.01 0 0 2000 2000 0 16000 2000 0 0 -synthetic_multiallelic_likelihood baseline 0 0 0 0 0 0 0 0 0 0 +synthetic_multiallelic_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 synthetic_multiallelic_likelihood exact 0 0 0 1200 1200 0 9600 0 0 0 synthetic_multiallelic_likelihood interp 0 0 0 1200 1200 0 9600 1200 1200 0 diff --git a/bench/format-shape/scripts/make_large_synthetic.pl b/bench/format-shape/scripts/make_large_synthetic.pl new file mode 100644 index 000000000..a90c8ed67 --- /dev/null +++ b/bench/format-shape/scripts/make_large_synthetic.pl @@ -0,0 +1,137 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my $outdir = shift @ARGV or die "usage: make_large_synthetic.pl OUTDIR [NSAMPLES]\n"; +my $nsamples = shift @ARGV || 2048; +my $scale = shift @ARGV || 1; +my @samples = map { "S$_" } 1..$nsamples; + +sub header { + my ($fh) = @_; + print $fh "##fileformat=VCFv4.3\n"; + print $fh "##contig=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "##FORMAT=\n"; + print $fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", join("\t", @samples), "\n"; +} + +sub open_vcf { + my ($name) = @_; + open my $fh, ">", "$outdir/$name.vcf" or die "$outdir/$name.vcf: $!\n"; + header($fh); + return $fh; +} + +sub genotype { + my ($i, $s, $n_alt) = @_; + return "./." if (($i + $s) % 97) == 0; + return "0/0" if (($i + $s) % 5) == 0; + return "1|1" if $n_alt == 1 && (($i + $s) % 23) == 0; + return $n_alt > 1 && (($i + $s) % 7) == 0 ? "1/2" : "0/1"; +} + +sub ad { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 131) == 0; + return join(",", map { (($i * 3 + $s * 5 + $_ * 7) % 120) } 0..($n_allele - 1)); +} + +sub pl { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 137) == 0; + my $n = $n_allele * ($n_allele + 1) / 2; + return join(",", map { (($i + $s + $_) * 13) % 700 } 0..($n - 1)); +} + +sub gl { + my ($i, $s, $n_allele) = @_; + return "." if (($i + $s) % 127) == 0; + my $n = $n_allele * ($n_allele + 1) / 2; + return join(",", map { sprintf("%.2f", -1 * ((($i + $s + $_) % 30) / 4.0)) } 0..($n - 1)); +} + +sub write_ccdg_like { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 21000000 + $i; + my $phase = $i % 2 == 0; + my $fmt = $phase ? "GT:AB:AD:DP:GQ:PGT:PID:PL" : "GT:AB:AD:DP:GQ:PL"; + my @vals; + for my $s (0..$#samples) { + my $gt = genotype($i, $s, 1); + my $ab = $gt eq "0/1" ? sprintf("%.2f", (($i + $s) % 90) / 100) : "."; + my $base = join(":", $gt, $ab, ad($i, $s, 2), (($i+$s)%160), (($i+$s)%99)); + if ($phase) { + push @vals, join(":", $base, ($gt =~ /\|/ ? $gt : "0|1"), "${pos}_A_T", pl($i, $s, 2)); + } else { + push @vals, join(":", $base, pl($i, $s, 2)); + } + } + print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", $fmt, @vals), "\n"; + } + close $fh; +} + +sub write_reordered { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 22000000 + $i; + my @vals; + for my $s (0..$#samples) { + push @vals, join(":", (($i+$s)%160), (($i+$s)%99), genotype($i, $s, 1), ad($i, $s, 2), pl($i, $s, 2)); + } + print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", "DP:GQ:GT:AD:PL", @vals), "\n"; + } + close $fh; +} + +sub write_multiallelic { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 23000000 + $i; + my $n_alt = 1 + ($i % 3); + my @alts = qw(C G T); + my $alt = join(",", @alts[0..($n_alt - 1)]); + my @vals; + for my $s (0..$#samples) { + push @vals, join(":", genotype($i, $s, $n_alt), ad($i, $s, $n_alt + 1), (($i+$s)%160), (($i+$s)%99), pl($i, $s, $n_alt + 1)); + } + print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", "GT:AD:DP:GQ:PL", @vals), "\n"; + } + close $fh; +} + +sub write_float_string { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 24000000 + $i; + my @vals; + for my $s (0..$#samples) { + my $ft = (($i+$s)%17) == 0 ? "LowQual" : "PASS"; + push @vals, join(":", genotype($i, $s, 2), gl($i, $s, 3), $ft, (($i+$s)%160), (($i+$s)%99)); + } + print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", "GT:GL:FT:DP:GQ", @vals), "\n"; + } + close $fh; +} + +write_ccdg_like("large_ccdg_likelihood_${nsamples}s", 20000 * $scale); +write_reordered("large_reordered_likelihood_${nsamples}s", 20000 * $scale); +write_multiallelic("large_multiallelic_likelihood_${nsamples}s", 16000 * $scale); +write_float_string("large_float_string_${nsamples}s", 16000 * $scale); diff --git a/bench/format-shape/scripts/run_bench.sh b/bench/format-shape/scripts/run_bench.sh index 4d7a1845c..bb88fe439 100755 --- a/bench/format-shape/scripts/run_bench.sh +++ b/bench/format-shape/scripts/run_bench.sh @@ -4,6 +4,7 @@ set -eu test_view=${TEST_VIEW:-./test/test_view} inputs=${1:-bench/format-shape/inputs.tsv} outdir=${OUTDIR:-bench/format-shape/results} +keep_outputs=${KEEP_OUTPUTS:-1} mkdir -p "$outdir" timings="$outdir/timings.tsv" @@ -76,6 +77,9 @@ do else printf '%s\tbaseline_vs_interp\tDIFF\n' "$name" >> "$checks" fi + if [ "$keep_outputs" = 0 ]; then + rm -f "$base_out" "$exact_out" "$interp_out" + fi done printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 21c9d88e9..a3b191005 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -207,6 +207,37 @@ Next likely cuts: - Once dynamic shape is consistently at parity, demote the exact CCDG kernels to oracle-only or remove them. +## 2026-04-29 Large-Corpus Check + +The small public/synthetic slices were too short to provide timing signal, so +the meaningful benchmark set moved to `bench/format-shape/large/inputs.tsv`. +The large corpus includes the CCDG 10k subset, full 1000 Genomes chr22 +genotypes, and 2,048-sample generated FORMAT workloads. + +Latest large run used: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All exact/interp outputs compared byte-identical to baseline. Timing summary: + +| Input | Baseline user | Exact user | Dynamic interp user | Shape hits | +|---|---:|---:|---:|---:| +| CCDG 10k | 2.60 s | 1.64 s | 1.63 s | 10,000 | +| 1000G chr22 full GT | 26.76 s | 9.32 s | 9.32 s | 0 | +| Large CCDG-like synthetic | 4.10 s | 2.77 s | 2.87 s | 20,000 | +| Large reordered likelihood | 2.97 s | 2.65 s | 2.62 s | 0 | +| Large multiallelic likelihood | 3.22 s | 2.09 s | 2.05 s | 16,000 | +| Large float/string | 2.95 s | 2.84 s | 2.84 s | 0 | + +The dynamic likelihood shape path is now at parity or close enough on the +meaningful workloads. The remaining visible gap is the generated CCDG-like +phase-heavy synthetic case, where dynamic-only is about 3-4% slower than exact. +That looks acceptable for this checkpoint; the next optimization target remains +cached shape classification to remove repeated deterministic row-level checks. + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/vcf.c b/vcf.c index 2edb52c4f..02237acc7 100644 --- a/vcf.c +++ b/vcf.c @@ -4409,6 +4409,7 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h { const char *cur, *end; int ad_w, pl_w, idx, sample, j, nsamples = bcf_hdr_nsamples(h); + int has_float = 0; int str1_idx = -1, str2_idx = -1; if (plan->n_ops != 5 && plan->n_ops != 6 && @@ -4429,8 +4430,10 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h idx = 1; if (idx < plan->n_ops && plan->ops[idx].htype == BCF_HT_REAL && - plan->ops[idx].number == 1) + plan->ops[idx].number == 1) { + has_float = 1; widths[idx++] = 1; + } if (idx + 3 >= plan->n_ops) return -4; if (plan->ops[idx].htype != BCF_HT_INT) @@ -4463,33 +4466,24 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h cur = q + 1; end = s->s + s->l; for (sample = 0; sample < nsamples && cur < end; sample++) { - for (j = 0; j < plan->n_ops; j++) { - const char *field = cur; - - while (cur < end && *cur && *cur != ':' && *cur != '\t') - cur++; - if (j == str1_idx || j == str2_idx) { - int w = cur - field; - if (j > 0) - w++; - if (w <= 0) - w = 1; - if (widths[j] < w) - widths[j] = w; - } - if (j + 1 < plan->n_ops) { - if (*cur != ':') - return -4; - cur++; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - return -4; - } - } + if (vcf_plan_skip_field(&cur, ':') < 0) + return -4; + if (has_float && vcf_plan_skip_field(&cur, ':') < 0) + return -4; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -4; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -4; + if (vcf_plan_skip_field(&cur, ':') < 0) + return -4; + if (vcf_plan_measure_string(&cur, ':', &widths[str1_idx]) < 0) + return -4; + if (vcf_plan_measure_string(&cur, ':', &widths[str2_idx]) < 0) + return -4; + while (cur < end && *cur && *cur != '\t') + cur++; + if (*cur == '\t') + cur++; } if (sample != nsamples) return -4; @@ -4497,6 +4491,8 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h widths[str1_idx] = 1; if (widths[str2_idx] <= 0) widths[str2_idx] = 1; + widths[str1_idx]++; + widths[str2_idx]++; return 0; } From e9442b79156cd9e87bfe0c7c5ae3a8efa0b1ef61 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 15:28:04 +0200 Subject: [PATCH 17/38] Cache dynamic FORMAT likelihood shapes --- bench/format-shape/README.md | 17 +- bench/format-shape/large/inputs.tsv | 4 + bench/format-shape/large/results/checks.tsv | 8 + bench/format-shape/large/results/timings.tsv | 48 +++-- .../scripts/make_large_synthetic.pl | 108 ++++++++++- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 45 +++++ test/format-plan-edge.vcf | 1 + vcf.c | 171 +++++++++++------- 8 files changed, 307 insertions(+), 95 deletions(-) diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index 27a1f1048..ea69f9f9b 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -118,9 +118,20 @@ bench/format-shape/results/checks.tsv - the CCDG 10k subset, - the full 1000 Genomes chr22 Phase 3 genotype VCF, -- four generated 2,048-sample synthetic FORMAT workloads: - CCDG-like likelihood, reordered likelihood, multiallelic likelihood, and - float/string FORMAT. +- eight generated 2,048-sample synthetic FORMAT workloads: + CCDG-like likelihood, reordered likelihood, multiallelic likelihood, + float/string FORMAT, variable phase-string widths, row-local likelihood + fallbacks, GT-first wrong-order likelihood-like rows, and two-string + float rows. + +To refresh only the newer cache-regression synthetic files without rewriting the +older large VCFs: + +```sh +SYNTHETIC_ONLY_NEW=1 \ + bench/format-shape/scripts/make_large_synthetic.pl \ + bench/format-shape/large/synthetic 2048 +``` The latest large run is summarized in: diff --git a/bench/format-shape/large/inputs.tsv b/bench/format-shape/large/inputs.tsv index 3dca38e81..795882a7e 100644 --- a/bench/format-shape/large/inputs.tsv +++ b/bench/format-shape/large/inputs.tsv @@ -5,3 +5,7 @@ large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likeli large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples large_multiallelic_likelihood_2048s bench/format-shape/large/synthetic/large_multiallelic_likelihood_2048s.vcf.gz synthetic multiallelic likelihood FORMAT, 16k records x 2,048 samples large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT, 16k records x 2,048 samples +large_phase_width_variation_2048s bench/format-shape/large/synthetic/large_phase_width_variation_2048s.vcf.gz synthetic likelihood FORMAT with variable PGT/PID widths, 12k records x 2,048 samples +large_mixed_likelihood_2048s bench/format-shape/large/synthetic/large_mixed_likelihood_2048s.vcf.gz synthetic likelihood FORMAT with row-local unsupported/wrong-width rows, 12k records x 2,048 samples +large_gt_first_reordered_2048s bench/format-shape/large/synthetic/large_gt_first_reordered_2048s.vcf.gz synthetic GT-first reordered non-shape likelihood FORMAT, 12k records x 2,048 samples +large_two_string_float_2048s bench/format-shape/large/synthetic/large_two_string_float_2048s.vcf.gz synthetic two-string float FORMAT, 12k records x 2,048 samples diff --git a/bench/format-shape/large/results/checks.tsv b/bench/format-shape/large/results/checks.tsv index 1ec35d27b..3dfeed0c0 100644 --- a/bench/format-shape/large/results/checks.tsv +++ b/bench/format-shape/large/results/checks.tsv @@ -11,3 +11,11 @@ large_multiallelic_likelihood_2048s baseline_vs_exact ok large_multiallelic_likelihood_2048s baseline_vs_interp ok large_float_string_2048s baseline_vs_exact ok large_float_string_2048s baseline_vs_interp ok +large_phase_width_variation_2048s baseline_vs_exact ok +large_phase_width_variation_2048s baseline_vs_interp ok +large_mixed_likelihood_2048s baseline_vs_exact ok +large_mixed_likelihood_2048s baseline_vs_interp ok +large_gt_first_reordered_2048s baseline_vs_exact ok +large_gt_first_reordered_2048s baseline_vs_interp ok +large_two_string_float_2048s baseline_vs_exact ok +large_two_string_float_2048s baseline_vs_interp ok diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv index fd189c759..ca14b806d 100644 --- a/bench/format-shape/large/results/timings.tsv +++ b/bench/format-shape/large/results/timings.tsv @@ -1,19 +1,31 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 3.43 2.6 0.19 0 0 0 0 0 0 0 -ccdg_10k exact 1.84 1.64 0.17 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.82 1.63 0.17 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 28.01 26.76 1.12 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 10.2 9.32 0.78 1103547 1103547 0 2763281688 1103547 0 0 -1000g_chr22_full_genotypes interp 10.19 9.32 0.81 1103547 1103547 0 2763281688 1103547 0 0 -large_ccdg_likelihood_2048s baseline 4.38 4.1 0.22 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 2.99 2.77 0.2 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 3.1 2.87 0.21 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 3.16 2.97 0.15 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.81 2.65 0.14 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.77 2.62 0.13 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.42 3.22 0.17 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.25 2.09 0.14 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.21 2.05 0.14 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.18 2.95 0.19 0 0 0 0 0 0 0 -large_float_string_2048s exact 3.08 2.84 0.21 16000 16000 0 32768000 16000 0 0 -large_float_string_2048s interp 3.07 2.84 0.21 16000 16000 0 32768000 16000 0 0 +ccdg_10k baseline 2.7 2.53 0.13 0 0 0 0 0 0 0 +ccdg_10k exact 1.76 1.61 0.13 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.75 1.6 0.13 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 26.95 26.23 0.61 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 9.86 9.16 0.58 1103547 1103547 0 2763281688 0 0 0 +1000g_chr22_full_genotypes interp 9.72 9.11 0.57 1103547 1103547 0 2763281688 0 0 0 +large_ccdg_likelihood_2048s baseline 4.1 3.93 0.15 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 2.89 2.74 0.14 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 2.85 2.69 0.13 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 2.98 2.87 0.09 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.68 2.56 0.08 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.67 2.57 0.08 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.25 3.11 0.1 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.15 2.05 0.09 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.09 1.99 0.09 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.02 2.87 0.14 0 0 0 0 0 0 0 +large_float_string_2048s exact 2.92 2.76 0.14 16000 16000 0 32768000 0 0 0 +large_float_string_2048s interp 2.93 2.77 0.14 16000 16000 0 32768000 0 0 0 +large_phase_width_variation_2048s baseline 2.68 2.51 0.13 0 0 0 0 0 0 0 +large_phase_width_variation_2048s exact 2.15 2 0.14 12000 12000 0 24576000 0 0 0 +large_phase_width_variation_2048s interp 2.14 1.99 0.14 12000 12000 0 24576000 12000 12000 0 +large_mixed_likelihood_2048s baseline 2.22 2.14 0.07 0 0 0 0 0 0 0 +large_mixed_likelihood_2048s exact 1.64 1.56 0.06 12000 11400 600 23347200 7355 6650 705 +large_mixed_likelihood_2048s interp 1.65 1.58 0.06 12000 12000 0 24576000 11295 10236 1059 +large_gt_first_reordered_2048s baseline 1.75 1.69 0.05 0 0 0 0 0 0 0 +large_gt_first_reordered_2048s exact 1.57 1.5 0.05 12000 12000 0 24576000 0 0 0 +large_gt_first_reordered_2048s interp 1.57 1.5 0.05 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s baseline 2.35 2.21 0.12 0 0 0 0 0 0 0 +large_two_string_float_2048s exact 2.49 2.36 0.12 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s interp 2.47 2.32 0.12 12000 12000 0 24576000 0 0 0 diff --git a/bench/format-shape/scripts/make_large_synthetic.pl b/bench/format-shape/scripts/make_large_synthetic.pl index a90c8ed67..785242b2d 100644 --- a/bench/format-shape/scripts/make_large_synthetic.pl +++ b/bench/format-shape/scripts/make_large_synthetic.pl @@ -131,7 +131,107 @@ sub write_float_string { close $fh; } -write_ccdg_like("large_ccdg_likelihood_${nsamples}s", 20000 * $scale); -write_reordered("large_reordered_likelihood_${nsamples}s", 20000 * $scale); -write_multiallelic("large_multiallelic_likelihood_${nsamples}s", 16000 * $scale); -write_float_string("large_float_string_${nsamples}s", 16000 * $scale); +sub write_phase_width_variation { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 25000000 + $i; + my @vals; + for my $s (0..$#samples) { + my $gt = genotype($i, $s, 1); + my $pgt = (($i + $s) % 29) == 0 ? "." : ($gt =~ /\|/ ? $gt : "0|1"); + my $pid; + if (($i + $s) % 31 == 0) { + $pid = "."; + } elsif (($i + $s) % 7 == 0) { + $pid = "${pos}_${s}_A_T_LONG_PHASE_SET"; + } elsif (($i + $s) % 5 == 0) { + $pid = "${pos}_A_T"; + } else { + $pid = "P" . (($i + $s) % 97); + } + push @vals, join(":", $gt, ad($i, $s, 2), (($i+$s)%160), + (($i+$s)%99), $pgt, $pid, pl($i, $s, 2)); + } + print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", + "GT:AD:DP:GQ:PGT:PID:PL", @vals), "\n"; + } + close $fh; +} + +sub write_mixed_likelihood { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 26000000 + $i; + my $n_alt = ($i % 17) == 0 ? 8 : (($i % 11) == 0 ? 2 : 1); + my @alts = qw(C G T AA AC AG AT GA); + my $alt = join(",", @alts[0..($n_alt - 1)]); + my $n_allele = $n_alt + 1; + my @vals; + for my $s (0..$#samples) { + my $ad = ad($i, $s, $n_allele); + my $pl = pl($i, $s, $n_allele); + if (($i % 19) == 0 && $ad ne ".") { + my @ad = split /,/, $ad; + pop @ad; + $ad = join(",", @ad); + } + if (($i % 23) == 0 && $pl ne ".") { + my @pl = split /,/, $pl; + pop @pl; + $pl = join(",", @pl); + } + push @vals, join(":", genotype($i, $s, $n_alt), $ad, + (($i+$s)%160), (($i+$s)%99), $pl); + } + print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", + "GT:AD:DP:GQ:PL", @vals), "\n"; + } + close $fh; +} + +sub write_gt_first_reordered { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 27000000 + $i; + my @vals; + for my $s (0..$#samples) { + push @vals, join(":", genotype($i, $s, 1), (($i+$s)%160), + ad($i, $s, 2), (($i+$s)%99), pl($i, $s, 2)); + } + print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", + "GT:DP:AD:GQ:PL", @vals), "\n"; + } + close $fh; +} + +sub write_two_string_float { + my ($name, $records) = @_; + my $fh = open_vcf($name); + for my $i (1..$records) { + my $pos = 28000000 + $i; + my @vals; + for my $s (0..$#samples) { + my $ft = (($i+$s)%17) == 0 ? "LowQual" : "PASS"; + my $pid = (($i+$s)%13) == 0 ? "." : "PS" . (($i * 11 + $s) % 100000); + push @vals, join(":", genotype($i, $s, 2), $ft, $pid, + gl($i, $s, 3), (($i+$s)%160)); + } + print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", + "GT:FT:PID:GL:DP", @vals), "\n"; + } + close $fh; +} + +unless ($ENV{SYNTHETIC_ONLY_NEW}) { + write_ccdg_like("large_ccdg_likelihood_${nsamples}s", 20000 * $scale); + write_reordered("large_reordered_likelihood_${nsamples}s", 20000 * $scale); + write_multiallelic("large_multiallelic_likelihood_${nsamples}s", 16000 * $scale); + write_float_string("large_float_string_${nsamples}s", 16000 * $scale); +} +write_phase_width_variation("large_phase_width_variation_${nsamples}s", 12000 * $scale); +write_mixed_likelihood("large_mixed_likelihood_${nsamples}s", 12000 * $scale); +write_gt_first_reordered("large_gt_first_reordered_${nsamples}s", 12000 * $scale); +write_two_string_float("large_two_string_float_${nsamples}s", 12000 * $scale); diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index a3b191005..ad6103474 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -238,6 +238,51 @@ phase-heavy synthetic case, where dynamic-only is about 3-4% slower than exact. That looks acceptable for this checkpoint; the next optimization target remains cached shape classification to remove repeated deterministic row-level checks. +## 2026-04-29 Cached Shape Classification + +Added FORMAT-level likelihood-shape classification to the dynamic general plan. +The cache only records deterministic facts from the FORMAT/header order and +types: + +```text +GT2, optional FLOAT1, INT[n_allele], INT1, INT1, +optional STR1, optional STR1, INT[ploidy likelihood width] +``` + +Row-level facts remain uncached. Each record still validates `n_allele`, +AD/PL widths, GT syntax, observed vector counts, separators, sample count, and +phase-string widths before using the likelihood executor. + +The large benchmark corpus now includes four extra cache-regression workloads: + +- variable-width `PGT/PID` likelihood rows, +- likelihood rows with mixed row-local fallbacks and later positive hits, +- GT-first but wrong-order likelihood-like rows, +- non-likelihood rows with two strings plus float vectors. + +Latest full large-corpus run: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All exact and interp outputs compared byte-identical to baseline. Highlights: + +| Input | Exact user | Dynamic interp user | Dynamic shape attempts | Dynamic shape hits | +|---|---:|---:|---:|---:| +| CCDG 10k | 1.61 s | 1.60 s | 10,000 | 10,000 | +| 1000G chr22 full GT | 9.16 s | 9.11 s | 0 | 0 | +| Large CCDG-like synthetic | 2.74 s | 2.69 s | 20,000 | 20,000 | +| Large multiallelic likelihood | 2.05 s | 1.99 s | 16,000 | 16,000 | +| Variable phase widths | 2.00 s | 1.99 s | 12,000 | 12,000 | +| Mixed row-local fallbacks | 1.56 s | 1.58 s | 11,295 | 10,236 | +| GT-first reordered negative | 1.50 s | 1.50 s | 0 | 0 | +| Two-string float negative | 2.36 s | 2.32 s | 0 | 0 | + +The important negative-cache result is the full 1000G GT-only workload: +dynamic mode no longer pays 1,103,547 failed likelihood-shape probes. + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index 3f45fbbe7..62a942695 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -28,3 +28,4 @@ chr22 10590000 . A T 50 PASS . DP:GQ:GT:AD:PL 11:50:0/1:6,5:80,0,90 8:45:0/0:8,0 chr22 10591000 . A T 50 PASS . AD:PL:GT:DP:GQ 4,3:70,0,80:0/1:7:60 9,0:0,70,120:0/0:9:50 0,0:.:./.:0:. chr22 10592000 . A T 50 PASS . GT:DP:AB:GQ:AD:PL 0/1:12:0.42:70:7,5:90,0,100 0/0:10:0.01:60:10,0:0,60,120 ./.:0:.:.:0,0:. chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,20:8:8,0,0,0 ./.:.,.:0:.,.,.,. +chr22 10594000 . A T 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 0|1:4,5:9:50:0|1:P1:90,0,90 0/1:3,2:5:20:0|1:10594000_A_T_LONG_PHASE_SET:20,0,200 ./.:0,0:0:.:.:.:. diff --git a/vcf.c b/vcf.c index 02237acc7..cf835d6cd 100644 --- a/vcf.c +++ b/vcf.c @@ -3336,6 +3336,16 @@ typedef struct { const bcf_hdr_t *hdr; int supported; int strict_supported; + int likelihood_supported; + int likelihood_has_float; + int likelihood_has_phase; + int likelihood_float_idx; + int likelihood_ad_idx; + int likelihood_dp_idx; + int likelihood_gq_idx; + int likelihood_str1_idx; + int likelihood_str2_idx; + int likelihood_pl_idx; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; vcf_format_fast_guard_t strict_guard; @@ -3415,6 +3425,64 @@ static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, return plan->supported; } +static int vcf_format_general_classify_likelihood(vcf_format_general_plan_t *plan) +{ + int idx; + + plan->likelihood_supported = 0; + plan->likelihood_has_float = 0; + plan->likelihood_has_phase = 0; + plan->likelihood_float_idx = -1; + plan->likelihood_ad_idx = -1; + plan->likelihood_dp_idx = -1; + plan->likelihood_gq_idx = -1; + plan->likelihood_str1_idx = -1; + plan->likelihood_str2_idx = -1; + plan->likelihood_pl_idx = -1; + + if (plan->n_ops != 5 && plan->n_ops != 6 && + plan->n_ops != 7 && plan->n_ops != 8) + return 0; + if (!plan->ops[0].is_gt) + return 0; + + idx = 1; + if (idx < plan->n_ops && plan->ops[idx].htype == BCF_HT_REAL && + plan->ops[idx].number == 1) { + plan->likelihood_has_float = 1; + plan->likelihood_float_idx = idx++; + } + if (idx + 3 >= plan->n_ops) + return 0; + if (plan->ops[idx].htype != BCF_HT_INT) + return 0; + plan->likelihood_ad_idx = idx++; + if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) + return 0; + plan->likelihood_dp_idx = idx++; + if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) + return 0; + plan->likelihood_gq_idx = idx++; + if (plan->n_ops - idx == 3) { + if (plan->ops[idx].htype != BCF_HT_STR || plan->ops[idx].number != 1 || + plan->ops[idx + 1].htype != BCF_HT_STR || plan->ops[idx + 1].number != 1) + return 0; + plan->likelihood_has_phase = 1; + plan->likelihood_str1_idx = idx++; + plan->likelihood_str2_idx = idx++; + } else if (plan->n_ops - idx != 1) { + return 0; + } + if (plan->ops[idx].htype != BCF_HT_INT) + return 0; + plan->likelihood_pl_idx = idx++; + if (idx != plan->n_ops) + return 0; + + plan->likelihood_supported = 1; + return 1; +} + static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *format) { enum { N_PLAN_CACHE = 8 }; @@ -3484,6 +3552,7 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (!plan->n_ops) return 0; + vcf_format_general_classify_likelihood(plan); plan->supported = 1; return 1; } @@ -4408,14 +4477,11 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h bcf1_t *v, char *q, int *widths) { const char *cur, *end; - int ad_w, pl_w, idx, sample, j, nsamples = bcf_hdr_nsamples(h); - int has_float = 0; - int str1_idx = -1, str2_idx = -1; + int ad_w, pl_w, sample, j, nsamples = bcf_hdr_nsamples(h); + int str1_idx = plan->likelihood_str1_idx; + int str2_idx = plan->likelihood_str2_idx; - if (plan->n_ops != 5 && plan->n_ops != 6 && - plan->n_ops != 7 && plan->n_ops != 8) - return -4; - if (!plan->ops[0].is_gt) + if (!plan->likelihood_supported) return -4; if (v->n_allele < 1 || v->n_allele > 8) return -4; @@ -4427,38 +4493,12 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h for (j = 0; j < plan->n_ops; j++) widths[j] = 0; widths[0] = 2; - - idx = 1; - if (idx < plan->n_ops && plan->ops[idx].htype == BCF_HT_REAL && - plan->ops[idx].number == 1) { - has_float = 1; - widths[idx++] = 1; - } - if (idx + 3 >= plan->n_ops) - return -4; - if (plan->ops[idx].htype != BCF_HT_INT) - return -4; - widths[idx++] = ad_w; - if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) - return -4; - widths[idx++] = 1; - if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) - return -4; - widths[idx++] = 1; - if (plan->n_ops - idx == 3) { - if (plan->ops[idx].htype != BCF_HT_STR || plan->ops[idx].number != 1 || - plan->ops[idx + 1].htype != BCF_HT_STR || plan->ops[idx + 1].number != 1) - return -4; - str1_idx = idx++; - str2_idx = idx++; - } else if (plan->n_ops - idx != 1) { - return -4; - } - if (plan->ops[idx].htype != BCF_HT_INT) - return -4; - widths[idx++] = pl_w; - if (idx != plan->n_ops) - return -4; + if (plan->likelihood_has_float) + widths[plan->likelihood_float_idx] = 1; + widths[plan->likelihood_ad_idx] = ad_w; + widths[plan->likelihood_dp_idx] = 1; + widths[plan->likelihood_gq_idx] = 1; + widths[plan->likelihood_pl_idx] = pl_w; if (str1_idx < 0) return 0; @@ -4468,7 +4508,7 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h for (sample = 0; sample < nsamples && cur < end; sample++) { if (vcf_plan_skip_field(&cur, ':') < 0) return -4; - if (has_float && vcf_plan_skip_field(&cur, ':') < 0) + if (plan->likelihood_has_float && vcf_plan_skip_field(&cur, ':') < 0) return -4; if (vcf_plan_skip_field(&cur, ':') < 0) return -4; @@ -4668,8 +4708,9 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, { kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h); - int ad_w, pl_w, sample, idx, ad_idx, dp_idx, gq_idx, pl_idx; - int has_float = 0, has_phase = 0, float_idx = -1, str1_idx = -1, str2_idx = -1; + int ad_w, pl_w, sample; + int ad_idx, dp_idx, gq_idx, pl_idx; + int has_float, has_phase, float_idx, str1_idx, str2_idx; int max_ad_count = 0, max_pl_count = 0, nwords; vcf_plan_int_range_t ad_range, dp_range, gq_range, pl_range; size_t indiv_l0 = v->indiv.l; @@ -4680,10 +4721,10 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, char *str1 = NULL, *str2 = NULL; const char *cur, *end; - vcf_format_likelihood_shape_attempts++; - if (plan->n_ops != 5 && plan->n_ops != 6 && - plan->n_ops != 7 && plan->n_ops != 8) + if (!plan->likelihood_supported) return -4; + + vcf_format_likelihood_shape_attempts++; if (row_ops[0].kind != VCF_FORMAT_ROW_GT2) return -4; if (v->n_allele < 1 || v->n_allele > 8) @@ -4694,34 +4735,23 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, if (pl_w < 1 || pl_w > 36) return -4; - idx = 1; - if (idx < plan->n_ops && row_ops[idx].kind == VCF_FORMAT_ROW_FLOAT1) { - has_float = 1; - float_idx = idx++; - } - if (idx + 3 >= plan->n_ops) - return -4; - ad_idx = idx++; - dp_idx = idx++; - gq_idx = idx++; - if (plan->n_ops - idx == 3) { - if (row_ops[idx].kind != VCF_FORMAT_ROW_STR || - row_ops[idx + 1].kind != VCF_FORMAT_ROW_STR) - return -4; - has_phase = 1; - str1_idx = idx++; - str2_idx = idx++; - } else if (plan->n_ops - idx != 1) { - return -4; - } - pl_idx = idx++; - if (idx != plan->n_ops) - return -4; + has_float = plan->likelihood_has_float; + has_phase = plan->likelihood_has_phase; + float_idx = plan->likelihood_float_idx; + ad_idx = plan->likelihood_ad_idx; + dp_idx = plan->likelihood_dp_idx; + gq_idx = plan->likelihood_gq_idx; + str1_idx = plan->likelihood_str1_idx; + str2_idx = plan->likelihood_str2_idx; + pl_idx = plan->likelihood_pl_idx; if (!vcf_format_row_is_int(&row_ops[ad_idx]) || row_ops[ad_idx].width != ad_w || row_ops[dp_idx].kind != VCF_FORMAT_ROW_INT1 || row_ops[gq_idx].kind != VCF_FORMAT_ROW_INT1 || + (has_float && row_ops[float_idx].kind != VCF_FORMAT_ROW_FLOAT1) || + (has_phase && (row_ops[str1_idx].kind != VCF_FORMAT_ROW_STR || + row_ops[str2_idx].kind != VCF_FORMAT_ROW_STR)) || !vcf_format_row_is_int(&row_ops[pl_idx]) || row_ops[pl_idx].width != pl_w) return -4; @@ -4924,7 +4954,8 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, vcf_plan_int_range_init(&ranges[j]); } vcf_format_general_resolve_ops(plan, v, widths, row_ops); - if ((ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops)) != -4) + if (plan->likelihood_supported && + (ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops)) != -4) return ret; if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) return vcf_parse_format_general_fixed_numeric(s, h, v, plan, q, row_ops); @@ -5046,7 +5077,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, if (!nsamples) return 0; strict_enabled = vcf_format_fast_guard_enabled(&plan->strict_guard); - if (strict_enabled) { + if (plan->likelihood_supported && strict_enabled) { ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q); if (ret == 0) { vcf_format_fast_guard_success(&plan->strict_guard); From 3185c8a9e3e9955dae27870c6f05845d891b5966 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 15:37:08 +0200 Subject: [PATCH 18/38] Add dynamic GT-only FORMAT fast path --- bench/format-shape/.gitignore | 1 + bench/format-shape/large/results/timings.tsv | 54 ++++++------- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 26 +++++++ test/format-plan-header-mismatch.vcf | 9 +++ test/test_format_plan.sh | 19 +++-- vcf.c | 75 +++++++++++++++++++ 6 files changed, 149 insertions(+), 35 deletions(-) create mode 100644 test/format-plan-header-mismatch.vcf diff --git a/bench/format-shape/.gitignore b/bench/format-shape/.gitignore index 1f2b779e7..801993aef 100644 --- a/bench/format-shape/.gitignore +++ b/bench/format-shape/.gitignore @@ -7,6 +7,7 @@ large/**/*.tbi large/results/*.bcf large/results/*.stderr large/results/*.tmp +large/results-*/* results/*.bcf results/*.stderr !results/timings.tsv diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv index ca14b806d..3589a8ef2 100644 --- a/bench/format-shape/large/results/timings.tsv +++ b/bench/format-shape/large/results/timings.tsv @@ -1,31 +1,31 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.7 2.53 0.13 0 0 0 0 0 0 0 -ccdg_10k exact 1.76 1.61 0.13 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.75 1.6 0.13 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 26.95 26.23 0.61 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 9.86 9.16 0.58 1103547 1103547 0 2763281688 0 0 0 -1000g_chr22_full_genotypes interp 9.72 9.11 0.57 1103547 1103547 0 2763281688 0 0 0 -large_ccdg_likelihood_2048s baseline 4.1 3.93 0.15 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 2.89 2.74 0.14 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 2.85 2.69 0.13 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 2.98 2.87 0.09 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.68 2.56 0.08 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.67 2.57 0.08 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.25 3.11 0.1 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.15 2.05 0.09 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.09 1.99 0.09 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.02 2.87 0.14 0 0 0 0 0 0 0 -large_float_string_2048s exact 2.92 2.76 0.14 16000 16000 0 32768000 0 0 0 -large_float_string_2048s interp 2.93 2.77 0.14 16000 16000 0 32768000 0 0 0 -large_phase_width_variation_2048s baseline 2.68 2.51 0.13 0 0 0 0 0 0 0 -large_phase_width_variation_2048s exact 2.15 2 0.14 12000 12000 0 24576000 0 0 0 -large_phase_width_variation_2048s interp 2.14 1.99 0.14 12000 12000 0 24576000 12000 12000 0 -large_mixed_likelihood_2048s baseline 2.22 2.14 0.07 0 0 0 0 0 0 0 +ccdg_10k baseline 2.82 2.63 0.14 0 0 0 0 0 0 0 +ccdg_10k exact 1.82 1.64 0.14 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.8 1.63 0.14 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 25.57 24.86 0.62 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 6.41 5.77 0.57 1103547 1103547 0 2763281688 0 0 0 +1000g_chr22_full_genotypes interp 6.22 5.61 0.57 1103547 1103547 0 2763281688 0 0 0 +large_ccdg_likelihood_2048s baseline 4.15 3.98 0.15 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 2.85 2.69 0.14 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 2.9 2.74 0.14 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 3.01 2.91 0.09 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.72 2.59 0.09 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.72 2.6 0.08 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.3 3.18 0.1 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.16 2.05 0.09 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.13 2.01 0.1 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.04 2.87 0.15 0 0 0 0 0 0 0 +large_float_string_2048s exact 2.9 2.75 0.14 16000 16000 0 32768000 0 0 0 +large_float_string_2048s interp 2.99 2.81 0.14 16000 16000 0 32768000 0 0 0 +large_phase_width_variation_2048s baseline 2.73 2.57 0.14 0 0 0 0 0 0 0 +large_phase_width_variation_2048s exact 2.14 1.98 0.14 12000 12000 0 24576000 0 0 0 +large_phase_width_variation_2048s interp 2.21 2.05 0.13 12000 12000 0 24576000 12000 12000 0 +large_mixed_likelihood_2048s baseline 2.27 2.18 0.07 0 0 0 0 0 0 0 large_mixed_likelihood_2048s exact 1.64 1.56 0.06 12000 11400 600 23347200 7355 6650 705 large_mixed_likelihood_2048s interp 1.65 1.58 0.06 12000 12000 0 24576000 11295 10236 1059 -large_gt_first_reordered_2048s baseline 1.75 1.69 0.05 0 0 0 0 0 0 0 -large_gt_first_reordered_2048s exact 1.57 1.5 0.05 12000 12000 0 24576000 0 0 0 -large_gt_first_reordered_2048s interp 1.57 1.5 0.05 12000 12000 0 24576000 0 0 0 +large_gt_first_reordered_2048s baseline 1.79 1.73 0.05 0 0 0 0 0 0 0 +large_gt_first_reordered_2048s exact 1.58 1.52 0.05 12000 12000 0 24576000 0 0 0 +large_gt_first_reordered_2048s interp 1.59 1.53 0.05 12000 12000 0 24576000 0 0 0 large_two_string_float_2048s baseline 2.35 2.21 0.12 0 0 0 0 0 0 0 -large_two_string_float_2048s exact 2.49 2.36 0.12 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s interp 2.47 2.32 0.12 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s exact 2.43 2.29 0.12 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s interp 2.46 2.32 0.12 12000 12000 0 24576000 0 0 0 diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index ad6103474..64ecf63ed 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -283,6 +283,32 @@ All exact and interp outputs compared byte-identical to baseline. Highlights: The important negative-cache result is the full 1000G GT-only workload: dynamic mode no longer pays 1,103,547 failed likelihood-shape probes. +## 2026-04-29 GT-Only Fast Path + +Added a tiny general-plan executor for the common `FORMAT=GT` / diploid `GT2` +shape. This is still shape-based rather than data-set specific: + +- requires a single FORMAT op and that op must be `GT`, +- requires allele indexes that fit the existing one-digit `GT2` parser, +- falls through to the existing strict/measured paths for haploid, dynamic GT, + malformed rows, or any unsupported row-local detail. + +Also tightened the older exact-name CCDG kernels so they only claim a FORMAT +after checking the relevant header types and scalar counts. A new +`format-plan-header-mismatch.vcf` fixture keeps this honest by using CCDG-shaped +names with `AD` declared as a string. + +Latest full large-corpus run remained byte-identical to baseline for exact and +interp. The main win is the full 1000G chr22 GT-only workload: + +| Input | Baseline user | Exact user | Dynamic interp user | +|---|---:|---:|---:| +| 1000G chr22 full GT | 24.86 s | 5.77 s | 5.61 s | + +The previous cached-shape run was about 9.1 s user in dynamic mode on this +input, so the direct GT-only executor removes roughly 39% of the remaining +planned-parser CPU for this large real workload. + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/test/format-plan-header-mismatch.vcf b/test/format-plan-header-mismatch.vcf new file mode 100644 index 000000000..583233048 --- /dev/null +++ b/test/format-plan-header-mismatch.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10600000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:42:90,0,120 0/0:5,0:5:15:0,15,200 ./.:.:0:.:. diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh index ba3a61836..78b6cdc8a 100755 --- a/test/test_format_plan.sh +++ b/test/test_format_plan.sh @@ -2,7 +2,7 @@ set -eu test_view=${TEST_VIEW:-./test/test_view} -input=${1:-test/format-plan-edge.vcf} +inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf"} tmpdir=${TMPDIR:-/tmp} base=${tmpdir}/hts-format-plan-base.$$ plan=${tmpdir}/hts-format-plan-plan.$$ @@ -12,10 +12,13 @@ interp_stats=${tmpdir}/hts-format-plan-interp-stats.$$ trap 'rm -f "$base" "$plan" "$interp" "$stats" "$interp_stats"' EXIT HUP INT TERM -env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 "$input" > "$base" -env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$plan" 2> "$stats" -env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$interp" 2> "$interp_stats" -cmp "$base" "$plan" -cmp "$base" "$interp" -cat "$stats" -cat "$interp_stats" +for input in $inputs +do + env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 "$input" > "$base" + env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$plan" 2> "$stats" + env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$interp" 2> "$interp_stats" + cmp "$base" "$plan" + cmp "$base" "$interp" + cat "$stats" + cat "$interp_stats" +done diff --git a/vcf.c b/vcf.c index cf835d6cd..a115625a0 100644 --- a/vcf.c +++ b/vcf.c @@ -3421,6 +3421,23 @@ static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, (plan->has_ab && plan->key_ab < 0) || (plan->has_phase && (plan->key_pgt < 0 || plan->key_pid < 0))) plan->supported = 0; + if (plan->supported && + (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_gt) != BCF_HT_STR || + bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_ad) != BCF_HT_INT || + bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_dp) != BCF_HT_INT || + bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_gq) != BCF_HT_INT || + bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pl) != BCF_HT_INT || + bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_dp) != 1 || + bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_gq) != 1 || + (plan->has_ab && + (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_ab) != BCF_HT_REAL || + bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_ab) != 1)) || + (plan->has_phase && + (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pgt) != BCF_HT_STR || + bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pid) != BCF_HT_STR || + bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_pgt) != 1 || + bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_pid) != 1)))) + plan->supported = 0; return plan->supported; } @@ -4537,6 +4554,55 @@ static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h return 0; } +static int vcf_parse_format_general_gt2_only(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q) +{ + int nsamples = bcf_hdr_nsamples(h), sample; + size_t indiv_l0 = v->indiv.l; + uint8_t *gt8; + const char *cur, *end; + + if (plan->n_ops != 1 || !plan->ops[0].is_gt || v->n_allele > 10) + return -4; + + bcf_enc_int1(&v->indiv, plan->ops[0].key); + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + goto error; + gt8 = (uint8_t *)v->indiv.s + v->indiv.l; + v->indiv.l += (size_t)nsamples * 2; + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) + goto fallback; + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else + goto fallback; + } + if (sample != nsamples) + goto fallback; + + v->n_fmt = 1; + v->n_sample = nsamples; + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += nsamples; + return 0; + +fallback: + v->indiv.l = indiv_l0; + return -4; +error: + v->indiv.l = indiv_l0; + return -1; +} + static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, @@ -5077,6 +5143,15 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, if (!nsamples) return 0; strict_enabled = vcf_format_fast_guard_enabled(&plan->strict_guard); + if (strict_enabled && plan->n_ops == 1 && plan->ops[0].is_gt) { + ret = vcf_parse_format_general_gt2_only(s, h, v, plan, q); + if (ret == 0) { + vcf_format_fast_guard_success(&plan->strict_guard); + return ret; + } + if (ret != -4) + return ret; + } if (plan->likelihood_supported && strict_enabled) { ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q); if (ret == 0) { From 3d51a29a619d4e3f8ebe2b8e1fbe13d6a5dda8b9 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 16:52:35 +0200 Subject: [PATCH 19/38] Tighten dynamic likelihood parsing --- bench/format-shape/large/results/timings.tsv | 60 +++---- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 29 +++ test/format-plan-edge.vcf | 5 + vcf.c | 168 +++++++++++++++++- 4 files changed, 226 insertions(+), 36 deletions(-) diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv index 3589a8ef2..a3c2565c7 100644 --- a/bench/format-shape/large/results/timings.tsv +++ b/bench/format-shape/large/results/timings.tsv @@ -1,31 +1,31 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.82 2.63 0.14 0 0 0 0 0 0 0 -ccdg_10k exact 1.82 1.64 0.14 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.8 1.63 0.14 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 25.57 24.86 0.62 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 6.41 5.77 0.57 1103547 1103547 0 2763281688 0 0 0 -1000g_chr22_full_genotypes interp 6.22 5.61 0.57 1103547 1103547 0 2763281688 0 0 0 -large_ccdg_likelihood_2048s baseline 4.15 3.98 0.15 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 2.85 2.69 0.14 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 2.9 2.74 0.14 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 3.01 2.91 0.09 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.72 2.59 0.09 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.72 2.6 0.08 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.3 3.18 0.1 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.16 2.05 0.09 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.13 2.01 0.1 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.04 2.87 0.15 0 0 0 0 0 0 0 -large_float_string_2048s exact 2.9 2.75 0.14 16000 16000 0 32768000 0 0 0 -large_float_string_2048s interp 2.99 2.81 0.14 16000 16000 0 32768000 0 0 0 -large_phase_width_variation_2048s baseline 2.73 2.57 0.14 0 0 0 0 0 0 0 -large_phase_width_variation_2048s exact 2.14 1.98 0.14 12000 12000 0 24576000 0 0 0 -large_phase_width_variation_2048s interp 2.21 2.05 0.13 12000 12000 0 24576000 12000 12000 0 -large_mixed_likelihood_2048s baseline 2.27 2.18 0.07 0 0 0 0 0 0 0 -large_mixed_likelihood_2048s exact 1.64 1.56 0.06 12000 11400 600 23347200 7355 6650 705 -large_mixed_likelihood_2048s interp 1.65 1.58 0.06 12000 12000 0 24576000 11295 10236 1059 -large_gt_first_reordered_2048s baseline 1.79 1.73 0.05 0 0 0 0 0 0 0 -large_gt_first_reordered_2048s exact 1.58 1.52 0.05 12000 12000 0 24576000 0 0 0 -large_gt_first_reordered_2048s interp 1.59 1.53 0.05 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s baseline 2.35 2.21 0.12 0 0 0 0 0 0 0 -large_two_string_float_2048s exact 2.43 2.29 0.12 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s interp 2.46 2.32 0.12 12000 12000 0 24576000 0 0 0 +ccdg_10k baseline 2.74 2.52 0.18 0 0 0 0 0 0 0 +ccdg_10k exact 1.77 1.59 0.16 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.73 1.56 0.16 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 26.48 25.45 0.95 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 6.36 5.64 0.67 1103547 1103547 0 2763281688 0 0 0 +1000g_chr22_full_genotypes interp 6.39 5.68 0.67 1103547 1103547 0 2763281688 0 0 0 +large_ccdg_likelihood_2048s baseline 4.59 4.34 0.18 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 3.09 2.93 0.15 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 3.15 2.96 0.15 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 3.32 3.19 0.1 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.92 2.82 0.09 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.93 2.82 0.1 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.64 3.51 0.11 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.4 2.26 0.11 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.18 2.07 0.1 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.33 3.15 0.17 0 0 0 0 0 0 0 +large_float_string_2048s exact 3.26 3.07 0.15 16000 16000 0 32768000 0 0 0 +large_float_string_2048s interp 3.2 3.04 0.15 16000 16000 0 32768000 0 0 0 +large_phase_width_variation_2048s baseline 3.02 2.83 0.15 0 0 0 0 0 0 0 +large_phase_width_variation_2048s exact 2.29 2.13 0.15 12000 12000 0 24576000 0 0 0 +large_phase_width_variation_2048s interp 2.36 2.2 0.15 12000 12000 0 24576000 12000 12000 0 +large_mixed_likelihood_2048s baseline 2.51 2.42 0.08 0 0 0 0 0 0 0 +large_mixed_likelihood_2048s exact 1.8 1.72 0.07 12000 11400 600 23347200 7355 6650 705 +large_mixed_likelihood_2048s interp 1.82 1.74 0.07 12000 12000 0 24576000 11295 10236 1059 +large_gt_first_reordered_2048s baseline 1.98 1.91 0.06 0 0 0 0 0 0 0 +large_gt_first_reordered_2048s exact 1.77 1.68 0.05 12000 12000 0 24576000 0 0 0 +large_gt_first_reordered_2048s interp 1.73 1.67 0.05 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s baseline 2.61 2.47 0.13 0 0 0 0 0 0 0 +large_two_string_float_2048s exact 2.7 2.55 0.13 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s interp 2.72 2.57 0.13 12000 12000 0 24576000 0 0 0 diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 64ecf63ed..3613e218c 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -309,6 +309,35 @@ The previous cached-shape run was about 9.1 s user in dynamic mode on this input, so the direct GT-only executor removes roughly 39% of the remaining planned-parser CPU for this large real workload. +## 2026-04-29 Multiallelic Parse Tightening + +Added two small low-risk likelihood executor refinements: + +- avoid retrying the likelihood shape executor inside the strict path when the + same row already reached the likelihood executor and failed row-local checks; +- add fixed-width integer vector parsers for AD width 4 and PL widths 6/10, + covering common triallelic and quad-allelic `Number=G` likelihood rows. + +The fixed-width parsers still use the same scalar integer parser and range +tracking, and they preserve short-vector padding and trailing-comma fallback +behavior. + +Small edge coverage now includes: + +- row-local likelihood fallback from short AD/PL in individual samples, +- missing AD/PL with another sample proving full row width, +- GT-only fast-path hits plus haploid and multidigit GT fallbacks. + +Latest full large-corpus run remained byte-identical to baseline. Timings were +noisier overall than the previous pass, but the important rows were: + +| Input | Exact user | Dynamic interp user | Notes | +|---|---:|---:|---| +| CCDG 10k | 1.59 s | 1.56 s | likelihood shape parity | +| 1000G chr22 full GT | 5.64 s | 5.68 s | GT-only fast path retained | +| Large multiallelic likelihood | 2.26 s | 2.07 s | dynamic ahead of exact | +| Mixed row-local fallbacks | 1.72 s | 1.74 s | byte-clean fallback path | + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index 62a942695..4622ae912 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -24,8 +24,13 @@ chr22 10560000 . A C,G 50 PASS . GT:GL:DP:GQ 0/1:-0.1,-1.2,-9.9,-2.0,-3.0,-4.0:7 chr22 10570000 . A T 50 PASS . GT:AD:DP:GQ:PL 0:3,0:3:10:0,10,100 1:0,3:3:20:100,10,0 .:0,0:0:.:. chr22 10580000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT:AD:DP:GQ:PL 10/10:0,0,0,0,0,0,0,0,0,0,7:7:20:200,190,180,170,160,150,140,130,120,110,100,90,80,70,60,50,40,30,20,10,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460 0/10:3,0,0,0,0,0,0,0,0,0,2:5:30:0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650 ./.:0,0,0,0,0,0,0,0,0,0,0:0:.:. chr22 10585000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/0:.:3:10:. 0/1:.:5:20:. ./.:.:0:.:. +chr22 10586000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:50:90,0,120 0/1:3:3:20:80,0 ./.:0,0:0:.:. +chr22 10587000 . A C,G 50 PASS . GT:AD:DP:GQ:PL 1/2:1,4,5:10:60:100,80,70,60,0,20 0/2:.:5:30:. ./.:0,0,0:0:.:. chr22 10590000 . A T 50 PASS . DP:GQ:GT:AD:PL 11:50:0/1:6,5:80,0,90 8:45:0/0:8,0:0,45,100 0:.:./.:0,0:. chr22 10591000 . A T 50 PASS . AD:PL:GT:DP:GQ 4,3:70,0,80:0/1:7:60 9,0:0,70,120:0/0:9:50 0,0:.:./.:0:. chr22 10592000 . A T 50 PASS . GT:DP:AB:GQ:AD:PL 0/1:12:0.42:70:7,5:90,0,100 0/0:10:0.01:60:10,0:0,60,120 ./.:0:.:.:0,0:. chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,20:8:8,0,0,0 ./.:.,.:0:.,.,.,. chr22 10594000 . A T 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 0|1:4,5:9:50:0|1:P1:90,0,90 0/1:3,2:5:20:0|1:10594000_A_T_LONG_PHASE_SET:20,0,200 ./.:0,0:0:.:.:.:. +chr22 10595000 . A T 50 PASS . GT 0/1 1|1 ./. +chr22 10596000 . A T 50 PASS . GT 0 1 . +chr22 10597000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT 10/10 0/10 ./. diff --git a/vcf.c b/vcf.c index a115625a0..7b66a5c62 100644 --- a/vcf.c +++ b/vcf.c @@ -3918,6 +3918,148 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char * return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char **sp, int32_t *out, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + int i = 4; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + out[3] = bcf_int32_vector_end; + i = 1; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { + out[2] = bcf_int32_vector_end; + out[3] = bcf_int32_vector_end; + i = 2; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { + out[3] = bcf_int32_vector_end; + i = 3; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s == ',') + return -1; +done: + *sp = s; + if (nread) + *nread = i; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector6_counted_range(const char **sp, int32_t *out, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + int i = 6, j; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { i = 1; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { i = 2; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { i = 3; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s != ',') { i = 4; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[4], range) < 0) + return -1; + if (*s != ',') { i = 5; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[5], range) < 0) + return -1; + if (*s == ',') + return -1; + goto done; +fill: + for (j = i; j < 6; j++) + out[j] = bcf_int32_vector_end; +done: + *sp = s; + if (nread) + *nread = i; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector10_counted_range(const char **sp, int32_t *out, int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + int i = 10, j; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { i = 1; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { i = 2; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { i = 3; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s != ',') { i = 4; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[4], range) < 0) + return -1; + if (*s != ',') { i = 5; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[5], range) < 0) + return -1; + if (*s != ',') { i = 6; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[6], range) < 0) + return -1; + if (*s != ',') { i = 7; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[7], range) < 0) + return -1; + if (*s != ',') { i = 8; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[8], range) < 0) + return -1; + if (*s != ',') { i = 9; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[9], range) < 0) + return -1; + if (*s == ',') + return -1; + goto done; +fill: + for (j = i; j < 10; j++) + out[j] = bcf_int32_vector_end; +done: + *sp = s; + if (nread) + *nread = i; + return 0; +} + VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) { return vcf_plan_parse_int_vector3_counted(sp, out, NULL); @@ -4898,6 +5040,9 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, } else if (ad_w == 3) { if (vcf_plan_parse_int_vector3_counted_range(&cur, &ad[sample * 3], &nread, &ad_range) < 0) goto fallback; + } else if (ad_w == 4) { + if (vcf_plan_parse_int_vector4_counted_range(&cur, &ad[sample * 4], &nread, &ad_range) < 0) + goto fallback; } else if (vcf_plan_parse_int_vector_counted_range(&cur, &ad[sample * ad_w], ad_w, &nread, &ad_range) < 0) { goto fallback; } @@ -4928,6 +5073,12 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, if (pl_w == 3) { if (vcf_plan_parse_int_vector3_counted_range(&cur, &pl[sample * 3], &nread, &pl_range) < 0) goto fallback; + } else if (pl_w == 6) { + if (vcf_plan_parse_int_vector6_counted_range(&cur, &pl[sample * 6], &nread, &pl_range) < 0) + goto fallback; + } else if (pl_w == 10) { + if (vcf_plan_parse_int_vector10_counted_range(&cur, &pl[sample * 10], &nread, &pl_range) < 0) + goto fallback; } else if (vcf_plan_parse_int_vector_counted_range(&cur, &pl[sample * pl_w], pl_w, &nread, &pl_range) < 0) { goto fallback; } @@ -4990,21 +5141,25 @@ static int vcf_parse_format_general_likelihood_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, - char *q) + char *q, int *attempted_shape) { int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; + if (attempted_shape) + *attempted_shape = 0; if (vcf_format_general_likelihood_widths(s, h, plan, v, q, widths) < 0) return -4; vcf_format_general_resolve_ops(plan, v, widths, row_ops); + if (attempted_shape) + *attempted_shape = 1; return vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops); } static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, - char *q) + char *q, int try_likelihood) { kstring_t *mem; int widths[MAX_N_FMT], max_counts[MAX_N_FMT]; @@ -5020,7 +5175,7 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, vcf_plan_int_range_init(&ranges[j]); } vcf_format_general_resolve_ops(plan, v, widths, row_ops); - if (plan->likelihood_supported && + if (try_likelihood && plan->likelihood_supported && (ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops)) != -4) return ret; if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) @@ -5128,7 +5283,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, kstring_t *mem; int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; - int nsamples, sample, j, vcf44, ret, strict_enabled; + int nsamples, sample, j, vcf44, ret, strict_enabled, likelihood_tried; const char *cur, *end; plan = vcf_format_general_plan_get(h, p); @@ -5143,6 +5298,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, if (!nsamples) return 0; strict_enabled = vcf_format_fast_guard_enabled(&plan->strict_guard); + likelihood_tried = 0; if (strict_enabled && plan->n_ops == 1 && plan->ops[0].is_gt) { ret = vcf_parse_format_general_gt2_only(s, h, v, plan, q); if (ret == 0) { @@ -5153,7 +5309,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, return ret; } if (plan->likelihood_supported && strict_enabled) { - ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q); + ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q, &likelihood_tried); if (ret == 0) { vcf_format_fast_guard_success(&plan->strict_guard); return ret; @@ -5162,7 +5318,7 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, return ret; } if (plan->strict_supported && strict_enabled) { - ret = vcf_parse_format_general_strict(s, h, v, plan, q); + ret = vcf_parse_format_general_strict(s, h, v, plan, q, !likelihood_tried); if (ret == 0) { vcf_format_fast_guard_success(&plan->strict_guard); return ret; From 949eee3a4e24d50aa3c27f254803a4fce11d030f Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 17:01:09 +0200 Subject: [PATCH 20/38] Skip integer sentinel checks when proven absent --- bench/format-shape/large/results/timings.tsv | 56 ++++++------ ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 29 ++++++ test/format-plan-edge.vcf | 1 + vcf.c | 90 ++++++++++++++----- 4 files changed, 124 insertions(+), 52 deletions(-) diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv index a3c2565c7..682f9dd58 100644 --- a/bench/format-shape/large/results/timings.tsv +++ b/bench/format-shape/large/results/timings.tsv @@ -1,31 +1,31 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.74 2.52 0.18 0 0 0 0 0 0 0 -ccdg_10k exact 1.77 1.59 0.16 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.73 1.56 0.16 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 26.48 25.45 0.95 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 6.36 5.64 0.67 1103547 1103547 0 2763281688 0 0 0 -1000g_chr22_full_genotypes interp 6.39 5.68 0.67 1103547 1103547 0 2763281688 0 0 0 -large_ccdg_likelihood_2048s baseline 4.59 4.34 0.18 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 3.09 2.93 0.15 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 3.15 2.96 0.15 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 3.32 3.19 0.1 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.92 2.82 0.09 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.93 2.82 0.1 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.64 3.51 0.11 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.4 2.26 0.11 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.18 2.07 0.1 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.33 3.15 0.17 0 0 0 0 0 0 0 -large_float_string_2048s exact 3.26 3.07 0.15 16000 16000 0 32768000 0 0 0 -large_float_string_2048s interp 3.2 3.04 0.15 16000 16000 0 32768000 0 0 0 -large_phase_width_variation_2048s baseline 3.02 2.83 0.15 0 0 0 0 0 0 0 -large_phase_width_variation_2048s exact 2.29 2.13 0.15 12000 12000 0 24576000 0 0 0 +ccdg_10k baseline 3.01 2.83 0.14 0 0 0 0 0 0 0 +ccdg_10k exact 1.93 1.74 0.15 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.88 1.73 0.14 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 28.61 27.65 0.82 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 6.7 6.02 0.6 1103547 1103547 0 2763281688 0 0 0 +1000g_chr22_full_genotypes interp 6.73 6.09 0.6 1103547 1103547 0 2763281688 0 0 0 +large_ccdg_likelihood_2048s baseline 4.55 4.37 0.17 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 3.21 3.03 0.15 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 3.13 2.98 0.15 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 3.26 3.14 0.09 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.96 2.84 0.09 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.97 2.87 0.09 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.6 3.47 0.11 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.41 2.29 0.1 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.24 2.12 0.1 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.33 3.15 0.16 0 0 0 0 0 0 0 +large_float_string_2048s exact 3.19 3.03 0.15 16000 16000 0 32768000 0 0 0 +large_float_string_2048s interp 3.3 3.11 0.15 16000 16000 0 32768000 0 0 0 +large_phase_width_variation_2048s baseline 2.99 2.81 0.16 0 0 0 0 0 0 0 +large_phase_width_variation_2048s exact 2.38 2.19 0.16 12000 12000 0 24576000 0 0 0 large_phase_width_variation_2048s interp 2.36 2.2 0.15 12000 12000 0 24576000 12000 12000 0 -large_mixed_likelihood_2048s baseline 2.51 2.42 0.08 0 0 0 0 0 0 0 -large_mixed_likelihood_2048s exact 1.8 1.72 0.07 12000 11400 600 23347200 7355 6650 705 -large_mixed_likelihood_2048s interp 1.82 1.74 0.07 12000 12000 0 24576000 11295 10236 1059 -large_gt_first_reordered_2048s baseline 1.98 1.91 0.06 0 0 0 0 0 0 0 -large_gt_first_reordered_2048s exact 1.77 1.68 0.05 12000 12000 0 24576000 0 0 0 +large_mixed_likelihood_2048s baseline 2.46 2.37 0.08 0 0 0 0 0 0 0 +large_mixed_likelihood_2048s exact 1.79 1.71 0.07 12000 11400 600 23347200 7355 6650 705 +large_mixed_likelihood_2048s interp 1.8 1.72 0.07 12000 12000 0 24576000 11295 10236 1059 +large_gt_first_reordered_2048s baseline 1.94 1.87 0.06 0 0 0 0 0 0 0 +large_gt_first_reordered_2048s exact 1.74 1.68 0.05 12000 12000 0 24576000 0 0 0 large_gt_first_reordered_2048s interp 1.73 1.67 0.05 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s baseline 2.61 2.47 0.13 0 0 0 0 0 0 0 -large_two_string_float_2048s exact 2.7 2.55 0.13 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s interp 2.72 2.57 0.13 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s baseline 2.58 2.43 0.13 0 0 0 0 0 0 0 +large_two_string_float_2048s exact 2.71 2.58 0.13 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s interp 2.71 2.56 0.13 12000 12000 0 24576000 0 0 0 diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 3613e218c..b0619716f 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -338,6 +338,35 @@ noisier overall than the previous pass, but the important rows were: | Large multiallelic likelihood | 2.26 s | 2.07 s | dynamic ahead of exact | | Mixed row-local fallbacks | 1.72 s | 1.74 s | byte-clean fallback path | +## 2026-04-29 No-Special Integer Encode + +Added a conservative `has_special` bit to planned integer range tracking. The +parser now records when it has observed `bcf_int32_missing` or +`bcf_int32_vector_end`, including vector-end padding from short fixed-width +vectors. The known-range encoder uses that proof to skip per-value sentinel +checks in int8/int16 output only when the row contains no missing/vector-end +values. + +Safety rule: min/max alone never proves this. Missing and vector-end sentinels +can still select int8/int16 BCF encodings, so the fast loop is gated only by the +parser-maintained flag. + +Small edge coverage now includes integer boundary rows spanning int8/int16/int32 +choices, plus existing rows with scalar missing values, short fixed vectors, and +explicit vector missing values. + +Latest full large-corpus run: + +| Input | Exact user | Dynamic interp user | Notes | +|---|---:|---:|---| +| CCDG 10k | 1.74 s | 1.73 s | real likelihood parity | +| 1000G chr22 full GT | 6.02 s | 6.09 s | GT-only path retained | +| Large CCDG-like synthetic | 3.03 s | 2.98 s | dynamic slightly ahead | +| Large multiallelic likelihood | 2.29 s | 2.12 s | dynamic ahead | +| Mixed row-local fallbacks | 1.71 s | 1.72 s | byte-clean fallback path | + +All exact and interp outputs compared byte-identical to baseline. + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index 4622ae912..8467573cc 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -30,6 +30,7 @@ chr22 10590000 . A T 50 PASS . DP:GQ:GT:AD:PL 11:50:0/1:6,5:80,0,90 8:45:0/0:8,0 chr22 10591000 . A T 50 PASS . AD:PL:GT:DP:GQ 4,3:70,0,80:0/1:7:60 9,0:0,70,120:0/0:9:50 0,0:.:./.:0:. chr22 10592000 . A T 50 PASS . GT:DP:AB:GQ:AD:PL 0/1:12:0.42:70:7,5:90,0,100 0/0:10:0.01:60:10,0:0,60,120 ./.:0:.:.:0,0:. chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,20:8:8,0,0,0 ./.:.,.:0:.,.,.,. +chr22 10593500 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:32767:3,4,5,6 0/0:-32760,32768:-32761:8,0,0,0 ./.:32767,32768:0:127,128,32767,32768 chr22 10594000 . A T 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 0|1:4,5:9:50:0|1:P1:90,0,90 0/1:3,2:5:20:0|1:10594000_A_T_LONG_PHASE_SET:20,0,200 ./.:0,0:0:.:.:.:. chr22 10595000 . A T 50 PASS . GT 0/1 1|1 ./. chr22 10596000 . A T 50 PASS . GT 0 1 . diff --git a/vcf.c b/vcf.c index 7b66a5c62..214345608 100644 --- a/vcf.c +++ b/vcf.c @@ -2930,8 +2930,8 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; } -static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, - int32_t min, int32_t max) +static int bcf_enc_vint_known_range_special(kstring_t *s, int n, int32_t *a, int wsize, + int32_t min, int32_t max, int has_special) { int i; // min/max must match bcf_enc_vint()'s scan: missing and vector-end values @@ -2948,10 +2948,15 @@ static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, ks_resize(s, s->l + n) < 0) return -1; uint8_t *p = (uint8_t *) s->s + s->l; - for (i = 0; i < n; ++i, p++) { - if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; - else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; - else *p = a[i]; + if (has_special) { + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + } else { + for (i = 0; i < n; ++i, p++) + *p = a[i]; } s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { @@ -2960,14 +2965,22 @@ static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, ks_resize(s, s->l + n * sizeof(int16_t)) < 0) return -1; p = (uint8_t *) s->s + s->l; - for (i = 0; i < n; ++i) - { - int16_t x; - if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; - else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; - else x = a[i]; - i16_to_le(x, p); - p += sizeof(int16_t); + if (has_special) { + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + } else { + for (i = 0; i < n; ++i) + { + i16_to_le((int16_t)a[i], p); + p += sizeof(int16_t); + } } s->l += n * sizeof(int16_t); } else { @@ -2987,6 +3000,12 @@ static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, return 0; } +static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, + int32_t min, int32_t max) +{ + return bcf_enc_vint_known_range_special(s, n, a, wsize, min, max, 1); +} + #ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; @@ -3375,6 +3394,7 @@ typedef struct { typedef struct { int32_t min; int32_t max; + int has_special; } vcf_plan_int_range_t; #if defined(__GNUC__) @@ -3689,10 +3709,13 @@ VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_init(vcf_plan_int_range_t *range) { range->min = INT32_MAX; range->max = INT32_MIN; + range->has_special = 0; } VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add(vcf_plan_int_range_t *range, int32_t val) { + if (val == bcf_int32_missing || val == bcf_int32_vector_end) + range->has_special = 1; if (range->max < val) range->max = val; if (range->min > val && val > INT32_MIN + 1) @@ -3780,6 +3803,8 @@ static int vcf_plan_parse_int_vector_counted_range(const char **sp, int32_t *out nvals = i; if (nread) *nread = nvals; + if (i < width) + range->has_special = 1; for (; i < width; i++) out[i] = bcf_int32_vector_end; if (*s == ',') @@ -3827,6 +3852,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char * if (*s != ',') { out[1] = bcf_int32_vector_end; *sp = s; + range->has_special = 1; if (nread) *nread = 1; return 0; @@ -3893,6 +3919,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char * out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; *sp = s; + range->has_special = 1; if (nread) *nread = 1; return 0; @@ -3903,6 +3930,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char * if (*s != ',') { out[2] = bcf_int32_vector_end; *sp = s; + range->has_special = 1; if (nread) *nread = 2; return 0; @@ -3930,6 +3958,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char * out[1] = bcf_int32_vector_end; out[2] = bcf_int32_vector_end; out[3] = bcf_int32_vector_end; + range->has_special = 1; i = 1; goto done; } @@ -3939,6 +3968,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char * if (*s != ',') { out[2] = bcf_int32_vector_end; out[3] = bcf_int32_vector_end; + range->has_special = 1; i = 2; goto done; } @@ -3947,6 +3977,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char * return -1; if (*s != ',') { out[3] = bcf_int32_vector_end; + range->has_special = 1; i = 3; goto done; } @@ -3994,6 +4025,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector6_counted_range(const char * return -1; goto done; fill: + range->has_special = 1; for (j = i; j < 6; j++) out[j] = bcf_int32_vector_end; done: @@ -4051,6 +4083,7 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector10_counted_range(const char return -1; goto done; fill: + range->has_special = 1; for (j = i; j < 10; j++) out[j] = bcf_int32_vector_end; done: @@ -4494,8 +4527,9 @@ static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring op->kind == VCF_FORMAT_ROW_INT2 || op->kind == VCF_FORMAT_ROW_INT3 || op->kind == VCF_FORMAT_ROW_INTN) { - if (bcf_enc_vint_known_range(dst, nsamples * op->width, (int32_t *)buf, - op->width, ranges[j].min, ranges[j].max) < 0) + if (bcf_enc_vint_known_range_special(dst, nsamples * op->width, (int32_t *)buf, + op->width, ranges[j].min, ranges[j].max, + ranges[j].has_special) < 0) return -1; } else { if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) @@ -5100,13 +5134,16 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, v->n_sample = nsamples; bcf_enc_int1(&v->indiv, row_ops[ad_idx].key); nwords = nsamples * ad_w; - if (bcf_enc_vint_known_range(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max, + ad_range.has_special) < 0) goto error; bcf_enc_int1(&v->indiv, row_ops[dp_idx].key); - if (bcf_enc_vint_known_range(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max, + dp_range.has_special) < 0) goto error; bcf_enc_int1(&v->indiv, row_ops[gq_idx].key); - if (bcf_enc_vint_known_range(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max, + gq_range.has_special) < 0) goto error; if (has_phase) { bcf_enc_int1(&v->indiv, row_ops[str1_idx].key); @@ -5120,7 +5157,8 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, } bcf_enc_int1(&v->indiv, row_ops[pl_idx].key); nwords = nsamples * pl_w; - if (bcf_enc_vint_known_range(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max, + pl_range.has_special) < 0) goto error; vcf_format_plan_stats.hits++; @@ -5627,13 +5665,16 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, v->n_sample = nsamples; bcf_enc_int1(&v->indiv, plan->key_ad); nwords = nsamples * ad_w; - if (bcf_enc_vint_known_range(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max, + ad_range.has_special) < 0) return -1; bcf_enc_int1(&v->indiv, plan->key_dp); - if (bcf_enc_vint_known_range(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max, + dp_range.has_special) < 0) return -1; bcf_enc_int1(&v->indiv, plan->key_gq); - if (bcf_enc_vint_known_range(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max, + gq_range.has_special) < 0) return -1; if (plan->has_phase) { bcf_enc_int1(&v->indiv, plan->key_pgt); @@ -5649,7 +5690,8 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } bcf_enc_int1(&v->indiv, plan->key_pl); nwords = nsamples * pl_w; - if (bcf_enc_vint_known_range(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max) < 0) + if (bcf_enc_vint_known_range_special(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max, + pl_range.has_special) < 0) return -1; vcf_format_plan_stats.hits++; From aae9423ac92391f41ee52c91bbfb6541c72836c9 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 17:08:36 +0200 Subject: [PATCH 21/38] Elide likelihood row op rebuild --- bench/format-shape/large/results/timings.tsv | 60 ++++++++--------- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 24 +++++++ vcf.c | 66 +++++++++---------- 3 files changed, 85 insertions(+), 65 deletions(-) diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv index 682f9dd58..42a3a118e 100644 --- a/bench/format-shape/large/results/timings.tsv +++ b/bench/format-shape/large/results/timings.tsv @@ -1,31 +1,31 @@ name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 3.01 2.83 0.14 0 0 0 0 0 0 0 -ccdg_10k exact 1.93 1.74 0.15 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.88 1.73 0.14 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 28.61 27.65 0.82 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 6.7 6.02 0.6 1103547 1103547 0 2763281688 0 0 0 -1000g_chr22_full_genotypes interp 6.73 6.09 0.6 1103547 1103547 0 2763281688 0 0 0 -large_ccdg_likelihood_2048s baseline 4.55 4.37 0.17 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 3.21 3.03 0.15 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 3.13 2.98 0.15 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 3.26 3.14 0.09 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.96 2.84 0.09 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.97 2.87 0.09 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.6 3.47 0.11 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.41 2.29 0.1 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.24 2.12 0.1 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.33 3.15 0.16 0 0 0 0 0 0 0 -large_float_string_2048s exact 3.19 3.03 0.15 16000 16000 0 32768000 0 0 0 -large_float_string_2048s interp 3.3 3.11 0.15 16000 16000 0 32768000 0 0 0 -large_phase_width_variation_2048s baseline 2.99 2.81 0.16 0 0 0 0 0 0 0 -large_phase_width_variation_2048s exact 2.38 2.19 0.16 12000 12000 0 24576000 0 0 0 -large_phase_width_variation_2048s interp 2.36 2.2 0.15 12000 12000 0 24576000 12000 12000 0 -large_mixed_likelihood_2048s baseline 2.46 2.37 0.08 0 0 0 0 0 0 0 -large_mixed_likelihood_2048s exact 1.79 1.71 0.07 12000 11400 600 23347200 7355 6650 705 -large_mixed_likelihood_2048s interp 1.8 1.72 0.07 12000 12000 0 24576000 11295 10236 1059 -large_gt_first_reordered_2048s baseline 1.94 1.87 0.06 0 0 0 0 0 0 0 -large_gt_first_reordered_2048s exact 1.74 1.68 0.05 12000 12000 0 24576000 0 0 0 -large_gt_first_reordered_2048s interp 1.73 1.67 0.05 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s baseline 2.58 2.43 0.13 0 0 0 0 0 0 0 -large_two_string_float_2048s exact 2.71 2.58 0.13 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s interp 2.71 2.56 0.13 12000 12000 0 24576000 0 0 0 +ccdg_10k baseline 2.95 2.76 0.13 0 0 0 0 0 0 0 +ccdg_10k exact 1.88 1.73 0.13 10000 10000 0 32020000 0 0 0 +ccdg_10k interp 1.85 1.7 0.13 10000 10000 0 32020000 10000 10000 0 +1000g_chr22_full_genotypes baseline 27.79 27.05 0.65 0 0 0 0 0 0 0 +1000g_chr22_full_genotypes exact 6.59 6 0.57 1103547 1103547 0 2763281688 0 0 0 +1000g_chr22_full_genotypes interp 7.18 6.51 0.6 1103547 1103547 0 2763281688 0 0 0 +large_ccdg_likelihood_2048s baseline 4.28 4.03 0.21 0 0 0 0 0 0 0 +large_ccdg_likelihood_2048s exact 2.95 2.77 0.16 20000 20000 0 40960000 0 0 0 +large_ccdg_likelihood_2048s interp 2.93 2.74 0.18 20000 20000 0 40960000 20000 20000 0 +large_reordered_likelihood_2048s baseline 3.06 2.91 0.13 0 0 0 0 0 0 0 +large_reordered_likelihood_2048s exact 2.76 2.63 0.12 20000 20000 0 40960000 0 0 0 +large_reordered_likelihood_2048s interp 2.76 2.62 0.12 20000 20000 0 40960000 0 0 0 +large_multiallelic_likelihood_2048s baseline 3.35 3.18 0.15 0 0 0 0 0 0 0 +large_multiallelic_likelihood_2048s exact 2.29 2.13 0.13 16000 16000 0 32768000 0 0 0 +large_multiallelic_likelihood_2048s interp 2.06 1.92 0.13 16000 16000 0 32768000 16000 16000 0 +large_float_string_2048s baseline 3.15 2.92 0.19 0 0 0 0 0 0 0 +large_float_string_2048s exact 3.04 2.85 0.18 16000 16000 0 32768000 0 0 0 +large_float_string_2048s interp 3 2.8 0.18 16000 16000 0 32768000 0 0 0 +large_phase_width_variation_2048s baseline 2.78 2.58 0.18 0 0 0 0 0 0 0 +large_phase_width_variation_2048s exact 2.25 2.04 0.19 12000 12000 0 24576000 0 0 0 +large_phase_width_variation_2048s interp 2.25 2.05 0.18 12000 12000 0 24576000 12000 12000 0 +large_mixed_likelihood_2048s baseline 2.3 2.18 0.1 0 0 0 0 0 0 0 +large_mixed_likelihood_2048s exact 1.69 1.58 0.09 12000 11400 600 23347200 7355 6650 705 +large_mixed_likelihood_2048s interp 1.7 1.59 0.09 12000 12000 0 24576000 11295 10236 1059 +large_gt_first_reordered_2048s baseline 1.84 1.74 0.07 0 0 0 0 0 0 0 +large_gt_first_reordered_2048s exact 1.62 1.54 0.06 12000 12000 0 24576000 0 0 0 +large_gt_first_reordered_2048s interp 1.62 1.53 0.07 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s baseline 2.46 2.27 0.17 0 0 0 0 0 0 0 +large_two_string_float_2048s exact 2.64 2.46 0.16 12000 12000 0 24576000 0 0 0 +large_two_string_float_2048s interp 2.65 2.47 0.16 12000 12000 0 24576000 0 0 0 diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index b0619716f..3c6baa578 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -367,6 +367,30 @@ Latest full large-corpus run: All exact and interp outputs compared byte-identical to baseline. +## 2026-04-29 Likelihood Row-Op Elision + +Removed `row_ops` construction from the dynamic likelihood strict path. The +likelihood executor now consumes cached plan indices and row-local widths +directly; generic strict still builds row ops only after the likelihood attempt +fails and it needs fixed-numeric/general parsing. + +This keeps row-local validation unchanged: + +- allele count remains limited per row, +- AD/PL counts still must prove the expected width, +- phase string widths are still measured for the current row, +- malformed GT/separators/sample counts still fall back. + +Latest full large-corpus run stayed byte-identical to baseline. Highlights: + +| Input | Exact user | Dynamic interp user | Notes | +|---|---:|---:|---| +| CCDG 10k | 1.73 s | 1.70 s | real likelihood slightly ahead | +| Large CCDG-like synthetic | 2.77 s | 2.74 s | dynamic slightly ahead | +| Large multiallelic likelihood | 2.13 s | 1.92 s | dynamic ahead | +| Variable phase widths | 2.04 s | 2.05 s | phase widths still row-local | +| Mixed row-local fallbacks | 1.58 s | 1.59 s | fallback path byte-clean | + ## Open Questions - How much of the gap is parse-loop dispatch versus generic encode cost? diff --git a/vcf.c b/vcf.c index 214345608..55ff17284 100644 --- a/vcf.c +++ b/vcf.c @@ -4946,7 +4946,7 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, bcf1_t *v, const vcf_format_general_plan_t *plan, char *q, - vcf_format_row_op_t *row_ops) + int *widths) { kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h); @@ -4967,7 +4967,7 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, return -4; vcf_format_likelihood_shape_attempts++; - if (row_ops[0].kind != VCF_FORMAT_ROW_GT2) + if (widths[0] != 2) return -4; if (v->n_allele < 1 || v->n_allele > 8) return -4; @@ -4987,15 +4987,13 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, str2_idx = plan->likelihood_str2_idx; pl_idx = plan->likelihood_pl_idx; - if (!vcf_format_row_is_int(&row_ops[ad_idx]) || - row_ops[ad_idx].width != ad_w || - row_ops[dp_idx].kind != VCF_FORMAT_ROW_INT1 || - row_ops[gq_idx].kind != VCF_FORMAT_ROW_INT1 || - (has_float && row_ops[float_idx].kind != VCF_FORMAT_ROW_FLOAT1) || - (has_phase && (row_ops[str1_idx].kind != VCF_FORMAT_ROW_STR || - row_ops[str2_idx].kind != VCF_FORMAT_ROW_STR)) || - !vcf_format_row_is_int(&row_ops[pl_idx]) || - row_ops[pl_idx].width != pl_w) + if (widths[ad_idx] != ad_w || + widths[dp_idx] != 1 || + widths[gq_idx] != 1 || + (has_float && widths[float_idx] != 1) || + (has_phase && (widths[str1_idx] <= 0 || + widths[str2_idx] <= 0)) || + widths[pl_idx] != pl_w) return -4; vcf_plan_int_range_init(&ad_range); @@ -5003,14 +5001,14 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, vcf_plan_int_range_init(&gq_range); vcf_plan_int_range_init(&pl_range); - bcf_enc_int1(&v->indiv, row_ops[0].key); + bcf_enc_int1(&v->indiv, plan->ops[0].key); if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) goto error; gt8_off = v->indiv.l; v->indiv.l += (size_t)nsamples * 2; if (has_float) { - bcf_enc_int1(&v->indiv, row_ops[float_idx].key); + bcf_enc_int1(&v->indiv, plan->ops[float_idx].key); if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) goto error; @@ -5027,7 +5025,7 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, total_bytes = (size_t) nsamples * (ad_w + 1 + 1 + pl_w) * sizeof(int32_t); if (has_phase) total_bytes += (size_t) nsamples * - (row_ops[str1_idx].width + row_ops[str2_idx].width); + (widths[str1_idx] + widths[str2_idx]); if (total_bytes > INT_MAX) goto error; if (ks_resize(mem, mem->l + total_bytes) < 0) @@ -5037,8 +5035,8 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); if (has_phase) { - str1_off = mem->l; mem->l += (size_t) nsamples * row_ops[str1_idx].width; - str2_off = mem->l; mem->l += (size_t) nsamples * row_ops[str2_idx].width; + str1_off = mem->l; mem->l += (size_t) nsamples * widths[str1_idx]; + str2_off = mem->l; mem->l += (size_t) nsamples * widths[str2_idx]; } pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); @@ -5093,13 +5091,13 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; if (has_phase) { - if (vcf_plan_copy_string(&cur, &str1[sample * row_ops[str1_idx].width], - row_ops[str1_idx].width) < 0) + if (vcf_plan_copy_string(&cur, &str1[sample * widths[str1_idx]], + widths[str1_idx]) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; - if (vcf_plan_copy_string(&cur, &str2[sample * row_ops[str2_idx].width], - row_ops[str2_idx].width) < 0) + if (vcf_plan_copy_string(&cur, &str2[sample * widths[str2_idx]], + widths[str2_idx]) < 0) goto fallback; if (vcf_plan_expect_sep(&cur, ':') < 0) goto fallback; @@ -5132,30 +5130,30 @@ static int vcf_parse_format_general_likelihood_shape(kstring_t *s, v->n_fmt = plan->n_ops; v->n_sample = nsamples; - bcf_enc_int1(&v->indiv, row_ops[ad_idx].key); + bcf_enc_int1(&v->indiv, plan->ops[ad_idx].key); nwords = nsamples * ad_w; if (bcf_enc_vint_known_range_special(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max, ad_range.has_special) < 0) goto error; - bcf_enc_int1(&v->indiv, row_ops[dp_idx].key); + bcf_enc_int1(&v->indiv, plan->ops[dp_idx].key); if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max, dp_range.has_special) < 0) goto error; - bcf_enc_int1(&v->indiv, row_ops[gq_idx].key); + bcf_enc_int1(&v->indiv, plan->ops[gq_idx].key); if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max, gq_range.has_special) < 0) goto error; if (has_phase) { - bcf_enc_int1(&v->indiv, row_ops[str1_idx].key); - if (bcf_enc_size(&v->indiv, row_ops[str1_idx].width, BCF_BT_CHAR) < 0 || - kputsn(str1, (size_t) nsamples * row_ops[str1_idx].width, &v->indiv) < 0) + bcf_enc_int1(&v->indiv, plan->ops[str1_idx].key); + if (bcf_enc_size(&v->indiv, widths[str1_idx], BCF_BT_CHAR) < 0 || + kputsn(str1, (size_t) nsamples * widths[str1_idx], &v->indiv) < 0) goto error; - bcf_enc_int1(&v->indiv, row_ops[str2_idx].key); - if (bcf_enc_size(&v->indiv, row_ops[str2_idx].width, BCF_BT_CHAR) < 0 || - kputsn(str2, (size_t) nsamples * row_ops[str2_idx].width, &v->indiv) < 0) + bcf_enc_int1(&v->indiv, plan->ops[str2_idx].key); + if (bcf_enc_size(&v->indiv, widths[str2_idx], BCF_BT_CHAR) < 0 || + kputsn(str2, (size_t) nsamples * widths[str2_idx], &v->indiv) < 0) goto error; } - bcf_enc_int1(&v->indiv, row_ops[pl_idx].key); + bcf_enc_int1(&v->indiv, plan->ops[pl_idx].key); nwords = nsamples * pl_w; if (bcf_enc_vint_known_range_special(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max, pl_range.has_special) < 0) @@ -5182,16 +5180,14 @@ static int vcf_parse_format_general_likelihood_strict(kstring_t *s, char *q, int *attempted_shape) { int widths[MAX_N_FMT]; - vcf_format_row_op_t row_ops[MAX_N_FMT]; if (attempted_shape) *attempted_shape = 0; if (vcf_format_general_likelihood_widths(s, h, plan, v, q, widths) < 0) return -4; - vcf_format_general_resolve_ops(plan, v, widths, row_ops); if (attempted_shape) *attempted_shape = 1; - return vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops); + return vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, widths); } static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, @@ -5212,10 +5208,10 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, max_counts[j] = 0; vcf_plan_int_range_init(&ranges[j]); } - vcf_format_general_resolve_ops(plan, v, widths, row_ops); if (try_likelihood && plan->likelihood_supported && - (ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, row_ops)) != -4) + (ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, widths)) != -4) return ret; + vcf_format_general_resolve_ops(plan, v, widths, row_ops); if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) return vcf_parse_format_general_fixed_numeric(s, h, v, plan, q, row_ops); From 8f1a9436a067f395fdf104e9cf1050e6b8b6e217 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 17:48:26 +0200 Subject: [PATCH 22/38] Refactor dynamic FORMAT parsing to composable ops --- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 57 +++ test/format-plan-composable.vcf | 16 + test/test_format_plan.sh | 2 +- vcf.c | 353 +++++------------- 4 files changed, 160 insertions(+), 268 deletions(-) create mode 100644 test/format-plan-composable.vcf diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 3c6baa578..c51988ca2 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -400,3 +400,60 @@ Latest full large-corpus run stayed byte-identical to baseline. Highlights: fixed-shape executor with parse-function pointers? - Do temporary fallback reason counters pay for themselves during iteration, or should they stay under an explicit debug environment variable? + +## 2026-04-29 Composable MVP Pivot + +The planned parser has been refactored toward the MVP design: + +```text +FORMAT/header -> per-tag compiled ops -> one composable executor -> fallback +``` + +The dynamic `interp` path no longer routes through separate GT-only, +likelihood-shape, fixed-numeric, and measured-general executor ladders. It +builds one row-local op list from header `Type`/`Number` metadata, parses all +supported ops in FORMAT order, and falls back to the production parser for the +whole row when compile-time support or row-local validation fails. + +Supported per-tag MVP ops include: + +- `GT`, with fast `GT2` storage when the row is diploid/simple; +- `Integer` and `Float` with fixed `Number=N`, `Number=A`, `Number=R`, + `Number=G`, or bounded measured `Number=.`; +- `String,Number=1` with row-local width measurement. + +This intentionally trades the previous likelihood-family microkernel speed for +broader composability. Rows such as `GT:AD`, `GT:AD:DP:XX:PL`, reordered +numeric/string tags, and supersets with normal header-described tags can now use +the same planned executor without a full-row shape match. + +Added `test/format-plan-composable.vcf` to cover subsets, supersets, +reordered fields, measured numeric fields, strings, and a deliberate row-local +fallback. `./test/test_format_plan.sh` compares baseline, exact, and interp +outputs byte-for-byte. + +Latest full large-corpus composable MVP run: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-composable-mvp \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All exact and interp outputs compared byte-identical to baseline. + +| Input | Baseline user | Exact user | Dynamic interp user | Notes | +|---|---:|---:|---:|---| +| CCDG 10k | 2.61 s | 1.66 s | 2.35 s | broader MVP, some row fallback | +| 1000G chr22 full GT | 25.88 s | 8.70 s | 8.70 s | composable GT path parity with exact | +| Large CCDG-like synthetic | 4.17 s | 2.78 s | 3.84 s | lost likelihood microkernel speed | +| Large reordered likelihood | 3.01 s | 2.49 s | 2.49 s | parity; no special likelihood shape needed | +| Large multiallelic likelihood | 3.23 s | 2.20 s | 3.11 s | generic op loop slower than microkernel | +| Large float/string | 2.97 s | 3.01 s | 3.01 s | parity with exact/general | +| Variable phase widths | 2.68 s | 2.07 s | 2.54 s | string measurement still row-local | +| Mixed row-local fallbacks | 2.25 s | 1.90 s | 2.06 s | byte-clean fallback path | +| GT-first reordered negative | 1.79 s | 1.47 s | 1.45 s | composable path slightly ahead | +| Two-string float negative | 2.28 s | 2.61 s | 2.58 s | planned path slower than baseline here | + +Takeaway: the MVP architecture is much less brittle and supports tag-level +composition, but parity with the removed likelihood-shape executor will require +generic per-op optimizations or a later optional executor-generation layer. diff --git a/test/format-plan-composable.vcf b/test/format-plan-composable.vcf new file mode 100644 index 000000000..942502853 --- /dev/null +++ b/test/format-plan-composable.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10610000 . A T 50 PASS . GT:AD 0/1:4,5 0/0:9,0 ./.:0,0 +chr22 10610010 . A T 50 PASS . GT:AD:DP:XX:PL 0/1:4,5:9:7,8:90,0,120 0/0:9,0:9:1,2:0,30,200 ./.:0,0:0:.,.:. +chr22 10610020 . A C,G 50 PASS . DP:XS:GT:XX:AD:GQ:PL 12:alpha:1/2:3,4:1,5,6:60:100,90,80,70,0,20 8:beta:0/2:5,6:4,0,4:35:80,70,60,50,40,0 0:.:./.:.,.:0,0,0:.:. +chr22 10610030 . G C 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:50:70,0 0/0:6,0:6:35:0,50 ./.:0,0:0:.:. +chr22 10610040 . G C 50 PASS . GT:AD:DP:VX:PL 0/1:3,4:7:1,2,3:70,0,90 0/0:6,0:6:5:0,50,120 ./.:0,0:0:.:. diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh index 78b6cdc8a..ba4ecaa6f 100755 --- a/test/test_format_plan.sh +++ b/test/test_format_plan.sh @@ -2,7 +2,7 @@ set -eu test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf"} +inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf"} tmpdir=${TMPDIR:-/tmp} base=${tmpdir}/hts-format-plan-base.$$ plan=${tmpdir}/hts-format-plan-plan.$$ diff --git a/vcf.c b/vcf.c index 55ff17284..94a266bd4 100644 --- a/vcf.c +++ b/vcf.c @@ -3348,6 +3348,7 @@ typedef struct { uint8_t htype; uint8_t is_gt; uint8_t vl_type; + uint8_t measured_width; } vcf_format_op_t; typedef struct { @@ -3573,14 +3574,19 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma plan->ops[plan->n_ops].htype = htype; plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; plan->ops[plan->n_ops].vl_type = bcf_hdr_id2length(h, BCF_HL_FMT, key); + plan->ops[plan->n_ops].measured_width = 0; if (!plan->ops[plan->n_ops].is_gt) { int vl = plan->ops[plan->n_ops].vl_type; if (htype == BCF_HT_STR) { if (plan->ops[plan->n_ops].number != 1) - plan->strict_supported = 0; + return 0; + plan->ops[plan->n_ops].measured_width = 1; } else if (vl != BCF_VL_FIXED && vl != BCF_VL_A && - vl != BCF_VL_R && vl != BCF_VL_G) { - plan->strict_supported = 0; + vl != BCF_VL_R && vl != BCF_VL_G && + vl != BCF_VL_VAR) { + return 0; + } else if (vl == BCF_VL_VAR) { + plan->ops[plan->n_ops].measured_width = 1; } } plan->n_ops++; @@ -4406,6 +4412,27 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, return vcf_plan_parse_int_vector3_flexible_counted(sp, out, NULL); } +static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, + int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) +{ + int i; + + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + out[0] = bcf_int32_missing; + vcf_plan_int_range_add(range, out[0]); + for (i = 1; i < width; i++) + out[i] = bcf_int32_vector_end; + range->has_special = 1; + if (nread) + *nread = 1; + return 0; + } + return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); +} + static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, bcf1_t *v, int *widths, vcf_format_row_op_t *row_ops) @@ -4421,7 +4448,7 @@ static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan row->offset = 0; if (op->is_gt) { row->kind = row->width == 2 && v->n_allele <= 10 ? VCF_FORMAT_ROW_GT2 : VCF_FORMAT_ROW_GT; - row->size = row->width * (int)sizeof(int32_t); + row->size = row->kind == VCF_FORMAT_ROW_GT2 ? 2 : row->width * (int)sizeof(int32_t); } else if (op->htype == BCF_HT_INT) { if (row->width == 1) row->kind = VCF_FORMAT_ROW_INT1; @@ -4464,6 +4491,7 @@ static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t * } static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt); +static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt); static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, int nsamples, int n_ops, @@ -4477,7 +4505,7 @@ static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, bcf_enc_int1(dst, op->key); if (op->kind == VCF_FORMAT_ROW_GT2) { - if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) + if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) return -1; } else if (op->kind == VCF_FORMAT_ROW_STR) { if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) @@ -4511,7 +4539,7 @@ static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring bcf_enc_int1(dst, op->key); if (op->kind == VCF_FORMAT_ROW_GT2) { - if (vcf_enc_gt2_int8(dst, nsamples, (int32_t *)buf) < 0) + if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) return -1; } else if (op->kind == VCF_FORMAT_ROW_STR) { if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) @@ -4554,6 +4582,15 @@ static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) return 0; } +static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt) +{ + int n = nsamples * 2; + + if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0) + return -1; + return kputsn((const char *)gt, n, dst) < 0 ? -1 : 0; +} + static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int n_ops) { int j; @@ -4566,8 +4603,8 @@ static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int return j; } -static int vcf_format_general_fixed_numeric_supported(const vcf_format_row_op_t *row_ops, - int n_ops) +static int vcf_format_general_composable_supported(const vcf_format_row_op_t *row_ops, + int n_ops) { int j; @@ -4602,16 +4639,14 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *q, int *widths) { const char *cur, *end; - int has_string = 0, sample, j, nsamples = bcf_hdr_nsamples(h); + int has_measured = 0, sample, j, nsamples = bcf_hdr_nsamples(h); for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; - if (!op->is_gt && op->htype == BCF_HT_STR) { - if (op->number != 1) - return -4; + if (op->measured_width) { widths[j] = 0; - has_string = 1; + has_measured = 1; } else { widths[j] = vcf_format_general_expected_width(op, v); if (widths[j] <= 0 || widths[j] > 64) @@ -4619,7 +4654,7 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, } } - if (!has_string) + if (!has_measured) return 0; cur = q + 1; @@ -4628,15 +4663,23 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; const char *field = cur; + int w = 1; - while (cur < end && *cur && *cur != ':' && *cur != '\t') + while (cur < end && *cur && *cur != ':' && *cur != '\t') { + if (op->measured_width && + (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) && + *cur == ',') + w++; cur++; - if (!op->is_gt && op->htype == BCF_HT_STR) { - int w = cur - field; + } + if (op->measured_width && !op->is_gt && op->htype == BCF_HT_STR) { + w = cur - field; if (j > 0) w++; if (w <= 0) w = 1; + } + if (op->measured_width) { if (widths[j] < w) widths[j] = w; } @@ -4658,9 +4701,12 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, if (sample != nsamples) return -4; for (j = 0; j < plan->n_ops; j++) - if (!plan->ops[j].is_gt && plan->ops[j].htype == BCF_HT_STR && - widths[j] <= 0) - widths[j] = 1; + if (plan->ops[j].measured_width) { + if (widths[j] <= 0) + widths[j] = 1; + if (widths[j] > 64) + return -4; + } return 0; } @@ -4779,15 +4825,15 @@ static int vcf_parse_format_general_gt2_only(kstring_t *s, const bcf_hdr_t *h, return -1; } -static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t *h, - bcf1_t *v, - const vcf_format_general_plan_t *plan, - char *q, - vcf_format_row_op_t *row_ops) +static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_row_op_t *row_ops) { kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h), sample, j; - int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); + int direct_ops = 0; int max_counts[MAX_N_FMT]; vcf_plan_int_range_t ranges[MAX_N_FMT]; size_t indiv_l0 = v->indiv.l; @@ -4796,7 +4842,7 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t size_t op_stride[MAX_N_FMT]; const char *cur = q + 1, *end = s->s + s->l; - if (!vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) + if (!vcf_format_general_composable_supported(row_ops, plan->n_ops)) return -4; for (j = 0; j < plan->n_ops; j++) { @@ -4856,12 +4902,8 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t switch (op->kind) { case VCF_FORMAT_ROW_GT2: - if (j < direct_ops) { - if (vcf_plan_gt2_u8(&cur, buf) < 0) - goto fallback; - } else if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) { + if (vcf_plan_gt2_u8(&cur, buf) < 0) goto fallback; - } break; case VCF_FORMAT_ROW_INT1: if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) @@ -4876,7 +4918,8 @@ static int vcf_parse_format_general_fixed_numeric(kstring_t *s, const bcf_hdr_t goto fallback; break; case VCF_FORMAT_ROW_INTN: - if (vcf_plan_parse_int_vector_counted_range(&cur, (int32_t *)buf, op->width, &n, &ranges[j]) < 0) + if (vcf_plan_parse_int_vector_flexible_counted_range(&cur, (int32_t *)buf, + op->width, &n, &ranges[j]) < 0) goto fallback; break; case VCF_FORMAT_ROW_FLOAT1: @@ -5193,132 +5236,22 @@ static int vcf_parse_format_general_likelihood_strict(kstring_t *s, static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, - char *q, int try_likelihood) + char *q) { - kstring_t *mem; - int widths[MAX_N_FMT], max_counts[MAX_N_FMT]; + int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; - vcf_plan_int_range_t ranges[MAX_N_FMT]; - int nsamples = bcf_hdr_nsamples(h), sample, j, vcf44, ret; - const char *cur, *end; if (vcf_format_general_strict_widths(s, h, plan, v, q, widths) < 0) return -4; - for (j = 0; j < plan->n_ops; j++) { - max_counts[j] = 0; - vcf_plan_int_range_init(&ranges[j]); - } - if (try_likelihood && plan->likelihood_supported && - (ret = vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, widths)) != -4) - return ret; vcf_format_general_resolve_ops(plan, v, widths, row_ops); - if (vcf_format_general_fixed_numeric_supported(row_ops, plan->n_ops)) - return vcf_parse_format_general_fixed_numeric(s, h, v, plan, q, row_ops); - - mem = (kstring_t*)&h->mem; - mem->l = 0; - for (j = 0; j < plan->n_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - - if (op->size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) - return -1; - if (align_mem(mem) < 0) - return -1; - op->offset = mem->l; - if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) - return -1; - mem->l += nsamples * (size_t) op->size; - } - - cur = q + 1; - end = s->s + s->l; - vcf44 = bcf_get_version(h, NULL) >= VCF44; - for (sample = 0; sample < nsamples && cur < end; sample++) { - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset + sample * (size_t)op->size; - int n = op->width; - - switch (op->kind) { - case VCF_FORMAT_ROW_GT2: - if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) - return -4; - break; - case VCF_FORMAT_ROW_GT: - if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, op->width, vcf44) < 0) - return -4; - n = vcf_plan_int_vector_count((int32_t *)buf, op->width); - break; - case VCF_FORMAT_ROW_INT1: - if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) - return -4; - break; - case VCF_FORMAT_ROW_INT2: - if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) - return -4; - break; - case VCF_FORMAT_ROW_INT3: - if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) - return -4; - break; - case VCF_FORMAT_ROW_INTN: - if (vcf_plan_parse_int_vector_counted_range(&cur, (int32_t *)buf, op->width, &n, &ranges[j]) < 0) - return -4; - break; - case VCF_FORMAT_ROW_FLOAT1: - if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) - return -4; - break; - case VCF_FORMAT_ROW_FLOATN: - if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) - return -4; - n = vcf_plan_float_vector_count((float *)buf, op->width); - break; - case VCF_FORMAT_ROW_STR: - return -4; - } - if (max_counts[j] < n) - max_counts[j] = n; - - if (j + 1 < plan->n_ops) { - if (vcf_plan_expect_sep(&cur, ':') < 0) - return -4; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - return -4; - } - } - } - if (sample != nsamples) - return -4; - for (j = 0; j < plan->n_ops; j++) - if (max_counts[j] != row_ops[j].width) - return -4; - - v->n_fmt = plan->n_ops; - v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, nsamples, - plan->n_ops, row_ops, - ranges, 0) < 0) - return -1; - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; - return 0; + return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops); } static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { vcf_format_general_plan_t *plan; - kstring_t *mem; - int widths[MAX_N_FMT]; - vcf_format_row_op_t row_ops[MAX_N_FMT]; - int nsamples, sample, j, vcf44, ret, strict_enabled, likelihood_tried; - const char *cur, *end; + int nsamples, ret; plan = vcf_format_general_plan_get(h, p); if (!plan) @@ -5331,127 +5264,13 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, nsamples = bcf_hdr_nsamples(h); if (!nsamples) return 0; - strict_enabled = vcf_format_fast_guard_enabled(&plan->strict_guard); - likelihood_tried = 0; - if (strict_enabled && plan->n_ops == 1 && plan->ops[0].is_gt) { - ret = vcf_parse_format_general_gt2_only(s, h, v, plan, q); - if (ret == 0) { - vcf_format_fast_guard_success(&plan->strict_guard); - return ret; - } - if (ret != -4) - return ret; - } - if (plan->likelihood_supported && strict_enabled) { - ret = vcf_parse_format_general_likelihood_strict(s, h, v, plan, q, &likelihood_tried); - if (ret == 0) { - vcf_format_fast_guard_success(&plan->strict_guard); - return ret; - } - if (ret != -4) - return ret; - } - if (plan->strict_supported && strict_enabled) { - ret = vcf_parse_format_general_strict(s, h, v, plan, q, !likelihood_tried); - if (ret == 0) { - vcf_format_fast_guard_success(&plan->strict_guard); - return ret; - } - if (ret != -4) - return ret; - vcf_format_fast_guard_fallback(&plan->strict_guard); - } - if (vcf_plan_measure_general(s, h, plan, q, widths) < 0) - goto fallback; - vcf_format_general_resolve_ops(plan, v, widths, row_ops); - - mem = (kstring_t*)&h->mem; - mem->l = 0; - for (j = 0; j < plan->n_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - - if (op->size < 0 || (uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) - return -1; - if (align_mem(mem) < 0) - return -1; - op->offset = mem->l; - if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) - return -1; - mem->l += nsamples * (size_t) op->size; - } - - cur = q + 1; - end = s->s + s->l; - vcf44 = bcf_get_version(h, NULL) >= VCF44; - for (sample = 0; sample < nsamples && cur < end; sample++) { - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset + sample * (size_t)op->size; - - switch (op->kind) { - case VCF_FORMAT_ROW_GT2: - if (vcf_plan_gt2(&cur, (int32_t *)buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_GT: - if (vcf_plan_parse_gt_dynamic(&cur, (int32_t *)buf, op->width, vcf44) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT1: - if (vcf_plan_int_scalar_flexible(&cur, (int32_t *)buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT2: - if (vcf_plan_parse_int_vector2_flexible(&cur, (int32_t *)buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT3: - if (vcf_plan_parse_int_vector3_flexible(&cur, (int32_t *)buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INTN: - if (vcf_plan_parse_int_vector_dynamic(&cur, (int32_t *)buf, op->width) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_FLOAT1: - if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_FLOATN: - if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_STR: - if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) - goto fallback; - break; - } - - if (j + 1 < plan->n_ops) { - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; - } - } + ret = vcf_parse_format_general_strict(s, h, v, plan, q); + if (ret == 0) { + vcf_format_fast_guard_success(&plan->general_guard); + return ret; } - if (sample != nsamples) - goto fallback; - - v->n_fmt = plan->n_ops; - v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops(&v->indiv, mem, nsamples, plan->n_ops, row_ops) < 0) - return -1; - - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; - vcf_format_fast_guard_success(&plan->general_guard); - return 0; + if (ret != -4) + return ret; fallback: if (plan) From 006059a9ae6685dbe2d9e18fac0e600bb2eb6fc9 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 18:08:26 +0200 Subject: [PATCH 23/38] Harden composable FORMAT parser --- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 39 + test/format-plan-gt-header-shape.vcf | 6 + test/test_format_plan.sh | 2 +- vcf.c | 758 +----------------- 4 files changed, 83 insertions(+), 722 deletions(-) create mode 100644 test/format-plan-gt-header-shape.vcf diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index c51988ca2..ceae4fe27 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -457,3 +457,42 @@ All exact and interp outputs compared byte-identical to baseline. Takeaway: the MVP architecture is much less brittle and supports tag-level composition, but parity with the removed likelihood-shape executor will require generic per-op optimizations or a later optional executor-generation layer. + +## 2026-04-29 Composable Production Hardening + +Follow-up productionizing pass: + +- tightened `GT` compile validation so the composable path only claims + `GT` when the header declares `Type=String,Number=1`; +- added `test/format-plan-gt-header-shape.vcf` to prove malformed-but-readable + `GT` headers fall back instead of being planned; +- restored direct writes for leading fixed-encoding ops (`GT2` and `FLOAT1`) + inside the composable executor; +- added a positive integer fast path before falling back to the full signed / + missing integer parser; +- routed generic `INTN` widths 4, 6, and 10 through the fixed-width counted + parsers; +- trimmed the unused dynamic likelihood-shape executor scaffolding from the + general planned path now that `interp` uses the composable executor. + +Latest full large-corpus run: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-composable-prod \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All exact and interp outputs compared byte-identical to baseline. + +| Input | Baseline user | Exact user | Dynamic interp user | Notes | +|---|---:|---:|---:|---| +| CCDG 10k | 2.62 s | 1.49 s | 2.29 s | partial fallback from row-local strictness | +| 1000G chr22 full GT | 26.02 s | 8.70 s | 9.09 s | GT-only composable path; noisy/slower pass | +| Large CCDG-like synthetic | 4.20 s | 2.68 s | 3.81 s | still behind old likelihood microkernel | +| Large reordered likelihood | 3.02 s | 2.47 s | 2.44 s | composable path at parity | +| Large multiallelic likelihood | 3.28 s | 1.95 s | 2.79 s | fixed-width INTN recovered part of gap | +| Large float/string | 2.99 s | 2.99 s | 2.94 s | parity/slightly ahead | +| Variable phase widths | 2.70 s | 2.01 s | 2.56 s | row-local string measurement remains cost | +| Mixed row-local fallbacks | 2.25 s | 1.76 s | 1.94 s | byte-clean fallback path | +| GT-first reordered negative | 1.77 s | 1.47 s | 1.44 s | composable path slightly ahead | +| Two-string float negative | 2.29 s | 2.55 s | 2.55 s | planned path still slower than baseline | diff --git a/test/format-plan-gt-header-shape.vcf b/test/format-plan-gt-header-shape.vcf new file mode 100644 index 000000000..5e3c79e19 --- /dev/null +++ b/test/format-plan-gt-header-shape.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10620000 . A T 50 PASS . GT:DP 0/1:7 0/0:5 ./.:0 diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh index ba4ecaa6f..e247a558e 100755 --- a/test/test_format_plan.sh +++ b/test/test_format_plan.sh @@ -2,7 +2,7 @@ set -eu test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf"} +inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf test/format-plan-gt-header-shape.vcf"} tmpdir=${TMPDIR:-/tmp} base=${tmpdir}/hts-format-plan-base.$$ plan=${tmpdir}/hts-format-plan-plan.$$ diff --git a/vcf.c b/vcf.c index 94a266bd4..9621181f0 100644 --- a/vcf.c +++ b/vcf.c @@ -3226,9 +3226,6 @@ typedef struct { } vcf_format_plan_stats_t; static vcf_format_plan_stats_t vcf_format_plan_stats; -static uint64_t vcf_format_likelihood_shape_attempts; -static uint64_t vcf_format_likelihood_shape_hits; -static uint64_t vcf_format_likelihood_shape_fallback; void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples) @@ -3242,9 +3239,9 @@ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, void hts_vcf_format_plan_shape_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback) { - if (attempts) *attempts = vcf_format_likelihood_shape_attempts; - if (hits) *hits = vcf_format_likelihood_shape_hits; - if (fallback) *fallback = vcf_format_likelihood_shape_fallback; + if (attempts) *attempts = 0; + if (hits) *hits = 0; + if (fallback) *fallback = 0; } static int vcf_format_plan_mode(void) @@ -3355,20 +3352,8 @@ typedef struct { char format[256]; const bcf_hdr_t *hdr; int supported; - int strict_supported; - int likelihood_supported; - int likelihood_has_float; - int likelihood_has_phase; - int likelihood_float_idx; - int likelihood_ad_idx; - int likelihood_dp_idx; - int likelihood_gq_idx; - int likelihood_str1_idx; - int likelihood_str2_idx; - int likelihood_pl_idx; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; - vcf_format_fast_guard_t strict_guard; vcf_format_fast_guard_t general_guard; } vcf_format_general_plan_t; @@ -3463,64 +3448,6 @@ static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, return plan->supported; } -static int vcf_format_general_classify_likelihood(vcf_format_general_plan_t *plan) -{ - int idx; - - plan->likelihood_supported = 0; - plan->likelihood_has_float = 0; - plan->likelihood_has_phase = 0; - plan->likelihood_float_idx = -1; - plan->likelihood_ad_idx = -1; - plan->likelihood_dp_idx = -1; - plan->likelihood_gq_idx = -1; - plan->likelihood_str1_idx = -1; - plan->likelihood_str2_idx = -1; - plan->likelihood_pl_idx = -1; - - if (plan->n_ops != 5 && plan->n_ops != 6 && - plan->n_ops != 7 && plan->n_ops != 8) - return 0; - if (!plan->ops[0].is_gt) - return 0; - - idx = 1; - if (idx < plan->n_ops && plan->ops[idx].htype == BCF_HT_REAL && - plan->ops[idx].number == 1) { - plan->likelihood_has_float = 1; - plan->likelihood_float_idx = idx++; - } - if (idx + 3 >= plan->n_ops) - return 0; - if (plan->ops[idx].htype != BCF_HT_INT) - return 0; - plan->likelihood_ad_idx = idx++; - if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) - return 0; - plan->likelihood_dp_idx = idx++; - if (plan->ops[idx].htype != BCF_HT_INT || plan->ops[idx].number != 1) - return 0; - plan->likelihood_gq_idx = idx++; - if (plan->n_ops - idx == 3) { - if (plan->ops[idx].htype != BCF_HT_STR || plan->ops[idx].number != 1 || - plan->ops[idx + 1].htype != BCF_HT_STR || plan->ops[idx + 1].number != 1) - return 0; - plan->likelihood_has_phase = 1; - plan->likelihood_str1_idx = idx++; - plan->likelihood_str2_idx = idx++; - } else if (plan->n_ops - idx != 1) { - return 0; - } - if (plan->ops[idx].htype != BCF_HT_INT) - return 0; - plan->likelihood_pl_idx = idx++; - if (idx != plan->n_ops) - return 0; - - plan->likelihood_supported = 1; - return 1; -} - static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *format) { enum { N_PLAN_CACHE = 8 }; @@ -3547,10 +3474,9 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma memset(plan, 0, sizeof(*plan)); if (strlen(format) >= sizeof(plan->format)) return 0; - strcpy(plan->format, format); - strcpy(tmp, format); + strcpy(plan->format, format); + strcpy(tmp, format); plan->hdr = h; - plan->strict_supported = 1; for (tok = strtok_r(tmp, ":", &saveptr); tok; tok = strtok_r(NULL, ":", &saveptr)) { @@ -3575,7 +3501,11 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; plan->ops[plan->n_ops].vl_type = bcf_hdr_id2length(h, BCF_HL_FMT, key); plan->ops[plan->n_ops].measured_width = 0; - if (!plan->ops[plan->n_ops].is_gt) { + if (plan->ops[plan->n_ops].is_gt) { + if (htype != BCF_HT_STR || plan->ops[plan->n_ops].number != 1 || + plan->ops[plan->n_ops].vl_type != BCF_VL_FIXED) + return 0; + } else { int vl = plan->ops[plan->n_ops].vl_type; if (htype == BCF_HT_STR) { if (plan->ops[plan->n_ops].number != 1) @@ -3595,7 +3525,6 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (!plan->n_ops) return 0; - vcf_format_general_classify_likelihood(plan); plan->supported = 1; return 1; } @@ -3618,30 +3547,6 @@ static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h return cache[ncache++].supported ? &cache[ncache-1] : NULL; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2(const char **sp, int32_t out[2]) -{ - const char *s = *sp; - int a0, a1, phased; - - if (s[0] == '.' && (s[1] == '/' || s[1] == '|') && s[2] == '.') { - out[0] = 0; - out[1] = 0; - *sp = s + 3; - return 0; - } - if (!(s[0] >= '0' && s[0] <= '9') || (s[1] != '/' && s[1] != '|') || - !(s[2] >= '0' && s[2] <= '9')) - return -1; - - a0 = s[0] - '0'; - a1 = s[2] - '0'; - phased = s[1] == '|'; - out[0] = ((a0 + 1) << 1) | phased; - out[1] = ((a1 + 1) << 1) | phased; - *sp = s + 3; - return 0; -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) { const char *s = *sp; @@ -3701,16 +3606,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_vector_count(const int32_t *vals, int width) -{ - int i; - - for (i = 0; i < width; i++) - if (vals[i] == bcf_int32_vector_end) - break; - return i; -} - VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_init(vcf_plan_int_range_t *range) { range->min = INT32_MAX; @@ -3759,6 +3654,22 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value_range(const char **sp, int32_t *out, vcf_plan_int_range_t *range) { + const char *s = *sp; + uint32_t val = 0, cutoff = BCF_MAX_BT_INT32 / 10, cutlim = BCF_MAX_BT_INT32 % 10; + + if (*s >= '0' && *s <= '9') { + do { + uint32_t digit = *s - '0'; + if (val > cutoff || (val == cutoff && digit > cutlim)) + return -1; + val = val * 10 + digit; + s++; + } while (*s >= '0' && *s <= '9'); + *out = (int32_t)val; + *sp = s; + vcf_plan_int_range_add(range, *out); + return 0; + } if (vcf_plan_int_value(sp, out) < 0) return -1; vcf_plan_int_range_add(range, *out); @@ -3819,11 +3730,6 @@ static int vcf_plan_parse_int_vector_counted_range(const char **sp, int32_t *out return 0; } -static int vcf_plan_parse_int_vector(const char **sp, int32_t *out, int width) -{ - return vcf_plan_parse_int_vector_counted(sp, out, width, NULL); -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted(const char **sp, int32_t *out, int *nread) { const char *s = *sp; @@ -3874,11 +3780,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char * return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2(const char **sp, int32_t *out) -{ - return vcf_plan_parse_int_vector2_counted(sp, out, NULL); -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted(const char **sp, int32_t *out, int *nread) { const char *s = *sp; @@ -4099,11 +4000,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector10_counted_range(const char return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3(const char **sp, int32_t *out) -{ - return vcf_plan_parse_int_vector3_counted(sp, out, NULL); -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) { if (**sp != sep) @@ -4156,142 +4052,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int return 0; } -static int vcf_plan_measure_general(kstring_t *s, const bcf_hdr_t *h, - const vcf_format_general_plan_t *plan, - char *q, int *widths) -{ - const char *cur = q + 1, *end = s->s + s->l; - int sample, j, nsamples = bcf_hdr_nsamples(h); - - for (j = 0; j < plan->n_ops; j++) - widths[j] = 0; - - for (sample = 0; sample < nsamples && cur < end; sample++) { - for (j = 0; j < plan->n_ops; j++) { - const char *field = cur; - const vcf_format_op_t *op = &plan->ops[j]; - int w = 1; - - while (cur < end && *cur && *cur != ':' && *cur != '\t') { - if (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) { - if (*cur == ',') - w++; - } else if (op->is_gt) { - if (*cur == '/' || *cur == '|') - w++; - } - cur++; - } - - if (op->htype == BCF_HT_STR && !op->is_gt) { - w = cur - field; - if (j > 0) - w++; - } - if (w <= 0) - w = 1; - if (widths[j] < w) - widths[j] = w; - - if (j + 1 < plan->n_ops) { - if (*cur != ':') - return -1; - cur++; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - return -1; - } - } - } - - return sample == nsamples ? 0 : -1; -} - -static int vcf_plan_parse_gt_dynamic(const char **sp, int32_t *out, int width, int vcf44) -{ - const char *s = *sp; - int l = 0, ploidy = 0, anyunphased = 0, phasingprfx = 0, unknown1 = 0; - int32_t is_phased = 0; - - if (vcf44 && (*s == '|' || *s == '/')) { - is_phased = *s++ == '|'; - phasingprfx = 1; - } - - for (;;) { - uint32_t val = 0; - - if (l >= width) - return -1; - ploidy++; - if (*s == '.') { - s++; - out[l++] = is_phased; - if (l == 1) - unknown1 = 1; - } else if (*s >= '0' && *s <= '9') { - do { - if (val > ((uint32_t)INT32_MAX >> 1) - 1) - return -1; - val = val * 10 + (*s - '0'); - s++; - } while (*s >= '0' && *s <= '9'); - if (val > ((uint32_t)INT32_MAX >> 1) - 1) - return -1; - out[l++] = ((val + 1) << 1) | is_phased; - } else { - return -1; - } - - anyunphased |= (ploidy != 1) && !is_phased; - is_phased = *s == '|'; - if (*s != '|' && *s != '/') - break; - s++; - } - - if (!phasingprfx) { - if (ploidy == 1) { - if (!unknown1) - out[0] |= 1; - } else { - out[0] |= anyunphased ? 0 : 1; - } - } - for (; l < width; l++) - out[l] = bcf_int32_vector_end; - - *sp = s; - return 0; -} - -static int vcf_plan_parse_int_vector_dynamic(const char **sp, int32_t *out, int width) -{ - const char *s = *sp; - int i = 0; - - if (*s == ':' || *s == '\t' || *s == '\0') { - out[i++] = bcf_int32_missing; - } else { - for (;;) { - if (i >= width || vcf_plan_int_value(&s, &out[i]) < 0) - return -1; - i++; - if (*s != ',') - break; - s++; - } - } - for (; i < width; i++) - out[i] = bcf_int32_vector_end; - *sp = s; - return 0; -} - static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int width) { const char *s = *sp; @@ -4344,18 +4104,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float return vcf_plan_float_value(sp, out); } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted(const char **sp, int32_t *out, int *nread) -{ - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - out[0] = bcf_int32_missing; - out[1] = bcf_int32_vector_end; - if (nread) - *nread = 1; - return 0; - } - return vcf_plan_parse_int_vector2_counted(sp, out, nread); -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(const char **sp, int32_t *out, int *nread, @@ -4372,24 +4120,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(con return vcf_plan_parse_int_vector2_counted_range(sp, out, nread, range); } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible(const char **sp, int32_t *out) -{ - return vcf_plan_parse_int_vector2_flexible_counted(sp, out, NULL); -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted(const char **sp, int32_t *out, int *nread) -{ - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - out[0] = bcf_int32_missing; - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - if (nread) - *nread = 1; - return 0; - } - return vcf_plan_parse_int_vector3_counted(sp, out, nread); -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(const char **sp, int32_t *out, int *nread, @@ -4407,11 +4137,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(con return vcf_plan_parse_int_vector3_counted_range(sp, out, nread, range); } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible(const char **sp, int32_t *out) -{ - return vcf_plan_parse_int_vector3_flexible_counted(sp, out, NULL); -} - static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, int32_t *out, int width, @@ -4430,6 +4155,16 @@ static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, *nread = 1; return 0; } + switch (width) { + case 4: + return vcf_plan_parse_int_vector4_counted_range(sp, out, nread, range); + case 6: + return vcf_plan_parse_int_vector6_counted_range(sp, out, nread, range); + case 10: + return vcf_plan_parse_int_vector10_counted_range(sp, out, nread, range); + default: + break; + } return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); } @@ -4490,41 +4225,8 @@ static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t * } } -static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt); static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt); -static int vcf_format_general_encode_row_ops(kstring_t *dst, kstring_t *mem, - int nsamples, int n_ops, - const vcf_format_row_op_t *row_ops) -{ - int j; - - for (j = 0; j < n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset; - - bcf_enc_int1(dst, op->key); - if (op->kind == VCF_FORMAT_ROW_GT2) { - if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_STR) { - if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) - return -1; - if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { - if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) - return -1; - if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) - return -1; - } else { - if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) - return -1; - } - } - return 0; -} - static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring_t *mem, int nsamples, int n_ops, const vcf_format_row_op_t *row_ops, @@ -4567,21 +4269,6 @@ static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring return 0; } -static int vcf_enc_gt2_int8(kstring_t *dst, int nsamples, int32_t *gt) -{ - int i, n = nsamples * 2; - uint8_t *p; - - if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0 || - ks_resize(dst, dst->l + n) < 0) - return -1; - p = (uint8_t *)dst->s + dst->l; - for (i = 0; i < n; i++) - p[i] = (uint8_t)gt[i]; - dst->l += n; - return 0; -} - static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt) { int n = nsamples * 2; @@ -4626,14 +4313,6 @@ static int vcf_format_general_composable_supported(const vcf_format_row_op_t *ro return 1; } -static inline int vcf_format_row_is_int(const vcf_format_row_op_t *op) -{ - return op->kind == VCF_FORMAT_ROW_INT1 || - op->kind == VCF_FORMAT_ROW_INT2 || - op->kind == VCF_FORMAT_ROW_INT3 || - op->kind == VCF_FORMAT_ROW_INTN; -} - static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, bcf1_t *v, char *q, int *widths) @@ -4711,120 +4390,6 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, return 0; } -static int vcf_format_general_likelihood_widths(kstring_t *s, const bcf_hdr_t *h, - const vcf_format_general_plan_t *plan, - bcf1_t *v, char *q, int *widths) -{ - const char *cur, *end; - int ad_w, pl_w, sample, j, nsamples = bcf_hdr_nsamples(h); - int str1_idx = plan->likelihood_str1_idx; - int str2_idx = plan->likelihood_str2_idx; - - if (!plan->likelihood_supported) - return -4; - if (v->n_allele < 1 || v->n_allele > 8) - return -4; - ad_w = v->n_allele; - pl_w = v->n_allele * (v->n_allele + 1) / 2; - if (pl_w < 1 || pl_w > 36) - return -4; - - for (j = 0; j < plan->n_ops; j++) - widths[j] = 0; - widths[0] = 2; - if (plan->likelihood_has_float) - widths[plan->likelihood_float_idx] = 1; - widths[plan->likelihood_ad_idx] = ad_w; - widths[plan->likelihood_dp_idx] = 1; - widths[plan->likelihood_gq_idx] = 1; - widths[plan->likelihood_pl_idx] = pl_w; - - if (str1_idx < 0) - return 0; - - cur = q + 1; - end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (vcf_plan_skip_field(&cur, ':') < 0) - return -4; - if (plan->likelihood_has_float && vcf_plan_skip_field(&cur, ':') < 0) - return -4; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -4; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -4; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -4; - if (vcf_plan_measure_string(&cur, ':', &widths[str1_idx]) < 0) - return -4; - if (vcf_plan_measure_string(&cur, ':', &widths[str2_idx]) < 0) - return -4; - while (cur < end && *cur && *cur != '\t') - cur++; - if (*cur == '\t') - cur++; - } - if (sample != nsamples) - return -4; - if (widths[str1_idx] <= 0) - widths[str1_idx] = 1; - if (widths[str2_idx] <= 0) - widths[str2_idx] = 1; - widths[str1_idx]++; - widths[str2_idx]++; - - return 0; -} - -static int vcf_parse_format_general_gt2_only(kstring_t *s, const bcf_hdr_t *h, - bcf1_t *v, - const vcf_format_general_plan_t *plan, - char *q) -{ - int nsamples = bcf_hdr_nsamples(h), sample; - size_t indiv_l0 = v->indiv.l; - uint8_t *gt8; - const char *cur, *end; - - if (plan->n_ops != 1 || !plan->ops[0].is_gt || v->n_allele > 10) - return -4; - - bcf_enc_int1(&v->indiv, plan->ops[0].key); - if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) - goto error; - gt8 = (uint8_t *)v->indiv.s + v->indiv.l; - v->indiv.l += (size_t)nsamples * 2; - - cur = q + 1; - end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) - goto fallback; - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; - } - if (sample != nsamples) - goto fallback; - - v->n_fmt = 1; - v->n_sample = nsamples; - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; - return 0; - -fallback: - v->indiv.l = indiv_l0; - return -4; -error: - v->indiv.l = indiv_l0; - return -1; -} - static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, @@ -4833,7 +4398,7 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, { kstring_t *mem = (kstring_t*)&h->mem; int nsamples = bcf_hdr_nsamples(h), sample, j; - int direct_ops = 0; + int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); int max_counts[MAX_N_FMT]; vcf_plan_int_range_t ranges[MAX_N_FMT]; size_t indiv_l0 = v->indiv.l; @@ -4984,255 +4549,6 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, return -1; } -static int vcf_parse_format_general_likelihood_shape(kstring_t *s, - const bcf_hdr_t *h, - bcf1_t *v, - const vcf_format_general_plan_t *plan, - char *q, - int *widths) -{ - kstring_t *mem = (kstring_t*)&h->mem; - int nsamples = bcf_hdr_nsamples(h); - int ad_w, pl_w, sample; - int ad_idx, dp_idx, gq_idx, pl_idx; - int has_float, has_phase, float_idx, str1_idx, str2_idx; - int max_ad_count = 0, max_pl_count = 0, nwords; - vcf_plan_int_range_t ad_range, dp_range, gq_range, pl_range; - size_t indiv_l0 = v->indiv.l; - size_t gt8_off, float_le_off = 0; - size_t ad_off, dp_off, gq_off, str1_off = 0, str2_off = 0, pl_off, total_bytes; - uint8_t *gt8, *float_le = NULL; - int32_t *ad, *dp, *gq, *pl; - char *str1 = NULL, *str2 = NULL; - const char *cur, *end; - - if (!plan->likelihood_supported) - return -4; - - vcf_format_likelihood_shape_attempts++; - if (widths[0] != 2) - return -4; - if (v->n_allele < 1 || v->n_allele > 8) - return -4; - - ad_w = v->n_allele; - pl_w = v->n_allele * (v->n_allele + 1) / 2; - if (pl_w < 1 || pl_w > 36) - return -4; - - has_float = plan->likelihood_has_float; - has_phase = plan->likelihood_has_phase; - float_idx = plan->likelihood_float_idx; - ad_idx = plan->likelihood_ad_idx; - dp_idx = plan->likelihood_dp_idx; - gq_idx = plan->likelihood_gq_idx; - str1_idx = plan->likelihood_str1_idx; - str2_idx = plan->likelihood_str2_idx; - pl_idx = plan->likelihood_pl_idx; - - if (widths[ad_idx] != ad_w || - widths[dp_idx] != 1 || - widths[gq_idx] != 1 || - (has_float && widths[float_idx] != 1) || - (has_phase && (widths[str1_idx] <= 0 || - widths[str2_idx] <= 0)) || - widths[pl_idx] != pl_w) - return -4; - - vcf_plan_int_range_init(&ad_range); - vcf_plan_int_range_init(&dp_range); - vcf_plan_int_range_init(&gq_range); - vcf_plan_int_range_init(&pl_range); - - bcf_enc_int1(&v->indiv, plan->ops[0].key); - if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) - goto error; - gt8_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * 2; - if (has_float) { - bcf_enc_int1(&v->indiv, plan->ops[float_idx].key); - if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) - goto error; - float_le_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * sizeof(float); - } - gt8 = (uint8_t *)v->indiv.s + gt8_off; - if (has_float) - float_le = (uint8_t *)v->indiv.s + float_le_off; - - mem->l = 0; - if (align_mem(mem) < 0) - goto error; - total_bytes = (size_t) nsamples * (ad_w + 1 + 1 + pl_w) * sizeof(int32_t); - if (has_phase) - total_bytes += (size_t) nsamples * - (widths[str1_idx] + widths[str2_idx]); - if (total_bytes > INT_MAX) - goto error; - if (ks_resize(mem, mem->l + total_bytes) < 0) - goto error; - - ad_off = mem->l; mem->l += (size_t) nsamples * ad_w * sizeof(int32_t); - dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); - gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); - if (has_phase) { - str1_off = mem->l; mem->l += (size_t) nsamples * widths[str1_idx]; - str2_off = mem->l; mem->l += (size_t) nsamples * widths[str2_idx]; - } - pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); - - ad = (int32_t *) (mem->s + ad_off); - dp = (int32_t *) (mem->s + dp_off); - gq = (int32_t *) (mem->s + gq_off); - if (has_phase) { - str1 = mem->s + str1_off; - str2 = mem->s + str2_off; - } - pl = (int32_t *) (mem->s + pl_off); - - cur = q + 1; - end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - int nread; - - if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (has_float) { - float f; - if (vcf_plan_float_value(&cur, &f) < 0) - goto fallback; - float_to_le(f, float_le + (size_t)sample * sizeof(float)); - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } - if (ad_w == 2) { - if (vcf_plan_parse_int_vector2_counted_range(&cur, &ad[sample * 2], &nread, &ad_range) < 0) - goto fallback; - } else if (ad_w == 3) { - if (vcf_plan_parse_int_vector3_counted_range(&cur, &ad[sample * 3], &nread, &ad_range) < 0) - goto fallback; - } else if (ad_w == 4) { - if (vcf_plan_parse_int_vector4_counted_range(&cur, &ad[sample * 4], &nread, &ad_range) < 0) - goto fallback; - } else if (vcf_plan_parse_int_vector_counted_range(&cur, &ad[sample * ad_w], ad_w, &nread, &ad_range) < 0) { - goto fallback; - } - if (max_ad_count < nread) - max_ad_count = nread; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value_range(&cur, &dp[sample], &dp_range) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value_range(&cur, &gq[sample], &gq_range) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (has_phase) { - if (vcf_plan_copy_string(&cur, &str1[sample * widths[str1_idx]], - widths[str1_idx]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_copy_string(&cur, &str2[sample * widths[str2_idx]], - widths[str2_idx]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } - if (pl_w == 3) { - if (vcf_plan_parse_int_vector3_counted_range(&cur, &pl[sample * 3], &nread, &pl_range) < 0) - goto fallback; - } else if (pl_w == 6) { - if (vcf_plan_parse_int_vector6_counted_range(&cur, &pl[sample * 6], &nread, &pl_range) < 0) - goto fallback; - } else if (pl_w == 10) { - if (vcf_plan_parse_int_vector10_counted_range(&cur, &pl[sample * 10], &nread, &pl_range) < 0) - goto fallback; - } else if (vcf_plan_parse_int_vector_counted_range(&cur, &pl[sample * pl_w], pl_w, &nread, &pl_range) < 0) { - goto fallback; - } - if (max_pl_count < nread) - max_pl_count = nread; - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; - } - if (sample != nsamples) - goto fallback; - if (max_ad_count != ad_w || max_pl_count != pl_w) - goto fallback; - - v->n_fmt = plan->n_ops; - v->n_sample = nsamples; - bcf_enc_int1(&v->indiv, plan->ops[ad_idx].key); - nwords = nsamples * ad_w; - if (bcf_enc_vint_known_range_special(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max, - ad_range.has_special) < 0) - goto error; - bcf_enc_int1(&v->indiv, plan->ops[dp_idx].key); - if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max, - dp_range.has_special) < 0) - goto error; - bcf_enc_int1(&v->indiv, plan->ops[gq_idx].key); - if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max, - gq_range.has_special) < 0) - goto error; - if (has_phase) { - bcf_enc_int1(&v->indiv, plan->ops[str1_idx].key); - if (bcf_enc_size(&v->indiv, widths[str1_idx], BCF_BT_CHAR) < 0 || - kputsn(str1, (size_t) nsamples * widths[str1_idx], &v->indiv) < 0) - goto error; - bcf_enc_int1(&v->indiv, plan->ops[str2_idx].key); - if (bcf_enc_size(&v->indiv, widths[str2_idx], BCF_BT_CHAR) < 0 || - kputsn(str2, (size_t) nsamples * widths[str2_idx], &v->indiv) < 0) - goto error; - } - bcf_enc_int1(&v->indiv, plan->ops[pl_idx].key); - nwords = nsamples * pl_w; - if (bcf_enc_vint_known_range_special(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max, - pl_range.has_special) < 0) - goto error; - - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; - vcf_format_likelihood_shape_hits++; - return 0; - -fallback: - v->indiv.l = indiv_l0; - vcf_format_likelihood_shape_fallback++; - return -4; -error: - v->indiv.l = indiv_l0; - return -1; -} - -static int vcf_parse_format_general_likelihood_strict(kstring_t *s, - const bcf_hdr_t *h, - bcf1_t *v, - const vcf_format_general_plan_t *plan, - char *q, int *attempted_shape) -{ - int widths[MAX_N_FMT]; - - if (attempted_shape) - *attempted_shape = 0; - if (vcf_format_general_likelihood_widths(s, h, plan, v, q, widths) < 0) - return -4; - if (attempted_shape) - *attempted_shape = 1; - return vcf_parse_format_general_likelihood_shape(s, h, v, plan, q, widths); -} - static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, From b43c38385cea6ec55455fa05b3e14606cb4fb841 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 18:17:14 +0200 Subject: [PATCH 24/38] Compact composable FORMAT row widths --- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 29 +++++++++++++++ vcf.c | 37 ++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index ceae4fe27..2bfd51f30 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -496,3 +496,32 @@ All exact and interp outputs compared byte-identical to baseline. | Mixed row-local fallbacks | 2.25 s | 1.76 s | 1.94 s | byte-clean fallback path | | GT-first reordered negative | 1.77 s | 1.47 s | 1.44 s | composable path slightly ahead | | Two-string float negative | 2.29 s | 2.55 s | 2.55 s | planned path still slower than baseline | + +## 2026-04-29 Underfilled Vector Compaction + +Added a composable per-op fallback reduction for fixed-width vector fields. If +an `INT2`/`INT3`/`INTN`/`FLOATN` op was parsed into the conservative +header-derived width, but the row's observed maximum vector count is smaller, +the executor now compacts that field's scratch buffer to the observed row width +and encodes it directly instead of falling back for the whole row. + +This keeps the fallback boundary for unsupported/malformed data, but avoids +production fallback for byte-identical rows where the production parser would +also emit a narrower BCF vector width. + +Latest full large-corpus run remained byte-identical to baseline. The main +effect is fallback reduction on mixed row-local cases: + +| Input | Exact user | Dynamic interp user | Dynamic hits/fallback | +|---|---:|---:|---:| +| CCDG 10k | 1.50 s | 2.28 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 9.08 s | 8.89 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 2.66 s | 3.76 s | 20,000 / 0 | +| Large multiallelic likelihood | 1.90 s | 2.78 s | 16,000 / 0 | +| Variable phase widths | 1.97 s | 2.50 s | 12,000 / 0 | +| Mixed row-local fallbacks | 1.75 s | 1.86 s | 12,000 / 0 | +| GT-first reordered negative | 1.43 s | 1.41 s | 12,000 / 0 | + +The attempted pointer-increment / reduced-bookkeeping hot-loop rewrite was +tested separately and reverted because it slowed the targeted likelihood-heavy +benchmarks despite remaining byte-correct. diff --git a/vcf.c b/vcf.c index 9621181f0..4c03fec05 100644 --- a/vcf.c +++ b/vcf.c @@ -4313,6 +4313,33 @@ static int vcf_format_general_composable_supported(const vcf_format_row_op_t *ro return 1; } +static int vcf_format_row_can_compact(const vcf_format_row_op_t *op) +{ + return op->kind == VCF_FORMAT_ROW_INT2 || + op->kind == VCF_FORMAT_ROW_INT3 || + op->kind == VCF_FORMAT_ROW_INTN || + op->kind == VCF_FORMAT_ROW_FLOATN; +} + +static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, + vcf_format_row_op_t *op, int width) +{ + size_t elem_size = op->kind == VCF_FORMAT_ROW_FLOATN ? sizeof(float) : sizeof(int32_t); + size_t old_stride = (size_t) op->width * elem_size; + size_t new_stride = (size_t) width * elem_size; + char *base = mem->s + op->offset; + int sample; + + for (sample = 1; sample < nsamples; sample++) + memmove(base + sample * new_stride, base + sample * old_stride, new_stride); + op->width = width; + op->size = (int)new_stride; + if (op->kind == VCF_FORMAT_ROW_INT2 || op->kind == VCF_FORMAT_ROW_INT3) + op->kind = width == 1 ? VCF_FORMAT_ROW_INT1 : + width == 2 ? VCF_FORMAT_ROW_INT2 : + width == 3 ? VCF_FORMAT_ROW_INT3 : VCF_FORMAT_ROW_INTN; +} + static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, bcf1_t *v, char *q, int *widths) @@ -4527,9 +4554,15 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, } if (sample != nsamples) goto fallback; - for (j = 0; j < plan->n_ops; j++) - if (max_counts[j] != row_ops[j].width) + for (j = 0; j < plan->n_ops; j++) { + if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) goto fallback; + if (max_counts[j] < row_ops[j].width) { + if (!vcf_format_row_can_compact(&row_ops[j])) + goto fallback; + vcf_format_compact_row_op(mem, nsamples, &row_ops[j], max_counts[j]); + } + } v->n_fmt = plan->n_ops; v->n_sample = nsamples; From b96d9823a3f4029cef45385a74af868d855ad520 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 18:39:58 +0200 Subject: [PATCH 25/38] Trim FORMAT parser to dynamic path --- bench/format-shape/README.md | 12 +- bench/format-shape/scripts/run_bench.sh | 33 +- docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 175 ++--- docs/DYNAMIC_FORMAT_PLAN_README.md | 73 ++ ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 84 +- docs/FORMAT_PLAN_SPEC.md | 164 ++-- test/test_view.c | 22 - vcf.c | 722 +++--------------- 8 files changed, 388 insertions(+), 897 deletions(-) create mode 100644 docs/DYNAMIC_FORMAT_PLAN_README.md diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index ea69f9f9b..e1d440f5e 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -14,7 +14,7 @@ bench/format-shape/ large/ meaningful multi-second benchmark inputs/results scripts/make_synthetic.pl deterministic synthetic VCF generator scripts/make_large_synthetic.pl - scripts/run_bench.sh baseline/exact/interp timing and cmp runner + scripts/run_bench.sh baseline/plan/interp timing and cmp runner results/ generated timing logs and BCF outputs ``` @@ -95,11 +95,13 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ `KEEP_OUTPUTS=0` still writes temporary BCF files and compares them with `cmp`, but deletes the large BCF outputs after each input is checked. -The script runs each input in three modes: +The script runs each input in three modes. `plan` and `interp` both use the +dynamic per-tag FORMAT planner; both are kept so old comparisons can still check +the `HTS_VCF_FORMAT_PLAN=1` spelling against the explicit dynamic spelling. ```text baseline: HTS_VCF_FORMAT_PLAN=0 -exact: HTS_VCF_FORMAT_PLAN=1 +plan: HTS_VCF_FORMAT_PLAN=1 interp: HTS_VCF_FORMAT_PLAN=interp ``` @@ -110,7 +112,7 @@ bench/format-shape/results/timings.tsv bench/format-shape/results/checks.tsv ``` -`checks.tsv` compares exact and interp BCF output against baseline with `cmp`. +`checks.tsv` compares plan and interp BCF output against baseline with `cmp`. ## Large Corpus @@ -140,4 +142,4 @@ bench/format-shape/large/results/timings.tsv bench/format-shape/large/results/checks.tsv ``` -All exact and interp outputs in that run compared byte-identical to baseline. +All plan and interp outputs in that run compared byte-identical to baseline. diff --git a/bench/format-shape/scripts/run_bench.sh b/bench/format-shape/scripts/run_bench.sh index bb88fe439..7fad74863 100755 --- a/bench/format-shape/scripts/run_bench.sh +++ b/bench/format-shape/scripts/run_bench.sh @@ -10,16 +10,16 @@ mkdir -p "$outdir" timings="$outdir/timings.tsv" checks="$outdir/checks.tsv" -printf 'name\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\tshape_attempts\tshape_hits\tshape_fallback\n' > "$timings" +printf 'name\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\n' > "$timings" printf 'name\tcomparison\tstatus\n' > "$checks" tail -n +2 "$inputs" | while IFS=' ' read -r name path source do base_out="$outdir/$name.baseline.bcf" - exact_out="$outdir/$name.exact.bcf" + plan_out="$outdir/$name.plan.bcf" interp_out="$outdir/$name.interp.bcf" - for mode in baseline exact interp + for mode in baseline plan interp do err="$outdir/$name.$mode.stderr" out="$outdir/$name.$mode.bcf" @@ -27,12 +27,12 @@ do baseline) env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" ;; - exact) - env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 HTS_VCF_FORMAT_PLAN_SHAPE_STATS=1 \ + plan) + env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 \ /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" ;; interp) - env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 HTS_VCF_FORMAT_PLAN_SHAPE_STATS=1 \ + env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 \ /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" ;; esac @@ -50,27 +50,18 @@ do else if (kv[1] == "parsed_samples") parsed=kv[2] } } - /^vcf-format-likelihood-shape / { - for (i=1; i<=NF; i++) { - split($i, kv, "=") - if (kv[1] == "attempts") shape_attempts=kv[2] - else if (kv[1] == "hits") shape_hits=kv[2] - else if (kv[1] == "fallback") shape_fallback=kv[2] - } - } END { - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", name, mode, real+0, user+0, sys+0, - attempts+0, hits+0, fallback+0, parsed+0, - shape_attempts+0, shape_hits+0, shape_fallback+0 + attempts+0, hits+0, fallback+0, parsed+0 } ' "$err" >> "$timings" done - if cmp "$base_out" "$exact_out" >/dev/null 2>&1; then - printf '%s\tbaseline_vs_exact\tok\n' "$name" >> "$checks" + if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then + printf '%s\tbaseline_vs_plan\tok\n' "$name" >> "$checks" else - printf '%s\tbaseline_vs_exact\tDIFF\n' "$name" >> "$checks" + printf '%s\tbaseline_vs_plan\tDIFF\n' "$name" >> "$checks" fi if cmp "$base_out" "$interp_out" >/dev/null 2>&1; then printf '%s\tbaseline_vs_interp\tok\n' "$name" >> "$checks" @@ -78,7 +69,7 @@ do printf '%s\tbaseline_vs_interp\tDIFF\n' "$name" >> "$checks" fi if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$exact_out" "$interp_out" + rm -f "$base_out" "$plan_out" "$interp_out" fi done diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 7a35c8ead..161111ec1 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -2,30 +2,34 @@ Date: 2026-04-29 -Worktree: `/tmp/htslib-vcf-avx-sanity` - Branch: `codex/vcf-avx-sanity` +This file is a checkpoint for the CCDG-oriented FORMAT parser work. Earlier +versions of the branch used handwritten exact CCDG kernels; those kernels have +now been removed. The current production candidate is dynamic-only. + ## Current Takeaway -The experimental FORMAT planner is viable, but the current large CCDG win comes -from the handwritten exact CCDG kernels, not yet from the fully dynamic -strict/interpreter path. +The dynamic FORMAT planner is byte-correct on the CCDG subset and larger FORMAT +benchmark corpus. It is no longer a narrow full-string kernel: it compiles each +FORMAT tag from header metadata and uses one composable executor for supported +tags. -The dynamic general planner is correct and modestly faster than baseline. It is -the path we want to improve next, using the exact kernels as a performance -oracle. +On the CCDG 10k subset, the dynamic-only path is faster than baseline but slower +than the historical handwritten exact kernels. On GT-only and several +reordered/synthetic workloads, the dynamic path is much closer to the previous +target and can be materially faster than baseline. ## Modes ```sh -HTS_VCF_FORMAT_PLAN=0 # baseline generic parser -HTS_VCF_FORMAT_PLAN=1 # exact CCDG kernels, then dynamic general fallback -HTS_VCF_FORMAT_PLAN=interp # dynamic general planner only +HTS_VCF_FORMAT_PLAN=0 # production parser +HTS_VCF_FORMAT_PLAN=1 # dynamic per-tag planner, then production fallback +HTS_VCF_FORMAT_PLAN=interp # same dynamic planner; retained for comparisons HTS_VCF_FORMAT_PLAN_STATS=1 # print planner counters from test/test_view ``` -## Data +## CCDG Data Source file: @@ -33,14 +37,7 @@ Source file: /Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` -Subset used for the current benchmark: - -```text -/tmp/ccdg_chr22_10k.vcf.gz -/tmp/ccdg_chr22_10k.bcf -``` - -The subset contains 10,000 variant records and 3,202 samples. The observed +The 10k subset contains 10,000 variant records and 3,202 samples. The observed FORMAT distribution is: | Records | FORMAT | @@ -50,117 +47,45 @@ FORMAT distribution is: | 813 | `GT:AD:DP:GQ:PL` | | 732 | `GT:AD:DP:GQ:PGT:PID:PL` | -The exact CCDG tier covers all four layouts. - -## Clean Sanity Rerun - -These numbers were rerun after noticing that an earlier table mislabeled the -dynamic/interpreter result. Timings are single wall-clock runs on the 10k CCDG -subset, so treat them as directional. - -| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | -|---|---:|---:| -| Baseline | 2.58 s | 2.83 s | -| Exact + dynamic fallback | 1.61 s | 1.86 s | -| Dynamic general only | 2.34 s | 2.55 s | - -Planner counters on VCF.gz -> uncompressed BCF: - -| Mode | Attempts | Hits | Fallback | Parsed samples | -|---|---:|---:|---:|---:| -| Exact + dynamic fallback | 10,000 | 10,000 | 0 | 32,020,000 | -| Dynamic general only | 10,000 | 10,000 | 0 | 32,020,000 | - -Both planned modes are byte-identical against baseline in the sanity tests, but -the exact tier is much faster. - -## Broader Conversion Matrix - -Earlier single-run compressed conversion checks used `test/test_view` and -compared outputs byte-for-byte with `cmp`. - -| Conversion | Baseline | Exact + dynamic fallback | Dynamic general only | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz | 8.73 s | 7.78 s | 8.58 s | -| BCF -> BCF.gz | 6.85 s | 6.92 s | 7.02 s | -| BCF -> VCF.gz | 11.18 s | 11.22 s | 11.15 s | -| VCF.gz -> VCF.gz | 13.26 s | 12.34 s | 13.01 s | -| VCF.gz -> uncompressed BCF | 2.83 s | 1.85 s | 2.58 s | - -BCF-input conversions are unchanged, as expected, because this optimization only -affects VCF text FORMAT parsing. +The current dynamic planner can compile these layouts from tag metadata rather +than matching the whole FORMAT string. -Threaded compressed output with `test_view -@ 4` makes the parser win visible -even for compressed-to-compressed workflows: +## Latest Large-Corpus Result -| Conversion | Baseline | Exact + dynamic fallback | Dynamic general only | -|---|---:|---:|---:| -| VCF.gz -> BCF.gz, `-@ 4` | 2.64 s | 2.03 s | 2.06 s | -| VCF.gz -> VCF.gz, `-@ 4` | 3.96 s | 3.03 s | 3.02 s | +The most recent post-trim run used: -The threaded dynamic-only numbers should be rerun before drawing strong -conclusions; the clean single-thread rerun shows dynamic-only is not yet at -exact-kernel speed. - -## Edge Fixture - -`./test/test_format_plan.sh` compares baseline, `HTS_VCF_FORMAT_PLAN=1`, and -`HTS_VCF_FORMAT_PLAN=interp` on `test/format-plan-edge.vcf`. - -Current output: - -```text -vcf-format-plan attempts=14 hits=11 fallback=3 parsed_samples=33 -vcf-format-plan attempts=14 hits=14 fallback=0 parsed_samples=42 -``` - -The first line is `HTS_VCF_FORMAT_PLAN=1`: exact kernels claim the CCDG-shaped -rows and intentionally fall back for rows outside their narrow shape. The -second line is dynamic-only: the general planner handles all 14 fixture rows. - -## Profiling Notes - -After `PGT:PID` support, the generic FORMAT fallback is no longer a meaningful -cost for the CCDG benchmark when exact kernels are enabled. A macOS `sample` -profile of VCF.gz -> uncompressed BCF on the 100k subset showed the next hot -areas inside the planned path: - -```text -vcf_plan_parse_int_vector 189 samples -libdeflate input decompress 158 samples -vcf_parse_format 154 samples -bcf_enc_vint 83 samples -vcf_plan_int_value 42 samples -vcf_plan_copy_string 33 samples -vcf_plan_gt2 27 samples -vcf_plan_float_value 24 samples -read 16 samples +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv ``` -This is statistical sampling, not exact cycle accounting. Directionally, the -next parser-side targets are integer-vector parsing, `PGT/PID` string handling, -per-sample dispatch, and repeated BCF integer encoding work. +All planned outputs compared byte-identical to baseline. -## Checkpoint Recommendation - -Commit this state as an honest experimental checkpoint: - -- keep the exact CCDG kernels because they establish the upper-bound target; -- keep the dynamic general planner and edge fixture because they are the path to - a general solution; -- keep benchmark docs explicit that dynamic-only is not yet the big win; -- do not open an upstream-facing PR until the dynamic executor closes more of - the gap or the PR is framed as an experimental CCDG-specialized prototype. +| Input | Baseline user | Dynamic `1` user | Dynamic `interp` user | Hits/fallback | +|---|---:|---:|---:|---:| +| CCDG 10k | 2.62 s | 2.25 s | 2.24 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 26.05 s | 7.98 s | 8.01 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | 3.77 s | 20,000 / 0 | +| Large reordered likelihood | 3.00 s | 2.42 s | 2.44 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.16 s | 2.73 s | 2.73 s | 16,000 / 0 | +| Large float/string | 2.93 s | 2.97 s | 2.97 s | 16,000 / 0 | +| Variable phase widths | 2.61 s | 2.50 s | 2.48 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.22 s | 1.87 s | 1.86 s | 12,000 / 0 | +| GT-first reordered negative | 1.75 s | 1.44 s | 1.45 s | 12,000 / 0 | +| Two-string float negative | 2.28 s | 2.56 s | 2.54 s | 12,000 / 0 | + +## Historical Note + +The removed exact kernels remain useful as a performance reference in old +benchmark logs, but they are no longer live code. New optimization work should +measure `HTS_VCF_FORMAT_PLAN=1` and `interp` as dynamic-only variants and should +compare both against `HTS_VCF_FORMAT_PLAN=0`. ## Next Work -The highest-value next step is to make a dynamic fixed-shape executor that -captures the exact-kernel benefits without matching on CCDG field names. The -target is exact-like speed for piecewise fixed FORMAT regions with quick -fallback when a row leaves the proven shape. - -An attempted bcftools rebuild against this htslib worktree failed at link time -because the sibling bcftools checkout expects `bcf_write_take_ownership`, which -is not present in this htslib worktree. Operation-level bcftools timings should -be rerun only after pairing this htslib branch with a matching bcftools revision -or porting that API. +- Reduce the CCDG fallback rate without introducing full-string special cases. +- Add selected-sample support so `keep_samples` does not force production + fallback. +- Lower per-op dispatch and scratch-buffer overhead on likelihood-shaped rows. +- Keep expanding edge fixtures when a new supported FORMAT tag or width pattern + is added. diff --git a/docs/DYNAMIC_FORMAT_PLAN_README.md b/docs/DYNAMIC_FORMAT_PLAN_README.md new file mode 100644 index 000000000..37c57a31b --- /dev/null +++ b/docs/DYNAMIC_FORMAT_PLAN_README.md @@ -0,0 +1,73 @@ +# Dynamic FORMAT Plan + +This branch adds an optional dynamic fast path for parsing VCF `FORMAT` sample +columns. The goal is to speed up common, header-described FORMAT layouts +without hardcoding exact full FORMAT strings such as `GT:AD:DP:GQ:PL`. + +## How It Works + +When `HTS_VCF_FORMAT_PLAN` is enabled, `vcf_parse_format()` first tries to +compile the record's literal FORMAT string into a small list of per-tag +operations. Compilation uses the active header, so the plan records each tag's +header key, type, declared number, and whether the row needs width measurement. + +The executor then parses samples with that op list and writes BCF's transposed +FORMAT layout. If compilation or row-local validation fails, it returns to the +existing production parser for the whole FORMAT column. + +Supported environment values: + +- unset or `0`: use the production parser only. +- `1`, `interp`, or `general`: use the dynamic plan, with production fallback. + +The old exact FORMAT kernels and optional SIMD tab-scanning front-end have been +removed. All enabled spellings now route through the same dynamic path. + +## Supported Cases + +The fast path is tag-composable rather than full-string-specialized. It can +handle subsets, reordered fields, and supersets when each tag is described by +header metadata that the executor supports. + +Currently supported FORMAT tag shapes: + +- `GT` declared as `Type=String,Number=1`, with simple diploid encodings on the + fast path. +- Integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or + bounded measured `Number=.` row widths. +- Float fields with the same number models as integer fields. +- String fields declared as `Type=String,Number=1`, measured per row. + +Examples that can use the dynamic path include `GT:AD`, `GT:AD:DP:PL`, +`GT:AB:AD:DP:GQ:PGT:PID:PL`, and reordered/superset layouts with additional +supported tags. + +## Fallback Behavior + +Fallback is intentionally whole-row for the MVP. The dynamic parser does not +mix optimized handling for some tags with production handling for other tags in +the same FORMAT column. This keeps BCF layout, warning behavior, and error +recovery aligned with the existing parser when a row is unusual. + +Known fallback cases include: + +- sample subsetting via `keep_samples`; +- undefined FORMAT tags that require production header repair; +- unsupported header types or number models; +- malformed sample separators or unexpected sample cardinality; +- row-local widths above the current bounded fast-path limit; +- GT encodings outside the simple fast-path representation. + +## Tests And Benchmarks + +Focused validation lives in `test/test_format_plan.sh`, including byte-for-byte +comparisons between production parsing and planned parsing on edge-case FORMAT +fixtures. + +The larger benchmark corpus lives under `bench/format-shape/large/`. The +benchmark script runs `baseline`, `plan`, and `interp` modes. `plan` is +`HTS_VCF_FORMAT_PLAN=1`; `interp` is the explicit dynamic spelling. After the +dynamic-only trim both enabled modes use the same dynamic executor. + +Latest documented results are in +`docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md`. diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 2bfd51f30..5b91b8b81 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -6,36 +6,35 @@ Branch: `codex/vcf-avx-sanity` ## Goal -Make the general-purpose VCF FORMAT planned parser approach the handwritten -exact CCDG kernel speed without matching on field names. The planner should stay -general, but the hot executor should become shape-specialized once a repeated -FORMAT layout proves stable. +Make the general-purpose VCF FORMAT planned parser as fast as possible without +matching on field names. The planner should stay general and tag-composable, +while suspicious rows continue to fall back to the production parser. The production htslib parser remains the source of truth. Any optimized path must either emit byte-identical BCF or return `-3` and fall back. ## Current Baseline -Known modes: +Current modes after the dynamic-only production trim: ```sh HTS_VCF_FORMAT_PLAN=0 # existing generic parser -HTS_VCF_FORMAT_PLAN=1 # exact CCDG kernels, then dynamic fallback -HTS_VCF_FORMAT_PLAN=interp # dynamic planner only +HTS_VCF_FORMAT_PLAN=1 # dynamic planner, then production fallback +HTS_VCF_FORMAT_PLAN=interp # same dynamic planner, explicit spelling HTS_VCF_FORMAT_PLAN_STATS=1 # counters from test/test_view ``` -Current 10k CCDG sanity timing: +Latest 10k CCDG timing from the large-corpus run: -| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | -|---|---:|---:| -| Baseline | 2.58 s | 2.83 s | -| Exact + dynamic fallback | 1.61 s | 1.86 s | -| Dynamic general only | 2.34 s | 2.55 s | +| Mode | User time | +|---|---:| +| Baseline | 2.62 s | +| Dynamic `1` | 2.25 s | +| Dynamic `interp` | 2.24 s | -The performance target is the exact CCDG tier. The first milestone is not to -delete exact kernels, but to make a dynamic shape executor selected without tag -name special cases reach the same neighborhood. +Older sections below record the experimental path through exact kernels and a +dynamic shape tier. Those paths have been removed from live code; the final +section is the current production-trim state. ## Working Hypothesis @@ -149,9 +148,10 @@ test/format-plan-edge.vcf ## Current Scratch Notes -- `HTS_VCF_FORMAT_PLAN=interp` is the key mode for dynamic executor progress. -- Exact kernels should remain until dynamic-only is close enough to make them - redundant. +- `HTS_VCF_FORMAT_PLAN=1` and `HTS_VCF_FORMAT_PLAN=interp` now exercise the same + dynamic executor. +- Historical exact-kernel numbers remain useful only as a performance reference + in older benchmark notes. - Avoid hardcoding `AD`, `PL`, `DP`, `GQ`, `AB`, `PGT`, or `PID`; use their header-derived type/number/width instead. - CCDG-like FORMAT distributions are still the first target because they provide @@ -163,8 +163,8 @@ Implemented the first dynamic likelihood-shape executor in `vcf.c`. What changed: -- Added an optional `HTS_VCF_FORMAT_PLAN_SHAPE_STATS` counter path in - `test/test_view`. +- Added a temporary shape counter path in `test/test_view`; it was later removed + with the dynamic-only production trim. - Relaxed strict string handling so `Type=String,Number=1` FORMAT fields can be handled by planned parsing with row-local byte-width measurement. - Added a shape-specific width derivation for CCDG-like layouts where `AD` may @@ -525,3 +525,45 @@ effect is fallback reduction on mixed row-local cases: The attempted pointer-increment / reduced-bookkeeping hot-loop rewrite was tested separately and reverted because it slowed the targeted likelihood-heavy benchmarks despite remaining byte-correct. + +## 2026-04-29 Dynamic-Only Production Trim + +Removed the optional SIMD tab-scanning front-end and the old hardcoded exact +FORMAT kernels. The optimized FORMAT entry point is now: + +```text +HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> production fallback +``` + +`HTS_VCF_FORMAT_PLAN=1`, `interp`, and `general` all route through the same +dynamic executor. The benchmark harness now labels `HTS_VCF_FORMAT_PLAN=1` as +`plan`; older result directories may still contain a historical `exact` label, +but that is no longer a separate hardcoded kernel path. + +Source cleanup removed the SIMD probe/stat plumbing, SIMD intrinsics includes, +shape-stat plumbing, exact shape compiler/cache, exact phase-width pass, and +exact GT/AB/AD/DP/GQ/PL microkernel. Relative to `origin/develop`, the live +source delta after adding inline docs is 1,467 added lines in `vcf.c` plus 14 +added lines in `test/test_view.c`. + +Latest full large-corpus run: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All output comparisons remained byte-identical to baseline. + +| Input | Baseline user | Dynamic `1` user | Dynamic `interp` user | Hits/fallback | +|---|---:|---:|---:|---:| +| CCDG 10k | 2.62 s | 2.25 s | 2.24 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 26.05 s | 7.98 s | 8.01 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | 3.77 s | 20,000 / 0 | +| Large reordered likelihood | 3.00 s | 2.42 s | 2.44 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.16 s | 2.73 s | 2.73 s | 16,000 / 0 | +| Large float/string | 2.93 s | 2.97 s | 2.97 s | 16,000 / 0 | +| Variable phase widths | 2.61 s | 2.50 s | 2.48 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.22 s | 1.87 s | 1.86 s | 12,000 / 0 | +| GT-first reordered negative | 1.75 s | 1.44 s | 1.45 s | 12,000 / 0 | +| Two-string float negative | 2.28 s | 2.56 s | 2.54 s | 12,000 / 0 | diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 47c0509e1..42d34f014 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -1,92 +1,98 @@ # FORMAT Plan Parser Spec -This document describes the current experimental `HTS_VCF_FORMAT_PLAN` VCF -FORMAT parser and the direction for making it more general. +This document describes the current `HTS_VCF_FORMAT_PLAN` VCF FORMAT parser. +The older exact CCDG kernels and dynamic likelihood-shape tier were removed; +the optimized path is now a single dynamic per-tag planner with production +fallback. ## Goal -Keep the existing htslib FORMAT parser as the source of truth, but add -opportunistic fast paths for repeated FORMAT layouts. A fast path may only claim -a record when it can produce byte-identical BCF. Otherwise it must return `-3` -and let the existing parser handle the row. +Keep the existing htslib FORMAT parser as the source of truth, while adding an +opportunistic fast path for repeated, header-described FORMAT layouts. The fast +path may only claim a row when it can produce byte-identical BCF. Otherwise it +must return `-3` and let the existing parser handle the whole FORMAT column. ## Current Architecture -`HTS_VCF_FORMAT_PLAN=1` enables a tiered parser: +`HTS_VCF_FORMAT_PLAN` controls the planned parser: -1. Handwritten exact kernels for the four dominant CCDG FORMAT layouts: - `GT:AB:AD:DP:GQ:PL`, `GT:AD:DP:GQ:PL`, - `GT:AB:AD:DP:GQ:PGT:PID:PL`, and `GT:AD:DP:GQ:PGT:PID:PL`. -2. A dynamic general FORMAT planner keyed by the literal FORMAT column and - header pointer. It resolves field IDs/types once and executes row-specific - operations for GT, integer vectors, float vectors, and strings. -3. The existing generic htslib FORMAT parser for unsupported or suspicious - rows. +```text +unset / 0 production parser only +1 dynamic per-tag planner, then production fallback +interp/general same dynamic per-tag planner, then production fallback +``` + +All enabled spellings now use the same implementation. The benchmark harness +may still run both `1` and `interp` as separate modes, but they are intended to +match except for normal timing noise. -`HTS_VCF_FORMAT_PLAN=interp` or `HTS_VCF_FORMAT_PLAN=general` skips the exact -CCDG kernels and runs only the dynamic general planner. This mode is useful for -isolating how much performance the general approach has captured. +The planned parser has four stages: -## Measured State +1. Compile the literal FORMAT string and active header into a cached list of + per-tag operations. +2. Resolve row-local widths from header `Number` metadata, allele count, and a + bounded measurement pass for strings or `Number=.` numeric vectors. +3. Parse sample fields into BCF's transposed FORMAT layout with a composable + executor. +4. Fall back to the production FORMAT parser for unsupported or suspicious rows. -On the 10k CCDG subset, the exact tier is currently the large win. A clean -sanity rerun on 2026-04-29 showed: +## Supported Tags -| Mode | VCF.gz read-only | VCF.gz -> uncompressed BCF | -|---|---:|---:| -| Baseline | 2.58 s | 2.83 s | -| `HTS_VCF_FORMAT_PLAN=1` | 1.61 s | 1.86 s | -| `HTS_VCF_FORMAT_PLAN=interp` | 2.34 s | 2.55 s | +The planner is tag-composable rather than full-string-specialized. It can claim +layouts such as `GT:AD`, `GT:AD:DP:PL`, +`GT:AB:AD:DP:GQ:PGT:PID:PL`, reordered fields, and supersets when each tag has +supported header metadata. -The earlier docs overstated the dynamic strict/interpreter result. The dynamic -planner is correct and modestly faster than baseline, but it does not yet match -the handwritten CCDG kernels. +Supported FORMAT tag shapes: -The next development target is to move the exact-kernel advantages into a -dynamic shape executor so common fixed-format regions can get exact-like speed -without field-name-specific kernels. +- `GT` declared as `Type=String,Number=1`, with simple diploid encodings on the + fast path. +- Integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or + bounded measured `Number=.` row widths. +- Float fields with the same number models as integer fields. +- String fields declared as `Type=String,Number=1`, measured per row. + +Unsupported tags or unsupported row-local encodings fall back whole-row. ## Correctness Rules The planned parser must preserve these invariants: - No planned parsing while `h->keep_samples` is active. -- Header IDs and types are resolved before execution. -- Duplicate FORMAT tags use the generic parser. -- Undefined tags use the generic parser, preserving current dummy-header - behavior and warnings. -- GT encoding must match generic htslib phasing semantics, including haploid - genotypes, missing alleles, multidigit allele indexes, and VCF 4.4 prefix - phasing. +- Header IDs, types, and number models are resolved before execution. +- Duplicate FORMAT tags use the production parser. +- Undefined tags use the production parser, preserving dummy-header behavior and + warnings. +- GT encoding must match htslib phasing semantics; encodings outside the simple + fast path must force fallback. - Numeric vectors use observed or provably fixed row width and pad shorter samples with vector-end sentinels. - Strings use observed maximum byte length and zero-pad shorter samples. -- Integer and float overflow/error behavior must either match generic htslib or - force fallback. +- Integer and float overflow/error behavior must either match production htslib + or force fallback. - Any fast path that writes directly into `v->indiv` must save the original length and roll back before fallback. ## Dynamic Planner -The general planner compiles the literal FORMAT string into a cached op list. -After seeing a record, it resolves the ops to row-local opcodes such as `GT2`, -`GT`, `INT1`, `INT2`, `INT3`, `INTN`, `FLOAT1`, `FLOATN`, and `STR`. +The planner compiles the literal FORMAT string into cached opcodes keyed by +header pointer plus FORMAT text. Header-local ids and type metadata make plans +unsafe to share across headers. -For rows whose widths can be predicted from the header and allele count, the -planner first tries a strict numeric executor. That path validates shape while -parsing, carries integer min/max metadata into BCF integer encoding, and can -direct-write a leading `GT2`/`FLOAT1` prefix. If the row is sparse, stringy, -malformed, or otherwise not byte-identical, it falls back to the measured-width -general planner. +After seeing a record, it resolves the reusable op list to row-local operations +such as `GT2`, `INT1`, `INT2`, `INT3`, `INTN`, `FLOAT1`, `FLOATN`, and `STR`. +`Number=A`, `Number=R`, and `Number=G` widths come from the current allele +count. String and `Number=.` numeric widths are measured across the row before +execution. -Today, the strict/general path still has enough overhead that it trails the -handwritten CCDG kernels on the CCDG benchmark. Likely remaining gaps include -per-field dispatch, measured-width/string handling for `PGT/PID`, scratch-buffer -traffic, and generic encode costs. +The executor writes BCF's transposed FORMAT layout. Leading fixed-width +`GT2`/`FLOAT1` rows can be written directly into `v->indiv`; other rows are +staged in header scratch memory and encoded after sample parsing so integer +range and observed-width metadata are known. ## Guard Policy -Each cached exact/general plan has a small runtime guard: +Each cached dynamic plan has a small runtime guard: - attempts, hits, fallbacks, - consecutive miss streak, @@ -97,43 +103,23 @@ after eight consecutive misses, or after at least 128 attempts with more than 10% fallbacks. After 256 skipped records, the plan probes again so later fixed-format regions can recover the optimized path. -For exact CCDG kernels, a paused exact guard routes the row to the dynamic -general planner. For general plans, a paused strict guard skips directly to the -measured-width planner, and a paused general guard returns to legacy htslib -parsing. - -## Edge Fixture +## Tests -`test/format-plan-edge.vcf` is CCDG-shaped but includes awkward realistic rows: +`./test/test_format_plan.sh` writes BCF through: -- the exact CCDG layouts, -- reordered FORMAT fields, -- non-CCDG numeric tag names with fixed widths, -- integer values around BCF int8/int16 boundaries, -- multiallelic AD/PL and GL, -- haploid GT, -- multidigit allele indexes, -- fixed integer vectors, -- string FORMAT fields, -- exact-kernel fallbacks that the dynamic planner can still handle. - -Run: - -```sh -./test/test_format_plan.sh -``` +- the production parser, +- `HTS_VCF_FORMAT_PLAN=1`, +- `HTS_VCF_FORMAT_PLAN=interp`. -The script writes BCF through the generic parser, `HTS_VCF_FORMAT_PLAN=1`, and -`HTS_VCF_FORMAT_PLAN=interp`, then compares the outputs with `cmp`. +It compares the planned outputs against baseline with `cmp`. The fixtures cover +subsets, supersets, reordered fields, measured numeric fields, strings, +malformed header shapes, and deliberate row-local fallback cases. ## Next Work -- Make a dynamic fixed-shape executor that captures the CCDG exact-kernel wins - without matching on field names. -- Specialize common string-bearing shapes such as `PGT/PID` without baking in - CCDG tag names. -- Reduce per-sample opcode dispatch in hot FORMAT shapes. -- Expand direct final-buffer output only where BCF type selection remains - byte-identical or can cheaply roll back. -- Keep the exact kernels as a performance oracle while iterating, then remove - or demote them once the dynamic executor catches up. +- Add selected-sample support so `keep_samples` does not require whole-row + fallback. +- Reduce per-sample opcode dispatch in hot FORMAT layouts. +- Improve string and measured-width handling without losing byte identity. +- Consider a later executor-generation layer if generic per-op dispatch remains + the main gap to historical exact-kernel speed. diff --git a/test/test_view.c b/test/test_view.c index d7c221128..9608a8661 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -37,13 +37,9 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/vcf.h" #include "../htslib/hts_log.h" -extern void hts_vcf_simd_probe_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, uint64_t *tabs); extern void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples); -extern void hts_vcf_format_plan_shape_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback); struct opts { char *fn_ref; @@ -439,15 +435,6 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); - if (getenv("HTS_VCF_SIMD_STATS")) { - uint64_t attempts = 0, hits = 0, fallback = 0, tabs = 0; - hts_vcf_simd_probe_stats(&attempts, &hits, &fallback, &tabs); - fprintf(stderr, - "vcf-simd-tabs attempts=%llu hits=%llu fallback=%llu tabs=%llu\n", - (unsigned long long) attempts, (unsigned long long) hits, - (unsigned long long) fallback, (unsigned long long) tabs); - } - if (getenv("HTS_VCF_FORMAT_PLAN_STATS")) { uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; hts_vcf_format_plan_stats(&attempts, &hits, &fallback, &parsed_samples); @@ -458,15 +445,6 @@ int main(int argc, char *argv[]) (unsigned long long) parsed_samples); } - if (getenv("HTS_VCF_FORMAT_PLAN_SHAPE_STATS")) { - uint64_t attempts = 0, hits = 0, fallback = 0; - hts_vcf_format_plan_shape_stats(&attempts, &hits, &fallback); - fprintf(stderr, - "vcf-format-likelihood-shape attempts=%llu hits=%llu fallback=%llu\n", - (unsigned long long) attempts, (unsigned long long) hits, - (unsigned long long) fallback); - } - if (fclose(stdout) != 0 && errno != EBADF) { fprintf(stderr, "Error closing standard output.\n"); exit_code = EXIT_FAILURE; diff --git a/vcf.c b/vcf.c index 4c03fec05..c12e67792 100644 --- a/vcf.c +++ b/vcf.c @@ -42,14 +42,6 @@ DEALINGS IN THE SOFTWARE. */ #include "fuzz_settings.h" #endif -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#include -#endif - -#if defined(__AVX2__) -#include -#endif - #include "htslib/vcf.h" #include "htslib/bgzf.h" #include "htslib/tbx.h" @@ -3000,12 +2992,6 @@ static int bcf_enc_vint_known_range_special(kstring_t *s, int n, int32_t *a, int return 0; } -static int bcf_enc_vint_known_range(kstring_t *s, int n, int32_t *a, int wsize, - int32_t min, int32_t max) -{ - return bcf_enc_vint_known_range_special(s, n, a, wsize, min, max, 1); -} - #ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; @@ -3227,6 +3213,24 @@ typedef struct { static vcf_format_plan_stats_t vcf_format_plan_stats; +/* + * Dynamic FORMAT fast path. + * + * The production FORMAT parser below is intentionally very permissive: it can + * repair missing header declarations, deal with sample subsetting, and recover + * from many odd row shapes. The fast path here only claims rows that can be + * described by the existing FORMAT header metadata and parsed as a fixed list + * of per-tag operations. If any compile-time or row-local invariant fails, it + * returns -3 to let the production parser handle the whole FORMAT column. + * + * HTS_VCF_FORMAT_PLAN controls the feature: + * unset/0 use production parser only + * 1/interp/general + * use the dynamic per-tag plan, with production fallback + * + * Older experimental exact kernels have been removed; all enabled spellings + * now route through the same dynamic planner/executor. + */ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples) { @@ -3236,14 +3240,6 @@ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; } -void hts_vcf_format_plan_shape_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback) -{ - if (attempts) *attempts = 0; - if (hits) *hits = 0; - if (fallback) *fallback = 0; -} - static int vcf_format_plan_mode(void) { static int mode = -1; @@ -3323,23 +3319,11 @@ static inline void vcf_format_fast_guard_fallback(vcf_format_fast_guard_t *guard } typedef struct { - char format[64]; - const bcf_hdr_t *hdr; - int supported; - int has_ab; - int has_phase; - int key_gt; - int key_ab; - int key_ad; - int key_dp; - int key_gq; - int key_pgt; - int key_pid; - int key_pl; - vcf_format_fast_guard_t guard; -} vcf_format_plan_t; - -typedef struct { + /* + * Header-derived operation for one FORMAT tag. This is the reusable, + * record-independent part of the plan: the tag key, declared type, declared + * length model, and whether the row must measure the width before parsing. + */ int key; int number; uint8_t htype; @@ -3349,6 +3333,11 @@ typedef struct { } vcf_format_op_t; typedef struct { + /* + * Cache key is the literal FORMAT string plus header pointer. This keeps + * repeated records on the same FORMAT layout from rebuilding the per-tag + * op list while still respecting that key ids/types are header-local. + */ char format[256]; const bcf_hdr_t *hdr; int supported; @@ -3370,6 +3359,10 @@ typedef enum { } vcf_format_row_kind_t; typedef struct { + /* + * Row-local operation. Header Number=A/R/G and measured Number=. fields + * depend on the current record, so width/size/offset are resolved per row. + */ int key; int width; int size; @@ -3389,97 +3382,27 @@ typedef struct { #define VCF_PLAN_ALWAYS_INLINE static inline #endif -static int vcf_format_plan_compile(const bcf_hdr_t *h, const char *format, - vcf_format_plan_t *plan) -{ - memset(plan, 0, sizeof(*plan)); - if (strlen(format) >= sizeof(plan->format)) - return 0; - strcpy(plan->format, format); - plan->hdr = h; - - if (strcmp(format, "GT:AB:AD:DP:GQ:PL") == 0) { - plan->supported = 1; - plan->has_ab = 1; - } else if (strcmp(format, "GT:AD:DP:GQ:PL") == 0) { - plan->supported = 1; - } else if (strcmp(format, "GT:AB:AD:DP:GQ:PGT:PID:PL") == 0) { - plan->supported = 1; - plan->has_ab = 1; - plan->has_phase = 1; - } else if (strcmp(format, "GT:AD:DP:GQ:PGT:PID:PL") == 0) { - plan->supported = 1; - plan->has_phase = 1; - } else { - return 0; - } - - plan->key_gt = bcf_hdr_id2int(h, BCF_DT_ID, "GT"); - plan->key_ad = bcf_hdr_id2int(h, BCF_DT_ID, "AD"); - plan->key_dp = bcf_hdr_id2int(h, BCF_DT_ID, "DP"); - plan->key_gq = bcf_hdr_id2int(h, BCF_DT_ID, "GQ"); - plan->key_pl = bcf_hdr_id2int(h, BCF_DT_ID, "PL"); - plan->key_ab = plan->has_ab ? bcf_hdr_id2int(h, BCF_DT_ID, "AB") : -1; - plan->key_pgt = plan->has_phase ? bcf_hdr_id2int(h, BCF_DT_ID, "PGT") : -1; - plan->key_pid = plan->has_phase ? bcf_hdr_id2int(h, BCF_DT_ID, "PID") : -1; - if (plan->key_gt < 0 || plan->key_ad < 0 || plan->key_dp < 0 || - plan->key_gq < 0 || plan->key_pl < 0 || - (plan->has_ab && plan->key_ab < 0) || - (plan->has_phase && (plan->key_pgt < 0 || plan->key_pid < 0))) - plan->supported = 0; - if (plan->supported && - (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_gt) != BCF_HT_STR || - bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_ad) != BCF_HT_INT || - bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_dp) != BCF_HT_INT || - bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_gq) != BCF_HT_INT || - bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pl) != BCF_HT_INT || - bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_dp) != 1 || - bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_gq) != 1 || - (plan->has_ab && - (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_ab) != BCF_HT_REAL || - bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_ab) != 1)) || - (plan->has_phase && - (bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pgt) != BCF_HT_STR || - bcf_hdr_id2type(h, BCF_HL_FMT, plan->key_pid) != BCF_HT_STR || - bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_pgt) != 1 || - bcf_hdr_id2number(h, BCF_HL_FMT, plan->key_pid) != 1)))) - plan->supported = 0; - - return plan->supported; -} - -static vcf_format_plan_t *vcf_format_plan_get(const bcf_hdr_t *h, const char *format) -{ - enum { N_PLAN_CACHE = 8 }; - static vcf_format_plan_t cache[N_PLAN_CACHE]; - static int ncache = 0; - int i; - - for (i = 0; i < ncache; i++) - if (cache[i].hdr == h && strcmp(cache[i].format, format) == 0) - return cache[i].supported ? &cache[i] : NULL; - - if (ncache == N_PLAN_CACHE) - return NULL; - vcf_format_plan_compile(h, format, &cache[ncache]); - return cache[ncache++].supported ? &cache[ncache-1] : NULL; -} - static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *format, vcf_format_general_plan_t *plan) { char tmp[256], *tok, *saveptr = NULL; int i; - memset(plan, 0, sizeof(*plan)); - if (strlen(format) >= sizeof(plan->format)) - return 0; + memset(plan, 0, sizeof(*plan)); + if (strlen(format) >= sizeof(plan->format)) + return 0; strcpy(plan->format, format); strcpy(tmp, format); plan->hdr = h; - for (tok = strtok_r(tmp, ":", &saveptr); tok; - tok = strtok_r(NULL, ":", &saveptr)) { + /* + * Compile at tag granularity, not full FORMAT-shape granularity. This is + * what allows GT:AD, GT:AD:DP:PL, reordered fields, and supersets with + * additional header-described tags to share the same executor instead of + * needing exact string-specific kernels. + */ + for (tok = strtok_r(tmp, ":", &saveptr); tok; + tok = strtok_r(NULL, ":", &saveptr)) { int key, htype; if (plan->n_ops >= MAX_N_FMT) @@ -3491,11 +3414,17 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (plan->ops[i].key == key) return 0; - htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); - if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) - return 0; + htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); + if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) + return 0; - plan->ops[plan->n_ops].key = key; + /* + * Only compile tags with enough header information to reproduce the + * production BCF layout. Undefined tags and exotic types intentionally + * stay on the production parser, which can emit warnings and install + * dummy header records where appropriate. + */ + plan->ops[plan->n_ops].key = key; plan->ops[plan->n_ops].number = bcf_hdr_id2number(h, BCF_HL_FMT, key); plan->ops[plan->n_ops].htype = htype; plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; @@ -4008,33 +3937,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_skip_field(const char **sp, int sep) -{ - const char *s = *sp; - while (*s && *s != sep && *s != '\t') - s++; - if (*s != sep) - return -1; - *sp = s + 1; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_measure_string(const char **sp, int sep, int *max_l) -{ - const char *s = *sp, *t = s; - int l; - - while (*t && *t != sep && *t != '\t') - t++; - if (*t != sep) - return -1; - l = t - s; - if (*max_l < l) - *max_l = l; - *sp = t + 1; - return 0; -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int width) { const char *s = *sp, *t = s; @@ -4351,6 +4253,13 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_op_t *op = &plan->ops[j]; if (op->measured_width) { + /* + * Strings and Number=. numeric vectors need a first pass so the + * transposed FORMAT storage has one row-local stride. The bound is + * deliberately small; wide or malformed records fall back whole-row + * to the production parser rather than growing a second general + * allocator here. + */ widths[j] = 0; has_measured = 1; } else { @@ -4371,6 +4280,12 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const char *field = cur; int w = 1; + /* + * This pass validates the sample field separators at the same time + * as measuring widths. A single unexpected ':' or tab position is + * enough to reject the fast path, preserving production behavior for + * odd FORMAT/sample cardinality cases. + */ while (cur < end && *cur && *cur != ':' && *cur != '\t') { if (op->measured_width && (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) && @@ -4437,6 +4352,13 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, if (!vcf_format_general_composable_supported(row_ops, plan->n_ops)) return -4; + /* + * The executor writes data in BCF's transposed FORMAT layout: all samples + * for FORMAT op 0, then all samples for op 1, etc. Leading fixed-width + * GT2/FLOAT1 rows can be written directly to v->indiv; the remaining rows + * are staged in h->mem so they can be parsed sample-major and encoded + * op-major once row-local ranges and widths are known. + */ for (j = 0; j < plan->n_ops; j++) { max_counts[j] = 0; direct_offsets[j] = 0; @@ -4492,6 +4414,11 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, uint8_t *buf = op_base[j] + sample * op_stride[j]; int n = op->width; + /* + * Each op parser consumes exactly one sample subfield and leaves cur + * on the following ':' or tab. Values that require production-only + * handling, such as non-simple GT encodings, return -4 via fallback. + */ switch (op->kind) { case VCF_FORMAT_ROW_GT2: if (vcf_plan_gt2_u8(&cur, buf) < 0) @@ -4558,6 +4485,12 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) goto fallback; if (max_counts[j] < row_ops[j].width) { + /* + * Production encodes fixed-width vector rows at the observed row + * maximum, not necessarily the conservative header-derived width. + * Compacting here avoids unnecessary whole-row fallback while + * keeping byte-identical BCF output. + */ if (!vcf_format_row_can_compact(&row_ops[j])) goto fallback; vcf_format_compact_row_op(mem, nsamples, &row_ops[j], max_counts[j]); @@ -4575,6 +4508,10 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, return 0; fallback: + /* + * Only v->indiv is mutated by this executor before success is known. All + * scratch data lives in h->mem and can be overwritten by the fallback parse. + */ v->indiv.l = indiv_l0; return -4; error: @@ -4606,6 +4543,11 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, if (!plan) goto fallback; if (!vcf_format_fast_guard_enabled(&plan->general_guard)) { + /* + * If this FORMAT string repeatedly fails row-local validation, stop + * probing it for a short cooldown. This protects mixed or pathological + * files from paying fast-path setup cost on every record. + */ vcf_format_plan_stats.fallback++; return -3; } @@ -4628,247 +4570,28 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, return -3; } -static int vcf_plan_phase_widths(const bcf_hdr_t *h, const vcf_format_plan_t *plan, - kstring_t *s, char *q, int *pgt_w, int *pid_w) -{ - const char *cur = q + 1, *end = s->s + s->l; - int sample, nsamples = bcf_hdr_nsamples(h); - - *pgt_w = 0; - *pid_w = 0; - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (vcf_plan_skip_field(&cur, ':') < 0) - return -1; - if (plan->has_ab && vcf_plan_skip_field(&cur, ':') < 0) - return -1; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -1; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -1; - if (vcf_plan_skip_field(&cur, ':') < 0) - return -1; - if (vcf_plan_measure_string(&cur, ':', pgt_w) < 0) - return -1; - if (vcf_plan_measure_string(&cur, ':', pid_w) < 0) - return -1; - while (cur < end && *cur && *cur != '\t') - cur++; - if (*cur == '\t') - cur++; - } - if (sample != nsamples) - return -1; - // The generic FORMAT max-length pass includes the preceding ':' in - // non-GT string widths, leaving one byte of padding per sample. - (*pgt_w)++; - (*pid_w)++; - return 0; -} - static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { - vcf_format_plan_t *plan = NULL; - kstring_t *mem; - int nsamples, ad_w, pl_w, sample, nwords, pgt_w = 0, pid_w = 0; - int max_ad_count = 0, max_pl_count = 0; - vcf_plan_int_range_t ad_range, dp_range, gq_range, pl_range; - size_t ad_off, dp_off, gq_off, pgt_off = 0, pid_off = 0, pl_off, total_bytes; - size_t indiv_l0, gt8_off, ab_le_off = 0; - uint8_t *gt8, *ab_le = NULL; - int32_t *ad, *dp, *gq, *pl; - char *pgt = NULL, *pid = NULL; - const char *cur, *end; int plan_mode; - indiv_l0 = v->indiv.l; - plan_mode = vcf_format_plan_mode(); - if (!plan_mode) - return -3; - vcf_format_plan_stats.attempts++; - if (h->keep_samples) - goto fallback; - - if (plan_mode == 2) - return vcf_parse_format_general_planned(s, h, v, p, q); - plan = vcf_format_plan_get(h, p); - if (!plan) - return vcf_parse_format_general_planned(s, h, v, p, q); - if (!vcf_format_fast_guard_enabled(&plan->guard)) - return vcf_parse_format_general_planned(s, h, v, p, q); - - nsamples = bcf_hdr_nsamples(h); - if (!nsamples) - return 0; - if (v->n_allele < 1 || v->n_allele > 8) - goto fallback; - ad_w = v->n_allele; - pl_w = v->n_allele * (v->n_allele + 1) / 2; - if (pl_w < 1 || pl_w > 36) - goto fallback; - if (plan->has_phase && vcf_plan_phase_widths(h, plan, s, q, &pgt_w, &pid_w) < 0) - goto fallback; - vcf_plan_int_range_init(&ad_range); - vcf_plan_int_range_init(&dp_range); - vcf_plan_int_range_init(&gq_range); - vcf_plan_int_range_init(&pl_range); - - mem = (kstring_t*)&h->mem; - mem->l = 0; - if (align_mem(mem) < 0) - return -1; - - bcf_enc_int1(&v->indiv, plan->key_gt); - if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) - return -1; - gt8_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * 2; - if (plan->has_ab) { - bcf_enc_int1(&v->indiv, plan->key_ab); - if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) - return -1; - ab_le_off = v->indiv.l; - v->indiv.l += (size_t)nsamples * sizeof(float); - } - gt8 = (uint8_t *)v->indiv.s + gt8_off; - if (plan->has_ab) - ab_le = (uint8_t *)v->indiv.s + ab_le_off; - - total_bytes = (size_t) nsamples * (ad_w + 1 + 1 + pl_w) * sizeof(int32_t); - total_bytes += (size_t) nsamples * (pgt_w + pid_w); - if (total_bytes > INT_MAX) - return -1; - if (ks_resize(mem, mem->l + total_bytes) < 0) - return -1; - - ad_off = mem->l; mem->l += (size_t) nsamples * ad_w * sizeof(int32_t); - dp_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); - gq_off = mem->l; mem->l += (size_t) nsamples * sizeof(int32_t); - if (plan->has_phase) { - pgt_off = mem->l; mem->l += (size_t) nsamples * pgt_w; - pid_off = mem->l; mem->l += (size_t) nsamples * pid_w; - } - pl_off = mem->l; mem->l += (size_t) nsamples * pl_w * sizeof(int32_t); - - ad = (int32_t *) (mem->s + ad_off); - dp = (int32_t *) (mem->s + dp_off); - gq = (int32_t *) (mem->s + gq_off); - if (plan->has_phase) { - pgt = mem->s + pgt_off; - pid = mem->s + pid_off; - } - pl = (int32_t *) (mem->s + pl_off); - - cur = q + 1; - end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - int nread; - if (vcf_plan_gt2_u8(&cur, >8[sample * 2]) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (plan->has_ab) { - float ab_val; - if (vcf_plan_float_value(&cur, &ab_val) < 0) - goto fallback; - float_to_le(ab_val, ab_le + (size_t)sample * sizeof(float)); - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } - if (ad_w == 2) { - if (vcf_plan_parse_int_vector2_counted_range(&cur, &ad[sample * 2], &nread, &ad_range) < 0) - goto fallback; - } else if (vcf_plan_parse_int_vector_counted_range(&cur, &ad[sample * ad_w], ad_w, &nread, &ad_range) < 0) { - goto fallback; - } - if (max_ad_count < nread) - max_ad_count = nread; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value_range(&cur, &dp[sample], &dp_range) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_int_value_range(&cur, &gq[sample], &gq_range) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (plan->has_phase) { - if (vcf_plan_copy_string(&cur, &pgt[sample * pgt_w], pgt_w) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - if (vcf_plan_copy_string(&cur, &pid[sample * pid_w], pid_w) < 0) - goto fallback; - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } - if (pl_w == 3) { - if (vcf_plan_parse_int_vector3_counted_range(&cur, &pl[sample * 3], &nread, &pl_range) < 0) - goto fallback; - } else if (vcf_plan_parse_int_vector_counted_range(&cur, &pl[sample * pl_w], pl_w, &nread, &pl_range) < 0) { - goto fallback; - } - if (max_pl_count < nread) - max_pl_count = nread; - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; - } - if (sample != nsamples) - goto fallback; - if (max_ad_count != ad_w || max_pl_count != pl_w) - goto fallback; - - v->n_fmt = plan->has_phase ? (plan->has_ab ? 8 : 7) : (plan->has_ab ? 6 : 5); - v->n_sample = nsamples; - bcf_enc_int1(&v->indiv, plan->key_ad); - nwords = nsamples * ad_w; - if (bcf_enc_vint_known_range_special(&v->indiv, nwords, ad, ad_w, ad_range.min, ad_range.max, - ad_range.has_special) < 0) - return -1; - bcf_enc_int1(&v->indiv, plan->key_dp); - if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, dp, 1, dp_range.min, dp_range.max, - dp_range.has_special) < 0) - return -1; - bcf_enc_int1(&v->indiv, plan->key_gq); - if (bcf_enc_vint_known_range_special(&v->indiv, nsamples, gq, 1, gq_range.min, gq_range.max, - gq_range.has_special) < 0) - return -1; - if (plan->has_phase) { - bcf_enc_int1(&v->indiv, plan->key_pgt); - if (bcf_enc_size(&v->indiv, pgt_w, BCF_BT_CHAR) < 0) - return -1; - if (kputsn(pgt, (size_t) nsamples * pgt_w, &v->indiv) < 0) - return -1; - bcf_enc_int1(&v->indiv, plan->key_pid); - if (bcf_enc_size(&v->indiv, pid_w, BCF_BT_CHAR) < 0) - return -1; - if (kputsn(pid, (size_t) nsamples * pid_w, &v->indiv) < 0) - return -1; + plan_mode = vcf_format_plan_mode(); + if (!plan_mode) + return -3; + vcf_format_plan_stats.attempts++; + if (h->keep_samples) { + /* + * Sample filtering/subsetting changes FORMAT column cardinality and + * error handling in ways this MVP does not yet model. Keep it on the + * production parser until the dynamic executor has explicit support for + * selected-sample writes. + */ + vcf_format_plan_stats.fallback++; + return -3; } - bcf_enc_int1(&v->indiv, plan->key_pl); - nwords = nsamples * pl_w; - if (bcf_enc_vint_known_range_special(&v->indiv, nwords, pl, pl_w, pl_range.min, pl_range.max, - pl_range.has_special) < 0) - return -1; - - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; - vcf_format_fast_guard_success(&plan->guard); - return 0; -fallback: - v->indiv.l = indiv_l0; - if (plan) - vcf_format_fast_guard_fallback(&plan->guard); - vcf_format_plan_stats.fallback++; - return -3; + /* All enabled modes now use the same dynamic per-tag plan. */ + return vcf_parse_format_general_planned(s, h, v, p, q); } // detect FORMAT "." @@ -5728,230 +5451,6 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p return -1; } -typedef struct { - uint64_t attempts; - uint64_t hits; - uint64_t fallback; - uint64_t tabs; -} vcf_simd_probe_stats_t; - -static vcf_simd_probe_stats_t vcf_simd_probe_stats; - -static int vcf_simd_tabs_enabled(void) -{ - static int enabled = -1; - if (enabled < 0) { - const char *env = getenv("HTS_VCF_SIMD_TABS"); - enabled = env && env[0] && strcmp(env, "0") != 0; - } - return enabled; -} - -void hts_vcf_simd_probe_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, uint64_t *tabs) -{ - if (attempts) *attempts = vcf_simd_probe_stats.attempts; - if (hits) *hits = vcf_simd_probe_stats.hits; - if (fallback) *fallback = vcf_simd_probe_stats.fallback; - if (tabs) *tabs = vcf_simd_probe_stats.tabs; -} - -static int vcf_find_tabs_scalar(const char *s, size_t len, - size_t *tabs, int max_tabs) -{ - int n = 0; - size_t i; - for (i = 0; i < len && n < max_tabs; i++) { - if (s[i] == '\t') - tabs[n++] = i; - } - return n; -} - -static int vcf_find_tabs_simd(const char *s, size_t len, - size_t *tabs, int max_tabs) -{ - int n = 0; - size_t i = 0; - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - const uint8x16_t tab = vdupq_n_u8('\t'); - for (; i + 16 <= len && n < max_tabs; i += 16) { - uint8x16_t bytes = vld1q_u8((const uint8_t *) s + i); - uint8x16_t eq = vceqq_u8(bytes, tab); - uint64_t lo = vgetq_lane_u64(vreinterpretq_u64_u8(eq), 0); - uint64_t hi = vgetq_lane_u64(vreinterpretq_u64_u8(eq), 1); - uint32_t mask = 0; - int j; - - for (j = 0; j < 8; j++) mask |= ((lo >> (j * 8)) & 0x80) ? 1u << j : 0; - for (j = 0; j < 8; j++) mask |= ((hi >> (j * 8)) & 0x80) ? 1u << (j + 8) : 0; - - while (mask && n < max_tabs) { - unsigned bit = (unsigned) __builtin_ctz(mask); - tabs[n++] = i + bit; - mask &= mask - 1; - } - } -#elif defined(__AVX2__) - const __m256i tab = _mm256_set1_epi8('\t'); - for (; i + 32 <= len && n < max_tabs; i += 32) { - __m256i bytes = _mm256_loadu_si256((const __m256i *) (s + i)); - uint32_t mask = (uint32_t) _mm256_movemask_epi8(_mm256_cmpeq_epi8(bytes, tab)); - while (mask && n < max_tabs) { - unsigned bit = (unsigned) __builtin_ctz(mask); - tabs[n++] = i + bit; - mask &= mask - 1; - } - } -#endif - - for (; i < len && n < max_tabs; i++) { - if (s[i] == '\t') - tabs[n++] = i; - } - - return n; -} - -#define VCF_NOT_DOT_FIELD(p) (memcmp((p), ".\0", 2)) - -static int vcf_parse_simd_tabs(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) -{ - int ret = -2, overflow = 0, ntabs, i; - size_t tabs[9], line_end; - char *base, *p, *q, *r, *t; - kstring_t *str; - khint_t k; - vdict_t *d; - - if (!vcf_simd_tabs_enabled()) - return -3; - - vcf_simd_probe_stats.attempts++; - if (!s || !h || !v || !(s->s)) - return -3; - if (ks_resize(s, s->l + 4) < 0) - return -2; - - base = s->s; - line_end = s->l; - ntabs = vcf_find_tabs_simd(base, line_end, tabs, 9); - vcf_simd_probe_stats.tabs += ntabs; - if (ntabs < 7) { - vcf_simd_probe_stats.fallback++; - return -3; - } - - s->s[s->l + 0] = 0; - s->s[s->l + 1] = 0; - s->s[s->l + 2] = 0; - s->s[s->l + 3] = 0; - - bcf_clear1(v); - str = &v->shared; - for (i = 0; i < 7; i++) - base[tabs[i]] = 0; - - p = base; - d = (vdict_t*)h->dict[BCF_DT_CTG]; - k = kh_get(vdict, d, p); - if (k == kh_end(d)) { - hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); - v->errcode = BCF_ERR_CTG_UNDEF; - if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { - hts_log_error("Could not add dummy header for contig '%s'", p); - v->errcode |= BCF_ERR_CTG_INVALID; - goto err; - } - } - v->rid = kh_val(d, k).id; - - p = base + tabs[0] + 1; - overflow = 0; - t = p; - v->pos = hts_str2uint(p, &p, 62, &overflow); - if (overflow) { - hts_log_error("Position value '%s' is too large", t); - goto err; - } else if (*p) { - hts_log_error("Could not parse the position '%s'", t); - goto err; - } else { - v->pos -= 1; - } - if (v->pos >= INT32_MAX) - v->unpacked |= BCF_IS_64BIT; - - p = base + tabs[1] + 1; - q = base + tabs[2]; - if (VCF_NOT_DOT_FIELD(p)) bcf_enc_vchar(str, q - p, p); - else bcf_enc_size(str, 0, BCF_BT_CHAR); - - p = base + tabs[2] + 1; - q = base + tabs[3]; - bcf_enc_vchar(str, q - p, p); - v->n_allele = 1, v->rlen = q - p; - - p = base + tabs[3] + 1; - q = base + tabs[4]; - if (VCF_NOT_DOT_FIELD(p)) { - for (r = t = p;; ++r) { - if (*r == ',' || *r == 0) { - if (v->n_allele == UINT16_MAX) { - hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto err; - } - bcf_enc_vchar(str, r - t, t); - t = r + 1; - ++v->n_allele; - } - if (r == q) break; - } - } - - p = base + tabs[4] + 1; - if (VCF_NOT_DOT_FIELD(p)) v->qual = atof(p); - else bcf_float_set_missing(v->qual); - if (v->max_unpack && !(v->max_unpack>>1)) goto end; - - p = base + tabs[5] + 1; - q = base + tabs[6]; - if (VCF_NOT_DOT_FIELD(p)) { - if (vcf_parse_filter(str, h, v, p, q)) - goto err; - } else bcf_enc_vint(str, 0, 0, -1); - if (v->max_unpack && !(v->max_unpack>>2)) goto end; - - p = base + tabs[6] + 1; - q = ntabs > 7 ? base + tabs[7] : base + line_end; - if (ntabs > 7) - *q = 0; - if (VCF_NOT_DOT_FIELD(p)) { - if (vcf_parse_info(str, h, v, p, q)) - goto err; - } - if (v->max_unpack && !(v->max_unpack>>3)) goto end; - - if (ntabs > 7) { - p = base + tabs[7] + 1; - q = ntabs > 8 ? base + tabs[8] : base + line_end; - *q = 0; - if (vcf_parse_format(s, h, v, p, q)) - goto err; - } - - end: - v->rlen = get_rlen(h, v); - ret = 0; - vcf_simd_probe_stats.hits++; - - err: - return ret; -} - int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { int ret = -2, overflow = 0; @@ -5969,11 +5468,6 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (!s || !h || !v || !(s->s)) return ret; - ret = vcf_parse_simd_tabs(s, h, v); - if (ret != -3) - return ret; - ret = -2; - // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); From cb025b18c88d0fbcaf830755fd7dfdc07a805502 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 18:56:07 +0200 Subject: [PATCH 26/38] Add threaded bcftools FORMAT benchmarks --- bench/format-shape/README.md | 54 +++++++++++--- bench/format-shape/large/threaded-inputs.tsv | 3 + .../scripts/run_bcftools_bench.sh | 63 ++++++++++++++++ bench/format-shape/scripts/run_bench.sh | 14 +--- .../format-shape/scripts/run_thread_bench.sh | 74 +++++++++++++++++++ docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 53 +++++++++---- docs/DYNAMIC_FORMAT_PLAN_README.md | 16 +++- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 67 +++++++++++++---- docs/FORMAT_PLAN_SPEC.md | 4 +- 9 files changed, 294 insertions(+), 54 deletions(-) create mode 100644 bench/format-shape/large/threaded-inputs.tsv create mode 100755 bench/format-shape/scripts/run_bcftools_bench.sh create mode 100755 bench/format-shape/scripts/run_thread_bench.sh diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index e1d440f5e..ab544dcc3 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -14,7 +14,9 @@ bench/format-shape/ large/ meaningful multi-second benchmark inputs/results scripts/make_synthetic.pl deterministic synthetic VCF generator scripts/make_large_synthetic.pl - scripts/run_bench.sh baseline/plan/interp timing and cmp runner + scripts/run_bench.sh baseline/plan timing and cmp runner + scripts/run_thread_bench.sh representative threaded timing and cmp runner + scripts/run_bcftools_bench.sh representative bcftools timing runner results/ generated timing logs and BCF outputs ``` @@ -95,14 +97,26 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ `KEEP_OUTPUTS=0` still writes temporary BCF files and compares them with `cmp`, but deletes the large BCF outputs after each input is checked. -The script runs each input in three modes. `plan` and `interp` both use the -dynamic per-tag FORMAT planner; both are kept so old comparisons can still check -the `HTS_VCF_FORMAT_PLAN=1` spelling against the explicit dynamic spelling. +Run the representative threaded scaling corpus: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded \ + bench/format-shape/scripts/run_thread_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +By default this runs unthreaded plus `-@ 2`, `-@ 4`, and `-@ 8`. Override with +`THREADS_LIST="2 4 8"` or a similar space-separated list. The current threaded +manifest intentionally uses a small representative subset of the large corpus: +one real GT-only workload and one FORMAT-heavy CCDG-like likelihood workload. + +The script runs each input in two modes. `interp` remains accepted by +`HTS_VCF_FORMAT_PLAN`, but it aliases the same dynamic parser as `plan`, so the +benchmark harness does not run it as a separate timing row. ```text baseline: HTS_VCF_FORMAT_PLAN=0 plan: HTS_VCF_FORMAT_PLAN=1 -interp: HTS_VCF_FORMAT_PLAN=interp ``` It writes: @@ -112,7 +126,23 @@ bench/format-shape/results/timings.tsv bench/format-shape/results/checks.tsv ``` -`checks.tsv` compares plan and interp BCF output against baseline with `cmp`. +`checks.tsv` compares plan BCF output against baseline with `cmp`. +The threaded runner writes the same files under its selected output directory, +with an additional `threads` column. + +Run the same representative threaded corpus through bcftools: + +```sh +BCFTOOLS=/path/to/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ + bench/format-shape/scripts/run_bcftools_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +This uses `bcftools view --no-version -Ob -l 0`, compares planned output against +baseline with `cmp`, and records the same `0 2 4 8` thread counts by default. +It does not report planner counters because bcftools does not expose the +`test/test_view` stats hook. ## Large Corpus @@ -123,8 +153,14 @@ bench/format-shape/results/checks.tsv - eight generated 2,048-sample synthetic FORMAT workloads: CCDG-like likelihood, reordered likelihood, multiallelic likelihood, float/string FORMAT, variable phase-string widths, row-local likelihood - fallbacks, GT-first wrong-order likelihood-like rows, and two-string - float rows. + fallbacks, GT-first wrong-order likelihood-like rows, and two-string + float rows. + +`large/threaded-inputs.tsv` currently selects two representative inputs from the +same corpus for `-@` scaling checks: + +- full 1000 Genomes chr22 genotype VCF, +- large CCDG-like synthetic likelihood VCF. To refresh only the newer cache-regression synthetic files without rewriting the older large VCFs: @@ -142,4 +178,4 @@ bench/format-shape/large/results/timings.tsv bench/format-shape/large/results/checks.tsv ``` -All plan and interp outputs in that run compared byte-identical to baseline. +All plan outputs in that run compared byte-identical to baseline. diff --git a/bench/format-shape/large/threaded-inputs.tsv b/bench/format-shape/large/threaded-inputs.tsv new file mode 100644 index 000000000..104541474 --- /dev/null +++ b/bench/format-shape/large/threaded-inputs.tsv @@ -0,0 +1,3 @@ +name path source +1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF; real GT-only scaling case +large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples; FORMAT-heavy scaling case diff --git a/bench/format-shape/scripts/run_bcftools_bench.sh b/bench/format-shape/scripts/run_bcftools_bench.sh new file mode 100755 index 000000000..15b374d3c --- /dev/null +++ b/bench/format-shape/scripts/run_bcftools_bench.sh @@ -0,0 +1,63 @@ +#!/bin/sh +set -eu + +bcftools=${BCFTOOLS:-bcftools} +inputs=${1:-bench/format-shape/large/threaded-inputs.tsv} +outdir=${OUTDIR:-bench/format-shape/large/results-bcftools} +keep_outputs=${KEEP_OUTPUTS:-1} +threads_list=${THREADS_LIST:-0 2 4 8} +mkdir -p "$outdir" + +timings="$outdir/timings.tsv" +checks="$outdir/checks.tsv" + +printf 'name\tthreads\tmode\treal\tuser\tsys\n' > "$timings" +printf 'name\tthreads\tcomparison\tstatus\n' > "$checks" + +tail -n +2 "$inputs" | while IFS=' ' read -r name path source +do + for threads in $threads_list + do + base_out="$outdir/$name.t$threads.baseline.bcf" + plan_out="$outdir/$name.t$threads.plan.bcf" + thread_args= + if [ "$threads" != 0 ]; then + thread_args="--threads $threads" + fi + + for mode in baseline plan + do + err="$outdir/$name.t$threads.$mode.stderr" + out="$outdir/$name.t$threads.$mode.bcf" + case "$mode" in + baseline) + env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args -o "$out" "$path" 2> "$err" + ;; + plan) + env HTS_VCF_FORMAT_PLAN=1 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args -o "$out" "$path" 2> "$err" + ;; + esac + + awk -v name="$name" -v threads="$threads" -v mode="$mode" ' + /^real / { real=$2 } + /^user / { user=$2 } + /^sys / { sys=$2 } + END { + printf "%s\t%s\t%s\t%s\t%s\t%s\n", + name, threads, mode, real+0, user+0, sys+0 + } + ' "$err" >> "$timings" + done + + if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then + printf '%s\t%s\tbaseline_vs_plan\tok\n' "$name" "$threads" >> "$checks" + else + printf '%s\t%s\tbaseline_vs_plan\tDIFF\n' "$name" "$threads" >> "$checks" + fi + if [ "$keep_outputs" = 0 ]; then + rm -f "$base_out" "$plan_out" + fi + done +done + +printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/bench/format-shape/scripts/run_bench.sh b/bench/format-shape/scripts/run_bench.sh index 7fad74863..66430d22a 100755 --- a/bench/format-shape/scripts/run_bench.sh +++ b/bench/format-shape/scripts/run_bench.sh @@ -17,9 +17,8 @@ tail -n +2 "$inputs" | while IFS=' ' read -r name path source do base_out="$outdir/$name.baseline.bcf" plan_out="$outdir/$name.plan.bcf" - interp_out="$outdir/$name.interp.bcf" - for mode in baseline plan interp + for mode in baseline plan do err="$outdir/$name.$mode.stderr" out="$outdir/$name.$mode.bcf" @@ -31,10 +30,6 @@ do env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 \ /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" ;; - interp) - env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 \ - /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" - ;; esac awk -v name="$name" -v mode="$mode" ' @@ -63,13 +58,8 @@ do else printf '%s\tbaseline_vs_plan\tDIFF\n' "$name" >> "$checks" fi - if cmp "$base_out" "$interp_out" >/dev/null 2>&1; then - printf '%s\tbaseline_vs_interp\tok\n' "$name" >> "$checks" - else - printf '%s\tbaseline_vs_interp\tDIFF\n' "$name" >> "$checks" - fi if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$plan_out" "$interp_out" + rm -f "$base_out" "$plan_out" fi done diff --git a/bench/format-shape/scripts/run_thread_bench.sh b/bench/format-shape/scripts/run_thread_bench.sh new file mode 100755 index 000000000..dc0934467 --- /dev/null +++ b/bench/format-shape/scripts/run_thread_bench.sh @@ -0,0 +1,74 @@ +#!/bin/sh +set -eu + +test_view=${TEST_VIEW:-./test/test_view} +inputs=${1:-bench/format-shape/large/threaded-inputs.tsv} +outdir=${OUTDIR:-bench/format-shape/large/results-threaded} +keep_outputs=${KEEP_OUTPUTS:-1} +threads_list=${THREADS_LIST:-0 2 4 8} +mkdir -p "$outdir" + +timings="$outdir/timings.tsv" +checks="$outdir/checks.tsv" + +printf 'name\tthreads\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\n' > "$timings" +printf 'name\tthreads\tcomparison\tstatus\n' > "$checks" + +tail -n +2 "$inputs" | while IFS=' ' read -r name path source +do + for threads in $threads_list + do + base_out="$outdir/$name.t$threads.baseline.bcf" + plan_out="$outdir/$name.t$threads.plan.bcf" + thread_args= + if [ "$threads" != 0 ]; then + thread_args="-@ $threads" + fi + + for mode in baseline plan + do + err="$outdir/$name.t$threads.$mode.stderr" + out="$outdir/$name.t$threads.$mode.bcf" + case "$mode" in + baseline) + env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$test_view" -b -l 0 $thread_args "$path" > "$out" 2> "$err" + ;; + plan) + env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 \ + /usr/bin/time -p "$test_view" -b -l 0 $thread_args "$path" > "$out" 2> "$err" + ;; + esac + + awk -v name="$name" -v threads="$threads" -v mode="$mode" ' + /^real / { real=$2 } + /^user / { user=$2 } + /^sys / { sys=$2 } + /^vcf-format-plan / { + for (i=1; i<=NF; i++) { + split($i, kv, "=") + if (kv[1] == "attempts") attempts=kv[2] + else if (kv[1] == "hits") hits=kv[2] + else if (kv[1] == "fallback") fallback=kv[2] + else if (kv[1] == "parsed_samples") parsed=kv[2] + } + } + END { + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + name, threads, mode, real+0, user+0, sys+0, + attempts+0, hits+0, fallback+0, parsed+0 + } + ' "$err" >> "$timings" + done + + if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then + printf '%s\t%s\tbaseline_vs_plan\tok\n' "$name" "$threads" >> "$checks" + else + printf '%s\t%s\tbaseline_vs_plan\tDIFF\n' "$name" "$threads" >> "$checks" + fi + if [ "$keep_outputs" = 0 ]; then + rm -f "$base_out" "$plan_out" + fi + done +done + +printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md index 161111ec1..9c5f8cc94 100644 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md @@ -25,7 +25,7 @@ target and can be materially faster than baseline. ```sh HTS_VCF_FORMAT_PLAN=0 # production parser HTS_VCF_FORMAT_PLAN=1 # dynamic per-tag planner, then production fallback -HTS_VCF_FORMAT_PLAN=interp # same dynamic planner; retained for comparisons +HTS_VCF_FORMAT_PLAN=interp # same dynamic planner; manual alias HTS_VCF_FORMAT_PLAN_STATS=1 # print planner counters from test/test_view ``` @@ -61,25 +61,48 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ All planned outputs compared byte-identical to baseline. -| Input | Baseline user | Dynamic `1` user | Dynamic `interp` user | Hits/fallback | -|---|---:|---:|---:|---:| -| CCDG 10k | 2.62 s | 2.25 s | 2.24 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 26.05 s | 7.98 s | 8.01 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | 3.77 s | 20,000 / 0 | -| Large reordered likelihood | 3.00 s | 2.42 s | 2.44 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.16 s | 2.73 s | 2.73 s | 16,000 / 0 | -| Large float/string | 2.93 s | 2.97 s | 2.97 s | 16,000 / 0 | -| Variable phase widths | 2.61 s | 2.50 s | 2.48 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.22 s | 1.87 s | 1.86 s | 12,000 / 0 | -| GT-first reordered negative | 1.75 s | 1.44 s | 1.45 s | 12,000 / 0 | -| Two-string float negative | 2.28 s | 2.56 s | 2.54 s | 12,000 / 0 | +| Input | Baseline user | Dynamic plan user | Hits/fallback | +|---|---:|---:|---:| +| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | +| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | +| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | +| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | +| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | +| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | ## Historical Note The removed exact kernels remain useful as a performance reference in old benchmark logs, but they are no longer live code. New optimization work should -measure `HTS_VCF_FORMAT_PLAN=1` and `interp` as dynamic-only variants and should -compare both against `HTS_VCF_FORMAT_PLAN=0`. +measure `HTS_VCF_FORMAT_PLAN=1` against `HTS_VCF_FORMAT_PLAN=0`; `interp` is +only an alias for manual debugging. + +## bcftools Production-Style Check + +A clean bcftools `develop` worktree was built against this htslib branch and +timed with: + +```sh +bcftools view --no-version -Ob -l 0 [--threads N] +``` + +on the representative threaded manifest. All planned outputs compared +byte-identical to baseline. + +| Input | Threads | Baseline real | Plan real | Speedup | +|---|---:|---:|---:|---:| +| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | +| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | +| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | +| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | +| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | +| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | +| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | +| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | ## Next Work diff --git a/docs/DYNAMIC_FORMAT_PLAN_README.md b/docs/DYNAMIC_FORMAT_PLAN_README.md index 37c57a31b..b2f042736 100644 --- a/docs/DYNAMIC_FORMAT_PLAN_README.md +++ b/docs/DYNAMIC_FORMAT_PLAN_README.md @@ -65,9 +65,19 @@ comparisons between production parsing and planned parsing on edge-case FORMAT fixtures. The larger benchmark corpus lives under `bench/format-shape/large/`. The -benchmark script runs `baseline`, `plan`, and `interp` modes. `plan` is -`HTS_VCF_FORMAT_PLAN=1`; `interp` is the explicit dynamic spelling. After the -dynamic-only trim both enabled modes use the same dynamic executor. +benchmark script runs `baseline` and `plan` modes. `plan` is +`HTS_VCF_FORMAT_PLAN=1`; `interp` and `general` remain accepted aliases, but +they use the same dynamic executor and are not emitted as separate timing rows. + +Threaded scaling checks use `bench/format-shape/scripts/run_thread_bench.sh`. +That runner exercises representative large inputs with unthreaded execution plus +`test/test_view -@ 2`, `-@ 4`, and `-@ 8`, and writes a `threads` column in its +timing output. + +Production-style checks use `bench/format-shape/scripts/run_bcftools_bench.sh` +with a bcftools binary built against this htslib tree. It runs +`bcftools view --no-version -Ob -l 0` in baseline and planned modes over the same +representative threaded manifest. Latest documented results are in `docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md`. diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md index 5b91b8b81..1c78387b0 100644 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md @@ -30,7 +30,6 @@ Latest 10k CCDG timing from the large-corpus run: |---|---:| | Baseline | 2.62 s | | Dynamic `1` | 2.25 s | -| Dynamic `interp` | 2.24 s | Older sections below record the experimental path through exact kernels and a dynamic shape tier. Those paths have been removed from live code; the final @@ -555,15 +554,57 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ All output comparisons remained byte-identical to baseline. -| Input | Baseline user | Dynamic `1` user | Dynamic `interp` user | Hits/fallback | -|---|---:|---:|---:|---:| -| CCDG 10k | 2.62 s | 2.25 s | 2.24 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 26.05 s | 7.98 s | 8.01 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | 3.77 s | 20,000 / 0 | -| Large reordered likelihood | 3.00 s | 2.42 s | 2.44 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.16 s | 2.73 s | 2.73 s | 16,000 / 0 | -| Large float/string | 2.93 s | 2.97 s | 2.97 s | 16,000 / 0 | -| Variable phase widths | 2.61 s | 2.50 s | 2.48 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.22 s | 1.87 s | 1.86 s | 12,000 / 0 | -| GT-first reordered negative | 1.75 s | 1.44 s | 1.45 s | 12,000 / 0 | -| Two-string float negative | 2.28 s | 2.56 s | 2.54 s | 12,000 / 0 | +| Input | Baseline user | Dynamic plan user | Hits/fallback | +|---|---:|---:|---:| +| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | +| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | +| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | +| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | +| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | +| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | + +## 2026-04-29 bcftools Production-Style Timing + +Built a clean bcftools `develop` worktree at: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan +``` + +using this htslib worktree via: + +```sh +make HTSDIR=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/htslib-vcf-avx-sanity bcftools +``` + +The timing run used `bcftools view --no-version -Ob -l 0` over the threaded +representative manifest: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ + bench/format-shape/scripts/run_bcftools_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +All planned outputs compared byte-identical to baseline. + +| Input | Threads | Baseline real | Plan real | Speedup | Baseline user | Plan user | +|---|---:|---:|---:|---:|---:|---:| +| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | 25.94 s | 8.05 s | +| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | 28.82 s | 9.04 s | +| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | 28.83 s | 9.08 s | +| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | 28.71 s | 9.38 s | +| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | 4.11 s | 3.66 s | +| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | 4.50 s | 4.06 s | +| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | 4.51 s | 4.09 s | +| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | 4.50 s | 4.05 s | + +Takeaway: in a bcftools conversion path, the dynamic FORMAT parser gives a large +production-visible win for GT-only sample-rich VCFs. On likelihood-heavy rows it +still helps, but output/input threading and remaining generic FORMAT work limit +the total wall-clock gain to roughly 12-15% in this representative run. diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md index 42d34f014..0636a0bd4 100644 --- a/docs/FORMAT_PLAN_SPEC.md +++ b/docs/FORMAT_PLAN_SPEC.md @@ -23,8 +23,8 @@ interp/general same dynamic per-tag planner, then production fallback ``` All enabled spellings now use the same implementation. The benchmark harness -may still run both `1` and `interp` as separate modes, but they are intended to -match except for normal timing noise. +reports only `HTS_VCF_FORMAT_PLAN=1` as `plan`; `interp`/`general` remain useful +manual aliases but are not distinct timing modes. The planned parser has four stages: From ec63fbb3ea2e0175480e4829b037d87cb6bac4e1 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 20:40:42 +0200 Subject: [PATCH 27/38] Consolidate FORMAT plan documentation --- bench/format-shape/README.md | 7 + docs/CCDG_FORMAT_PLAN_BENCHMARK.md | 114 ---- docs/DYNAMIC_FORMAT_PLAN_README.md | 83 --- ...YNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md | 610 ------------------ docs/FORMAT_PLAN_CURRENT.md | 184 ++++++ docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 185 ++++++ docs/FORMAT_PLAN_OVERVIEW.md | 90 +++ docs/FORMAT_PLAN_SPEC.md | 125 ---- 8 files changed, 466 insertions(+), 932 deletions(-) delete mode 100644 docs/CCDG_FORMAT_PLAN_BENCHMARK.md delete mode 100644 docs/DYNAMIC_FORMAT_PLAN_README.md delete mode 100644 docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md create mode 100644 docs/FORMAT_PLAN_CURRENT.md create mode 100644 docs/FORMAT_PLAN_EXPERIMENT_LOG.md create mode 100644 docs/FORMAT_PLAN_OVERVIEW.md delete mode 100644 docs/FORMAT_PLAN_SPEC.md diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index ab544dcc3..da1bf0fe2 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -4,6 +4,13 @@ This directory is a local test and benchmark corpus for the experimental VCF FORMAT planner in `vcf.c`. It is intentionally kept under the repository worktree instead of `/tmp` so the inputs survive restarts. +The canonical feature docs are: + +- `docs/FORMAT_PLAN_OVERVIEW.md` for the high-level feature summary; +- `docs/FORMAT_PLAN_CURRENT.md` for the current implementation and benchmark + tables; +- `docs/FORMAT_PLAN_EXPERIMENT_LOG.md` for the historical experiment log. + ## Layout ```text diff --git a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md b/docs/CCDG_FORMAT_PLAN_BENCHMARK.md deleted file mode 100644 index 9c5f8cc94..000000000 --- a/docs/CCDG_FORMAT_PLAN_BENCHMARK.md +++ /dev/null @@ -1,114 +0,0 @@ -# CCDG FORMAT Plan Benchmark Checkpoint - -Date: 2026-04-29 - -Branch: `codex/vcf-avx-sanity` - -This file is a checkpoint for the CCDG-oriented FORMAT parser work. Earlier -versions of the branch used handwritten exact CCDG kernels; those kernels have -now been removed. The current production candidate is dynamic-only. - -## Current Takeaway - -The dynamic FORMAT planner is byte-correct on the CCDG subset and larger FORMAT -benchmark corpus. It is no longer a narrow full-string kernel: it compiles each -FORMAT tag from header metadata and uses one composable executor for supported -tags. - -On the CCDG 10k subset, the dynamic-only path is faster than baseline but slower -than the historical handwritten exact kernels. On GT-only and several -reordered/synthetic workloads, the dynamic path is much closer to the previous -target and can be materially faster than baseline. - -## Modes - -```sh -HTS_VCF_FORMAT_PLAN=0 # production parser -HTS_VCF_FORMAT_PLAN=1 # dynamic per-tag planner, then production fallback -HTS_VCF_FORMAT_PLAN=interp # same dynamic planner; manual alias -HTS_VCF_FORMAT_PLAN_STATS=1 # print planner counters from test/test_view -``` - -## CCDG Data - -Source file: - -```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -The 10k subset contains 10,000 variant records and 3,202 samples. The observed -FORMAT distribution is: - -| Records | FORMAT | -|---:|---| -| 4,681 | `GT:AB:AD:DP:GQ:PL` | -| 3,774 | `GT:AB:AD:DP:GQ:PGT:PID:PL` | -| 813 | `GT:AD:DP:GQ:PL` | -| 732 | `GT:AD:DP:GQ:PGT:PID:PL` | - -The current dynamic planner can compile these layouts from tag metadata rather -than matching the whole FORMAT string. - -## Latest Large-Corpus Result - -The most recent post-trim run used: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All planned outputs compared byte-identical to baseline. - -| Input | Baseline user | Dynamic plan user | Hits/fallback | -|---|---:|---:|---:| -| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | -| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | -| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | -| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | -| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | -| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | - -## Historical Note - -The removed exact kernels remain useful as a performance reference in old -benchmark logs, but they are no longer live code. New optimization work should -measure `HTS_VCF_FORMAT_PLAN=1` against `HTS_VCF_FORMAT_PLAN=0`; `interp` is -only an alias for manual debugging. - -## bcftools Production-Style Check - -A clean bcftools `develop` worktree was built against this htslib branch and -timed with: - -```sh -bcftools view --no-version -Ob -l 0 [--threads N] -``` - -on the representative threaded manifest. All planned outputs compared -byte-identical to baseline. - -| Input | Threads | Baseline real | Plan real | Speedup | -|---|---:|---:|---:|---:| -| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | -| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | -| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | -| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | -| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | -| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | -| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | -| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | - -## Next Work - -- Reduce the CCDG fallback rate without introducing full-string special cases. -- Add selected-sample support so `keep_samples` does not force production - fallback. -- Lower per-op dispatch and scratch-buffer overhead on likelihood-shaped rows. -- Keep expanding edge fixtures when a new supported FORMAT tag or width pattern - is added. diff --git a/docs/DYNAMIC_FORMAT_PLAN_README.md b/docs/DYNAMIC_FORMAT_PLAN_README.md deleted file mode 100644 index b2f042736..000000000 --- a/docs/DYNAMIC_FORMAT_PLAN_README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Dynamic FORMAT Plan - -This branch adds an optional dynamic fast path for parsing VCF `FORMAT` sample -columns. The goal is to speed up common, header-described FORMAT layouts -without hardcoding exact full FORMAT strings such as `GT:AD:DP:GQ:PL`. - -## How It Works - -When `HTS_VCF_FORMAT_PLAN` is enabled, `vcf_parse_format()` first tries to -compile the record's literal FORMAT string into a small list of per-tag -operations. Compilation uses the active header, so the plan records each tag's -header key, type, declared number, and whether the row needs width measurement. - -The executor then parses samples with that op list and writes BCF's transposed -FORMAT layout. If compilation or row-local validation fails, it returns to the -existing production parser for the whole FORMAT column. - -Supported environment values: - -- unset or `0`: use the production parser only. -- `1`, `interp`, or `general`: use the dynamic plan, with production fallback. - -The old exact FORMAT kernels and optional SIMD tab-scanning front-end have been -removed. All enabled spellings now route through the same dynamic path. - -## Supported Cases - -The fast path is tag-composable rather than full-string-specialized. It can -handle subsets, reordered fields, and supersets when each tag is described by -header metadata that the executor supports. - -Currently supported FORMAT tag shapes: - -- `GT` declared as `Type=String,Number=1`, with simple diploid encodings on the - fast path. -- Integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or - bounded measured `Number=.` row widths. -- Float fields with the same number models as integer fields. -- String fields declared as `Type=String,Number=1`, measured per row. - -Examples that can use the dynamic path include `GT:AD`, `GT:AD:DP:PL`, -`GT:AB:AD:DP:GQ:PGT:PID:PL`, and reordered/superset layouts with additional -supported tags. - -## Fallback Behavior - -Fallback is intentionally whole-row for the MVP. The dynamic parser does not -mix optimized handling for some tags with production handling for other tags in -the same FORMAT column. This keeps BCF layout, warning behavior, and error -recovery aligned with the existing parser when a row is unusual. - -Known fallback cases include: - -- sample subsetting via `keep_samples`; -- undefined FORMAT tags that require production header repair; -- unsupported header types or number models; -- malformed sample separators or unexpected sample cardinality; -- row-local widths above the current bounded fast-path limit; -- GT encodings outside the simple fast-path representation. - -## Tests And Benchmarks - -Focused validation lives in `test/test_format_plan.sh`, including byte-for-byte -comparisons between production parsing and planned parsing on edge-case FORMAT -fixtures. - -The larger benchmark corpus lives under `bench/format-shape/large/`. The -benchmark script runs `baseline` and `plan` modes. `plan` is -`HTS_VCF_FORMAT_PLAN=1`; `interp` and `general` remain accepted aliases, but -they use the same dynamic executor and are not emitted as separate timing rows. - -Threaded scaling checks use `bench/format-shape/scripts/run_thread_bench.sh`. -That runner exercises representative large inputs with unthreaded execution plus -`test/test_view -@ 2`, `-@ 4`, and `-@ 8`, and writes a `threads` column in its -timing output. - -Production-style checks use `bench/format-shape/scripts/run_bcftools_bench.sh` -with a bcftools binary built against this htslib tree. It runs -`bcftools view --no-version -Ob -l 0` in baseline and planned modes over the same -representative threaded manifest. - -Latest documented results are in -`docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md`. diff --git a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md b/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md deleted file mode 100644 index 1c78387b0..000000000 --- a/docs/DYNAMIC_FORMAT_SHAPE_EXECUTOR_SCRATCHPAD.md +++ /dev/null @@ -1,610 +0,0 @@ -# Dynamic FORMAT Shape Executor Scratchpad - -Date: 2026-04-29 - -Branch: `codex/vcf-avx-sanity` - -## Goal - -Make the general-purpose VCF FORMAT planned parser as fast as possible without -matching on field names. The planner should stay general and tag-composable, -while suspicious rows continue to fall back to the production parser. - -The production htslib parser remains the source of truth. Any optimized path -must either emit byte-identical BCF or return `-3` and fall back. - -## Current Baseline - -Current modes after the dynamic-only production trim: - -```sh -HTS_VCF_FORMAT_PLAN=0 # existing generic parser -HTS_VCF_FORMAT_PLAN=1 # dynamic planner, then production fallback -HTS_VCF_FORMAT_PLAN=interp # same dynamic planner, explicit spelling -HTS_VCF_FORMAT_PLAN_STATS=1 # counters from test/test_view -``` - -Latest 10k CCDG timing from the large-corpus run: - -| Mode | User time | -|---|---:| -| Baseline | 2.62 s | -| Dynamic `1` | 2.25 s | - -Older sections below record the experimental path through exact kernels and a -dynamic shape tier. Those paths have been removed from live code; the final -section is the current production-trim state. - -## Working Hypothesis - -The exact kernels are faster because they do less work in the sample loop: - -- no per-field switch dispatch for every sample, -- fewer scratch-buffer passes, -- direct writes into final BCF payloads for cheap fixed fields, -- integer min/max tracking is carried directly into BCF integer width selection, -- CCDG `PGT/PID` string handling is tailored instead of fully generic, -- fallback checks are simple and close to the parse step. - -The dynamic planner should keep general discovery, but execute as a compact -fixed-shape kernel after resolving header metadata and row-local widths. - -## Design Direction - -Compile the literal FORMAT column plus header metadata into a plan as today, then -derive a shape descriptor from the row and header: - -```text -GT2, FLOAT1, INT_R, INT1, INT1, STR1, STR1, INT_G -``` - -This shape says what to parse, not which tag names are present. Field IDs, -header types, and BCF keys still come from the generic plan. - -The executor should be monomorphic for common shapes: - -- `GT2 + fixed numeric fields` -- `GT2 + FLOAT1 + fixed numeric fields` -- `GT2 + fixed numeric fields + fixed strings` -- `GT dynamic + fixed numeric fields` -- measured-width fallback for strings or sparse/non-fixed rows - -The important constraint is to move per-field dispatch out of the per-sample hot -loop wherever possible. - -## Correctness Rules - -- Do not run planned parsing when `h->keep_samples` is active. -- Fall back on duplicate FORMAT tags, undefined tags, unsupported header types, - malformed rows, unsupported GT shape, or integer/float behavior that cannot be - made byte-identical. -- Preserve htslib GT semantics: haploid GT, missing alleles, multidigit allele - indexes, phased/unphased state, and VCF 4.4 prefix phasing. -- Preserve vector-end padding and string zero-padding. -- Save and roll back `v->indiv.l` before any direct final-buffer write that may - fall back. -- Keep exact kernels available as an oracle until dynamic shape execution closes - the gap. - -## Implementation Plan - -1. Add shape classification to the dynamic general plan path. - - Use existing `vcf_format_row_op_t` data where possible. - - Recognize fixed-width rows derived from `Number=1`, `Number=R`, - `Number=G`, and fixed `Number=N`. - - Reject rows needing measured widths unless handled by a specific executor. - -2. Add a first generic fixed-shape executor for CCDG-equivalent structures. - - No tag-name matching. - - Require leading `GT2`. - - Support any mix/order of fixed INT/REAL fields with widths 1, R, G, or - small fixed N. - - Initially support `Number=1` strings with measured max width so `PGT/PID` - can stay on the planned path. - -3. Reduce hot-loop dispatch. - - Precompute field offsets, widths, sizes, and parse actions. - - Prefer executor-family loops over `switch (op->kind)` per field per sample. - - Specialize common width parse helpers for 1, 2, 3, and small fixed widths. - -4. Direct-write final BCF output when safe. - - Continue direct `GT2` int8 output. - - Track integer ranges while parsing and use known-range encoding. - - For floats, serialize directly when field width is fixed. - - For strings, write final char payload after width is known. - -5. Instrument fallback reasons and executor choices. - - Add temporary counters or debug logging gated by env vars if useful. - - Track shape hits, shape fallbacks, strict hits, measured fallback hits. - -6. Benchmark and iterate. - - Correctness: `./test/test_format_plan.sh` - - CCDG subset: `/tmp/ccdg_chr22_10k.vcf.gz` - - Compare baseline, exact, and dynamic-only output with `cmp`. - - Primary target: dynamic-only `HTS_VCF_FORMAT_PLAN=interp` approaching - exact-mode time on VCF.gz -> uncompressed BCF. - -## Test Data - -Full source: - -```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -Benchmark subset: - -```text -/tmp/ccdg_chr22_10k.vcf.gz -/tmp/ccdg_chr22_10k.bcf -``` - -Correctness fixture: - -```text -test/format-plan-edge.vcf -``` - -## Current Scratch Notes - -- `HTS_VCF_FORMAT_PLAN=1` and `HTS_VCF_FORMAT_PLAN=interp` now exercise the same - dynamic executor. -- Historical exact-kernel numbers remain useful only as a performance reference - in older benchmark notes. -- Avoid hardcoding `AD`, `PL`, `DP`, `GQ`, `AB`, `PGT`, or `PID`; use their - header-derived type/number/width instead. -- CCDG-like FORMAT distributions are still the first target because they provide - a real, repeatable workload and a clear oracle. - -## 2026-04-29 Iteration Notes - -Implemented the first dynamic likelihood-shape executor in `vcf.c`. - -What changed: - -- Added a temporary shape counter path in `test/test_view`; it was later removed - with the dynamic-only production trim. -- Relaxed strict string handling so `Type=String,Number=1` FORMAT fields can be - handled by planned parsing with row-local byte-width measurement. -- Added a shape-specific width derivation for CCDG-like layouts where `AD` may - be declared as `Number=.` in the header but the row shape proves the observed - width is `n_allele`. -- Added a straight-line dynamic executor for: - -```text -GT2, optional FLOAT1, INT[n_allele], INT1, INT1, -optional STR1, optional STR1, INT[n_allele * (n_allele + 1) / 2] -``` - -This executor is selected by FORMAT type/order/width, not by tag names. It -still validates observed AD/PL counts and falls back on mismatch. - -Latest 10k CCDG VCF.gz -> uncompressed BCF single-run timings on the rebuilt -worktree: - -| Mode | Wall | User | Notes | -|---|---:|---:|---| -| Baseline | 2.78 s | 2.56 s | `HTS_VCF_FORMAT_PLAN=0` | -| Exact CCDG | 1.78 s | 1.61 s | exact kernels, shape hits 0 | -| Dynamic shape | 2.53 s | 1.71 s | `interp`, shape hits 10,000 | - -`cmp` passed for both dynamic-shape and exact outputs against baseline BCF. - -The important result is CPU parity is close: dynamic shape is within about 6% of -exact on user time in this run. Wall time is noisier, likely output/cache -effects, and should be rerun in a tighter benchmark loop. - -Next likely cuts: - -- Cache shape classification per `(header, FORMAT)` plan so we do less - per-record type/order checking. -- Split phase and non-phase shape executors to remove `has_phase` branches from - the sample loop. -- Consider separate `has_float` executor variants for the same reason. -- Compare a no-shape-stats build/run to estimate counter overhead, though it is - probably minor. -- Once dynamic shape is consistently at parity, demote the exact CCDG kernels to - oracle-only or remove them. - -## 2026-04-29 Large-Corpus Check - -The small public/synthetic slices were too short to provide timing signal, so -the meaningful benchmark set moved to `bench/format-shape/large/inputs.tsv`. -The large corpus includes the CCDG 10k subset, full 1000 Genomes chr22 -genotypes, and 2,048-sample generated FORMAT workloads. - -Latest large run used: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All exact/interp outputs compared byte-identical to baseline. Timing summary: - -| Input | Baseline user | Exact user | Dynamic interp user | Shape hits | -|---|---:|---:|---:|---:| -| CCDG 10k | 2.60 s | 1.64 s | 1.63 s | 10,000 | -| 1000G chr22 full GT | 26.76 s | 9.32 s | 9.32 s | 0 | -| Large CCDG-like synthetic | 4.10 s | 2.77 s | 2.87 s | 20,000 | -| Large reordered likelihood | 2.97 s | 2.65 s | 2.62 s | 0 | -| Large multiallelic likelihood | 3.22 s | 2.09 s | 2.05 s | 16,000 | -| Large float/string | 2.95 s | 2.84 s | 2.84 s | 0 | - -The dynamic likelihood shape path is now at parity or close enough on the -meaningful workloads. The remaining visible gap is the generated CCDG-like -phase-heavy synthetic case, where dynamic-only is about 3-4% slower than exact. -That looks acceptable for this checkpoint; the next optimization target remains -cached shape classification to remove repeated deterministic row-level checks. - -## 2026-04-29 Cached Shape Classification - -Added FORMAT-level likelihood-shape classification to the dynamic general plan. -The cache only records deterministic facts from the FORMAT/header order and -types: - -```text -GT2, optional FLOAT1, INT[n_allele], INT1, INT1, -optional STR1, optional STR1, INT[ploidy likelihood width] -``` - -Row-level facts remain uncached. Each record still validates `n_allele`, -AD/PL widths, GT syntax, observed vector counts, separators, sample count, and -phase-string widths before using the likelihood executor. - -The large benchmark corpus now includes four extra cache-regression workloads: - -- variable-width `PGT/PID` likelihood rows, -- likelihood rows with mixed row-local fallbacks and later positive hits, -- GT-first but wrong-order likelihood-like rows, -- non-likelihood rows with two strings plus float vectors. - -Latest full large-corpus run: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All exact and interp outputs compared byte-identical to baseline. Highlights: - -| Input | Exact user | Dynamic interp user | Dynamic shape attempts | Dynamic shape hits | -|---|---:|---:|---:|---:| -| CCDG 10k | 1.61 s | 1.60 s | 10,000 | 10,000 | -| 1000G chr22 full GT | 9.16 s | 9.11 s | 0 | 0 | -| Large CCDG-like synthetic | 2.74 s | 2.69 s | 20,000 | 20,000 | -| Large multiallelic likelihood | 2.05 s | 1.99 s | 16,000 | 16,000 | -| Variable phase widths | 2.00 s | 1.99 s | 12,000 | 12,000 | -| Mixed row-local fallbacks | 1.56 s | 1.58 s | 11,295 | 10,236 | -| GT-first reordered negative | 1.50 s | 1.50 s | 0 | 0 | -| Two-string float negative | 2.36 s | 2.32 s | 0 | 0 | - -The important negative-cache result is the full 1000G GT-only workload: -dynamic mode no longer pays 1,103,547 failed likelihood-shape probes. - -## 2026-04-29 GT-Only Fast Path - -Added a tiny general-plan executor for the common `FORMAT=GT` / diploid `GT2` -shape. This is still shape-based rather than data-set specific: - -- requires a single FORMAT op and that op must be `GT`, -- requires allele indexes that fit the existing one-digit `GT2` parser, -- falls through to the existing strict/measured paths for haploid, dynamic GT, - malformed rows, or any unsupported row-local detail. - -Also tightened the older exact-name CCDG kernels so they only claim a FORMAT -after checking the relevant header types and scalar counts. A new -`format-plan-header-mismatch.vcf` fixture keeps this honest by using CCDG-shaped -names with `AD` declared as a string. - -Latest full large-corpus run remained byte-identical to baseline for exact and -interp. The main win is the full 1000G chr22 GT-only workload: - -| Input | Baseline user | Exact user | Dynamic interp user | -|---|---:|---:|---:| -| 1000G chr22 full GT | 24.86 s | 5.77 s | 5.61 s | - -The previous cached-shape run was about 9.1 s user in dynamic mode on this -input, so the direct GT-only executor removes roughly 39% of the remaining -planned-parser CPU for this large real workload. - -## 2026-04-29 Multiallelic Parse Tightening - -Added two small low-risk likelihood executor refinements: - -- avoid retrying the likelihood shape executor inside the strict path when the - same row already reached the likelihood executor and failed row-local checks; -- add fixed-width integer vector parsers for AD width 4 and PL widths 6/10, - covering common triallelic and quad-allelic `Number=G` likelihood rows. - -The fixed-width parsers still use the same scalar integer parser and range -tracking, and they preserve short-vector padding and trailing-comma fallback -behavior. - -Small edge coverage now includes: - -- row-local likelihood fallback from short AD/PL in individual samples, -- missing AD/PL with another sample proving full row width, -- GT-only fast-path hits plus haploid and multidigit GT fallbacks. - -Latest full large-corpus run remained byte-identical to baseline. Timings were -noisier overall than the previous pass, but the important rows were: - -| Input | Exact user | Dynamic interp user | Notes | -|---|---:|---:|---| -| CCDG 10k | 1.59 s | 1.56 s | likelihood shape parity | -| 1000G chr22 full GT | 5.64 s | 5.68 s | GT-only fast path retained | -| Large multiallelic likelihood | 2.26 s | 2.07 s | dynamic ahead of exact | -| Mixed row-local fallbacks | 1.72 s | 1.74 s | byte-clean fallback path | - -## 2026-04-29 No-Special Integer Encode - -Added a conservative `has_special` bit to planned integer range tracking. The -parser now records when it has observed `bcf_int32_missing` or -`bcf_int32_vector_end`, including vector-end padding from short fixed-width -vectors. The known-range encoder uses that proof to skip per-value sentinel -checks in int8/int16 output only when the row contains no missing/vector-end -values. - -Safety rule: min/max alone never proves this. Missing and vector-end sentinels -can still select int8/int16 BCF encodings, so the fast loop is gated only by the -parser-maintained flag. - -Small edge coverage now includes integer boundary rows spanning int8/int16/int32 -choices, plus existing rows with scalar missing values, short fixed vectors, and -explicit vector missing values. - -Latest full large-corpus run: - -| Input | Exact user | Dynamic interp user | Notes | -|---|---:|---:|---| -| CCDG 10k | 1.74 s | 1.73 s | real likelihood parity | -| 1000G chr22 full GT | 6.02 s | 6.09 s | GT-only path retained | -| Large CCDG-like synthetic | 3.03 s | 2.98 s | dynamic slightly ahead | -| Large multiallelic likelihood | 2.29 s | 2.12 s | dynamic ahead | -| Mixed row-local fallbacks | 1.71 s | 1.72 s | byte-clean fallback path | - -All exact and interp outputs compared byte-identical to baseline. - -## 2026-04-29 Likelihood Row-Op Elision - -Removed `row_ops` construction from the dynamic likelihood strict path. The -likelihood executor now consumes cached plan indices and row-local widths -directly; generic strict still builds row ops only after the likelihood attempt -fails and it needs fixed-numeric/general parsing. - -This keeps row-local validation unchanged: - -- allele count remains limited per row, -- AD/PL counts still must prove the expected width, -- phase string widths are still measured for the current row, -- malformed GT/separators/sample counts still fall back. - -Latest full large-corpus run stayed byte-identical to baseline. Highlights: - -| Input | Exact user | Dynamic interp user | Notes | -|---|---:|---:|---| -| CCDG 10k | 1.73 s | 1.70 s | real likelihood slightly ahead | -| Large CCDG-like synthetic | 2.77 s | 2.74 s | dynamic slightly ahead | -| Large multiallelic likelihood | 2.13 s | 1.92 s | dynamic ahead | -| Variable phase widths | 2.04 s | 2.05 s | phase widths still row-local | -| Mixed row-local fallbacks | 1.58 s | 1.59 s | fallback path byte-clean | - -## Open Questions - -- How much of the gap is parse-loop dispatch versus generic encode cost? -- Can string width measurement be cached per shape region, or does row-local - width variation force a cheap scan every time? -- Is it better to build several executor families by op sequence, or one generic - fixed-shape executor with parse-function pointers? -- Do temporary fallback reason counters pay for themselves during iteration, or - should they stay under an explicit debug environment variable? - -## 2026-04-29 Composable MVP Pivot - -The planned parser has been refactored toward the MVP design: - -```text -FORMAT/header -> per-tag compiled ops -> one composable executor -> fallback -``` - -The dynamic `interp` path no longer routes through separate GT-only, -likelihood-shape, fixed-numeric, and measured-general executor ladders. It -builds one row-local op list from header `Type`/`Number` metadata, parses all -supported ops in FORMAT order, and falls back to the production parser for the -whole row when compile-time support or row-local validation fails. - -Supported per-tag MVP ops include: - -- `GT`, with fast `GT2` storage when the row is diploid/simple; -- `Integer` and `Float` with fixed `Number=N`, `Number=A`, `Number=R`, - `Number=G`, or bounded measured `Number=.`; -- `String,Number=1` with row-local width measurement. - -This intentionally trades the previous likelihood-family microkernel speed for -broader composability. Rows such as `GT:AD`, `GT:AD:DP:XX:PL`, reordered -numeric/string tags, and supersets with normal header-described tags can now use -the same planned executor without a full-row shape match. - -Added `test/format-plan-composable.vcf` to cover subsets, supersets, -reordered fields, measured numeric fields, strings, and a deliberate row-local -fallback. `./test/test_format_plan.sh` compares baseline, exact, and interp -outputs byte-for-byte. - -Latest full large-corpus composable MVP run: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-composable-mvp \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All exact and interp outputs compared byte-identical to baseline. - -| Input | Baseline user | Exact user | Dynamic interp user | Notes | -|---|---:|---:|---:|---| -| CCDG 10k | 2.61 s | 1.66 s | 2.35 s | broader MVP, some row fallback | -| 1000G chr22 full GT | 25.88 s | 8.70 s | 8.70 s | composable GT path parity with exact | -| Large CCDG-like synthetic | 4.17 s | 2.78 s | 3.84 s | lost likelihood microkernel speed | -| Large reordered likelihood | 3.01 s | 2.49 s | 2.49 s | parity; no special likelihood shape needed | -| Large multiallelic likelihood | 3.23 s | 2.20 s | 3.11 s | generic op loop slower than microkernel | -| Large float/string | 2.97 s | 3.01 s | 3.01 s | parity with exact/general | -| Variable phase widths | 2.68 s | 2.07 s | 2.54 s | string measurement still row-local | -| Mixed row-local fallbacks | 2.25 s | 1.90 s | 2.06 s | byte-clean fallback path | -| GT-first reordered negative | 1.79 s | 1.47 s | 1.45 s | composable path slightly ahead | -| Two-string float negative | 2.28 s | 2.61 s | 2.58 s | planned path slower than baseline here | - -Takeaway: the MVP architecture is much less brittle and supports tag-level -composition, but parity with the removed likelihood-shape executor will require -generic per-op optimizations or a later optional executor-generation layer. - -## 2026-04-29 Composable Production Hardening - -Follow-up productionizing pass: - -- tightened `GT` compile validation so the composable path only claims - `GT` when the header declares `Type=String,Number=1`; -- added `test/format-plan-gt-header-shape.vcf` to prove malformed-but-readable - `GT` headers fall back instead of being planned; -- restored direct writes for leading fixed-encoding ops (`GT2` and `FLOAT1`) - inside the composable executor; -- added a positive integer fast path before falling back to the full signed / - missing integer parser; -- routed generic `INTN` widths 4, 6, and 10 through the fixed-width counted - parsers; -- trimmed the unused dynamic likelihood-shape executor scaffolding from the - general planned path now that `interp` uses the composable executor. - -Latest full large-corpus run: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-composable-prod \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All exact and interp outputs compared byte-identical to baseline. - -| Input | Baseline user | Exact user | Dynamic interp user | Notes | -|---|---:|---:|---:|---| -| CCDG 10k | 2.62 s | 1.49 s | 2.29 s | partial fallback from row-local strictness | -| 1000G chr22 full GT | 26.02 s | 8.70 s | 9.09 s | GT-only composable path; noisy/slower pass | -| Large CCDG-like synthetic | 4.20 s | 2.68 s | 3.81 s | still behind old likelihood microkernel | -| Large reordered likelihood | 3.02 s | 2.47 s | 2.44 s | composable path at parity | -| Large multiallelic likelihood | 3.28 s | 1.95 s | 2.79 s | fixed-width INTN recovered part of gap | -| Large float/string | 2.99 s | 2.99 s | 2.94 s | parity/slightly ahead | -| Variable phase widths | 2.70 s | 2.01 s | 2.56 s | row-local string measurement remains cost | -| Mixed row-local fallbacks | 2.25 s | 1.76 s | 1.94 s | byte-clean fallback path | -| GT-first reordered negative | 1.77 s | 1.47 s | 1.44 s | composable path slightly ahead | -| Two-string float negative | 2.29 s | 2.55 s | 2.55 s | planned path still slower than baseline | - -## 2026-04-29 Underfilled Vector Compaction - -Added a composable per-op fallback reduction for fixed-width vector fields. If -an `INT2`/`INT3`/`INTN`/`FLOATN` op was parsed into the conservative -header-derived width, but the row's observed maximum vector count is smaller, -the executor now compacts that field's scratch buffer to the observed row width -and encodes it directly instead of falling back for the whole row. - -This keeps the fallback boundary for unsupported/malformed data, but avoids -production fallback for byte-identical rows where the production parser would -also emit a narrower BCF vector width. - -Latest full large-corpus run remained byte-identical to baseline. The main -effect is fallback reduction on mixed row-local cases: - -| Input | Exact user | Dynamic interp user | Dynamic hits/fallback | -|---|---:|---:|---:| -| CCDG 10k | 1.50 s | 2.28 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 9.08 s | 8.89 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 2.66 s | 3.76 s | 20,000 / 0 | -| Large multiallelic likelihood | 1.90 s | 2.78 s | 16,000 / 0 | -| Variable phase widths | 1.97 s | 2.50 s | 12,000 / 0 | -| Mixed row-local fallbacks | 1.75 s | 1.86 s | 12,000 / 0 | -| GT-first reordered negative | 1.43 s | 1.41 s | 12,000 / 0 | - -The attempted pointer-increment / reduced-bookkeeping hot-loop rewrite was -tested separately and reverted because it slowed the targeted likelihood-heavy -benchmarks despite remaining byte-correct. - -## 2026-04-29 Dynamic-Only Production Trim - -Removed the optional SIMD tab-scanning front-end and the old hardcoded exact -FORMAT kernels. The optimized FORMAT entry point is now: - -```text -HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> production fallback -``` - -`HTS_VCF_FORMAT_PLAN=1`, `interp`, and `general` all route through the same -dynamic executor. The benchmark harness now labels `HTS_VCF_FORMAT_PLAN=1` as -`plan`; older result directories may still contain a historical `exact` label, -but that is no longer a separate hardcoded kernel path. - -Source cleanup removed the SIMD probe/stat plumbing, SIMD intrinsics includes, -shape-stat plumbing, exact shape compiler/cache, exact phase-width pass, and -exact GT/AB/AD/DP/GQ/PL microkernel. Relative to `origin/develop`, the live -source delta after adding inline docs is 1,467 added lines in `vcf.c` plus 14 -added lines in `test/test_view.c`. - -Latest full large-corpus run: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All output comparisons remained byte-identical to baseline. - -| Input | Baseline user | Dynamic plan user | Hits/fallback | -|---|---:|---:|---:| -| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | -| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | -| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | -| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | -| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | -| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | - -## 2026-04-29 bcftools Production-Style Timing - -Built a clean bcftools `develop` worktree at: - -```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan -``` - -using this htslib worktree via: - -```sh -make HTSDIR=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/htslib-vcf-avx-sanity bcftools -``` - -The timing run used `bcftools view --no-version -Ob -l 0` over the threaded -representative manifest: - -```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ - bench/format-shape/scripts/run_bcftools_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -All planned outputs compared byte-identical to baseline. - -| Input | Threads | Baseline real | Plan real | Speedup | Baseline user | Plan user | -|---|---:|---:|---:|---:|---:|---:| -| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | 25.94 s | 8.05 s | -| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | 28.82 s | 9.04 s | -| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | 28.83 s | 9.08 s | -| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | 28.71 s | 9.38 s | -| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | 4.11 s | 3.66 s | -| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | 4.50 s | 4.06 s | -| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | 4.51 s | 4.09 s | -| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | 4.50 s | 4.05 s | - -Takeaway: in a bcftools conversion path, the dynamic FORMAT parser gives a large -production-visible win for GT-only sample-rich VCFs. On likelihood-heavy rows it -still helps, but output/input threading and remaining generic FORMAT work limit -the total wall-clock gain to roughly 12-15% in this representative run. diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md new file mode 100644 index 000000000..364c028ac --- /dev/null +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -0,0 +1,184 @@ +# Dynamic FORMAT Plan: Current Implementation + +This document describes the implementation currently present in `vcf.c`, the +correctness boundaries, and the latest benchmark results. + +## Entry Point + +`vcf_parse_format()` first calls `vcf_parse_format_planned()` when +`HTS_VCF_FORMAT_PLAN` is enabled. The planned path either parses the whole +FORMAT column or returns `-3`, allowing the production parser to handle the +column unchanged. + +```text +HTS_VCF_FORMAT_PLAN enabled + -> compile FORMAT/header to per-tag plan + -> resolve row-local widths + -> composable executor + -> production fallback on unsupported or suspicious rows +``` + +Enabled spellings are `1`, `interp`, and `general`; all route through the same +dynamic executor. + +## Plan Compilation + +Plans are cached by header pointer plus literal FORMAT string. This is +important because VCF header IDs, declared types, and number models are +header-local. + +The compile step rejects: + +- undefined FORMAT tags; +- duplicate FORMAT tags; +- unsupported header types; +- unsupported number models; +- `GT` declarations that are not `Type=String,Number=1`. + +Undefined tags intentionally fall back to the production parser so existing +dummy-header repair and warning behavior is preserved. + +## Supported Operations + +The current executor supports: + +- `GT`, with fast `GT2` storage when the row is diploid and simple; +- integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or + bounded measured `Number=.` row widths; +- float fields with the same number models as integer fields; +- string fields declared as `Type=String,Number=1`, measured per row. + +Header-derived widths are resolved per record. `Number=A`, `Number=R`, and +`Number=G` depend on the current allele count. String and `Number=.` numeric +fields use a row-local measurement pass. + +## Executor + +BCF stores FORMAT data transposed by tag: all samples for FORMAT op 0, then all +samples for FORMAT op 1, and so on. The dynamic executor parses VCF samples in +sample-major order and writes that transposed BCF layout. + +Leading fixed-width `GT2` and `FLOAT1` rows can write directly into `v->indiv`. +Other rows are staged in header scratch memory, then encoded after sample +parsing so integer range and observed-width metadata are known. + +For fixed-width vector fields, the executor can compact underfilled rows to the +observed row maximum before BCF encoding. This avoids whole-row fallback when +the production parser would also emit a narrower byte-identical vector width. + +## Guard Policy + +Each cached dynamic plan has a small runtime guard: + +- attempts, hits, fallbacks; +- consecutive miss streak; +- temporary cooldown. + +An isolated fallback does not disable the fast path. A plan is paused after +eight consecutive misses, or after at least 128 attempts with more than 10% +fallbacks. After 256 skipped records, the plan probes again so later stable +regions can recover the optimized path. + +## Correctness Rules + +The planned parser must preserve these invariants: + +- no planned parsing while `h->keep_samples` is active; +- header IDs, types, and number models are resolved before execution; +- duplicate or undefined tags use the production parser; +- unsupported GT encodings force fallback; +- numeric vectors preserve observed width and vector-end padding; +- strings use observed maximum byte length and zero-pad shorter samples; +- integer and float overflow/error behavior must match production htslib or + force fallback; +- direct writes to `v->indiv` must roll back before fallback. + +Focused validation lives in `./test/test_format_plan.sh`. It compares +production parsing, `HTS_VCF_FORMAT_PLAN=1`, and the `interp` alias byte-for-byte +with `cmp`. + +## Current Source Delta + +After removing the old exact kernels and SIMD tab scanner, the live parser/test +hook delta relative to `origin/develop` is: + +| File | Added lines | +|---|---:| +| `vcf.c` | 1,467 | +| `test/test_view.c` | 14 | + +## Large Corpus Benchmark + +Command: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ + bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv +``` + +All planned outputs compared byte-identical to baseline. + +| Input | Baseline user | Plan user | Hits/fallback | +|---|---:|---:|---:| +| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | +| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | +| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | +| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | +| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | +| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | + +## bcftools Production-Style Benchmark + +A clean bcftools `develop` worktree was built at: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan +``` + +using this htslib worktree: + +```sh +make HTSDIR=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/htslib-vcf-avx-sanity bcftools +``` + +Timing command: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ + bench/format-shape/scripts/run_bcftools_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +The runner uses `bcftools view --no-version -Ob -l 0 [--threads N]`. All +planned outputs compared byte-identical to baseline. + +| Input | Threads | Baseline real | Plan real | Speedup | Baseline user | Plan user | +|---|---:|---:|---:|---:|---:|---:| +| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | 25.94 s | 8.05 s | +| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | 28.82 s | 9.04 s | +| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | 28.83 s | 9.08 s | +| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | 28.71 s | 9.38 s | +| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | 4.11 s | 3.66 s | +| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | 4.50 s | 4.06 s | +| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | 4.51 s | 4.09 s | +| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | 4.50 s | 4.05 s | + +## Interpretation + +The dynamic path gives a large production-visible win for sample-rich GT-only +VCFs. On likelihood-heavy rows, it is consistently faster but still limited by +generic per-op work, string/width handling, and IO/compression costs. Some +float/string-heavy layouts remain near parity or slightly slower than baseline. + +## Remaining Work + +- Add selected-sample support so `keep_samples` does not force fallback. +- Reduce per-sample opcode dispatch in hot FORMAT layouts. +- Improve string and measured-width handling without losing byte identity. +- Consider a later executor-generation layer if generic per-op dispatch remains + the main gap to historical exact-kernel speed. diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md new file mode 100644 index 000000000..0ab5a6efd --- /dev/null +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -0,0 +1,185 @@ +# Dynamic FORMAT Plan Experiment Log + +This log records the major approaches tried while developing the dynamic FORMAT +parser, the result of each approach, and what survived into the current design. + +## Starting Point + +The initial problem was that exact, hand-written FORMAT kernels were much faster +than the dynamic implementation, but exact kernels were too brittle. They only +matched a few complete FORMAT strings, such as: + +- `GT:AB:AD:DP:GQ:PL` +- `GT:AD:DP:GQ:PL` +- `GT:AB:AD:DP:GQ:PGT:PID:PL` +- `GT:AD:DP:GQ:PGT:PID:PL` + +The target became: recognize useful structure at the FORMAT-tag level, remain +general across subsets/supersets/reordered tags, and fall back to production +htslib whenever the optimized parser could not prove byte-identical output. + +## Exact CCDG Kernels + +The first high-performance path was a set of exact kernels for dominant CCDG +likelihood layouts. They proved the upper-bound target: on the 10k CCDG subset, +exact mode was roughly 1.6 s user versus 2.6 s baseline. + +Result: useful as a performance oracle, but removed from the production +candidate because exact string matching did not satisfy the generality goal. + +## Dynamic Likelihood Shape Executor + +Next, the parser used header/type/order information to recognize a likelihood +shape rather than exact tag names: + +```text +GT2, optional FLOAT1, INT[n_allele], INT1, INT1, +optional STR1, optional STR1, INT[n_allele * (n_allele + 1) / 2] +``` + +This was selected by type/order/width rather than names such as `AD` and `PL`. +It validated allele count, observed vector counts, GT syntax, separators, sample +count, and phase-string widths per row. + +Result: it closed much of the performance gap. On one 10k CCDG run, dynamic +shape was within about 6% of exact user time while remaining byte-identical. + +Why it did not survive: it reintroduced a shape-specific executor family. That +was useful evidence, but the MVP goal shifted toward one composable per-tag +executor before adding any generation/specialization layer. + +## Cached Shape Classification + +The dynamic shape attempt initially paid repeated failed probes on non-likelihood +workloads. Caching deterministic shape facts per `(header, FORMAT)` plan fixed +that. The full 1000G GT-only workload stopped paying over a million failed +likelihood-shape probes. + +Result: retained as a lesson for future specialization. The current composable +plan still caches by `(header, FORMAT)`. + +## GT-Only Fast Path + +A tiny `FORMAT=GT` / diploid `GT2` executor was added and gave a large speedup +on the full 1000G chr22 genotype VCF, cutting dynamic-mode user time from about +9.1 s to about 5.6 s in that intermediate architecture. + +Result: the direct `GT2` insight survived, but not as a separate GT-only +executor. The current composable executor direct-writes leading `GT2` rows when +safe. + +## Integer Parse And Encode Tightening + +Several low-risk parser/encoder refinements were tried: + +- fixed-width integer vector parsers for common AD/PL widths; +- positive integer fast path before falling back to full signed/missing parsing; +- integer range tracking with a `has_special` bit so int8/int16 encoding can skip + sentinel checks only when the parser proved no missing/vector-end values. + +Result: retained. These fit the generic per-op architecture and helped recover +some likelihood-heavy performance. + +## Likelihood Row-Op Elision + +In the shape-executor phase, row-op construction was removed from the dynamic +likelihood strict path so the executor could consume cached plan indices and +row-local widths directly. + +Result: useful for the old shape executor, but not retained once the MVP pivoted +to the composable row-op model. + +## Composable MVP Pivot + +The architecture pivoted to: + +```text +FORMAT/header -> per-tag compiled ops -> one composable executor -> fallback +``` + +The dynamic path stopped routing through separate GT-only, likelihood-shape, +fixed-numeric, and measured-general executor ladders. Instead, it builds one +row-local op list from header metadata and parses supported ops in FORMAT order. + +Result: retained. This is the current design because it supports tag-level +composition for rows such as `GT:AD`, `GT:AD:DP:XX:PL`, reordered fields, and +supersets with normal header-described tags. + +Tradeoff: broader composability lost some of the microkernel speed from the +likelihood shape executor. + +## Production Hardening + +Several hardening passes made the composable MVP safer and faster: + +- tightened `GT` compile validation to require `Type=String,Number=1`; +- added malformed-but-readable `GT` header coverage; +- restored direct writes for leading fixed-encoding ops (`GT2`, `FLOAT1`); +- routed generic `INTN` widths 4, 6, and 10 through fixed-width counted parsers; +- removed unused dynamic likelihood-shape scaffolding; +- added underfilled vector compaction for fixed-width vector fields. + +Result: retained. The dynamic path became broader and reduced unnecessary +whole-row fallback while preserving byte-identical output. + +## Reverted Or Removed Work + +Removed: + +- exact CCDG kernels; +- dynamic likelihood-shape executor scaffolding; +- optional SIMD tab-scanning front-end; +- shape-stat benchmark plumbing; +- legacy `exact`/`interp` timing rows in the benchmark harness. + +Tested and reverted: + +- pointer-increment / reduced-bookkeeping hot-loop rewrite. It stayed + byte-correct but slowed targeted likelihood-heavy benchmarks. + +## Dynamic-Only Production Trim + +After removing exact and SIMD paths, the optimized entry became: + +```text +HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> production fallback +``` + +`HTS_VCF_FORMAT_PLAN=1`, `interp`, and `general` now route through the same +dynamic executor. Benchmarks label only `HTS_VCF_FORMAT_PLAN=1` as `plan`. + +Large-corpus post-trim user-time highlights: + +| Input | Baseline | Plan | Result | +|---|---:|---:|---| +| CCDG 10k | 2.62 s | 2.25 s | faster, partial fallback | +| 1000G chr22 full GT | 26.05 s | 7.98 s | major win | +| Large CCDG-like synthetic | 4.24 s | 3.78 s | modest win | +| Large float/string | 2.93 s | 2.97 s | near parity/slightly slower | +| Two-string float negative | 2.28 s | 2.56 s | slower | + +## bcftools Production Check + +A clean bcftools `develop` worktree was built against this htslib branch and run +with `bcftools view --no-version -Ob -l 0 [--threads N]`. + +All planned outputs compared byte-identical to baseline. + +| Input | Threads | Baseline real | Plan real | Speedup | +|---|---:|---:|---:|---:| +| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | +| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | +| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | +| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | + +## Main Lessons + +- Tag-level composition is the right MVP boundary; exact full FORMAT strings are + too brittle. +- Whole-row fallback keeps correctness manageable, but makes one unsupported tag + enough to lose the optimized path. +- Sample-rich GT-only VCFs are the clearest production win. +- Likelihood-heavy workloads benefit, but generic per-op dispatch and string / + measured-width handling still leave performance on the table. +- Future executor generation or shape-specialized families may be worth adding + after the composable MVP is stable. diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md new file mode 100644 index 000000000..56817cbac --- /dev/null +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -0,0 +1,90 @@ +# Dynamic FORMAT Plan Overview + +This branch adds an optional fast path for parsing VCF `FORMAT` sample columns. +The goal is to speed up common, header-described FORMAT layouts without writing +one-off kernels for exact FORMAT strings such as `GT:AD:DP:GQ:PL`. + +## What It Does + +When `HTS_VCF_FORMAT_PLAN` is enabled, htslib first tries to compile the record's +literal FORMAT string into a small list of per-tag operations. The plan is +driven by the active VCF header: each tag contributes its key, type, declared +number model, and whether the current row needs width measurement. + +If the row fits the supported operation set, the dynamic executor parses samples +and writes BCF's transposed FORMAT layout directly. If anything looks unsafe or +unsupported, htslib falls back to the production parser for the whole FORMAT +column. + +## Why This Shape + +The important design choice is tag-level composition. A file does not need an +exact hardcoded FORMAT string to benefit. For example, these can all share the +same dynamic machinery when their tags are described by supported header +metadata: + +- `GT:AD` +- `GT:AD:DP:PL` +- `GT:AB:AD:DP:GQ:PGT:PID:PL` +- reordered numeric/string tags +- supersets with additional supported tags + +This is deliberately more general than the earlier experimental exact kernels. +Those kernels were fast, but brittle: adding or removing one tag could miss the +optimized path entirely. + +## Where It Helps + +The feature is most useful for sample-rich VCF text input where FORMAT parsing is +a meaningful part of total runtime: + +- large `GT`-only genotype VCFs; +- likelihood-heavy VCFs with fields such as `AD`, `PL`, `DP`, `GQ`, `AB`, and + phase strings; +- conversion paths such as VCF.gz to BCF where text FORMAT parsing is exposed; +- workloads with repeated FORMAT layouts across many records. + +In the latest bcftools-style timing, the real 1000G chr22 GT workload sped up +from 27.48 s to 8.99 s unthreaded, and from 26.71 s to 6.94 s at 4 threads. +The likelihood-heavy synthetic workload improved more modestly, from 4.43 s to +3.94 s unthreaded and from 3.47 s to 3.02 s at 4 threads. + +## Drawbacks + +The MVP intentionally keeps fallback whole-row. It does not parse supported +tags dynamically while delegating only one unsupported tag to the production +parser. That makes correctness easier to reason about, but a single unsupported +tag or malformed row means the entire FORMAT column uses the production parser. + +Known fallback cases include: + +- sample subsetting via `keep_samples`; +- undefined FORMAT tags that require production header repair; +- unsupported header types or number models; +- duplicate FORMAT tags; +- malformed separators or unexpected sample cardinality; +- row-local widths above the bounded fast-path limit; +- GT encodings outside the simple fast-path representation. + +The path is also not always faster. Some string/float-heavy layouts are roughly +at parity or slightly slower than baseline because the dynamic path still pays +measurement, dispatch, and scratch-buffer costs. + +## User-Facing Controls + +```text +unset / 0 production parser only +1 dynamic per-tag planner, then production fallback +interp/general aliases for the same dynamic planner +``` + +The benchmark harness reports only `HTS_VCF_FORMAT_PLAN=1` as `plan`. +`interp` and `general` remain accepted aliases for manual debugging, but they are +not distinct implementations. + +## Related Docs + +- `docs/FORMAT_PLAN_CURRENT.md`: current implementation, supported shapes, + correctness rules, and benchmark tables. +- `docs/FORMAT_PLAN_EXPERIMENT_LOG.md`: chronological log of approaches tried, + results, reversions, and retained lessons. diff --git a/docs/FORMAT_PLAN_SPEC.md b/docs/FORMAT_PLAN_SPEC.md deleted file mode 100644 index 0636a0bd4..000000000 --- a/docs/FORMAT_PLAN_SPEC.md +++ /dev/null @@ -1,125 +0,0 @@ -# FORMAT Plan Parser Spec - -This document describes the current `HTS_VCF_FORMAT_PLAN` VCF FORMAT parser. -The older exact CCDG kernels and dynamic likelihood-shape tier were removed; -the optimized path is now a single dynamic per-tag planner with production -fallback. - -## Goal - -Keep the existing htslib FORMAT parser as the source of truth, while adding an -opportunistic fast path for repeated, header-described FORMAT layouts. The fast -path may only claim a row when it can produce byte-identical BCF. Otherwise it -must return `-3` and let the existing parser handle the whole FORMAT column. - -## Current Architecture - -`HTS_VCF_FORMAT_PLAN` controls the planned parser: - -```text -unset / 0 production parser only -1 dynamic per-tag planner, then production fallback -interp/general same dynamic per-tag planner, then production fallback -``` - -All enabled spellings now use the same implementation. The benchmark harness -reports only `HTS_VCF_FORMAT_PLAN=1` as `plan`; `interp`/`general` remain useful -manual aliases but are not distinct timing modes. - -The planned parser has four stages: - -1. Compile the literal FORMAT string and active header into a cached list of - per-tag operations. -2. Resolve row-local widths from header `Number` metadata, allele count, and a - bounded measurement pass for strings or `Number=.` numeric vectors. -3. Parse sample fields into BCF's transposed FORMAT layout with a composable - executor. -4. Fall back to the production FORMAT parser for unsupported or suspicious rows. - -## Supported Tags - -The planner is tag-composable rather than full-string-specialized. It can claim -layouts such as `GT:AD`, `GT:AD:DP:PL`, -`GT:AB:AD:DP:GQ:PGT:PID:PL`, reordered fields, and supersets when each tag has -supported header metadata. - -Supported FORMAT tag shapes: - -- `GT` declared as `Type=String,Number=1`, with simple diploid encodings on the - fast path. -- Integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or - bounded measured `Number=.` row widths. -- Float fields with the same number models as integer fields. -- String fields declared as `Type=String,Number=1`, measured per row. - -Unsupported tags or unsupported row-local encodings fall back whole-row. - -## Correctness Rules - -The planned parser must preserve these invariants: - -- No planned parsing while `h->keep_samples` is active. -- Header IDs, types, and number models are resolved before execution. -- Duplicate FORMAT tags use the production parser. -- Undefined tags use the production parser, preserving dummy-header behavior and - warnings. -- GT encoding must match htslib phasing semantics; encodings outside the simple - fast path must force fallback. -- Numeric vectors use observed or provably fixed row width and pad shorter - samples with vector-end sentinels. -- Strings use observed maximum byte length and zero-pad shorter samples. -- Integer and float overflow/error behavior must either match production htslib - or force fallback. -- Any fast path that writes directly into `v->indiv` must save the original - length and roll back before fallback. - -## Dynamic Planner - -The planner compiles the literal FORMAT string into cached opcodes keyed by -header pointer plus FORMAT text. Header-local ids and type metadata make plans -unsafe to share across headers. - -After seeing a record, it resolves the reusable op list to row-local operations -such as `GT2`, `INT1`, `INT2`, `INT3`, `INTN`, `FLOAT1`, `FLOATN`, and `STR`. -`Number=A`, `Number=R`, and `Number=G` widths come from the current allele -count. String and `Number=.` numeric widths are measured across the row before -execution. - -The executor writes BCF's transposed FORMAT layout. Leading fixed-width -`GT2`/`FLOAT1` rows can be written directly into `v->indiv`; other rows are -staged in header scratch memory and encoded after sample parsing so integer -range and observed-width metadata are known. - -## Guard Policy - -Each cached dynamic plan has a small runtime guard: - -- attempts, hits, fallbacks, -- consecutive miss streak, -- temporary cooldown. - -An isolated fallback does not disable the fast path. A plan is paused only -after eight consecutive misses, or after at least 128 attempts with more than -10% fallbacks. After 256 skipped records, the plan probes again so later -fixed-format regions can recover the optimized path. - -## Tests - -`./test/test_format_plan.sh` writes BCF through: - -- the production parser, -- `HTS_VCF_FORMAT_PLAN=1`, -- `HTS_VCF_FORMAT_PLAN=interp`. - -It compares the planned outputs against baseline with `cmp`. The fixtures cover -subsets, supersets, reordered fields, measured numeric fields, strings, -malformed header shapes, and deliberate row-local fallback cases. - -## Next Work - -- Add selected-sample support so `keep_samples` does not require whole-row - fallback. -- Reduce per-sample opcode dispatch in hot FORMAT layouts. -- Improve string and measured-width handling without losing byte identity. -- Consider a later executor-generation layer if generic per-op dispatch remains - the main gap to historical exact-kernel speed. From b433732fb98403f17ed1f6dc09bd944b9c5ac0f7 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 22:09:03 +0200 Subject: [PATCH 28/38] Harden dynamic FORMAT planner --- .gitignore | 1 + Makefile | 6 + bench/format-shape/README.md | 31 +- bench/format-shape/large/threaded-inputs.tsv | 12 +- .../scripts/run_bcftools_bench.sh | 15 +- docs/FORMAT_PLAN_CURRENT.md | 166 +++++++-- docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 64 ++++ docs/FORMAT_PLAN_OVERVIEW.md | 22 +- test/format-plan-cache.vcf | 61 +++ test/format-plan-profitability.vcf | 8 + test/test_format_plan.sh | 26 +- test/test_format_plan_cache.c | 130 +++++++ test/test_view.c | 11 +- vcf.c | 352 +++++++++++++++--- 14 files changed, 804 insertions(+), 101 deletions(-) create mode 100644 test/format-plan-cache.vcf create mode 100644 test/format-plan-profitability.vcf create mode 100644 test/test_format_plan_cache.c diff --git a/.gitignore b/.gitignore index 6aee57ace..d574fe03d 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ shlib-exports-*.txt /test/test_bgzf /test/test_expr /test/test_faidx +/test/test_format_plan_cache /test/test_index /test/test_introspection /test/test_kfunc diff --git a/Makefile b/Makefile index a6617b8a2..74b257f45 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,7 @@ BUILT_TEST_PROGRAMS = \ test/test_str2int \ test/test_time_funcs \ test/test_view \ + test/test_format_plan_cache \ test/test_index \ test/test-vcf-api \ test/test-vcf-sweep \ @@ -690,6 +691,7 @@ check test: all $(HTSCODECS_TEST_TARGETS) test/test_str2int test/test_time_funcs test/fieldarith test/fieldarith.sam + test/test_format_plan_cache test/hfile if test "x$(BUILT_PLUGINS)" != "x"; then \ HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \ @@ -786,6 +788,9 @@ test/test_time_funcs: test/test_time_funcs.o test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread +test/test_format_plan_cache: test/test_format_plan_cache.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_format_plan_cache.o libhts.a $(LIBS) -lpthread + test/test_index: test/test_index.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_index.o libhts.a $(LIBS) -lpthread @@ -881,6 +886,7 @@ test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_reg test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_format_plan_cache.o: test/test_format_plan_cache.c config.h $(htslib_kstring_h) $(htslib_vcf_h) test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_kstring_h) $(htslib_kseq_h) diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index da1bf0fe2..59363f506 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -22,8 +22,8 @@ bench/format-shape/ scripts/make_synthetic.pl deterministic synthetic VCF generator scripts/make_large_synthetic.pl scripts/run_bench.sh baseline/plan timing and cmp runner - scripts/run_thread_bench.sh representative threaded timing and cmp runner - scripts/run_bcftools_bench.sh representative bcftools timing runner + scripts/run_thread_bench.sh threaded timing and cmp runner + scripts/run_bcftools_bench.sh bcftools threaded timing runner results/ generated timing logs and BCF outputs ``` @@ -104,7 +104,7 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ `KEEP_OUTPUTS=0` still writes temporary BCF files and compares them with `cmp`, but deletes the large BCF outputs after each input is checked. -Run the representative threaded scaling corpus: +Run the threaded scaling corpus: ```sh KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded \ @@ -113,9 +113,9 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded \ ``` By default this runs unthreaded plus `-@ 2`, `-@ 4`, and `-@ 8`. Override with -`THREADS_LIST="2 4 8"` or a similar space-separated list. The current threaded -manifest intentionally uses a small representative subset of the large corpus: -one real GT-only workload and one FORMAT-heavy CCDG-like likelihood workload. +`THREADS_LIST="2 4 8"` or a similar space-separated list. The threaded manifest +now mirrors the full large corpus so thread scaling is checked across the same +real and synthetic workload shapes as the primary benchmark. The script runs each input in two modes. `interp` remains accepted by `HTS_VCF_FORMAT_PLAN`, but it aliases the same dynamic parser as `plan`, so the @@ -137,7 +137,7 @@ bench/format-shape/results/checks.tsv The threaded runner writes the same files under its selected output directory, with an additional `threads` column. -Run the same representative threaded corpus through bcftools: +Run the same threaded corpus through bcftools: ```sh BCFTOOLS=/path/to/bcftools \ @@ -151,6 +151,17 @@ baseline with `cmp`, and records the same `0 2 4 8` thread counts by default. It does not report planner counters because bcftools does not expose the `test/test_view` stats hook. +To exercise selected-sample parsing, set `SAMPLE_COUNT=N`. The runner queries +the first N samples from each input with `bcftools query -l` and passes them to +`bcftools view -s`; sites-only inputs have no sample list and run unchanged. + +```sh +BCFTOOLS=/path/to/bcftools SAMPLE_COUNT=2 \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ + bench/format-shape/scripts/run_bcftools_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + ## Large Corpus `large/inputs.tsv` currently contains: @@ -163,11 +174,7 @@ It does not report planner counters because bcftools does not expose the fallbacks, GT-first wrong-order likelihood-like rows, and two-string float rows. -`large/threaded-inputs.tsv` currently selects two representative inputs from the -same corpus for `-@` scaling checks: - -- full 1000 Genomes chr22 genotype VCF, -- large CCDG-like synthetic likelihood VCF. +`large/threaded-inputs.tsv` mirrors this full corpus for `-@` scaling checks. To refresh only the newer cache-regression synthetic files without rewriting the older large VCFs: diff --git a/bench/format-shape/large/threaded-inputs.tsv b/bench/format-shape/large/threaded-inputs.tsv index 104541474..795882a7e 100644 --- a/bench/format-shape/large/threaded-inputs.tsv +++ b/bench/format-shape/large/threaded-inputs.tsv @@ -1,3 +1,11 @@ name path source -1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF; real GT-only scaling case -large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples; FORMAT-heavy scaling case +ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples +1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF +large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples +large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples +large_multiallelic_likelihood_2048s bench/format-shape/large/synthetic/large_multiallelic_likelihood_2048s.vcf.gz synthetic multiallelic likelihood FORMAT, 16k records x 2,048 samples +large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT, 16k records x 2,048 samples +large_phase_width_variation_2048s bench/format-shape/large/synthetic/large_phase_width_variation_2048s.vcf.gz synthetic likelihood FORMAT with variable PGT/PID widths, 12k records x 2,048 samples +large_mixed_likelihood_2048s bench/format-shape/large/synthetic/large_mixed_likelihood_2048s.vcf.gz synthetic likelihood FORMAT with row-local unsupported/wrong-width rows, 12k records x 2,048 samples +large_gt_first_reordered_2048s bench/format-shape/large/synthetic/large_gt_first_reordered_2048s.vcf.gz synthetic GT-first reordered non-shape likelihood FORMAT, 12k records x 2,048 samples +large_two_string_float_2048s bench/format-shape/large/synthetic/large_two_string_float_2048s.vcf.gz synthetic two-string float FORMAT, 12k records x 2,048 samples diff --git a/bench/format-shape/scripts/run_bcftools_bench.sh b/bench/format-shape/scripts/run_bcftools_bench.sh index 15b374d3c..520690690 100755 --- a/bench/format-shape/scripts/run_bcftools_bench.sh +++ b/bench/format-shape/scripts/run_bcftools_bench.sh @@ -6,6 +6,7 @@ inputs=${1:-bench/format-shape/large/threaded-inputs.tsv} outdir=${OUTDIR:-bench/format-shape/large/results-bcftools} keep_outputs=${KEEP_OUTPUTS:-1} threads_list=${THREADS_LIST:-0 2 4 8} +sample_count=${SAMPLE_COUNT:-0} mkdir -p "$outdir" timings="$outdir/timings.tsv" @@ -16,6 +17,16 @@ printf 'name\tthreads\tcomparison\tstatus\n' > "$checks" tail -n +2 "$inputs" | while IFS=' ' read -r name path source do + sample_args= + if [ "$sample_count" != 0 ]; then + samples=$("$bcftools" query -l "$path" | awk -v n="$sample_count" ' + NR <= n { if (s) s = s "," $0; else s = $0 } + END { print s } + ') + if [ -n "$samples" ]; then + sample_args="-s $samples" + fi + fi for threads in $threads_list do base_out="$outdir/$name.t$threads.baseline.bcf" @@ -31,10 +42,10 @@ do out="$outdir/$name.t$threads.$mode.bcf" case "$mode" in baseline) - env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args -o "$out" "$path" 2> "$err" + env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args -o "$out" "$path" 2> "$err" ;; plan) - env HTS_VCF_FORMAT_PLAN=1 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args -o "$out" "$path" 2> "$err" + env HTS_VCF_FORMAT_PLAN=1 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args -o "$out" "$path" 2> "$err" ;; esac diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md index 364c028ac..51aa01984 100644 --- a/docs/FORMAT_PLAN_CURRENT.md +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -12,7 +12,7 @@ column unchanged. ```text HTS_VCF_FORMAT_PLAN enabled - -> compile FORMAT/header to per-tag plan + -> fetch or compile header-owned FORMAT/header plan -> resolve row-local widths -> composable executor -> production fallback on unsupported or suspicious rows @@ -23,9 +23,21 @@ dynamic executor. ## Plan Compilation -Plans are cached by header pointer plus literal FORMAT string. This is -important because VCF header IDs, declared types, and number models are -header-local. +Plans are cached in private `bcf_hdr_aux_t` state by literal FORMAT string plus +the header's private FORMAT-plan generation. This is important because VCF +header IDs, declared types, and number models are header-local. The cache grows +from 16 entries up to 128 entries, uses heap storage for long FORMAT strings, +and also caches unsupported schemas so repeated odd rows do not repeatedly pay +compile cost. + +`bcf_hdr_sync()` clears the header-owned plan cache and increments the private +generation after header dictionaries are rebuilt. The planner also refuses to +compile while `h->dirty` is set, leaving unsynced or header-repair cases on the +production parser. + +The cache and per-plan guard counters are mutable header-owned state, like other +htslib header scratch storage. Callers should not concurrently parse through +the same `bcf_hdr_t` from multiple threads. The compile step rejects: @@ -34,6 +46,8 @@ The compile step rejects: - unsupported header types; - unsupported number models; - `GT` declarations that are not `Type=String,Number=1`. +- string-plus-float-vector schemas with too little integer-vector work to repay + the dynamic path's width-measurement cost. Undefined tags intentionally fall back to the production parser so existing dummy-header repair and warning behavior is preserved. @@ -47,6 +61,8 @@ The current executor supports: bounded measured `Number=.` row widths; - float fields with the same number models as integer fields; - string fields declared as `Type=String,Number=1`, measured per row. +- `bcf_hdr_set_samples()` / `keep_samples`, by scanning the original sample + columns and writing only retained samples densely into the planned BCF output. Header-derived widths are resolved per record. `Number=A`, `Number=R`, and `Number=G` depend on the current allele count. String and `Number=.` numeric @@ -83,9 +99,12 @@ regions can recover the optimized path. The planned parser must preserve these invariants: -- no planned parsing while `h->keep_samples` is active; +- no planned parsing while the header has unsynced dictionary changes; - header IDs, types, and number models are resolved before execution; +- selected-sample parsing must honor `h->keep_samples`, use `h->nsamples_ori` + for input-column scans, and set `v->n_sample` to the retained sample count; - duplicate or undefined tags use the production parser; +- unprofitable string/float-heavy schemas use the production parser; - unsupported GT encodings force fallback; - numeric vectors preserve observed width and vector-end padding; - strings use observed maximum byte length and zero-pad shorter samples; @@ -95,24 +114,34 @@ The planned parser must preserve these invariants: Focused validation lives in `./test/test_format_plan.sh`. It compares production parsing, `HTS_VCF_FORMAT_PLAN=1`, and the `interp` alias byte-for-byte -with `cmp`. +with `cmp`. The script also checks selected-sample parsing for explicit +inclusion and exclusion lists (`S1,S3`, `S2`, and `^S2`). `test/format-plan-cache.vcf` +additionally exercises more than 16 distinct FORMAT schemas and a literal FORMAT +string longer than the old fixed cache key. `test/test_format_plan_cache` +mutates and resyncs a header after a plan has been compiled for the same FORMAT +string, then verifies the row is planned again with the new metadata. ## Current Source Delta -After removing the old exact kernels and SIMD tab scanner, the live parser/test -hook delta relative to `origin/develop` is: +After removing the old exact kernels and SIMD tab scanner, then hardening the +dynamic cache, the live parser/test hook delta relative to `origin/develop` is: | File | Added lines | |---|---:| -| `vcf.c` | 1,467 | -| `test/test_view.c` | 14 | +| `vcf.c` | 1,703 | +| `Makefile` | 6 | +| `test/test_format_plan.sh` | 48 | +| `test/test_format_plan_cache.c` | 130 | +| `test/test_view.c` | 23 | +| `test/format-plan-cache.vcf` | 61 | +| `test/format-plan-profitability.vcf` | 8 | ## Large Corpus Benchmark Command: ```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-dynamic-trim-plan \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-profit-gate \ bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv ``` @@ -120,16 +149,43 @@ All planned outputs compared byte-identical to baseline. | Input | Baseline user | Plan user | Hits/fallback | |---|---:|---:|---:| -| CCDG 10k | 2.62 s | 2.25 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 26.05 s | 7.98 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | 20,000 / 0 | -| Large reordered likelihood | 3.00 s | 2.42 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.16 s | 2.73 s | 16,000 / 0 | -| Large float/string | 2.93 s | 2.97 s | 16,000 / 0 | -| Variable phase widths | 2.61 s | 2.50 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.22 s | 1.87 s | 12,000 / 0 | -| GT-first reordered negative | 1.75 s | 1.44 s | 12,000 / 0 | -| Two-string float negative | 2.28 s | 2.56 s | 12,000 / 0 | +| CCDG 10k | 2.47 s | 2.15 s | 8,396 / 1,604 | +| 1000G chr22 full GT | 25.25 s | 7.82 s | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.02 s | 3.64 s | 20,000 / 0 | +| Large reordered likelihood | 2.95 s | 2.40 s | 20,000 / 0 | +| Large multiallelic likelihood | 3.15 s | 2.76 s | 16,000 / 0 | +| Large float/string | 2.96 s | 2.89 s | 0 / 16,000 | +| Variable phase widths | 2.60 s | 2.46 s | 12,000 / 0 | +| Mixed row-local fallbacks | 2.19 s | 1.84 s | 12,000 / 0 | +| GT-first reordered negative | 1.72 s | 1.37 s | 12,000 / 0 | +| Two-string float negative | 2.29 s | 2.26 s | 0 / 12,000 | + +## Full Threaded Corpus Benchmark + +Command: + +```sh +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded-profit-gate \ + bench/format-shape/scripts/run_thread_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +All 40 planned outputs compared byte-identical to baseline. Detailed timings +are in `bench/format-shape/large/results-threaded-profit-gate/timings.tsv`; the +table below summarizes real-time speedup. + +| Input | 0 threads | 2 threads | 4 threads | 8 threads | +|---|---:|---:|---:|---:| +| CCDG 10k | 1.13x | 1.15x | 1.16x | 1.15x | +| 1000G chr22 full GT | 3.10x | 3.73x | 4.34x | 3.88x | +| Large CCDG-like synthetic | 1.12x | 1.14x | 1.13x | 1.13x | +| Large reordered likelihood | 1.23x | 1.33x | 1.32x | 1.29x | +| Large multiallelic likelihood | 1.16x | 1.22x | 1.22x | 1.22x | +| Large float/string | 1.01x | 0.97x | 1.04x | 1.00x | +| Variable phase widths | 1.06x | 1.10x | 1.11x | 1.09x | +| Mixed row-local fallbacks | 1.18x | 1.25x | 1.31x | 1.23x | +| GT-first reordered negative | 1.22x | 1.31x | 1.32x | 1.32x | +| Two-string float negative | 1.00x | 1.00x | 1.01x | 1.00x | ## bcftools Production-Style Benchmark @@ -154,6 +210,10 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ bench/format-shape/large/threaded-inputs.tsv ``` +`bench/format-shape/large/threaded-inputs.tsv` now mirrors the full large +corpus from `large/inputs.tsv`, so threaded runs cover all real and synthetic +workload shapes rather than only the earlier two representative rows. + The runner uses `bcftools view --no-version -Ob -l 0 [--threads N]`. All planned outputs compared byte-identical to baseline. @@ -168,6 +228,69 @@ planned outputs compared byte-identical to baseline. | Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | 4.51 s | 4.09 s | | Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | 4.50 s | 4.05 s | +## bcftools Selected-Sample Benchmark + +The same bcftools runner can select the first N samples from each input with +`SAMPLE_COUNT=N`. This exercises the `bcf_hdr_set_samples()` / `keep_samples` +path through bcftools rather than only through the test harness. + +Command: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +SAMPLE_COUNT=2 KEEP_OUTPUTS=0 \ +OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ + bench/format-shape/scripts/run_bcftools_bench.sh \ + bench/format-shape/large/threaded-inputs.tsv +``` + +All 40 planned outputs compared byte-identical to baseline. The table shows +real-time and user-time speedup for selecting two samples from every input that +has samples; sites-only inputs naturally run without `-s`. + +| Input | Threads | Real speedup | User speedup | +|---|---:|---:|---:| +| CCDG 10k | 0 | 1.12x | 1.12x | +| CCDG 10k | 2 | 1.12x | 1.11x | +| CCDG 10k | 4 | 1.13x | 1.12x | +| CCDG 10k | 8 | 1.11x | 1.10x | +| 1000G chr22 full GT | 0 | 2.71x | 2.73x | +| 1000G chr22 full GT | 2 | 2.83x | 2.44x | +| 1000G chr22 full GT | 4 | 2.94x | 2.52x | +| 1000G chr22 full GT | 8 | 3.06x | 2.61x | +| Large CCDG-like synthetic | 0 | 1.07x | 1.08x | +| Large CCDG-like synthetic | 2 | 1.10x | 1.07x | +| Large CCDG-like synthetic | 4 | 1.09x | 1.07x | +| Large CCDG-like synthetic | 8 | 1.09x | 1.07x | +| Large reordered likelihood | 0 | 1.15x | 1.17x | +| Large reordered likelihood | 2 | 1.22x | 1.15x | +| Large reordered likelihood | 4 | 1.23x | 1.17x | +| Large reordered likelihood | 8 | 1.22x | 1.16x | +| Large multiallelic likelihood | 0 | 1.13x | 1.13x | +| Large multiallelic likelihood | 2 | 1.14x | 1.11x | +| Large multiallelic likelihood | 4 | 1.16x | 1.12x | +| Large multiallelic likelihood | 8 | 1.18x | 1.13x | +| Large float/string | 0 | 1.02x | 1.01x | +| Large float/string | 2 | 0.99x | 0.99x | +| Large float/string | 4 | 1.01x | 1.00x | +| Large float/string | 8 | 0.97x | 0.98x | +| Variable phase widths | 0 | 1.04x | 1.05x | +| Variable phase widths | 2 | 1.05x | 1.05x | +| Variable phase widths | 4 | 1.05x | 1.04x | +| Variable phase widths | 8 | 1.06x | 1.05x | +| Mixed row-local fallbacks | 0 | 1.14x | 1.16x | +| Mixed row-local fallbacks | 2 | 1.17x | 1.14x | +| Mixed row-local fallbacks | 4 | 1.18x | 1.14x | +| Mixed row-local fallbacks | 8 | 1.17x | 1.14x | +| GT-first reordered negative | 0 | 1.21x | 1.22x | +| GT-first reordered negative | 2 | 1.25x | 1.19x | +| GT-first reordered negative | 4 | 1.26x | 1.19x | +| GT-first reordered negative | 8 | 1.22x | 1.18x | +| Two-string float negative | 0 | 0.96x | 0.98x | +| Two-string float negative | 2 | 1.00x | 0.99x | +| Two-string float negative | 4 | 0.99x | 0.98x | +| Two-string float negative | 8 | 1.03x | 1.01x | + ## Interpretation The dynamic path gives a large production-visible win for sample-rich GT-only @@ -177,7 +300,6 @@ float/string-heavy layouts remain near parity or slightly slower than baseline. ## Remaining Work -- Add selected-sample support so `keep_samples` does not force fallback. - Reduce per-sample opcode dispatch in hot FORMAT layouts. - Improve string and measured-width handling without losing byte identity. - Consider a later executor-generation layer if generic per-op dispatch remains diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md index 0ab5a6efd..55eef012f 100644 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -118,6 +118,9 @@ Several hardening passes made the composable MVP safer and faster: - routed generic `INTN` widths 4, 6, and 10 through fixed-width counted parsers; - removed unused dynamic likelihood-shape scaffolding; - added underfilled vector compaction for fixed-width vector fields. +- replaced the original process-global 16-entry FORMAT plan cache with a + header-owned, generation-aware, dynamically sized cache that stores both + supported and unsupported compile results. Result: retained. The dynamic path became broader and reduced unnecessary whole-row fallback while preserving byte-identical output. @@ -158,6 +161,67 @@ Large-corpus post-trim user-time highlights: | Large float/string | 2.93 s | 2.97 s | near parity/slightly slower | | Two-string float negative | 2.28 s | 2.56 s | slower | +## Header-Owned Cache Hardening + +The static FORMAT plan cache was replaced with private `bcf_hdr_aux_t` state. +The hardened cache: + +- grows from 16 to 128 entries; +- stores literal FORMAT strings on the heap, so long schemas are no longer + rejected by the old fixed key buffer; +- caches unsupported compile results to avoid repeated work; +- clears on `bcf_hdr_sync()` and records a private header generation; +- declines fast planning while `h->dirty` is set. + +Result: retained. `test/format-plan-cache.vcf` now asserts 21/21 planned hits +across more than 16 distinct FORMAT schemas, including one long schema. The +new `test/test_format_plan_cache` helper verifies that a plan compiled before a +header metadata change is not reused after `bcf_hdr_sync()`. The large corpus +remained byte-identical after the rewrite, with the same broad performance +profile: 1000G chr22 GT user time at 26.06 s baseline versus 7.96 s planned, +and CCDG 10k at 2.55 s baseline versus 2.24 s planned. + +## Profitability Gate For String/Float Shapes + +The expanded threaded benchmark exposed two regressions: + +- `GT:GL:FT:DP:GQ` +- `GT:FT:PID:GL:DP` + +Both schemas were syntactically supported and had zero row-local fallback, but +they were dominated by measured strings plus `Number=G` float vectors. The +dynamic path had to measure string widths over every sample before parsing, then +still use the general float conversion path, while there were no integer vectors +to amortize that setup. + +Result: retained. The compiler now negative-caches these low-profit schemas and +sends only those FORMAT rows to the production parser. The full threaded corpus +remained byte-identical. The two-string float case improved from a consistent +slowdown, roughly 0.86-0.89x, to parity at 1.00-1.01x. Other integer-heavy +likelihood rows stayed on the dynamic path. + +## Selected-Sample Support + +The planner originally rejected `h->keep_samples` because sample subsetting +changes the relationship between input sample columns and output BCF sample +slots. That was conservative but would have made the optimized path invisible +for common `bcftools view -s/-S` style workflows. + +The executor now treats the input and output counts separately. It scans +`h->nsamples_ori` columns when `h->keep_samples` is active, skips unselected +columns with the header bitset, writes retained samples densely, and sets +`v->n_sample` to the retained sample count. The width-measurement pass follows +the same rule, so measured strings and variable numeric widths are based only on +the samples that will be emitted, matching production htslib's selected-sample +behavior. + +Result: retained. `test/test_format_plan.sh` now compares explicit inclusion +and exclusion sample lists byte-for-byte against production parsing. A +bcftools run selecting the first two samples from every input completed 40/40 +byte-identical comparisons. The 1000G chr22 GT workload still showed a large +real-time win, from 26.51 s to 9.77 s unthreaded and from 25.99 s to 8.84 s at +4 threads; string/float-heavy negative rows remained near parity. + ## bcftools Production Check A clean bcftools `develop` worktree was built against this htslib branch and run diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 56817cbac..4bc306f75 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -11,10 +11,23 @@ literal FORMAT string into a small list of per-tag operations. The plan is driven by the active VCF header: each tag contributes its key, type, declared number model, and whether the current row needs width measurement. +Compiled plans live in private header-owned cache state. The cache is cleared +when the header dictionaries are resynchronised, and the optimized path declines +to run while the header has unsynced mutations. That keeps cached supported and +unsupported decisions tied to the exact header metadata that produced them. + If the row fits the supported operation set, the dynamic executor parses samples and writes BCF's transposed FORMAT layout directly. If anything looks unsafe or unsupported, htslib falls back to the production parser for the whole FORMAT -column. +column. The planner also keeps a small profitability gate: schemas dominated by +measured strings plus float vectors, such as `GT:FT:PID:GL:DP`, currently use +the production parser because the dynamic path's width-measurement work costs +more than it saves. + +The optimized path also supports selected-sample reads. When +`bcf_hdr_set_samples()` is active, it scans the original sample columns, skips +unretained samples, and writes the retained samples densely into the BCF FORMAT +blocks. ## Why This Shape @@ -49,6 +62,11 @@ from 27.48 s to 8.99 s unthreaded, and from 26.71 s to 6.94 s at 4 threads. The likelihood-heavy synthetic workload improved more modestly, from 4.43 s to 3.94 s unthreaded and from 3.47 s to 3.02 s at 4 threads. +With bcftools selecting the first two samples from each input, the same 1000G +GT workload improved from 26.51 s to 9.77 s unthreaded and from 25.99 s to +8.84 s at 4 threads. Selected-sample likelihood-heavy rows are still faster, +but the gains are smaller because much less FORMAT payload is emitted. + ## Drawbacks The MVP intentionally keeps fallback whole-row. It does not parse supported @@ -58,9 +76,9 @@ tag or malformed row means the entire FORMAT column uses the production parser. Known fallback cases include: -- sample subsetting via `keep_samples`; - undefined FORMAT tags that require production header repair; - unsupported header types or number models; +- unprofitable string/float-heavy schemas; - duplicate FORMAT tags; - malformed separators or unexpected sample cardinality; - row-local widths above the bounded fast-path limit; diff --git a/test/format-plan-cache.vcf b/test/format-plan-cache.vcf new file mode 100644 index 000000000..0e6bde16b --- /dev/null +++ b/test/format-plan-cache.vcf @@ -0,0 +1,61 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C . PASS . GT:F01 0/1:1 1/1:2 +1 2 . A C . PASS . GT:F02 0/1:2 1/1:3 +1 3 . A C . PASS . GT:F03 0/1:3 1/1:4 +1 4 . A C . PASS . GT:F04 0/1:4 1/1:5 +1 5 . A C . PASS . GT:F05 0/1:5 1/1:6 +1 6 . A C . PASS . GT:F06 0/1:6 1/1:7 +1 7 . A C . PASS . GT:F07 0/1:7 1/1:8 +1 8 . A C . PASS . GT:F08 0/1:8 1/1:9 +1 9 . A C . PASS . GT:F09 0/1:9 1/1:10 +1 10 . A C . PASS . GT:F10 0/1:10 1/1:11 +1 11 . A C . PASS . GT:F11 0/1:11 1/1:12 +1 12 . A C . PASS . GT:F12 0/1:12 1/1:13 +1 13 . A C . PASS . GT:F13 0/1:13 1/1:14 +1 14 . A C . PASS . GT:F14 0/1:14 1/1:15 +1 15 . A C . PASS . GT:F15 0/1:15 1/1:16 +1 16 . A C . PASS . GT:F16 0/1:16 1/1:17 +1 17 . A C . PASS . GT:F17 0/1:17 1/1:18 +1 18 . A C . PASS . GT:F18 0/1:18 1/1:19 +1 19 . A C . PASS . GT:F19 0/1:19 1/1:20 +1 20 . A C . PASS . GT:F20 0/1:20 1/1:21 +1 21 . A C . PASS . GT:LONGFORMATFIELD01:LONGFORMATFIELD02:LONGFORMATFIELD03:LONGFORMATFIELD04:LONGFORMATFIELD05:LONGFORMATFIELD06:LONGFORMATFIELD07:LONGFORMATFIELD08:LONGFORMATFIELD09:LONGFORMATFIELD10:LONGFORMATFIELD11:LONGFORMATFIELD12:LONGFORMATFIELD13:LONGFORMATFIELD14:LONGFORMATFIELD15:LONGFORMATFIELD16 0/1:1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16 1/1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17 diff --git a/test/format-plan-profitability.vcf b/test/format-plan-profitability.vcf new file mode 100644 index 000000000..13db490b1 --- /dev/null +++ b/test/format-plan-profitability.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C,G . PASS . GT:GL:FT:DP 0/1:-0.25,-0.50,-0.75,-1.00,-1.25,-1.50:PASS:12 1/2:-0.50,-0.75,-1.00,-1.25,-1.50,-1.75:LowQual:8 diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh index e247a558e..d47741777 100755 --- a/test/test_format_plan.sh +++ b/test/test_format_plan.sh @@ -2,7 +2,7 @@ set -eu test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf test/format-plan-gt-header-shape.vcf"} +inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf test/format-plan-gt-header-shape.vcf test/format-plan-cache.vcf test/format-plan-profitability.vcf"} tmpdir=${TMPDIR:-/tmp} base=${tmpdir}/hts-format-plan-base.$$ plan=${tmpdir}/hts-format-plan-plan.$$ @@ -19,6 +19,30 @@ do env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$interp" 2> "$interp_stats" cmp "$base" "$plan" cmp "$base" "$interp" + case "$input" in + *format-plan-cache.vcf) + grep -q 'attempts=21 hits=21 fallback=0 ' "$stats" + grep -q 'attempts=21 hits=21 fallback=0 ' "$interp_stats" + ;; + *format-plan-profitability.vcf) + grep -q 'attempts=1 hits=0 fallback=1 ' "$stats" + grep -q 'attempts=1 hits=0 fallback=1 ' "$interp_stats" + ;; + esac cat "$stats" cat "$interp_stats" done + +for samples in S1,S3 S2 ^S2 +do + for input in test/format-plan-composable.vcf test/format-plan-edge.vcf + do + env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 -s "$samples" "$input" > "$base" + env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 -s "$samples" "$input" > "$plan" 2> "$stats" + env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 -s "$samples" "$input" > "$interp" 2> "$interp_stats" + cmp "$base" "$plan" + cmp "$base" "$interp" + cat "$stats" + cat "$interp_stats" + done +done diff --git a/test/test_format_plan_cache.c b/test/test_format_plan_cache.c new file mode 100644 index 000000000..ca2263ce3 --- /dev/null +++ b/test/test_format_plan_cache.c @@ -0,0 +1,130 @@ +/* test/test_format_plan_cache.c -- FORMAT planner cache tests. + + Copyright (C) 2026 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include + +#include "../htslib/kstring.h" +#include "../htslib/vcf.h" + +void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, uint64_t *parsed_samples); + +static void fail(const char *msg) +{ + fprintf(stderr, "%s\n", msg); + exit(EXIT_FAILURE); +} + +#define check0(expr) do { if ((expr) != 0) fail("check failed: " #expr); } while (0) +#define check1(expr) do { if (!(expr)) fail("check failed: " #expr); } while (0) + +static void parse_line(bcf_hdr_t *hdr, bcf1_t *rec, kstring_t *line, + const char *text) +{ + ks_clear(line); + if (kputsn(text, strlen(text), line) < 0) + fail("failed to build VCF line"); + check0(vcf_parse(line, hdr, rec)); +} + +static void check_x_values(bcf_hdr_t *hdr, bcf1_t *rec, + const int32_t *expected, int n_expected) +{ + int32_t *values = NULL; + int n_values = 0, ret, i; + + ret = bcf_get_format_int32(hdr, rec, "X", &values, &n_values); + if (ret != n_expected) { + free(values); + fail("unexpected X vector length"); + } + for (i = 0; i < n_expected; i++) { + if (values[i] != expected[i]) { + free(values); + fail("unexpected X value"); + } + } + free(values); +} + +int main(void) +{ + static char header[] = + "##fileformat=VCFv4.3\n" + "##contig=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\n"; + static const int32_t x1[] = { 7 }; + static const int32_t x2[] = { 11, 13 }; + bcf_hdr_t *hdr; + bcf1_t *rec; + kstring_t line = KS_INITIALIZE; + uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; + + check0(setenv("HTS_VCF_FORMAT_PLAN", "1", 1)); + hdr = bcf_hdr_init("r"); + rec = bcf_init(); + check1(hdr); + check1(rec); + check0(bcf_hdr_parse(hdr, header)); + + parse_line(hdr, rec, &line, + "1\t1\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:7"); + check_x_values(hdr, rec, x1, 1); + + /* + * Rebuild the same FORMAT string against changed metadata. A stale plan + * would still think X is Number=1 and would either fall back or encode the + * second row incorrectly. The header-owned generation must force a fresh + * compile, preserving both correctness and fast-path coverage. + */ + bcf_hdr_remove(hdr, BCF_HL_FMT, "X"); + check0(bcf_hdr_append(hdr, + "##FORMAT=")); + check0(bcf_hdr_sync(hdr)); + bcf_clear1(rec); + parse_line(hdr, rec, &line, + "1\t2\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:11,13"); + check_x_values(hdr, rec, x2, 2); + + hts_vcf_format_plan_stats(&attempts, &hits, &fallback, &parsed_samples); + if (attempts != 2 || hits != 2 || fallback != 0 || parsed_samples != 2) { + fprintf(stderr, + "unexpected planner stats: attempts=%" PRIu64 + " hits=%" PRIu64 " fallback=%" PRIu64 + " parsed_samples=%" PRIu64 "\n", + attempts, hits, fallback, parsed_samples); + return EXIT_FAILURE; + } + + bcf_destroy(rec); + bcf_hdr_destroy(hdr); + free(line.s); + return EXIT_SUCCESS; +} diff --git a/test/test_view.c b/test/test_view.c index 9608a8661..594c1fd75 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -53,6 +53,7 @@ struct opts { int multi_reg; char *index; int min_shift; + char *samples; }; enum test_op { @@ -208,6 +209,9 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, if (!b) return 1; + if (opts->samples && bcf_hdr_set_samples(h, opts->samples, 0) < 0) + return 1; + if (!opts->benchmark && bcf_hdr_write(out, h) < 0) return 1; @@ -301,8 +305,9 @@ int main(int argc, char *argv[]) opts.multi_reg = 0; opts.index = NULL; opts.min_shift = 0; + opts.samples = NULL; - while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:v")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:vs:")) >= 0) { switch (c) { case 'D': opts.flag |= READ_CRAM; break; case 'S': opts.flag |= READ_COMPRESSED; break; @@ -325,11 +330,12 @@ int main(int argc, char *argv[]) case 'x': opts.index = optarg; break; case 'm': opts.min_shift = atoi(optarg); break; case 'p': out_fn = optarg; break; + case 's': opts.samples = optarg; break; case 'v': hts_verbose++; break; } } if (argc == optind) { - fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] [-x index_fn] [-m min_shift] [-p out] [-v] || [region]\n"); + fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] [-x index_fn] [-m min_shift] [-p out] [-s samples] [-v] || [region]\n"); fprintf(stderr, "\n"); fprintf(stderr, "-D: read CRAM format (mode 'c')\n"); fprintf(stderr, "-S: read compressed BCF, BAM, FAI (mode 'b')\n"); @@ -352,6 +358,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "-x fn: write index to fn\n"); fprintf(stderr, "-m min_shift: specifies BAI/CSI bin size; 0 is BAI(BAM) or TBI(VCF), 14 is CSI default\n"); fprintf(stderr, "-p out_fn: output to out_fn instead of stdout\n"); + fprintf(stderr, "-s samples: select VCF samples, as a comma-separated bcf_hdr_set_samples list\n"); fprintf(stderr, "-v: increase verbosity\n"); fprintf(stderr, "The region list entries should be specified as 'reg:beg-end', with intervals of a region being disjunct and sorted by the starting coordinate.\n"); return 1; diff --git a/vcf.c b/vcf.c index c12e67792..a2732bafb 100644 --- a/vcf.c +++ b/vcf.c @@ -113,11 +113,17 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N // Note that this preserving API and ABI requires that the first element is vdict_t struct // rather than a pointer, as user programs may (and in some cases do) access the dictionary // directly as (vdict_t*)hdr->dict. +typedef struct vcf_format_plan_cache_t vcf_format_plan_cache_t; +static void vcf_format_plan_cache_clear(vcf_format_plan_cache_t *cache); +static void vcf_format_plan_cache_destroy(vcf_format_plan_cache_t *cache); + typedef struct { vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings + vcf_format_plan_cache_t *format_plan_cache; // Header-local FORMAT planner cache + uint64_t format_plan_gen; // Incremented when header dictionaries are resynchronised int version; //cached version uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called } @@ -344,6 +350,10 @@ int bcf_hdr_sync(bcf_hdr_t *h) free(aux->key_len); aux->key_len = NULL; } + if (aux && aux->format_plan_cache) + vcf_format_plan_cache_clear(aux->format_plan_cache); + if (aux) + aux->format_plan_gen++; h->dirty = 0; return 0; @@ -1650,6 +1660,8 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if ( !aux ) goto fail; if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } aux->key_len = NULL; + aux->format_plan_cache = NULL; + aux->format_plan_gen = 0; aux->dict = *((vdict_t*)h->dict[0]); aux->version = 0; aux->ref_count = 1; @@ -1694,6 +1706,7 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); kh_destroy(hdict, aux->gen); free(aux->key_len); // may exist for dict[0] only + vcf_format_plan_cache_destroy(aux->format_plan_cache); } kh_destroy(vdict, d); free(h->id[i]); @@ -3334,18 +3347,31 @@ typedef struct { typedef struct { /* - * Cache key is the literal FORMAT string plus header pointer. This keeps - * repeated records on the same FORMAT layout from rebuilding the per-tag - * op list while still respecting that key ids/types are header-local. + * Cache key is the literal FORMAT string plus the private header + * generation. FORMAT key ids/types are header-local, so plans are owned by + * the header aux block and invalidated whenever bcf_hdr_sync() rebuilds the + * dictionaries. Unsupported plans are cached too; repeated uncommon or + * undefined FORMAT strings should pay the compile cost once, then fall back + * directly to the production parser. */ - char format[256]; - const bcf_hdr_t *hdr; + char *format; + size_t format_len; + uint64_t format_hash; + uint64_t hdr_gen; int supported; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; vcf_format_fast_guard_t general_guard; } vcf_format_general_plan_t; +struct vcf_format_plan_cache_t { + vcf_format_general_plan_t *plans; + int n; + int m; + int next_evict; + uint64_t hdr_gen; +}; + typedef enum { VCF_FORMAT_ROW_GT, VCF_FORMAT_ROW_GT2, @@ -3382,18 +3408,163 @@ typedef struct { #define VCF_PLAN_ALWAYS_INLINE static inline #endif +static uint64_t vcf_format_plan_hash(const char *format, size_t len) +{ + size_t i; + uint64_t hash = 1469598103934665603ULL; + + for (i = 0; i < len; i++) { + hash ^= (unsigned char) format[i]; + hash *= 1099511628211ULL; + } + return hash; +} + +static void vcf_format_general_plan_destroy(vcf_format_general_plan_t *plan) +{ + if (!plan) + return; + free(plan->format); + memset(plan, 0, sizeof(*plan)); +} + +static void vcf_format_plan_cache_clear(vcf_format_plan_cache_t *cache) +{ + int i; + + if (!cache) + return; + for (i = 0; i < cache->n; i++) + vcf_format_general_plan_destroy(&cache->plans[i]); + cache->n = 0; + cache->next_evict = 0; +} + +static void vcf_format_plan_cache_destroy(vcf_format_plan_cache_t *cache) +{ + if (!cache) + return; + vcf_format_plan_cache_clear(cache); + free(cache->plans); + free(cache); +} + +static vcf_format_plan_cache_t *vcf_format_plan_cache_get(const bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + + if (!aux) + return NULL; + if (!aux->format_plan_cache) { + aux->format_plan_cache = (vcf_format_plan_cache_t *) + calloc(1, sizeof(*aux->format_plan_cache)); + if (!aux->format_plan_cache) + return NULL; + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + if (aux->format_plan_cache->hdr_gen != aux->format_plan_gen) { + vcf_format_plan_cache_clear(aux->format_plan_cache); + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + return aux->format_plan_cache; +} + +static int vcf_format_plan_cache_slot(vcf_format_plan_cache_t *cache) +{ + enum { VCF_FORMAT_PLAN_CACHE_INIT = 16, VCF_FORMAT_PLAN_CACHE_MAX = 128 }; + int i, idx, new_m; + vcf_format_general_plan_t *plans; + + if (cache->n < cache->m) + return cache->n++; + + if (cache->m < VCF_FORMAT_PLAN_CACHE_MAX) { + new_m = cache->m ? cache->m * 2 : VCF_FORMAT_PLAN_CACHE_INIT; + if (new_m > VCF_FORMAT_PLAN_CACHE_MAX) + new_m = VCF_FORMAT_PLAN_CACHE_MAX; + if ((size_t) new_m > SIZE_MAX / sizeof(*cache->plans)) + return -1; + plans = (vcf_format_general_plan_t *) + realloc(cache->plans, (size_t) new_m * sizeof(*cache->plans)); + if (!plans) + return -1; + memset(plans + cache->m, 0, + (size_t) (new_m - cache->m) * sizeof(*plans)); + cache->plans = plans; + cache->m = new_m; + return cache->n++; + } + + for (i = 0; i < cache->n; i++) { + idx = (cache->next_evict + i) % cache->n; + if (!cache->plans[idx].supported) + goto found; + } + idx = cache->next_evict; + +found: + vcf_format_general_plan_destroy(&cache->plans[idx]); + cache->next_evict = (idx + 1) % cache->n; + return idx; +} + +static int vcf_format_general_plan_profitable(const vcf_format_general_plan_t *plan) +{ + int j, string_ops = 0, float_vector_ops = 0, int_ops = 0, int_vector_ops = 0; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + if (op->is_gt) + continue; + if (op->htype == BCF_HT_STR) { + string_ops++; + } else if (op->htype == BCF_HT_REAL) { + if (op->vl_type == BCF_VL_FIXED && op->number == 1) + ; + else + float_vector_ops++; + } else if (op->htype == BCF_HT_INT) { + int_ops++; + if (op->vl_type != BCF_VL_FIXED || op->number != 1) + int_vector_ops++; + } + } + + /* + * FORMAT rows with measured strings plus float vectors have to pay the + * dynamic executor's full width-measurement pass and then still use the + * general float conversion path. Without integer vectors to amortize that + * setup, production parsing has been consistently faster on the large + * corpus (for example GT:GL:FT:DP:GQ and GT:FT:PID:GL:DP). + */ + if (string_ops > 0 && float_vector_ops > 0 && + int_vector_ops == 0 && int_ops <= 2) + return 0; + return 1; +} + static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *format, + size_t format_len, uint64_t format_hash, + uint64_t hdr_gen, vcf_format_general_plan_t *plan) { - char tmp[256], *tok, *saveptr = NULL; - int i; + char *tmp, *tok, *saveptr = NULL; + int i, ret = 0; memset(plan, 0, sizeof(*plan)); - if (strlen(format) >= sizeof(plan->format)) - return 0; - strcpy(plan->format, format); - strcpy(tmp, format); - plan->hdr = h; + plan->format = (char *) malloc(format_len + 1); + tmp = (char *) malloc(format_len + 1); + if (!plan->format || !tmp) { + free(tmp); + free(plan->format); + memset(plan, 0, sizeof(*plan)); + return -1; + } + memcpy(plan->format, format, format_len + 1); + memcpy(tmp, format, format_len + 1); + plan->format_len = format_len; + plan->format_hash = format_hash; + plan->hdr_gen = hdr_gen; /* * Compile at tag granularity, not full FORMAT-shape granularity. This is @@ -3406,17 +3577,17 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma int key, htype; if (plan->n_ops >= MAX_N_FMT) - return 0; + goto done; key = bcf_hdr_id2int(h, BCF_DT_ID, tok); if (key < 0 || !bcf_hdr_idinfo_exists(h, BCF_HL_FMT, key)) - return 0; + goto done; for (i = 0; i < plan->n_ops; i++) if (plan->ops[i].key == key) - return 0; + goto done; htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) - return 0; + goto done; /* * Only compile tags with enough header information to reproduce the @@ -3433,17 +3604,17 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (plan->ops[plan->n_ops].is_gt) { if (htype != BCF_HT_STR || plan->ops[plan->n_ops].number != 1 || plan->ops[plan->n_ops].vl_type != BCF_VL_FIXED) - return 0; + goto done; } else { int vl = plan->ops[plan->n_ops].vl_type; if (htype == BCF_HT_STR) { if (plan->ops[plan->n_ops].number != 1) - return 0; + goto done; plan->ops[plan->n_ops].measured_width = 1; } else if (vl != BCF_VL_FIXED && vl != BCF_VL_A && vl != BCF_VL_R && vl != BCF_VL_G && vl != BCF_VL_VAR) { - return 0; + goto done; } else if (vl == BCF_VL_VAR) { plan->ops[plan->n_ops].measured_width = 1; } @@ -3452,28 +3623,68 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma } if (!plan->n_ops) - return 0; + goto done; + if (!vcf_format_general_plan_profitable(plan)) + goto done; plan->supported = 1; - return 1; + ret = 1; + +done: + free(tmp); + return ret; } static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h, const char *format) { - enum { N_GENERAL_PLAN_CACHE = 16 }; - static vcf_format_general_plan_t cache[N_GENERAL_PLAN_CACHE]; - static int ncache = 0; - int i; + bcf_hdr_aux_t *aux; + vcf_format_plan_cache_t *cache; + vcf_format_general_plan_t *plan; + size_t format_len; + uint64_t format_hash, hdr_gen; + int i, idx, ret; - for (i = 0; i < ncache; i++) - if (cache[i].hdr == h && strcmp(cache[i].format, format) == 0) - return cache[i].supported ? &cache[i] : NULL; + /* + * The compiler reads h->id[] and header metadata directly. If a caller has + * mutated the header but not synced it yet, the production parser is the + * only safe path because it already owns all header-repair semantics. + */ + if (h->dirty) + return NULL; + + aux = get_hdr_aux(h); + if (!aux) + return NULL; + cache = vcf_format_plan_cache_get(h); + if (!cache) + return NULL; + + format_len = strlen(format); + format_hash = vcf_format_plan_hash(format, format_len); + hdr_gen = aux->format_plan_gen; + for (i = 0; i < cache->n; i++) { + plan = &cache->plans[i]; + if (plan->format && plan->hdr_gen == hdr_gen && + plan->format_len == format_len && + plan->format_hash == format_hash && + memcmp(plan->format, format, format_len) == 0) + return plan->supported ? plan : NULL; + } - if (ncache == N_GENERAL_PLAN_CACHE) - return NULL; - vcf_format_general_plan_compile(h, format, &cache[ncache]); - return cache[ncache++].supported ? &cache[ncache-1] : NULL; + idx = vcf_format_plan_cache_slot(cache); + if (idx < 0) + return NULL; + plan = &cache->plans[idx]; + ret = vcf_format_general_plan_compile(h, format, format_len, format_hash, + hdr_gen, plan); + if (ret < 0) { + vcf_format_general_plan_destroy(plan); + if (idx == cache->n - 1) + cache->n--; + return NULL; + } + return plan->supported ? plan : NULL; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) @@ -4106,6 +4317,15 @@ static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan } } +static const char *vcf_format_skip_sample_column(const char *cur, const char *end) +{ + while (cur < end && *cur && *cur != '\t') + cur++; + if (cur < end && *cur == '\t') + cur++; + return cur; +} + static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t *v) { if (op->is_gt) @@ -4247,8 +4467,16 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *q, int *widths) { const char *cur, *end; - int has_measured = 0, sample, j, nsamples = bcf_hdr_nsamples(h); + int has_measured = 0, sample, kept = 0, j; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h); + /* + * With bcf_hdr_set_samples(), the text line still contains the original + * sample columns but BCF output must contain only the retained samples. The + * measurement pass therefore scans original columns and updates row-local + * widths only for samples that will be emitted. + */ for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; @@ -4275,6 +4503,10 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, cur = q + 1; end = s->s + s->l; for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; const char *field = cur; @@ -4318,8 +4550,10 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, return -4; } } + if (++kept == output_nsamples) + break; } - if (sample != nsamples) + if (kept != output_nsamples) return -4; for (j = 0; j < plan->n_ops; j++) if (plan->ops[j].measured_width) { @@ -4339,7 +4573,8 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, vcf_format_row_op_t *row_ops) { kstring_t *mem = (kstring_t*)&h->mem; - int nsamples = bcf_hdr_nsamples(h), sample, j; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h), sample, kept = 0, j; int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); int max_counts[MAX_N_FMT]; vcf_plan_int_range_t ranges[MAX_N_FMT]; @@ -4358,6 +4593,11 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, * GT2/FLOAT1 rows can be written directly to v->indiv; the remaining rows * are staged in h->mem so they can be parsed sample-major and encoded * op-major once row-local ranges and widths are known. + * + * If keep_samples is active, nsamples is the number of columns to scan in + * the input line and output_nsamples is the dense BCF sample count. This + * mirrors the production parser: unselected sample columns may influence + * neither emitted widths nor output cardinality. */ for (j = 0; j < plan->n_ops; j++) { max_counts[j] = 0; @@ -4371,16 +4611,16 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, bcf_enc_int1(&v->indiv, op->key); if (op->kind == VCF_FORMAT_ROW_GT2) { if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * 2) < 0) + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * 2) < 0) goto error; direct_offsets[j] = v->indiv.l; - v->indiv.l += (size_t)nsamples * 2; + v->indiv.l += (size_t)output_nsamples * 2; } else { if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)nsamples * sizeof(float)) < 0) + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * sizeof(float)) < 0) goto error; direct_offsets[j] = v->indiv.l; - v->indiv.l += (size_t)nsamples * sizeof(float); + v->indiv.l += (size_t)output_nsamples * sizeof(float); } } @@ -4388,14 +4628,14 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, for (j = direct_ops; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; - if ((uint64_t) mem->l + nsamples * (uint64_t) op->size > INT_MAX) + if ((uint64_t) mem->l + output_nsamples * (uint64_t) op->size > INT_MAX) goto error; if (align_mem(mem) < 0) goto error; op->offset = mem->l; - if (ks_resize(mem, mem->l + nsamples * (size_t) op->size) < 0) + if (ks_resize(mem, mem->l + output_nsamples * (size_t) op->size) < 0) goto error; - mem->l += nsamples * (size_t) op->size; + mem->l += output_nsamples * (size_t) op->size; } for (j = 0; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; @@ -4409,9 +4649,13 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, } for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } for (j = 0; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = op_base[j] + sample * op_stride[j]; + uint8_t *buf = op_base[j] + kept * op_stride[j]; int n = op->width; /* @@ -4478,8 +4722,10 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, goto fallback; } } + if (++kept == output_nsamples) + break; } - if (sample != nsamples) + if (kept != output_nsamples) goto fallback; for (j = 0; j < plan->n_ops; j++) { if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) @@ -4493,18 +4739,18 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, */ if (!vcf_format_row_can_compact(&row_ops[j])) goto fallback; - vcf_format_compact_row_op(mem, nsamples, &row_ops[j], max_counts[j]); + vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]); } } v->n_fmt = plan->n_ops; - v->n_sample = nsamples; - if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, nsamples, + v->n_sample = output_nsamples; + if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, output_nsamples, plan->n_ops, row_ops, ranges, direct_ops) < 0) goto error; vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += nsamples; + vcf_format_plan_stats.parsed_samples += output_nsamples; return 0; fallback: @@ -4579,16 +4825,6 @@ static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (!plan_mode) return -3; vcf_format_plan_stats.attempts++; - if (h->keep_samples) { - /* - * Sample filtering/subsetting changes FORMAT column cardinality and - * error handling in ways this MVP does not yet model. Keep it on the - * production parser until the dynamic executor has explicit support for - * selected-sample writes. - */ - vcf_format_plan_stats.fallback++; - return -3; - } /* All enabled modes now use the same dynamic per-tag plan. */ return vcf_parse_format_general_planned(s, h, v, p, q); From 65758b1eeb434da3fa020dc8e7b0a4f891cf6566 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 29 Apr 2026 23:11:50 +0200 Subject: [PATCH 29/38] Add broader bcftools command benchmarks --- bench/format-shape/README.md | 82 ++++++++ .../large/bcftools-command-inputs.tsv | 6 + .../large/bcftools-merge-inputs.tsv | 4 + .../scripts/run_bcftools_command_bench.sh | 176 ++++++++++++++++++ docs/FORMAT_PLAN_CURRENT.md | 63 +++++++ docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 24 +++ docs/FORMAT_PLAN_OVERVIEW.md | 8 + 7 files changed, 363 insertions(+) create mode 100644 bench/format-shape/large/bcftools-command-inputs.tsv create mode 100644 bench/format-shape/large/bcftools-merge-inputs.tsv create mode 100755 bench/format-shape/scripts/run_bcftools_command_bench.sh diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index 59363f506..a7fc781a2 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -24,6 +24,7 @@ bench/format-shape/ scripts/run_bench.sh baseline/plan timing and cmp runner scripts/run_thread_bench.sh threaded timing and cmp runner scripts/run_bcftools_bench.sh bcftools threaded timing runner + scripts/run_bcftools_command_bench.sh broader bcftools command runner results/ generated timing logs and BCF outputs ``` @@ -162,6 +163,82 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ bench/format-shape/large/threaded-inputs.tsv ``` +Run broader bcftools command shapes: + +```sh +BCFTOOLS=/path/to/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-commands \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-command-inputs.tsv +``` + +This runner is intended to be a bridge toward future tests. It runs each +command once with `HTS_VCF_FORMAT_PLAN=0` and once with +`HTS_VCF_FORMAT_PLAN=1`, then compares outputs with `cmp`. + +The default command set is: + +| Command | Purpose | Output check | +|---|---|---| +| `view_bcf` | Full `bcftools view --no-version -Ob -l 0` conversion. | Binary BCF `cmp`. | +| `view_sites` | `bcftools view -G` after dropping genotypes. | Binary BCF `cmp`. | +| `query_sites` | Fixed-column query that should not benefit from FORMAT parsing. | Text `cmp`. | +| `query_format` | Query `%GT` for the first `QUERY_SAMPLE_COUNT` samples. | Text `cmp`. | +| `stats` | `bcftools stats` over the input. | Text `cmp`. | +| `filter_gt` | `bcftools view -i 'GT="alt"'` for the first `QUERY_SAMPLE_COUNT` samples. | Binary BCF `cmp`. | +| `merge_self` | `bcftools merge --no-index --force-samples` of the input with itself. | Binary BCF `cmp`. | + +`query_format`, `filter_gt`, and `merge_self` are skipped for sites-only inputs. +By default the query/filter commands select two samples +(`QUERY_SAMPLE_COUNT=2`) to avoid generating enormous text output on cohort-scale +VCFs. Override with: + +```sh +COMMANDS="query_format stats" QUERY_SAMPLE_COUNT=8 THREADS_LIST="0 4" \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-command-inputs.tsv +``` + +The runner writes: + +```text +timings.tsv name, command, threads, mode, real/user/sys +checks.tsv baseline-vs-plan cmp status, including skipped_no_samples +commands.tsv command descriptions captured with the result directory +``` + +For CI, the likely future shape is to keep one or two tiny inputs per command +and assert `checks.tsv` has only `ok` or expected `skipped_no_samples` rows. +The large corpus should remain a performance benchmark rather than a normal +test-suite dependency. + +`merge_self` is intentionally not in the default `COMMANDS` list because it can +produce very large outputs on cohort-scale inputs. Run it against the smaller +merge manifest: + +```sh +BCFTOOLS=/path/to/bcftools COMMANDS=merge_self \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-merge \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-merge-inputs.tsv +``` + +This is not a semantic recommendation to merge a file with itself in production; +it is a controlled benchmark shape. `--force-samples` creates distinct sample +names and `--no-index` avoids needing local tabix indexes for generated slices. + +The latest local merge run wrote: + +```text +bench/format-shape/large/results-bcftools-merge/timings.tsv +bench/format-shape/large/results-bcftools-merge/checks.tsv +``` + +All planned merge outputs compared byte-identical to baseline. The small 1000G +genotype input improved from 0.14 s to 0.10 s, the 1024-sample CCDG-like input +improved from 4.50 s to 4.33 s, and the 1024-sample float/string input was +unchanged at 2.69 s. + ## Large Corpus `large/inputs.tsv` currently contains: @@ -175,6 +252,11 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ float rows. `large/threaded-inputs.tsv` mirrors this full corpus for `-@` scaling checks. +`large/bcftools-command-inputs.tsv` is a smaller representative set for the +broader command benchmark: GT-only, real CCDG-like FORMAT, reordered FORMAT, +string/float negative control, and an INFO-heavy sites-only gnomAD slice. +`large/bcftools-merge-inputs.tsv` is smaller still, so merge output does not +explode during routine local benchmarks. To refresh only the newer cache-regression synthetic files without rewriting the older large VCFs: diff --git a/bench/format-shape/large/bcftools-command-inputs.tsv b/bench/format-shape/large/bcftools-command-inputs.tsv new file mode 100644 index 000000000..62f1957e9 --- /dev/null +++ b/bench/format-shape/large/bcftools-command-inputs.tsv @@ -0,0 +1,6 @@ +name path source +ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples +1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF +large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples +large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT negative-control shape, 16k records x 2,048 samples +gnomad_sites_chr22 bench/format-shape/public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz gnomAD v4.1 exomes chr22 sites-only INFO-heavy slice diff --git a/bench/format-shape/large/bcftools-merge-inputs.tsv b/bench/format-shape/large/bcftools-merge-inputs.tsv new file mode 100644 index 000000000..7764e0c05 --- /dev/null +++ b/bench/format-shape/large/bcftools-merge-inputs.tsv @@ -0,0 +1,4 @@ +name path source +small_1000g_genotypes bench/format-shape/public/1000g_chr22_genotypes_16050k_16150k.vcf.gz small 1000 Genomes GT slice used as a quick merge smoke case +large_ccdg_likelihood_1024s bench/format-shape/large/synthetic/large_ccdg_likelihood_1024s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 1,024 samples +large_float_string_1024s bench/format-shape/large/synthetic/large_float_string_1024s.vcf.gz synthetic float/string FORMAT negative-control shape, 16k records x 1,024 samples diff --git a/bench/format-shape/scripts/run_bcftools_command_bench.sh b/bench/format-shape/scripts/run_bcftools_command_bench.sh new file mode 100755 index 000000000..db84b0aa4 --- /dev/null +++ b/bench/format-shape/scripts/run_bcftools_command_bench.sh @@ -0,0 +1,176 @@ +#!/bin/sh +set -eu + +# Broader production-style command benchmark for the VCF FORMAT planner. +# +# The conversion benchmark in run_bcftools_bench.sh measures one important +# path: VCF text -> BCF output via `bcftools view`. This script intentionally +# exercises a wider set of bcftools command shapes so we can see which workflows +# actually expose FORMAT parse cost: +# +# view_bcf full VCF -> BCF conversion +# view_sites VCF -> BCF after dropping genotypes with -G +# query_sites fixed-column/INFO-oriented query +# query_format FORMAT accessor query for a small sample subset +# stats bcftools stats +# filter_gt FORMAT expression filtering for a small sample subset +# merge_self bcftools merge of the input with itself using --force-samples +# +# Each command is run twice, once with HTS_VCF_FORMAT_PLAN=0 and once with +# HTS_VCF_FORMAT_PLAN=1. Outputs are compared with cmp whenever the command is +# applicable. FORMAT commands are skipped for sites-only inputs. +# +# Keep the default THREADS_LIST narrow here. This harness multiplies inputs by +# commands by planner modes, so exhaustive thread scaling belongs in the +# dedicated threaded runner unless a specific command needs investigation. + +bcftools=${BCFTOOLS:-bcftools} +inputs=${1:-bench/format-shape/large/bcftools-command-inputs.tsv} +outdir=${OUTDIR:-bench/format-shape/large/results-bcftools-commands} +keep_outputs=${KEEP_OUTPUTS:-1} +threads_list=${THREADS_LIST:-0} +commands=${COMMANDS:-view_bcf view_sites query_sites query_format stats filter_gt} +query_sample_count=${QUERY_SAMPLE_COUNT:-2} +mkdir -p "$outdir" + +timings="$outdir/timings.tsv" +checks="$outdir/checks.tsv" +cmds_out="$outdir/commands.tsv" + +printf 'name\tcommand\tthreads\tmode\treal\tuser\tsys\n' > "$timings" +printf 'name\tcommand\tthreads\tcomparison\tstatus\n' > "$checks" +printf 'command\tdescription\n' > "$cmds_out" +printf 'view_bcf\tbcftools view --no-version -Ob -l 0\n' >> "$cmds_out" +printf 'view_sites\tbcftools view --no-version -G -Ob -l 0\n' >> "$cmds_out" +printf 'query_sites\tbcftools query fixed site fields\n' >> "$cmds_out" +printf 'query_format\tbcftools query GT for first QUERY_SAMPLE_COUNT samples\n' >> "$cmds_out" +printf 'stats\tbcftools stats\n' >> "$cmds_out" +printf 'filter_gt\tbcftools view -i GT="alt" for first QUERY_SAMPLE_COUNT samples\n' >> "$cmds_out" +printf 'merge_self\tbcftools merge --no-index --force-samples of the input with itself\n' >> "$cmds_out" + +run_one() +{ + mode=$1 + command=$2 + threads=$3 + path=$4 + out=$5 + err=$6 + sample_args=$7 + thread_args= + plan=0 + + if [ "$mode" = plan ]; then + plan=1 + fi + if [ "$threads" != 0 ]; then + thread_args="--threads $threads" + fi + + case "$command" in + view_bcf) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -Ob -l 0 $thread_args \ + -o "$out" "$path" 2> "$err" + ;; + view_sites) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -G -Ob -l 0 $thread_args \ + -o "$out" "$path" 2> "$err" + ;; + query_sites) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" query -f '%CHROM\t%POS\t%REF\t%ALT\n' \ + "$path" > "$out" 2> "$err" + ;; + query_format) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" query $sample_args -f '%CHROM\t%POS[\t%GT]\n' \ + "$path" > "$out" 2> "$err" + ;; + stats) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" stats "$path" > "$out" 2> "$err" + ;; + filter_gt) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args \ + -i 'GT="alt"' -o "$out" "$path" 2> "$err" + ;; + merge_self) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" merge --no-index --force-samples --no-version -Ob \ + $thread_args -o "$out" "$path" "$path" 2> "$err" + ;; + *) + printf 'unknown command: %s\n' "$command" >&2 + return 1 + ;; + esac +} + +tail -n +2 "$inputs" | while IFS=' ' read -r name path source +do + samples=$("$bcftools" query -l "$path" | awk -v n="$query_sample_count" ' + NR <= n { if (s) s = s "," $0; else s = $0 } + END { print s } + ') + sample_args= + if [ -n "$samples" ]; then + sample_args="-s $samples" + fi + + for command in $commands + do + case "$command" in + query_format|filter_gt|merge_self) + if [ -z "$sample_args" ]; then + for threads in $threads_list + do + printf '%s\t%s\t%s\tbaseline_vs_plan\tskipped_no_samples\n' \ + "$name" "$command" "$threads" >> "$checks" + done + continue + fi + ;; + esac + + for threads in $threads_list + do + base_out="$outdir/$name.$command.t$threads.baseline.out" + plan_out="$outdir/$name.$command.t$threads.plan.out" + + for mode in baseline plan + do + err="$outdir/$name.$command.t$threads.$mode.stderr" + out="$outdir/$name.$command.t$threads.$mode.out" + run_one "$mode" "$command" "$threads" "$path" "$out" "$err" "$sample_args" + + awk -v name="$name" -v command="$command" \ + -v threads="$threads" -v mode="$mode" ' + /^real / { real=$2 } + /^user / { user=$2 } + /^sys / { sys=$2 } + END { + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + name, command, threads, mode, + real+0, user+0, sys+0 + } + ' "$err" >> "$timings" + done + + if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then + printf '%s\t%s\t%s\tbaseline_vs_plan\tok\n' \ + "$name" "$command" "$threads" >> "$checks" + else + printf '%s\t%s\t%s\tbaseline_vs_plan\tDIFF\n' \ + "$name" "$command" "$threads" >> "$checks" + fi + if [ "$keep_outputs" = 0 ]; then + rm -f "$base_out" "$plan_out" + fi + done + done +done + +printf 'wrote %s, %s, and %s\n' "$timings" "$checks" "$cmds_out" diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md index 51aa01984..133e4411e 100644 --- a/docs/FORMAT_PLAN_CURRENT.md +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -291,12 +291,75 @@ has samples; sites-only inputs naturally run without `-s`. | Two-string float negative | 4 | 0.99x | 0.98x | | Two-string float negative | 8 | 1.03x | 1.01x | +## bcftools Command Benchmark + +The broader command runner exercises bcftools paths that either consume FORMAT +records, discard FORMAT records, or mostly operate on site-level data: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-commands \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-command-inputs.tsv +``` + +All applicable planned outputs compared byte-identical to baseline. FORMAT +commands were skipped for the sites-only gnomAD input as expected. + +| Input | Command | Real speedup | User speedup | +|---|---|---:|---:| +| CCDG 10k | view_bcf | 1.11x | 1.12x | +| CCDG 10k | view_sites | 1.12x | 1.13x | +| CCDG 10k | query_format | 1.51x | 1.56x | +| CCDG 10k | filter_gt | 1.11x | 1.12x | +| 1000G chr22 full GT | view_bcf | 2.79x | 2.94x | +| 1000G chr22 full GT | view_sites | 2.98x | 3.02x | +| 1000G chr22 full GT | query_format | 1.94x | 1.94x | +| 1000G chr22 full GT | filter_gt | 1.57x | 1.58x | +| Large reordered likelihood | view_bcf | 1.21x | 1.22x | +| Large reordered likelihood | view_sites | 1.20x | 1.20x | +| Large reordered likelihood | query_format | 1.39x | 1.42x | +| Large reordered likelihood | filter_gt | 1.14x | 1.14x | +| Large float/string | view_bcf | 1.02x | 1.02x | +| Large float/string | query_format | 1.01x | 1.00x | +| gnomAD sites chr22 | view_bcf | 0.98x | 1.00x | +| gnomAD sites chr22 | query_sites | 1.00x | 1.08x | + +`query_sites` and `stats` were generally neutral because they do little or no +FORMAT work. The small negative rows, such as CCDG `stats` at 0.94x real and +float/string `stats` at 0.93x real, are still within the area to watch for +planner overhead in workloads that do not benefit from FORMAT decoding. + +## bcftools Merge Benchmark + +`merge_self` is kept out of the default command list because merge output can +grow quickly. It was run against the smaller merge manifest: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +COMMANDS=merge_self KEEP_OUTPUTS=0 \ +OUTDIR=bench/format-shape/large/results-bcftools-merge \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-merge-inputs.tsv +``` + +All planned merge outputs compared byte-identical to baseline. + +| Input | Baseline real | Plan real | Real speedup | Baseline user | Plan user | +|---|---:|---:|---:|---:|---:| +| Small 1000G genotypes | 0.14 s | 0.10 s | 1.40x | 0.13 s | 0.08 s | +| Large CCDG likelihood 1024s | 4.50 s | 4.33 s | 1.04x | 4.05 s | 3.91 s | +| Large float/string 1024s | 2.69 s | 2.69 s | 1.00x | 2.40 s | 2.41 s | + ## Interpretation The dynamic path gives a large production-visible win for sample-rich GT-only VCFs. On likelihood-heavy rows, it is consistently faster but still limited by generic per-op work, string/width handling, and IO/compression costs. Some float/string-heavy layouts remain near parity or slightly slower than baseline. +The broader bcftools command run supports the same story: commands that expose +FORMAT parsing benefit; commands dominated by site-only logic, stats, merge +bookkeeping, or compression are neutral. ## Remaining Work diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md index 55eef012f..2c98ce35a 100644 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -236,6 +236,30 @@ All planned outputs compared byte-identical to baseline. | Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | | Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | +## Broader bcftools Command Check + +Added `bench/format-shape/scripts/run_bcftools_command_bench.sh` so the branch +can exercise more than `bcftools view`. The runner currently covers full BCF +conversion, genotype-dropping conversion, site queries, small FORMAT queries, +`stats`, genotype filters, and an opt-in merge benchmark. Every command runs +once with `HTS_VCF_FORMAT_PLAN=0` and once with `HTS_VCF_FORMAT_PLAN=1`, then +compares outputs with `cmp`. + +Result: retained. All applicable planned outputs compared byte-identical to +baseline. FORMAT-heavy commands showed the expected gains: 1000G full GT was +2.79x faster for `view_bcf`, 2.98x faster for `view_sites`, 1.94x faster for +`query_format`, and 1.57x faster for `filter_gt`. CCDG and reordered +likelihood workloads were smaller but positive. Site-only queries and `stats` +were mostly neutral, with a few small negative rows that remain useful overhead +watchpoints. + +`bcftools merge` was tested through the opt-in `merge_self` command against a +smaller manifest to avoid excessive duplicated-sample output. All planned merge +outputs compared byte-identical to baseline. Merge was neutral-to-positive: +small 1000G genotype input improved from 0.14 s to 0.10 s, large CCDG +likelihood improved from 4.50 s to 4.33 s, and large float/string remained +unchanged at 2.69 s. + ## Main Lessons - Tag-level composition is the right MVP boundary; exact full FORMAT strings are diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 4bc306f75..3545617c4 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -67,6 +67,14 @@ GT workload improved from 26.51 s to 9.77 s unthreaded and from 25.99 s to 8.84 s at 4 threads. Selected-sample likelihood-heavy rows are still faster, but the gains are smaller because much less FORMAT payload is emitted. +Broader bcftools commands follow the same pattern. `bcftools view`, +`bcftools query` of FORMAT values, and genotype filters benefit when they expose +sample FORMAT parsing. Site-only queries, `stats`, and `merge` are mostly +neutral because their runtime is dominated by non-FORMAT work, output writing, +or command-level bookkeeping. A controlled `bcftools merge` self-merge check +produced byte-identical output and was neutral-to-positive across the small +merge manifest. + ## Drawbacks The MVP intentionally keeps fallback whole-row. It does not parse supported From ceda038f24c6ebe390f5aaa07fa0280657083221 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 16:55:26 +0200 Subject: [PATCH 30/38] Harden FORMAT planner tests --- bench/format-shape/README.md | 135 +- .../large/bcftools-full-ccdg-inputs.tsv | 2 + .../large/bcftools-giab-ccdg-inputs.tsv | 6 + .../run_bcftools_command_bench_stream.sh | 164 ++ docs/FORMAT_PLAN_CURRENT.md | 283 +- docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 230 +- docs/FORMAT_PLAN_OVERVIEW.md | 34 +- test/format-plan-edge.vcf | 1 + test/format-plan-fallback.vcf | 10 + ...ility.vcf => format-plan-float-string.vcf} | 0 test/format-plan-repeated-wide-gt.vcf | 14 + test/format-plan-sample-count.vcf | 6 + test/format-plan-sample-skip.vcf | 7 + test/test.pl | 110 + test/test_format_plan.sh | 48 - test/test_format_plan_cache.c | 54 +- test/test_view.c | 32 +- vcf.c | 2501 +++++++++-------- 18 files changed, 2284 insertions(+), 1353 deletions(-) create mode 100644 bench/format-shape/large/bcftools-full-ccdg-inputs.tsv create mode 100644 bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv create mode 100644 bench/format-shape/scripts/run_bcftools_command_bench_stream.sh create mode 100644 test/format-plan-fallback.vcf rename test/{format-plan-profitability.vcf => format-plan-float-string.vcf} (100%) create mode 100644 test/format-plan-repeated-wide-gt.vcf create mode 100644 test/format-plan-sample-count.vcf create mode 100644 test/format-plan-sample-skip.vcf delete mode 100755 test/test_format_plan.sh diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index a7fc781a2..5c51d4b73 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -25,6 +25,7 @@ bench/format-shape/ scripts/run_thread_bench.sh threaded timing and cmp runner scripts/run_bcftools_bench.sh bcftools threaded timing runner scripts/run_bcftools_command_bench.sh broader bcftools command runner + scripts/run_bcftools_command_bench_stream.sh checksum-only large-output runner results/ generated timing logs and BCF outputs ``` @@ -37,6 +38,21 @@ commands below. keeps BCF outputs locally so `cmp` checks are inspectable, but `.gitignore` excludes those large files. +## Repo Tests + +The small correctness cases that should travel with the implementation now live +in the normal htslib test harness, not only in this benchmark directory. +`make check` runs `test_vcf_format_plan` inside `test/test.pl` plus +`test/test_format_plan_cache`. Those tests assert byte-identical planned output +at the parser-output level, selected-sample behavior, rollback after partial +planned parsing, malformed-input failure behavior, and header-cache generation +invalidation. Fallback reason counters remain local diagnostics for benchmark +analysis rather than production test assertions. + +The benchmark corpus remains for performance and production-shape coverage. It +should not become a normal test-suite dependency because several inputs are +large public VCFs or generated multi-second workloads. + ## Public Inputs The small `public/` and `synthetic/` inputs are smoke/correctness fixtures. They @@ -54,6 +70,10 @@ non-FORMAT and real-world INFO-heavy workloads. | `public/1000g_wgs_sites_chr22_16050k_16300k.vcf.gz` | 1000 Genomes Phase 3 WGS sites | sites-only | | `public/clinvar_grch38_chr22_16050k_20000k.vcf.gz` | ClinVar GRCh38 VCF | sites-only clinical annotations | | `public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz` | gnomAD v4.1 exomes chr22 | sites-only, INFO-heavy | +| `large/public/giab/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz` | GIAB HG002 v4.2.1 | 4,048,342-record single-sample truth-set small variants | +| `large/public/giab/HG002_GRCh38_v5.0q_smvar.vcf.gz` | GIAB HG002 v5.0q GRCh38 | 5,945,525-record single-sample small variants | +| `large/public/giab/HG002_GRCh38_v5.0q_stvar.vcf.gz` | GIAB HG002 v5.0q GRCh38 | 6,268,852-record single-sample structural variants | +| `large/public/giab/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz` | GIAB HG002 v5.0q CHM13v2.0 | 5,829,374-record single-sample small variants | Source URLs used: @@ -62,8 +82,29 @@ https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_sha https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr22.vcf.bgz +https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz +https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_GRCh38_v5.0q_smvar.vcf.gz +https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_GRCh38_v5.0q_stvar.vcf.gz +https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz +``` + +The parent CCDG/1000G high-coverage chr22 file for +`public/ccdg_chr22_10k.vcf.gz` is: + +```text +https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +It is 26.0 GiB compressed and is available locally at: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` +Do not run the normal output-materializing command harness on this file. A +single uncompressed BCF output reached 155 GiB before the run was interrupted. +Use the streaming checksum harness below instead. + ## Synthetic Inputs The synthetic files are generated by: @@ -118,9 +159,7 @@ By default this runs unthreaded plus `-@ 2`, `-@ 4`, and `-@ 8`. Override with now mirrors the full large corpus so thread scaling is checked across the same real and synthetic workload shapes as the primary benchmark. -The script runs each input in two modes. `interp` remains accepted by -`HTS_VCF_FORMAT_PLAN`, but it aliases the same dynamic parser as `plan`, so the -benchmark harness does not run it as a separate timing row. +The script runs each input in two modes: ```text baseline: HTS_VCF_FORMAT_PLAN=0 @@ -207,11 +246,83 @@ checks.tsv baseline-vs-plan cmp status, including skipped_no_samples commands.tsv command descriptions captured with the result directory ``` +For very large inputs, use the streaming checksum variant. It runs the same +command families but pipes output through `cksum` and compares checksums instead +of storing complete BCF/text outputs: + +```sh +BCFTOOLS=/path/to/bcftools \ +OUTDIR=bench/format-shape/large/results-bcftools-full-ccdg-stream \ + bash bench/format-shape/scripts/run_bcftools_command_bench_stream.sh \ + bench/format-shape/large/bcftools-full-ccdg-inputs.tsv +``` + +The full CCDG chr22 streaming run wrote: + +```text +bench/format-shape/large/results-bcftools-full-ccdg-stream/timings.tsv +bench/format-shape/large/results-bcftools-full-ccdg-stream/checks.tsv +bench/format-shape/large/results-bcftools-full-ccdg-stream/checksums.tsv +``` + +All baseline-vs-plan checksums compared `ok`. + +| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | +|---|---:|---:|---:|---:|---:|---:| +| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | +| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | +| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | +| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | +| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | +| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | + For CI, the likely future shape is to keep one or two tiny inputs per command and assert `checks.tsv` has only `ok` or expected `skipped_no_samples` rows. The large corpus should remain a performance benchmark rather than a normal test-suite dependency. +Run the GIAB plus CCDG correctness/performance pass: + +```sh +BCFTOOLS=/path/to/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv +``` + +If using the sibling bcftools checkout in this workspace, build it against this +HTSlib checkout explicitly: + +```sh +cd ../bcftools-htslib-vcf-plan +make HTSDIR=../htslib-vcf-avx-sanity bcftools +``` + +This pass is primarily a production-shape correctness check. GIAB is +single-sample, so it does not show the large cohort speedups, but it does cover +real truth-set small-variant and structural-variant FORMAT details. The first +GIAB v5.0q run exposed a planned-path bug where `.|.` was serialized as `./.`. +The GT2 parser now preserves phased missing alleles, and the fixed rerun has +all baseline-vs-plan command outputs comparing `ok`. + +Latest hardened GIAB/CCDG command run: + +| Input | Command | Real speedup | User speedup | +|---|---|---:|---:| +| CCDG 10k | view_bcf | 1.14x | 1.14x | +| CCDG 10k | view_sites | 1.13x | 1.14x | +| CCDG 10k | query_format | 1.52x | 1.56x | +| CCDG 10k | filter_gt | 1.12x | 1.12x | +| GIAB HG002 GRCh38 v4.2.1 | view_bcf | 1.09x | 1.09x | +| GIAB HG002 GRCh38 v4.2.1 | query_format | 1.07x | 1.07x | +| GIAB HG002 GRCh38 v4.2.1 | filter_gt | 1.09x | 1.09x | +| GIAB HG002 GRCh38 v5.0q small variants | view_bcf | 1.09x | 1.09x | +| GIAB HG002 GRCh38 v5.0q small variants | query_format | 1.09x | 1.07x | +| GIAB HG002 GRCh38 v5.0q structural variants | view_bcf | 1.09x | 1.09x | +| GIAB HG002 GRCh38 v5.0q structural variants | query_format | 1.02x | 1.02x | +| GIAB HG002 CHM13 v5.0q small variants | view_bcf | 1.07x | 1.07x | +| GIAB HG002 CHM13 v5.0q small variants | query_format | 1.06x | 1.06x | + `merge_self` is intentionally not in the default `COMMANDS` list because it can produce very large outputs on cohort-scale inputs. Run it against the smaller merge manifest: @@ -248,8 +359,8 @@ unchanged at 2.69 s. - eight generated 2,048-sample synthetic FORMAT workloads: CCDG-like likelihood, reordered likelihood, multiallelic likelihood, float/string FORMAT, variable phase-string widths, row-local likelihood - fallbacks, GT-first wrong-order likelihood-like rows, and two-string - float rows. + fallbacks, GT-first wrong-order likelihood-like rows, and two-string + float rows. `large/threaded-inputs.tsv` mirrors this full corpus for `-@` scaling checks. `large/bcftools-command-inputs.tsv` is a smaller representative set for the @@ -270,8 +381,18 @@ SYNTHETIC_ONLY_NEW=1 \ The latest large run is summarized in: ```text -bench/format-shape/large/results/timings.tsv -bench/format-shape/large/results/checks.tsv +bench/format-shape/large/results-prod-hardening2/timings.tsv +bench/format-shape/large/results-prod-hardening2/checks.tsv ``` All plan outputs in that run compared byte-identical to baseline. + +That run includes fallback reason diagnostics. In the CCDG 10k slice, the +planner hit 9,861 of 10,000 rows; the remaining 139 rows fell back for +`string_width`, meaning their measured string field exceeded the current +256-byte planned cap. + +One rejected optimization is recorded in +`bench/format-shape/large/results-opt-nosubset-split`: splitting the all-samples +loop from the `keep_samples` loop preserved correctness but slowed the planned +rows, so that code was reverted. diff --git a/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv b/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv new file mode 100644 index 000000000..0dce0fd61 --- /dev/null +++ b/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv @@ -0,0 +1,2 @@ +name path source +ccdg_chr22_full /Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz local full 1000G/CCDG high-coverage chr22 VCF, 3,202 samples diff --git a/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv b/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv new file mode 100644 index 000000000..852e684eb --- /dev/null +++ b/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv @@ -0,0 +1,6 @@ +name path source +ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG/1000G high-coverage chr22 slice, 10k records x 3,202 samples +giab_hg002_grch38_v421 bench/format-shape/large/public/giab/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz GIAB HG002 NIST v4.2.1 GRCh38 benchmark small variants +giab_hg002_grch38_v50q_smvar bench/format-shape/large/public/giab/HG002_GRCh38_v5.0q_smvar.vcf.gz GIAB HG002 v5.0q GRCh38 small variants +giab_hg002_grch38_v50q_stvar bench/format-shape/large/public/giab/HG002_GRCh38_v5.0q_stvar.vcf.gz GIAB HG002 v5.0q GRCh38 structural variants +giab_hg002_chm13_v50q_smvar bench/format-shape/large/public/giab/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz GIAB HG002 v5.0q CHM13v2.0 small variants diff --git a/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh b/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh new file mode 100644 index 000000000..7f1a84438 --- /dev/null +++ b/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh @@ -0,0 +1,164 @@ +#!/bin/bash +set -euo pipefail + +# Streaming variant of run_bcftools_command_bench.sh for very large VCFs. +# It runs the same command families but pipes command output through cksum, +# avoiding temporary BCF/text outputs that can be hundreds of GiB on full +# cohort chromosomes. Baseline and planned checksums are compared. + +bcftools=${BCFTOOLS:-bcftools} +inputs=${1:-bench/format-shape/large/bcftools-command-inputs.tsv} +outdir=${OUTDIR:-bench/format-shape/large/results-bcftools-commands-stream} +threads_list=${THREADS_LIST:-0} +commands=${COMMANDS:-view_bcf view_sites query_sites query_format stats filter_gt} +query_sample_count=${QUERY_SAMPLE_COUNT:-2} +mkdir -p "$outdir" + +timings="$outdir/timings.tsv" +checks="$outdir/checks.tsv" +cmds_out="$outdir/commands.tsv" +checksums="$outdir/checksums.tsv" + +printf 'name\tcommand\tthreads\tmode\treal\tuser\tsys\n' > "$timings" +printf 'name\tcommand\tthreads\tcomparison\tstatus\n' > "$checks" +printf 'name\tcommand\tthreads\tmode\tcksum\tbytes\n' > "$checksums" +printf 'command\tdescription\n' > "$cmds_out" +printf 'view_bcf\tbcftools view --no-version -Ob -l 0 streamed to cksum\n' >> "$cmds_out" +printf 'view_sites\tbcftools view --no-version -G -Ob -l 0 streamed to cksum\n' >> "$cmds_out" +printf 'query_sites\tbcftools query fixed site fields streamed to cksum\n' >> "$cmds_out" +printf 'query_format\tbcftools query GT for first QUERY_SAMPLE_COUNT samples streamed to cksum\n' >> "$cmds_out" +printf 'stats\tbcftools stats streamed to cksum\n' >> "$cmds_out" +printf 'filter_gt\tbcftools view -i GT="alt" for first QUERY_SAMPLE_COUNT samples streamed to cksum\n' >> "$cmds_out" +printf 'merge_self\tbcftools merge --no-index --force-samples streamed to cksum\n' >> "$cmds_out" + +run_one() +{ + local mode=$1 + local command=$2 + local threads=$3 + local path=$4 + local sum_out=$5 + local err=$6 + local sample_args=$7 + local plan=0 + local thread_args= + + if [ "$mode" = plan ]; then + plan=1 + fi + if [ "$threads" != 0 ]; then + thread_args="--threads $threads" + fi + + case "$command" in + view_bcf) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -Ob -l 0 $thread_args \ + -o - "$path" 2> "$err" | cksum > "$sum_out" + ;; + view_sites) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -G -Ob -l 0 $thread_args \ + -o - "$path" 2> "$err" | cksum > "$sum_out" + ;; + query_sites) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" query -f '%CHROM\t%POS\t%REF\t%ALT\n' \ + "$path" 2> "$err" | cksum > "$sum_out" + ;; + query_format) + # shellcheck disable=SC2086 + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" query $sample_args -f '%CHROM\t%POS[\t%GT]\n' \ + "$path" 2> "$err" | cksum > "$sum_out" + ;; + stats) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" stats "$path" 2> "$err" | cksum > "$sum_out" + ;; + filter_gt) + # shellcheck disable=SC2086 + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" view --no-version -Ob -l 0 $thread_args \ + $sample_args -i 'GT="alt"' -o - "$path" 2> "$err" | cksum > "$sum_out" + ;; + merge_self) + env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ + "$bcftools" merge --no-index --force-samples --no-version -Ob \ + $thread_args -o - "$path" "$path" 2> "$err" | cksum > "$sum_out" + ;; + *) + printf 'unknown command: %s\n' "$command" >&2 + return 1 + ;; + esac +} + +tail -n +2 "$inputs" | while IFS=$'\t' read -r name path source +do + samples=$("$bcftools" query -l "$path" | awk -v n="$query_sample_count" ' + NR <= n { if (s) s = s "," $0; else s = $0 } + END { print s } + ') + sample_args= + if [ -n "$samples" ]; then + sample_args="-s $samples" + fi + + for command in $commands + do + case "$command" in + query_format|filter_gt|merge_self) + if [ -z "$sample_args" ]; then + for threads in $threads_list + do + printf '%s\t%s\t%s\tbaseline_vs_plan\tskipped_no_samples\n' \ + "$name" "$command" "$threads" >> "$checks" + done + continue + fi + ;; + esac + + for threads in $threads_list + do + base_sum="$outdir/$name.$command.t$threads.baseline.cksum" + plan_sum="$outdir/$name.$command.t$threads.plan.cksum" + + for mode in baseline plan + do + err="$outdir/$name.$command.t$threads.$mode.stderr" + sum="$outdir/$name.$command.t$threads.$mode.cksum" + run_one "$mode" "$command" "$threads" "$path" "$sum" "$err" "$sample_args" + + awk -v name="$name" -v command="$command" \ + -v threads="$threads" -v mode="$mode" ' + /^real / { real=$2 } + /^user / { user=$2 } + /^sys / { sys=$2 } + END { + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + name, command, threads, mode, + real+0, user+0, sys+0 + } + ' "$err" >> "$timings" + + awk -v name="$name" -v command="$command" \ + -v threads="$threads" -v mode="$mode" ' + { printf "%s\t%s\t%s\t%s\t%s\t%s\n", + name, command, threads, mode, $1, $2 } + ' "$sum" >> "$checksums" + done + + if cmp "$base_sum" "$plan_sum" >/dev/null 2>&1; then + printf '%s\t%s\t%s\tbaseline_vs_plan\tok\n' \ + "$name" "$command" "$threads" >> "$checks" + else + printf '%s\t%s\t%s\tbaseline_vs_plan\tDIFF\n' \ + "$name" "$command" "$threads" >> "$checks" + fi + done + done +done + +printf 'wrote %s, %s, %s, and %s\n' "$timings" "$checks" "$checksums" "$cmds_out" diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md index 133e4411e..762c0da62 100644 --- a/docs/FORMAT_PLAN_CURRENT.md +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -7,7 +7,7 @@ correctness boundaries, and the latest benchmark results. `vcf_parse_format()` first calls `vcf_parse_format_planned()` when `HTS_VCF_FORMAT_PLAN` is enabled. The planned path either parses the whole -FORMAT column or returns `-3`, allowing the production parser to handle the +FORMAT column or returns `-3`, allowing the existing generic parser to handle the column unchanged. ```text @@ -15,11 +15,12 @@ HTS_VCF_FORMAT_PLAN enabled -> fetch or compile header-owned FORMAT/header plan -> resolve row-local widths -> composable executor - -> production fallback on unsupported or suspicious rows + -> generic fallback on unsupported or suspicious rows ``` -Enabled spellings are `1`, `interp`, and `general`; all route through the same -dynamic executor. +The only enabled spelling is `HTS_VCF_FORMAT_PLAN=1`. Unknown values are +treated as disabled so typos such as `off` or `false` do not accidentally enable +the planner. ## Plan Compilation @@ -30,10 +31,17 @@ from 16 entries up to 128 entries, uses heap storage for long FORMAT strings, and also caches unsupported schemas so repeated odd rows do not repeatedly pay compile cost. +Planner statistics are collected only when `HTS_VCF_FORMAT_PLAN_STATS=1` is +also set. Normal production parsing therefore avoids touching the process-wide +test counters. The test hook reports both aggregate attempts/hits/fallbacks and +fallback reason counters: unsupported schema, guard cooldown, numeric width, +string width, GT shape, parse failure, separator mismatch, and sample-count +mismatch. + `bcf_hdr_sync()` clears the header-owned plan cache and increments the private generation after header dictionaries are rebuilt. The planner also refuses to compile while `h->dirty` is set, leaving unsynced or header-repair cases on the -production parser. +generic parser. The cache and per-plan guard counters are mutable header-owned state, like other htslib header scratch storage. Callers should not concurrently parse through @@ -49,7 +57,7 @@ The compile step rejects: - string-plus-float-vector schemas with too little integer-vector work to repay the dynamic path's width-measurement cost. -Undefined tags intentionally fall back to the production parser so existing +Undefined tags intentionally fall back to the generic parser so existing dummy-header repair and warning behavior is preserved. ## Supported Operations @@ -66,7 +74,10 @@ The current executor supports: Header-derived widths are resolved per record. `Number=A`, `Number=R`, and `Number=G` depend on the current allele count. String and `Number=.` numeric -fields use a row-local measurement pass. +fields use a row-local measurement pass. Numeric vectors remain capped at 64 +values per FORMAT field in the planned path. Measured strings are capped +separately at 256 bytes per row field, which keeps common phase-set annotations +on the fast path while bounding scratch-buffer and transposition work. ## Executor @@ -80,7 +91,7 @@ parsing so integer range and observed-width metadata are known. For fixed-width vector fields, the executor can compact underfilled rows to the observed row maximum before BCF encoding. This avoids whole-row fallback when -the production parser would also emit a narrower byte-identical vector width. +the generic parser would also emit a narrower byte-identical vector width. ## Guard Policy @@ -92,8 +103,14 @@ Each cached dynamic plan has a small runtime guard: An isolated fallback does not disable the fast path. A plan is paused after eight consecutive misses, or after at least 128 attempts with more than 10% -fallbacks. After 256 skipped records, the plan probes again so later stable -regions can recover the optimized path. +guard-counted fallbacks. Row-local numeric/string width misses are counted in +diagnostics but do not poison the normal guard, because those rows can be sparse +within an otherwise profitable schema. A separate dense-width guard pauses a +schema only after at least 128 width probes with more than 75% width misses; this +catches pathological over-cap schemas without disabling CCDG-like layouts where +only a small minority of rows have very long phase strings. After 256 skipped +records, the plan probes again so later stable regions can recover the optimized +path. ## Correctness Rules @@ -103,21 +120,27 @@ The planned parser must preserve these invariants: - header IDs, types, and number models are resolved before execution; - selected-sample parsing must honor `h->keep_samples`, use `h->nsamples_ori` for input-column scans, and set `v->n_sample` to the retained sample count; -- duplicate or undefined tags use the production parser; -- unprofitable string/float-heavy schemas use the production parser; +- duplicate or undefined tags use the generic parser; +- low-profit string/float-heavy schemas use the generic parser; - unsupported GT encodings force fallback; - numeric vectors preserve observed width and vector-end padding; - strings use observed maximum byte length and zero-pad shorter samples; - integer and float overflow/error behavior must match production htslib or force fallback; +- successful planned rows run the same final FORMAT consistency check as the + generic parser via `vcf_parse_format_check7()`; - direct writes to `v->indiv` must roll back before fallback. -Focused validation lives in `./test/test_format_plan.sh`. It compares -production parsing, `HTS_VCF_FORMAT_PLAN=1`, and the `interp` alias byte-for-byte -with `cmp`. The script also checks selected-sample parsing for explicit -inclusion and exclusion lists (`S1,S3`, `S2`, and `^S2`). `test/format-plan-cache.vcf` -additionally exercises more than 16 distinct FORMAT schemas and a literal FORMAT -string longer than the old fixed cache key. `test/test_format_plan_cache` +Focused validation lives in the existing `test/test.pl` harness as +`test_vcf_format_plan`. It compares generic parsing and +`HTS_VCF_FORMAT_PLAN=1` byte-for-byte with `cmp`, and also verifies that +unrecognized control values such as `HTS_VCF_FORMAT_PLAN=off` behave like the +generic parser. The repo fixtures cover numeric-width and GT-shape fallback, +low-value float/string schemas, cache growth, long FORMAT strings, string-width +fallback, separator fallback, parse fallback with rollback, repeated wide GT +values, selected-sample skipping of malformed unselected columns, and +sample-count mismatch. The selected-sample checks compare explicit inclusion +and exclusion lists (`S1,S3`, `S2`, and `^S2`). `test/test_format_plan_cache` mutates and resyncs a header after a plan has been compiled for the same FORMAT string, then verifies the row is planned again with the new metadata. @@ -128,37 +151,50 @@ dynamic cache, the live parser/test hook delta relative to `origin/develop` is: | File | Added lines | |---|---:| -| `vcf.c` | 1,703 | +| `vcf.c` | 1,939 added / 164 removed | | `Makefile` | 6 | -| `test/test_format_plan.sh` | 48 | -| `test/test_format_plan_cache.c` | 130 | -| `test/test_view.c` | 23 | +| `test/test.pl` | 110 | +| `test/test_format_plan_cache.c` | 133 | +| `test/test_view.c` | 45 added / 2 removed | | `test/format-plan-cache.vcf` | 61 | -| `test/format-plan-profitability.vcf` | 8 | +| `test/format-plan-edge.vcf` | 38 | +| `test/format-plan-float-string.vcf` | 8 | +| `test/format-plan-fallback.vcf` | 10 | +| `test/format-plan-repeated-wide-gt.vcf` | 14 | +| `test/format-plan-sample-count.vcf` | 6 | +| `test/format-plan-sample-skip.vcf` | 7 | ## Large Corpus Benchmark Command: ```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-profit-gate \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-prod-hardening2 \ bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv ``` All planned outputs compared byte-identical to baseline. -| Input | Baseline user | Plan user | Hits/fallback | -|---|---:|---:|---:| -| CCDG 10k | 2.47 s | 2.15 s | 8,396 / 1,604 | -| 1000G chr22 full GT | 25.25 s | 7.82 s | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.02 s | 3.64 s | 20,000 / 0 | -| Large reordered likelihood | 2.95 s | 2.40 s | 20,000 / 0 | -| Large multiallelic likelihood | 3.15 s | 2.76 s | 16,000 / 0 | -| Large float/string | 2.96 s | 2.89 s | 0 / 16,000 | -| Variable phase widths | 2.60 s | 2.46 s | 12,000 / 0 | -| Mixed row-local fallbacks | 2.19 s | 1.84 s | 12,000 / 0 | -| GT-first reordered negative | 1.72 s | 1.37 s | 12,000 / 0 | -| Two-string float negative | 2.29 s | 2.26 s | 0 / 12,000 | +| Input | Baseline user | Plan user | User speedup | Hits/fallback | +|---|---:|---:|---:|---:| +| CCDG 10k | 2.47 s | 2.21 s | 1.12x | 9,861 / 139 | +| 1000G chr22 full GT | 24.61 s | 9.48 s | 2.60x | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.00 s | 3.68 s | 1.09x | 20,000 / 0 | +| Large reordered likelihood | 2.86 s | 2.42 s | 1.18x | 20,000 / 0 | +| Large multiallelic likelihood | 3.08 s | 2.67 s | 1.15x | 16,000 / 0 | +| Large float/string | 2.88 s | 2.86 s | 1.01x | 0 / 16,000 | +| Variable phase widths | 2.53 s | 2.45 s | 1.03x | 12,000 / 0 | +| Mixed row-local fallbacks | 2.14 s | 1.84 s | 1.16x | 12,000 / 0 | +| GT-first reordered | 1.68 s | 1.41 s | 1.19x | 12,000 / 0 | +| Two-string float | 2.20 s | 2.19 s | 1.00x | 0 / 12,000 | + +The CCDG 10k fallbacks are all `string_width=139`, meaning only rows with +measured string fields wider than the 256-byte planned cap use the generic +parser. The float/string control fixtures still fall back as unsupported +because the low-profit schema gate deliberately rejects those schemas. A +briefly tested consecutive-width guard regressed CCDG to 9,702 hits / 298 +fallbacks; the retained dense-width guard restores the expected sparse-fallback +profile. ## Full Threaded Corpus Benchmark @@ -351,6 +387,181 @@ All planned merge outputs compared byte-identical to baseline. | Large CCDG likelihood 1024s | 4.50 s | 4.33 s | 1.04x | 4.05 s | 3.91 s | | Large float/string 1024s | 2.69 s | 2.69 s | 1.00x | 2.40 s | 2.41 s | +## GIAB and CCDG Command Check + +GIAB HG002 files were added as real-world single-sample correctness fixtures: +NIST v4.2.1 GRCh38 small variants, v5.0q GRCh38 small variants, v5.0q GRCh38 +structural variants, and v5.0q CHM13v2.0 small variants. The same bcftools +command suite was run against those files plus the all-sample CCDG 10k slice: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening \ + bench/format-shape/scripts/run_bcftools_command_bench.sh \ + bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv +``` + +The initial GIAB v5.0q run found a correctness bug: the planned GT2 parser +encoded phased missing genotypes such as `.|.` as unphased `./.`. The parser +now preserves the phase bit for missing alleles, and `test/format-plan-edge.vcf` +has an explicit phased-missing GT row. After the fix, every command output in +this run compared byte-identical/text-identical to baseline. The table below +shows user-time speedups from the latest hardened rerun. + +| Input | Records | Samples | view_bcf | query_format | filter_gt | Notes | +|---|---:|---:|---:|---:|---:|---| +| CCDG 10k | 10,000 | 3,202 | 1.14x | 1.56x | 1.12x | Cohort FORMAT win remains visible. | +| GIAB HG002 GRCh38 v4.2.1 | 4,048,342 | 1 | 1.09x | 1.07x | 1.09x | Single-sample truth-set small variants. | +| GIAB HG002 GRCh38 v5.0q small variants | 5,945,525 | 1 | 1.09x | 1.07x | 1.07x | Includes phased missing GTs. | +| GIAB HG002 GRCh38 v5.0q structural variants | 6,268,852 | 1 | 1.09x | 1.02x | 1.08x | Structural-variant FORMAT coverage. | +| GIAB HG002 CHM13 v5.0q small variants | 5,829,374 | 1 | 1.07x | 1.06x | 1.16x | Alternate reference truth-set coverage. | + +The parent CCDG/1000G high-coverage chr22 file is 26.0 GiB compressed: + +```text +https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +It is available locally at: + +```text +/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +The normal command harness is unsafe for this input because one full +`view_bcf -Ob -l 0` output reached 155 GiB before the run was interrupted. The +full-file benchmark therefore used the streaming checksum harness: + +```sh +BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +OUTDIR=bench/format-shape/large/results-bcftools-full-ccdg-stream \ + bash bench/format-shape/scripts/run_bcftools_command_bench_stream.sh \ + bench/format-shape/large/bcftools-full-ccdg-inputs.tsv +``` + +All baseline-vs-plan checksums compared `ok`. + +| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | +|---|---:|---:|---:|---:|---:|---:| +| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | +| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | +| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | +| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | +| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | +| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | + +## Executor Optimization Pass + +The latest optimization pass stayed within the generic per-op executor. It did +not add schema-specific kernels. Retained changes are: + +- skip observed-count tracking for row ops that cannot compact; +- update integer ranges directly on the common positive-integer path; +- fail over-wide measured fields during the measurement pass; +- remove nullable `nread` checks from planner-private vector helpers. + +Focused tests passed: + +```sh +make test/test_view test/test_format_plan_cache +cd test && REF_PATH=: ./test.pl -F vcf_format_plan +test/test_format_plan_cache +git diff --check +``` + +The htslib large corpus result is in +`bench/format-shape/large/results-opt-batch1b`. All planned outputs compared +byte-identical to baseline. + +| Input | Plan user | User speedup | Hits/fallback | +|---|---:|---:|---:| +| CCDG 10k | 2.20 s | 1.14x | 8,396 / 1,604 | +| 1000G chr22 full GT | 8.99 s | 2.79x | 1,103,547 / 0 | +| Large CCDG-like synthetic | 3.68 s | 1.09x | 20,000 / 0 | +| Large reordered likelihood | 2.38 s | 1.22x | 20,000 / 0 | +| Large multiallelic likelihood | 2.64 s | 1.21x | 16,000 / 0 | +| Large float/string | 2.88 s | 1.00x | 0 / 16,000 | +| Variable phase widths | 2.44 s | 1.05x | 12,000 / 0 | +| Mixed row-local fallbacks | 1.83 s | 1.20x | 12,000 / 0 | +| GT-first reordered | 1.41 s | 1.23x | 12,000 / 0 | +| Two-string float | 2.24 s | 1.00x | 0 / 12,000 | + +The `keep_samples`/all-samples loop split was tested and rejected. It preserved +correctness, but `bench/format-shape/large/results-opt-nosubset-split` was +slower across the planned rows, so the change was reverted. + +For bcftools-level validation, the sibling bcftools checkout must be built +against this checkout explicitly: + +```sh +make HTSDIR=../htslib-vcf-avx-sanity bcftools +``` + +The standard GIAB/CCDG command result is in +`bench/format-shape/large/results-bcftools-giab-ccdg-opt-batch1`; all outputs +compared `ok`. CCDG 10k user-time speedups were 1.12x for `view_bcf`, 1.55x +for `query_format`, and 1.11x for `filter_gt`. GIAB single-sample FORMAT query +rows were roughly 1.08-1.12x faster; site-only controls and `stats` remain +neutral/noisy as expected. + +## Fallback Diagnostics And String Width Tuning + +A later pass added fallback reason counters and split the planned width cap +into numeric and string limits: + +- numeric measured vectors remain capped at 64 values; +- measured strings are capped at 256 bytes; +- numeric/string width fallbacks are counted but do not disable the schema guard. + +A 512-byte string cap was tested first. It recovered all CCDG 10k planner +fallbacks, but the bcftools-level signal was mixed. The retained 256-byte cap +keeps almost all CCDG rows on the planned path while leaving the longest string +rows on the generic parser. + +Focused CCDG 10k htslib result at 256 bytes: + +| Metric | Value | +|---|---:| +| Baseline user | 2.43 s | +| Plan user | 2.15 s | +| Hits / fallback | 9,861 / 139 | +| Fallback reason | `string_width=139` | + +The standard GIAB/CCDG bcftools command result for the retained version is in +`bench/format-shape/large/results-bcftools-giab-ccdg-cap256`; all outputs +compared `ok`. + +| Input | `view_bcf` user | `query_format` user | `filter_gt` user | +|---|---:|---:|---:| +| CCDG 10k | 1.13x | 1.56x | 1.10x | +| GIAB HG002 GRCh38 v4.2.1 | 1.08x | 1.08x | 1.04x | +| GIAB HG002 GRCh38 v5.0q small variants | 1.13x | 1.08x | 1.03x | +| GIAB HG002 GRCh38 v5.0q structural variants | 1.11x | 1.15x | 1.04x | +| GIAB HG002 CHM13 v5.0q small variants | 1.08x | 1.07x | 1.03x | + +## Repo Test Harness Hardening + +The latest hardening pass moved the important correctness checks into the normal +htslib `test/test.pl` harness instead of leaving them only in +`bench/format-shape`. The `make check` coverage is intentionally black-box and +includes: + +- byte-identity checks for all small planned-path fixtures; +- generic parser vs planned parser comparisons; +- disabled-control comparisons for `HTS_VCF_FORMAT_PLAN=off`; +- a rollback row where planned parsing starts and then falls back after a DP + overflow; +- repeated unsupported wide GT values; +- selected-sample parsing where malformed unselected sample fields must be + skipped and must not affect emitted widths; +- malformed sample-count input, where both generic and planned modes must fail; +- cache-generation coverage in `test/test_format_plan_cache`. + +The planned executor now calls `vcf_parse_format_check7()` on success, so the +planned path shares the generic parser's final FORMAT cardinality check. The +fallback counters are test-only diagnostics, exposed through renamed +`*_for_test` hooks rather than API-looking `hts_*` names. + ## Interpretation The dynamic path gives a large production-visible win for sample-rich GT-only diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md index 2c98ce35a..7d3d12180 100644 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -145,11 +145,12 @@ Tested and reverted: After removing exact and SIMD paths, the optimized entry became: ```text -HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> production fallback +HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> generic fallback ``` -`HTS_VCF_FORMAT_PLAN=1`, `interp`, and `general` now route through the same -dynamic executor. Benchmarks label only `HTS_VCF_FORMAT_PLAN=1` as `plan`. +`HTS_VCF_FORMAT_PLAN=1` now routes through the dynamic executor. Older +`interp` and `general` aliases were later removed during production tightening +so unknown values do not accidentally enable the fast path. Large-corpus post-trim user-time highlights: @@ -195,7 +196,7 @@ still use the general float conversion path, while there were no integer vectors to amortize that setup. Result: retained. The compiler now negative-caches these low-profit schemas and -sends only those FORMAT rows to the production parser. The full threaded corpus +sends only those FORMAT rows to the generic parser. The full threaded corpus remained byte-identical. The two-string float case improved from a consistent slowdown, roughly 0.86-0.89x, to parity at 1.00-1.01x. Other integer-heavy likelihood rows stayed on the dynamic path. @@ -215,8 +216,8 @@ the same rule, so measured strings and variable numeric widths are based only on the samples that will be emitted, matching production htslib's selected-sample behavior. -Result: retained. `test/test_format_plan.sh` now compares explicit inclusion -and exclusion sample lists byte-for-byte against production parsing. A +Result: retained. The FORMAT-plan tests now compare explicit inclusion and +exclusion sample lists byte-for-byte against production parsing. A bcftools run selecting the first two samples from every input completed 40/40 byte-identical comparisons. The 1000G chr22 GT workload still showed a large real-time win, from 26.51 s to 9.77 s unthreaded and from 25.99 s to 8.84 s at @@ -260,6 +261,223 @@ small 1000G genotype input improved from 0.14 s to 0.10 s, large CCDG likelihood improved from 4.50 s to 4.33 s, and large float/string remained unchanged at 2.69 s. +## GIAB and Full CCDG Probe + +Four GIAB HG002 VCFs were pulled into `bench/format-shape/large/public/giab`: +NIST v4.2.1 GRCh38 small variants, v5.0q GRCh38 small variants, v5.0q GRCh38 +structural variants, and v5.0q CHM13v2.0 small variants. The bcftools command +suite was run on those files plus the 3,202-sample CCDG 10k slice using +`bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv`. + +First result: GIAB v5.0q exposed a real GT correctness bug. The planned GT2 +parser encoded `.|.` as `./.` because missing alleles were stored without the +separator phase bit. The parser now accepts simple diploid missing/digit +combinations and preserves the phase bit for `.|.`, `0|.`, and `.|0`; an +explicit edge row was added to `test/format-plan-edge.vcf`. + +After the fix, all baseline-vs-plan outputs compared `ok`. Speedups are modest +on GIAB because it is single-sample data: roughly 1.06-1.11x for `view_bcf` and +1.03-1.09x for `query_format`. CCDG 10k remained in the expected cohort range: +1.13x for `view_bcf`, 1.52x for `query_format`, and 1.10x for `filter_gt`. + +The full parent CCDG/1000G high-coverage chr22 VCF was identified as: + +```text +https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +``` + +It is 26.0 GiB compressed and was later found locally under +`/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original`. +The normal command harness materializes complete outputs, which is not practical +for this file: a single `view_bcf -Ob -l 0` baseline output reached 155 GiB +before that run was stopped. A streaming checksum harness was added so command +outputs can be validated without storing them. + +The full CCDG streaming command suite completed with all baseline-vs-plan +checksums comparing `ok`: + +| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | +|---|---:|---:|---:|---:|---:|---:| +| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | +| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | +| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | +| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | +| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | +| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | + +## Parser Helper Trim + +Reviewed the `vcf.c` implementation for duplicated fast-path helper code. A +first attempt collapsed the fixed-width integer vector parsers into one generic +counted loop. Correctness held, but likelihood-shaped rows regressed by roughly +10% in the focused benchmark, so that version was rejected. + +The retained refactor is intentionally narrower: remove unused non-range integer +vector helpers, remove an unused scalar helper, and centralize only the empty +integer-vector fill case. The hand-unrolled range parsers for common vector +widths remain because they are part of the measured hot path. + +Result: retained. The final `vcf.c` diff is about 116 fewer deleted helper +lines relative to the previous branch tip, with byte-identical outputs on the +focused GT/likelihood/string corpus. A repeat likelihood benchmark was neutral: +CCDG-like plan time improved from 4.18 s to 4.12 s, reordered likelihood was +2.66 s to 2.70 s, and multiallelic likelihood improved from 3.01 s to 2.98 s. + +## Production Tightening Review + +Three review passes focused on production-readiness: code-size risk, correctness +risk, and upstream polish. The retained implementation changes were deliberately +low-risk: + +- `HTS_VCF_FORMAT_PLAN` now enables only on `1`; old `interp` / `general` + aliases and typo-enables were removed. +- Planner statistics are incremented only when + `HTS_VCF_FORMAT_PLAN_STATS=1`, avoiding process-global counter writes in + normal runs. +- The row-op support check was folded into row-op resolution, removing a second + pass over the FORMAT operation list. +- The row-width bound was made explicit in the planner instead of using an + inline literal. +- Tests now assert that an unknown value such as `HTS_VCF_FORMAT_PLAN=off` + behaves like the generic parser. + +Result: retained. `make test/test_view test/test_format_plan_cache`, +the FORMAT-plan parser-output checks, and `test/test_format_plan_cache` pass. +At that point, the `vcf.c` implementation was about 1,594 added lines relative to +`origin/develop`, down from the earlier 1,703-line core. + +## Generic Executor Micro-Optimizations + +The next pass targeted the generic per-op executor rather than adding new +schema-specific kernels. Retained changes: + +- skip `max_counts` maintenance for row ops that cannot compact; +- update integer min/max directly on the common positive-integer parse path; +- reject over-wide measured `Number=.` / string fields during the measurement + pass instead of after scanning the full row; +- remove nullable `nread` checks from planner-private integer vector helpers. + +Result: retained. Focused FORMAT tests passed, `git diff --check` was clean, +and the htslib large corpus in +`bench/format-shape/large/results-opt-batch1b` compared byte-identical to +baseline. + +| Input | Baseline user | Plan user | User speedup | Hits/fallback | +|---|---:|---:|---:|---:| +| CCDG 10k | 2.50 s | 2.20 s | 1.14x | 8,396 / 1,604 | +| 1000G chr22 full GT | 25.08 s | 8.99 s | 2.79x | 1,103,547 / 0 | +| Large CCDG-like synthetic | 4.02 s | 3.68 s | 1.09x | 20,000 / 0 | +| Large reordered likelihood | 2.91 s | 2.38 s | 1.22x | 20,000 / 0 | +| Large multiallelic likelihood | 3.19 s | 2.64 s | 1.21x | 16,000 / 0 | +| Large float/string | 2.89 s | 2.88 s | 1.00x | 0 / 16,000 | +| Variable phase widths | 2.57 s | 2.44 s | 1.05x | 12,000 / 0 | +| Mixed row-local fallbacks | 2.20 s | 1.83 s | 1.20x | 12,000 / 0 | +| GT-first reordered | 1.73 s | 1.41 s | 1.23x | 12,000 / 0 | +| Two-string float | 2.25 s | 2.24 s | 1.00x | 0 / 12,000 | + +One broader structural attempt was rejected: splitting the all-samples loop from +the `keep_samples` loop. Correctness held, but +`bench/format-shape/large/results-opt-nosubset-split` was slower across the +planned corpus: CCDG 10k plan user time moved from 2.20 s to 2.28 s, 1000G +GT-only from 8.99 s to 9.30 s, and the likelihood-shaped synthetic rows also +regressed. That change was reverted. + +The standard bcftools GIAB/CCDG command corpus was then run against a bcftools +binary explicitly linked to this checkout with: + +```sh +make HTSDIR=../htslib-vcf-avx-sanity bcftools +``` + +All command outputs compared `ok` in +`bench/format-shape/large/results-bcftools-giab-ccdg-opt-batch1`. The command +profile stayed positive where FORMAT parsing matters and neutral/noisy where it +does not: CCDG 10k `query_format` was 1.55x faster by user time, CCDG 10k +`view_bcf` was 1.12x faster, and GIAB single-sample `query_format` rows were +roughly 1.08-1.12x faster. + +## Fallback Reason Counters And Split Width Caps + +The next regression investigation focused on CCDG rows that were falling back +because phase-set string fields exceeded the old single planned-width limit. +The implementation now reports fallback reasons under +`HTS_VCF_FORMAT_PLAN_STATS=1`: + +- unsupported schema; +- guard cooldown; +- numeric width; +- string width; +- GT shape; +- parse failure; +- separator mismatch; +- sample-count mismatch. + +The single width cap was split into a 64-value numeric-vector cap and a +256-byte measured-string cap. Numeric and string width fallbacks are diagnostic +only for the normal schema guard: they do not disable a schema that succeeds on +nearby rows. + +Two string caps were benchmarked. A 512-byte cap planned all CCDG 10k rows but +had a mixed bcftools-level signal. The retained 256-byte cap planned 9,861 of +10,000 CCDG rows and left the 139 longest string rows on the generic parser: + +```text +vcf-format-plan attempts=10000 hits=9861 fallback=139 parsed_samples=31574922 +vcf-format-plan-fallback unsupported=0 guard=0 numeric_width=0 string_width=139 gt_shape=0 parse=0 separator=0 sample_count=0 +``` + +Result: retained. Focused tests passed, `git diff --check` was clean, and the +htslib large corpus in `bench/format-shape/large/results-string-cap256-reasons` +compared byte-identical to baseline. CCDG 10k user time was 2.47 s baseline +versus 2.17 s planned, 1000G chr22 full GT was 24.70 s versus 9.75 s, and the +likelihood-shaped synthetic rows remained faster or neutral. + +The bcftools GIAB/CCDG command corpus in +`bench/format-shape/large/results-bcftools-giab-ccdg-cap256` also compared +byte-identical. CCDG 10k user-time speedups were 1.13x for `view_bcf`, 1.56x +for `query_format`, and 1.10x for `filter_gt`; GIAB FORMAT-query rows were +1.07-1.15x faster, while site-only controls and `stats` remained neutral/noisy. + +## Repo Test Harness Hardening + +The final hardening pass moved the important small-case checks from the +benchmark directory into the actual htslib test harness. The bespoke shell test +was removed; the production-facing checks now live in `test/test.pl` as +`test_vcf_format_plan`, while `test/test_format_plan_cache` remains the focused +cache-generation check. + +Retained changes: + +- successful planned rows call `vcf_parse_format_check7()`, matching the generic + parser's final FORMAT cardinality validation; +- fallback diagnostics are test-only hooks with `*_for_test` names and are + emitted only when `HTS_VCF_FORMAT_PLAN_STATS=1`; +- `test_vcf_format_plan` compares planned output against generic output + byte-for-byte, including selected-sample cases and disabled-control values + such as `HTS_VCF_FORMAT_PLAN=off`; +- new fixtures cover rollback after partial planned parsing, malformed + unselected samples under `bcf_hdr_set_samples()`, repeated wide GT values, and + malformed sample-count failures; +- dense-width guard behavior was tightened so sparse over-cap string rows do not + poison CCDG-like schemas. + +Result: retained. `make check` passed with 377/377 tests. `make +maintainer-check` was attempted but failed before the whitespace/copyright +checks because the local build invoked the C compiler on `test/usepublic.cpp` +with `-std=gnu23`. The relevant whitespace check and `git diff --check` passed +separately. + +The htslib large corpus in `bench/format-shape/large/results-prod-hardening2` +compared byte-identical to baseline. CCDG 10k held the expected 9,861 / 139 +hit/fallback split, and 1000G chr22 full GT remained the largest win at +24.61 s baseline user time versus 9.48 s planned. + +The latest bcftools GIAB/CCDG command corpus in +`bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening` also +compared byte-identical. CCDG 10k user-time speedups were 1.14x for `view_bcf`, +1.56x for `query_format`, and 1.12x for `filter_gt`; GIAB single-sample FORMAT +rows remained modestly positive, as expected. + ## Main Lessons - Tag-level composition is the right MVP boundary; exact full FORMAT strings are diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 3545617c4..8f384a61f 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -18,10 +18,10 @@ unsupported decisions tied to the exact header metadata that produced them. If the row fits the supported operation set, the dynamic executor parses samples and writes BCF's transposed FORMAT layout directly. If anything looks unsafe or -unsupported, htslib falls back to the production parser for the whole FORMAT +unsupported, htslib falls back to the generic parser for the whole FORMAT column. The planner also keeps a small profitability gate: schemas dominated by measured strings plus float vectors, such as `GT:FT:PID:GL:DP`, currently use -the production parser because the dynamic path's width-measurement work costs +the generic parser because the dynamic path's width-measurement work costs more than it saves. The optimized path also supports selected-sample reads. When @@ -29,6 +29,12 @@ The optimized path also supports selected-sample reads. When unretained samples, and writes the retained samples densely into the BCF FORMAT blocks. +Fallbacks are whole-row, but they are now classified for diagnostics when +`HTS_VCF_FORMAT_PLAN_STATS=1` is set. The current reason counters distinguish +unsupported schemas, guard cooldowns, numeric-width limits, string-width limits, +GT shape misses, parse failures, separator mismatches, and sample-count +mismatches. + ## Why This Shape The important design choice is tag-level composition. A file does not need an @@ -78,15 +84,15 @@ merge manifest. ## Drawbacks The MVP intentionally keeps fallback whole-row. It does not parse supported -tags dynamically while delegating only one unsupported tag to the production +tags dynamically while delegating only one unsupported tag to the generic parser. That makes correctness easier to reason about, but a single unsupported -tag or malformed row means the entire FORMAT column uses the production parser. +tag or malformed row means the entire FORMAT column uses the generic parser. Known fallback cases include: - undefined FORMAT tags that require production header repair; - unsupported header types or number models; -- unprofitable string/float-heavy schemas; +- low-profit string/float-heavy schemas; - duplicate FORMAT tags; - malformed separators or unexpected sample cardinality; - row-local widths above the bounded fast-path limit; @@ -96,17 +102,25 @@ The path is also not always faster. Some string/float-heavy layouts are roughly at parity or slightly slower than baseline because the dynamic path still pays measurement, dispatch, and scratch-buffer costs. +The current planned width limits are intentionally conservative: measured +numeric vectors are capped at 64 values, and measured strings are capped at +256 bytes. Rows above those limits use the generic parser; numeric/string width +misses do not by themselves disable the schema for later rows. + +Correctness checks for this path now live in the normal htslib test harness, not +only in the benchmark directory. `make check` runs black-box byte-identity +fixtures through `test/test.pl`, selected-sample checks, malformed-input checks, +and focused header-cache generation coverage. + ## User-Facing Controls ```text -unset / 0 production parser only -1 dynamic per-tag planner, then production fallback -interp/general aliases for the same dynamic planner +unset / 0 generic parser only +1 dynamic per-tag planner, then generic fallback ``` The benchmark harness reports only `HTS_VCF_FORMAT_PLAN=1` as `plan`. -`interp` and `general` remain accepted aliases for manual debugging, but they are -not distinct implementations. +Other values are treated as disabled. ## Related Docs diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf index 8467573cc..085434543 100644 --- a/test/format-plan-edge.vcf +++ b/test/format-plan-edge.vcf @@ -33,5 +33,6 @@ chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,2 chr22 10593500 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:32767:3,4,5,6 0/0:-32760,32768:-32761:8,0,0,0 ./.:32767,32768:0:127,128,32767,32768 chr22 10594000 . A T 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 0|1:4,5:9:50:0|1:P1:90,0,90 0/1:3,2:5:20:0|1:10594000_A_T_LONG_PHASE_SET:20,0,200 ./.:0,0:0:.:.:.:. chr22 10595000 . A T 50 PASS . GT 0/1 1|1 ./. +chr22 10595500 . A T 50 PASS . GT .|. 0|. .|0 chr22 10596000 . A T 50 PASS . GT 0 1 . chr22 10597000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT 10/10 0/10 ./. diff --git a/test/format-plan-fallback.vcf b/test/format-plan-fallback.vcf new file mode 100644 index 000000000..5d082753b --- /dev/null +++ b/test/format-plan-fallback.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:PID:DP 0/1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA:12 0/0:P2:8 ./.:.:0 +1 2 . A C . PASS . GT:DP 0/1 0/0:8 ./.:0 +1 3 . A C . PASS . GT:QS:DP 0/1:1.5:999999999999999999999999999999 0/0:2.5:8 ./.:.:0 diff --git a/test/format-plan-profitability.vcf b/test/format-plan-float-string.vcf similarity index 100% rename from test/format-plan-profitability.vcf rename to test/format-plan-float-string.vcf diff --git a/test/format-plan-repeated-wide-gt.vcf b/test/format-plan-repeated-wide-gt.vcf new file mode 100644 index 000000000..4b1f93a1e --- /dev/null +++ b/test/format-plan-repeated-wide-gt.vcf @@ -0,0 +1,14 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 2 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 3 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 4 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 5 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 6 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 7 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 8 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 9 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 10 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 diff --git a/test/format-plan-sample-count.vcf b/test/format-plan-sample-count.vcf new file mode 100644 index 000000000..b5c168c6d --- /dev/null +++ b/test/format-plan-sample-count.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C . PASS . GT:DP 0/1:7 diff --git a/test/format-plan-sample-skip.vcf b/test/format-plan-sample-skip.vcf new file mode 100644 index 000000000..3027add4a --- /dev/null +++ b/test/format-plan-sample-skip.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:PID:DP 0/1:P1:7 0/1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA:not_an_int 0/0:P3:8 diff --git a/test/test.pl b/test/test.pl index eaa65ea30..50aee44fa 100755 --- a/test/test.pl +++ b/test/test.pl @@ -55,6 +55,7 @@ run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out'); run_test('test_vcf_various',$opts); run_test('test_vcf_44', $opts); +run_test('test_vcf_format_plan', $opts); run_test('test_bcf_sr_sort',$opts); run_test('test_bcf_sr_no_index',$opts); run_test('test_bcf_sr_range', $opts); @@ -1211,6 +1212,115 @@ sub test_vcf_44 cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf"); } +sub test_vcf_format_plan_one +{ + my ($opts, $input, $label, $extra_args) = @_; + my $base = "$$opts{tmp}/$label.base.bcf"; + my $plan = "$$opts{tmp}/$label.plan.bcf"; + my $disabled = "$$opts{tmp}/$label.disabled.bcf"; + my $test = "VCF FORMAT planner: $label"; + my $args = defined($extra_args) ? $extra_args : ""; + + print "$test:\n"; + + my $cmd = "env HTS_VCF_FORMAT_PLAN=0 $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $base"; + print "\t$cmd\n"; + my ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "generic parser command failed\n$out"); + return; + } + + $cmd = "env HTS_VCF_FORMAT_PLAN=1 $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $plan"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "planned parser command failed\n$out"); + return; + } + + $cmd = "cmp $base $plan"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, $out ? $out : "planned output differs from generic output"); + return; + } + + $cmd = "env HTS_VCF_FORMAT_PLAN=off $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $disabled"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "disabled-mode command failed\n$out"); + return; + } + + $cmd = "cmp $base $disabled"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, $out ? $out : "disabled-mode output differs from generic output"); + return; + } + + passed($opts, $test); +} + +sub test_vcf_format_plan_failure +{ + my ($opts, $input, $label) = @_; + my $base = "$$opts{tmp}/$label.base.bcf"; + my $plan = "$$opts{tmp}/$label.plan.bcf"; + my $test = "VCF FORMAT planner expected failure: $label"; + + print "$test:\n"; + + my $cmd = "env HTS_VCF_FORMAT_PLAN=0 $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $base"; + print "\t$cmd\n"; + my ($base_ret, $base_out) = _cmd($cmd); + + $cmd = "env HTS_VCF_FORMAT_PLAN=1 $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $plan"; + print "\t$cmd\n"; + my ($plan_ret, $plan_out) = _cmd($cmd); + + if ($base_ret == 0 || $plan_ret == 0) { + failed($opts, $test, "expected both parser modes to fail, got generic=$base_ret planned=$plan_ret"); + return; + } + + passed($opts, $test); +} + +sub test_vcf_format_plan +{ + my ($opts) = @_; + + for my $input ( + "format-plan-edge.vcf", + "format-plan-header-mismatch.vcf", + "format-plan-composable.vcf", + "format-plan-gt-header-shape.vcf", + "format-plan-cache.vcf", + "format-plan-float-string.vcf", + "format-plan-fallback.vcf", + "format-plan-repeated-wide-gt.vcf") { + (my $label = $input) =~ s/\.vcf$//; + test_vcf_format_plan_one($opts, $input, $label, ""); + } + + for my $samples ("S1,S3", "S2", "^S2") { + for my $input ("format-plan-composable.vcf", "format-plan-edge.vcf") { + (my $label = "$input.$samples") =~ s/[^A-Za-z0-9_.-]/_/g; + test_vcf_format_plan_one($opts, $input, $label, "-s '$samples'"); + } + } + + test_vcf_format_plan_one($opts, "format-plan-sample-skip.vcf", + "format-plan-sample-skip.S1_S3", "-s S1,S3"); + test_vcf_format_plan_failure($opts, "format-plan-sample-count.vcf", + "format-plan-sample-count"); +} + sub write_multiblock_bgzf { my ($name, $frags) = @_; diff --git a/test/test_format_plan.sh b/test/test_format_plan.sh deleted file mode 100755 index d47741777..000000000 --- a/test/test_format_plan.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/sh -set -eu - -test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-"test/format-plan-edge.vcf test/format-plan-header-mismatch.vcf test/format-plan-composable.vcf test/format-plan-gt-header-shape.vcf test/format-plan-cache.vcf test/format-plan-profitability.vcf"} -tmpdir=${TMPDIR:-/tmp} -base=${tmpdir}/hts-format-plan-base.$$ -plan=${tmpdir}/hts-format-plan-plan.$$ -interp=${tmpdir}/hts-format-plan-interp.$$ -stats=${tmpdir}/hts-format-plan-stats.$$ -interp_stats=${tmpdir}/hts-format-plan-interp-stats.$$ - -trap 'rm -f "$base" "$plan" "$interp" "$stats" "$interp_stats"' EXIT HUP INT TERM - -for input in $inputs -do - env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 "$input" > "$base" - env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$plan" 2> "$stats" - env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 "$input" > "$interp" 2> "$interp_stats" - cmp "$base" "$plan" - cmp "$base" "$interp" - case "$input" in - *format-plan-cache.vcf) - grep -q 'attempts=21 hits=21 fallback=0 ' "$stats" - grep -q 'attempts=21 hits=21 fallback=0 ' "$interp_stats" - ;; - *format-plan-profitability.vcf) - grep -q 'attempts=1 hits=0 fallback=1 ' "$stats" - grep -q 'attempts=1 hits=0 fallback=1 ' "$interp_stats" - ;; - esac - cat "$stats" - cat "$interp_stats" -done - -for samples in S1,S3 S2 ^S2 -do - for input in test/format-plan-composable.vcf test/format-plan-edge.vcf - do - env HTS_VCF_FORMAT_PLAN=0 "$test_view" -b -l 0 -s "$samples" "$input" > "$base" - env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 -s "$samples" "$input" > "$plan" 2> "$stats" - env HTS_VCF_FORMAT_PLAN=interp HTS_VCF_FORMAT_PLAN_STATS=1 "$test_view" -b -l 0 -s "$samples" "$input" > "$interp" 2> "$interp_stats" - cmp "$base" "$plan" - cmp "$base" "$interp" - cat "$stats" - cat "$interp_stats" - done -done diff --git a/test/test_format_plan_cache.c b/test/test_format_plan_cache.c index ca2263ce3..2575dadbf 100644 --- a/test/test_format_plan_cache.c +++ b/test/test_format_plan_cache.c @@ -22,8 +22,6 @@ DEALINGS IN THE SOFTWARE. */ #include -#include -#include #include #include #include @@ -31,9 +29,6 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/kstring.h" #include "../htslib/vcf.h" -void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, uint64_t *parsed_samples); - static void fail(const char *msg) { fprintf(stderr, "%s\n", msg); @@ -72,6 +67,31 @@ static void check_x_values(bcf_hdr_t *hdr, bcf1_t *rec, free(values); } +static void check_x_float(bcf_hdr_t *hdr, bcf1_t *rec, float expected) +{ + bcf_fmt_t *fmt; + float *values = NULL; + int n_values = 0, ret; + + check0(bcf_unpack(rec, BCF_UN_FMT)); + fmt = bcf_get_fmt(hdr, rec, "X"); + if (!fmt) + fail("missing X FORMAT field"); + if (fmt->type != BCF_BT_FLOAT || fmt->n != 1) + fail("unexpected X FORMAT storage type"); + + ret = bcf_get_format_float(hdr, rec, "X", &values, &n_values); + if (ret != 1) { + free(values); + fail("unexpected X float vector length"); + } + if (values[0] != expected) { + free(values); + fail("unexpected X float value"); + } + free(values); +} + int main(void) { static char header[] = @@ -81,11 +101,9 @@ int main(void) "##FORMAT=\n" "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\n"; static const int32_t x1[] = { 7 }; - static const int32_t x2[] = { 11, 13 }; bcf_hdr_t *hdr; bcf1_t *rec; kstring_t line = KS_INITIALIZE; - uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; check0(setenv("HTS_VCF_FORMAT_PLAN", "1", 1)); hdr = bcf_hdr_init("r"); @@ -100,28 +118,18 @@ int main(void) /* * Rebuild the same FORMAT string against changed metadata. A stale plan - * would still think X is Number=1 and would either fall back or encode the - * second row incorrectly. The header-owned generation must force a fresh - * compile, preserving both correctness and fast-path coverage. + * would still think X is an integer and could encode the second row with + * integer storage even though the header now declares a float. The + * header-owned generation must force a fresh compile. */ bcf_hdr_remove(hdr, BCF_HL_FMT, "X"); check0(bcf_hdr_append(hdr, - "##FORMAT=")); + "##FORMAT=")); check0(bcf_hdr_sync(hdr)); bcf_clear1(rec); parse_line(hdr, rec, &line, - "1\t2\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:11,13"); - check_x_values(hdr, rec, x2, 2); - - hts_vcf_format_plan_stats(&attempts, &hits, &fallback, &parsed_samples); - if (attempts != 2 || hits != 2 || fallback != 0 || parsed_samples != 2) { - fprintf(stderr, - "unexpected planner stats: attempts=%" PRIu64 - " hits=%" PRIu64 " fallback=%" PRIu64 - " parsed_samples=%" PRIu64 "\n", - attempts, hits, fallback, parsed_samples); - return EXIT_FAILURE; - } + "1\t2\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:2"); + check_x_float(hdr, rec, 2.0f); bcf_destroy(rec); bcf_hdr_destroy(hdr); diff --git a/test/test_view.c b/test/test_view.c index 594c1fd75..08cf53bbe 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -37,9 +37,17 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/vcf.h" #include "../htslib/hts_log.h" -extern void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, - uint64_t *parsed_samples); +extern void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, + uint64_t *parsed_samples); +extern void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, + uint64_t *guard, + uint64_t *numeric_width, + uint64_t *string_width, + uint64_t *gt_shape, + uint64_t *parse, + uint64_t *separator, + uint64_t *sample_count); struct opts { char *fn_ref; @@ -442,14 +450,28 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); - if (getenv("HTS_VCF_FORMAT_PLAN_STATS")) { + const char *format_plan_stats = getenv("HTS_VCF_FORMAT_PLAN_STATS"); + if (format_plan_stats && strcmp(format_plan_stats, "1") == 0) { uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; - hts_vcf_format_plan_stats(&attempts, &hits, &fallback, &parsed_samples); + uint64_t unsupported = 0, guard = 0; + uint64_t numeric_width = 0, string_width = 0, gt_shape = 0, parse = 0; + uint64_t separator = 0, sample_count = 0; + vcf_format_plan_stats_for_test(&attempts, &hits, &fallback, &parsed_samples); + vcf_format_plan_fallback_stats_for_test(&unsupported, &guard, &numeric_width, + &string_width, >_shape, + &parse, &separator, + &sample_count); fprintf(stderr, "vcf-format-plan attempts=%llu hits=%llu fallback=%llu parsed_samples=%llu\n", (unsigned long long) attempts, (unsigned long long) hits, (unsigned long long) fallback, (unsigned long long) parsed_samples); + fprintf(stderr, + "vcf-format-plan-fallback unsupported=%llu guard=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", + (unsigned long long) unsupported, (unsigned long long) guard, + (unsigned long long) numeric_width, (unsigned long long) string_width, + (unsigned long long) gt_shape, (unsigned long long) parse, + (unsigned long long) separator, (unsigned long long) sample_count); } if (fclose(stdout) != 0 && errno != EBADF) { diff --git a/vcf.c b/vcf.c index a2732bafb..f17838c37 100644 --- a/vcf.c +++ b/vcf.c @@ -3217,6 +3217,8 @@ static inline int align_mem(kstring_t *s) #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ +static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v); + typedef struct { uint64_t attempts; uint64_t hits; @@ -3226,26 +3228,36 @@ typedef struct { static vcf_format_plan_stats_t vcf_format_plan_stats; +typedef enum { + VCF_FORMAT_PLAN_FB_UNSUPPORTED = 0, + VCF_FORMAT_PLAN_FB_GUARD, + VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH, + VCF_FORMAT_PLAN_FB_STRING_WIDTH, + VCF_FORMAT_PLAN_FB_GT_SHAPE, + VCF_FORMAT_PLAN_FB_PARSE, + VCF_FORMAT_PLAN_FB_SEPARATOR, + VCF_FORMAT_PLAN_FB_SAMPLE_COUNT, + VCF_FORMAT_PLAN_FB_N +} vcf_format_plan_fallback_reason_t; + +static uint64_t vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_N]; + /* * Dynamic FORMAT fast path. * - * The production FORMAT parser below is intentionally very permissive: it can + * The existing FORMAT parser below is intentionally very permissive: it can * repair missing header declarations, deal with sample subsetting, and recover * from many odd row shapes. The fast path here only claims rows that can be * described by the existing FORMAT header metadata and parsed as a fixed list * of per-tag operations. If any compile-time or row-local invariant fails, it - * returns -3 to let the production parser handle the whole FORMAT column. + * returns -3 to let the generic parser handle the whole FORMAT column. * * HTS_VCF_FORMAT_PLAN controls the feature: - * unset/0 use production parser only - * 1/interp/general - * use the dynamic per-tag plan, with production fallback - * - * Older experimental exact kernels have been removed; all enabled spellings - * now route through the same dynamic planner/executor. + * unset/0 use the generic parser only + * 1 try the dynamic per-tag plan, with generic fallback */ -void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, uint64_t *parsed_samples) +void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, + uint64_t *fallback, uint64_t *parsed_samples) { if (attempts) *attempts = vcf_format_plan_stats.attempts; if (hits) *hits = vcf_format_plan_stats.hits; @@ -3253,153 +3265,257 @@ void hts_vcf_format_plan_stats(uint64_t *attempts, uint64_t *hits, if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; } -static int vcf_format_plan_mode(void) +void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, + uint64_t *guard, + uint64_t *numeric_width, + uint64_t *string_width, + uint64_t *gt_shape, + uint64_t *parse, + uint64_t *separator, + uint64_t *sample_count) +{ + if (unsupported) + *unsupported = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_UNSUPPORTED]; + if (guard) + *guard = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GUARD]; + if (numeric_width) + *numeric_width = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH]; + if (string_width) + *string_width = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_STRING_WIDTH]; + if (gt_shape) + *gt_shape = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GT_SHAPE]; + if (parse) + *parse = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_PARSE]; + if (separator) + *separator = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SEPARATOR]; + if (sample_count) + *sample_count = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SAMPLE_COUNT]; +} + +static int vcf_format_plan_stats_enabled(void) +{ + static int enabled = -1; + + if (enabled < 0) { + const char *env = getenv("HTS_VCF_FORMAT_PLAN_STATS"); + enabled = env && strcmp(env, "1") == 0; + } + return enabled; +} + +static int vcf_format_plan_enabled(void) { - static int mode = -1; - if (mode < 0) { + static int enabled = -1; + + if (enabled < 0) { const char *env = getenv("HTS_VCF_FORMAT_PLAN"); - if (!env || !env[0] || strcmp(env, "0") == 0) - mode = 0; - else if (strcmp(env, "interp") == 0 || strcmp(env, "general") == 0) - mode = 2; - else - mode = 1; + enabled = env && strcmp(env, "1") == 0; } - return mode; + return enabled; } typedef struct { - uint32_t attempts; - uint32_t hits; - uint32_t fallbacks; - uint16_t miss_streak; - uint16_t cooldown; - uint8_t disabled; + uint32_t attempts; + uint32_t hits; + uint32_t fallbacks; + uint32_t width_attempts; + uint32_t width_fallbacks; + uint16_t miss_streak; + uint16_t width_miss_streak; + uint16_t cooldown; + uint8_t disabled; } vcf_format_fast_guard_t; enum { - VCF_FORMAT_FAST_DISABLE_STREAK = 8, - VCF_FORMAT_FAST_PROBE_ATTEMPTS = 128, - VCF_FORMAT_FAST_MAX_FALLBACK_PCT = 10, - VCF_FORMAT_FAST_COOLDOWN_RECORDS = 256 + VCF_FORMAT_FAST_DISABLE_STREAK = 8, + VCF_FORMAT_FAST_PROBE_ATTEMPTS = 128, + VCF_FORMAT_FAST_MAX_FALLBACK_PCT = 10, + VCF_FORMAT_FAST_WIDTH_PROBE_ATTEMPTS = 128, + VCF_FORMAT_FAST_MAX_WIDTH_FALLBACK_PCT = 75, + VCF_FORMAT_FAST_COOLDOWN_RECORDS = 256, + VCF_FORMAT_MAX_NUMERIC_WIDTH = 64, + VCF_FORMAT_MAX_STRING_WIDTH = 256 }; +static inline void vcf_format_plan_note_fallback(vcf_format_plan_fallback_reason_t reason) +{ + if (vcf_format_plan_stats_enabled()) { + vcf_format_plan_stats.fallback++; + if ((unsigned)reason < VCF_FORMAT_PLAN_FB_N) + vcf_format_plan_fallback_reasons[reason]++; + } +} + +static inline void vcf_format_plan_set_reason(vcf_format_plan_fallback_reason_t *dst, + vcf_format_plan_fallback_reason_t reason) +{ + if (dst) + *dst = reason; +} + +static inline int vcf_format_plan_width_reason(vcf_format_plan_fallback_reason_t reason) +{ + return reason == VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH || + reason == VCF_FORMAT_PLAN_FB_STRING_WIDTH; +} + +static inline int vcf_format_plan_guard_counts_reason(vcf_format_plan_fallback_reason_t reason) +{ + /* + * Row-local width limits are expected on mixed real-world files. They are + * tracked by a separate dense-width guard so sparse long rows do not disable + * an otherwise useful schema. + */ + return !vcf_format_plan_width_reason(reason); +} + +static inline void vcf_format_fast_guard_reset(vcf_format_fast_guard_t *guard) +{ + guard->attempts = 0; + guard->hits = 0; + guard->fallbacks = 0; + guard->width_attempts = 0; + guard->width_fallbacks = 0; + guard->miss_streak = 0; + guard->width_miss_streak = 0; + guard->disabled = 0; +} + static inline int vcf_format_fast_guard_enabled(vcf_format_fast_guard_t *guard) { - if (!guard->disabled) - return 1; - if (guard->cooldown) { - guard->cooldown--; - return 0; - } - guard->attempts = 0; - guard->hits = 0; - guard->fallbacks = 0; - guard->miss_streak = 0; - guard->disabled = 0; - return 1; + if (!guard->disabled) + return 1; + if (guard->cooldown) { + guard->cooldown--; + return 0; + } + vcf_format_fast_guard_reset(guard); + return 1; } static inline void vcf_format_fast_guard_success(vcf_format_fast_guard_t *guard) { - if (guard->attempts != UINT32_MAX) - guard->attempts++; - if (guard->hits != UINT32_MAX) - guard->hits++; - guard->miss_streak = 0; + if (guard->attempts != UINT32_MAX) + guard->attempts++; + if (guard->hits != UINT32_MAX) + guard->hits++; + if (guard->width_attempts != UINT32_MAX) + guard->width_attempts++; + guard->miss_streak = 0; + guard->width_miss_streak = 0; } static inline void vcf_format_fast_guard_fallback(vcf_format_fast_guard_t *guard) { - if (guard->attempts != UINT32_MAX) - guard->attempts++; - if (guard->fallbacks != UINT32_MAX) - guard->fallbacks++; - if (guard->miss_streak != UINT16_MAX) - guard->miss_streak++; - - if (guard->miss_streak >= VCF_FORMAT_FAST_DISABLE_STREAK) { - guard->disabled = 1; - guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; - return; - } - if (guard->attempts >= VCF_FORMAT_FAST_PROBE_ATTEMPTS && - (uint64_t) guard->fallbacks * 100 > - (uint64_t) guard->attempts * VCF_FORMAT_FAST_MAX_FALLBACK_PCT) { - guard->disabled = 1; - guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; - } + if (guard->attempts != UINT32_MAX) + guard->attempts++; + if (guard->fallbacks != UINT32_MAX) + guard->fallbacks++; + if (guard->miss_streak != UINT16_MAX) + guard->miss_streak++; + guard->width_miss_streak = 0; + + if (guard->miss_streak >= VCF_FORMAT_FAST_DISABLE_STREAK) { + guard->disabled = 1; + guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; + return; + } + if (guard->attempts >= VCF_FORMAT_FAST_PROBE_ATTEMPTS && + (uint64_t) guard->fallbacks * 100 > + (uint64_t) guard->attempts * VCF_FORMAT_FAST_MAX_FALLBACK_PCT) { + guard->disabled = 1; + guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; + } +} + +static inline void vcf_format_fast_guard_width_fallback(vcf_format_fast_guard_t *guard) +{ + if (guard->width_attempts != UINT32_MAX) + guard->width_attempts++; + if (guard->width_fallbacks != UINT32_MAX) + guard->width_fallbacks++; + if (guard->width_miss_streak != UINT16_MAX) + guard->width_miss_streak++; + + if (guard->width_attempts >= VCF_FORMAT_FAST_WIDTH_PROBE_ATTEMPTS && + (uint64_t) guard->width_fallbacks * 100 > + (uint64_t) guard->width_attempts * VCF_FORMAT_FAST_MAX_WIDTH_FALLBACK_PCT) { + guard->disabled = 1; + guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; + } } typedef struct { - /* - * Header-derived operation for one FORMAT tag. This is the reusable, - * record-independent part of the plan: the tag key, declared type, declared - * length model, and whether the row must measure the width before parsing. - */ - int key; - int number; - uint8_t htype; - uint8_t is_gt; - uint8_t vl_type; - uint8_t measured_width; + /* + * Header-derived operation for one FORMAT tag. This is the reusable, + * record-independent part of the plan: the tag key, declared type, declared + * length model, and whether the row must measure the width before parsing. + */ + int key; + int number; + uint8_t htype; + uint8_t is_gt; + uint8_t vl_type; + uint8_t measured_width; } vcf_format_op_t; typedef struct { - /* - * Cache key is the literal FORMAT string plus the private header - * generation. FORMAT key ids/types are header-local, so plans are owned by - * the header aux block and invalidated whenever bcf_hdr_sync() rebuilds the - * dictionaries. Unsupported plans are cached too; repeated uncommon or - * undefined FORMAT strings should pay the compile cost once, then fall back - * directly to the production parser. - */ - char *format; - size_t format_len; - uint64_t format_hash; - uint64_t hdr_gen; - int supported; - int n_ops; - vcf_format_op_t ops[MAX_N_FMT]; - vcf_format_fast_guard_t general_guard; + /* + * Cache key is the literal FORMAT string plus the private header + * generation. FORMAT key ids/types are header-local, so plans are owned by + * the header aux block and invalidated whenever bcf_hdr_sync() rebuilds the + * dictionaries. Unsupported plans are cached too; repeated uncommon or + * undefined FORMAT strings should pay the compile cost once, then fall back + * directly to the generic parser. + */ + char *format; + size_t format_len; + uint64_t format_hash; + uint64_t hdr_gen; + int supported; + vcf_format_plan_fallback_reason_t fallback_reason; + int n_ops; + vcf_format_op_t ops[MAX_N_FMT]; + vcf_format_fast_guard_t general_guard; } vcf_format_general_plan_t; struct vcf_format_plan_cache_t { - vcf_format_general_plan_t *plans; - int n; - int m; - int next_evict; - uint64_t hdr_gen; + vcf_format_general_plan_t *plans; + int n; + int m; + int next_evict; + uint64_t hdr_gen; }; typedef enum { - VCF_FORMAT_ROW_GT, - VCF_FORMAT_ROW_GT2, - VCF_FORMAT_ROW_INT1, - VCF_FORMAT_ROW_INT2, - VCF_FORMAT_ROW_INT3, - VCF_FORMAT_ROW_INTN, - VCF_FORMAT_ROW_FLOAT1, - VCF_FORMAT_ROW_FLOATN, - VCF_FORMAT_ROW_STR + VCF_FORMAT_ROW_GT, + VCF_FORMAT_ROW_GT2, + VCF_FORMAT_ROW_INT1, + VCF_FORMAT_ROW_INT2, + VCF_FORMAT_ROW_INT3, + VCF_FORMAT_ROW_INTN, + VCF_FORMAT_ROW_FLOAT1, + VCF_FORMAT_ROW_FLOATN, + VCF_FORMAT_ROW_STR } vcf_format_row_kind_t; typedef struct { - /* - * Row-local operation. Header Number=A/R/G and measured Number=. fields - * depend on the current record, so width/size/offset are resolved per row. - */ - int key; - int width; - int size; - int offset; - vcf_format_row_kind_t kind; + /* + * Row-local operation. Header Number=A/R/G and measured Number=. fields + * depend on the current record, so width/size/offset are resolved per row. + */ + int key; + int width; + int size; + int offset; + vcf_format_row_kind_t kind; + uint8_t can_compact; } vcf_format_row_op_t; typedef struct { - int32_t min; - int32_t max; - int has_special; + int32_t min; + int32_t max; + int has_special; } vcf_plan_int_range_t; #if defined(__GNUC__) @@ -3410,137 +3526,137 @@ typedef struct { static uint64_t vcf_format_plan_hash(const char *format, size_t len) { - size_t i; - uint64_t hash = 1469598103934665603ULL; + size_t i; + uint64_t hash = 1469598103934665603ULL; - for (i = 0; i < len; i++) { - hash ^= (unsigned char) format[i]; - hash *= 1099511628211ULL; - } - return hash; + for (i = 0; i < len; i++) { + hash ^= (unsigned char) format[i]; + hash *= 1099511628211ULL; + } + return hash; } static void vcf_format_general_plan_destroy(vcf_format_general_plan_t *plan) { - if (!plan) - return; - free(plan->format); - memset(plan, 0, sizeof(*plan)); + if (!plan) + return; + free(plan->format); + memset(plan, 0, sizeof(*plan)); } static void vcf_format_plan_cache_clear(vcf_format_plan_cache_t *cache) { - int i; + int i; - if (!cache) - return; - for (i = 0; i < cache->n; i++) - vcf_format_general_plan_destroy(&cache->plans[i]); - cache->n = 0; - cache->next_evict = 0; + if (!cache) + return; + for (i = 0; i < cache->n; i++) + vcf_format_general_plan_destroy(&cache->plans[i]); + cache->n = 0; + cache->next_evict = 0; } static void vcf_format_plan_cache_destroy(vcf_format_plan_cache_t *cache) { - if (!cache) - return; - vcf_format_plan_cache_clear(cache); - free(cache->plans); - free(cache); + if (!cache) + return; + vcf_format_plan_cache_clear(cache); + free(cache->plans); + free(cache); } static vcf_format_plan_cache_t *vcf_format_plan_cache_get(const bcf_hdr_t *h) { - bcf_hdr_aux_t *aux = get_hdr_aux(h); + bcf_hdr_aux_t *aux = get_hdr_aux(h); - if (!aux) - return NULL; - if (!aux->format_plan_cache) { - aux->format_plan_cache = (vcf_format_plan_cache_t *) - calloc(1, sizeof(*aux->format_plan_cache)); - if (!aux->format_plan_cache) - return NULL; - aux->format_plan_cache->hdr_gen = aux->format_plan_gen; - } - if (aux->format_plan_cache->hdr_gen != aux->format_plan_gen) { - vcf_format_plan_cache_clear(aux->format_plan_cache); - aux->format_plan_cache->hdr_gen = aux->format_plan_gen; - } - return aux->format_plan_cache; + if (!aux) + return NULL; + if (!aux->format_plan_cache) { + aux->format_plan_cache = (vcf_format_plan_cache_t *) + calloc(1, sizeof(*aux->format_plan_cache)); + if (!aux->format_plan_cache) + return NULL; + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + if (aux->format_plan_cache->hdr_gen != aux->format_plan_gen) { + vcf_format_plan_cache_clear(aux->format_plan_cache); + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + return aux->format_plan_cache; } static int vcf_format_plan_cache_slot(vcf_format_plan_cache_t *cache) { - enum { VCF_FORMAT_PLAN_CACHE_INIT = 16, VCF_FORMAT_PLAN_CACHE_MAX = 128 }; - int i, idx, new_m; - vcf_format_general_plan_t *plans; - - if (cache->n < cache->m) - return cache->n++; - - if (cache->m < VCF_FORMAT_PLAN_CACHE_MAX) { - new_m = cache->m ? cache->m * 2 : VCF_FORMAT_PLAN_CACHE_INIT; - if (new_m > VCF_FORMAT_PLAN_CACHE_MAX) - new_m = VCF_FORMAT_PLAN_CACHE_MAX; - if ((size_t) new_m > SIZE_MAX / sizeof(*cache->plans)) - return -1; - plans = (vcf_format_general_plan_t *) - realloc(cache->plans, (size_t) new_m * sizeof(*cache->plans)); - if (!plans) - return -1; - memset(plans + cache->m, 0, - (size_t) (new_m - cache->m) * sizeof(*plans)); - cache->plans = plans; - cache->m = new_m; - return cache->n++; - } - - for (i = 0; i < cache->n; i++) { - idx = (cache->next_evict + i) % cache->n; - if (!cache->plans[idx].supported) - goto found; - } - idx = cache->next_evict; + enum { VCF_FORMAT_PLAN_CACHE_INIT = 16, VCF_FORMAT_PLAN_CACHE_MAX = 128 }; + int i, idx, new_m; + vcf_format_general_plan_t *plans; + + if (cache->n < cache->m) + return cache->n++; + + if (cache->m < VCF_FORMAT_PLAN_CACHE_MAX) { + new_m = cache->m ? cache->m * 2 : VCF_FORMAT_PLAN_CACHE_INIT; + if (new_m > VCF_FORMAT_PLAN_CACHE_MAX) + new_m = VCF_FORMAT_PLAN_CACHE_MAX; + if ((size_t) new_m > SIZE_MAX / sizeof(*cache->plans)) + return -1; + plans = (vcf_format_general_plan_t *) + realloc(cache->plans, (size_t) new_m * sizeof(*cache->plans)); + if (!plans) + return -1; + memset(plans + cache->m, 0, + (size_t) (new_m - cache->m) * sizeof(*plans)); + cache->plans = plans; + cache->m = new_m; + return cache->n++; + } + + for (i = 0; i < cache->n; i++) { + idx = (cache->next_evict + i) % cache->n; + if (!cache->plans[idx].supported) + goto found; + } + idx = cache->next_evict; found: - vcf_format_general_plan_destroy(&cache->plans[idx]); - cache->next_evict = (idx + 1) % cache->n; - return idx; + vcf_format_general_plan_destroy(&cache->plans[idx]); + cache->next_evict = (idx + 1) % cache->n; + return idx; } static int vcf_format_general_plan_profitable(const vcf_format_general_plan_t *plan) { - int j, string_ops = 0, float_vector_ops = 0, int_ops = 0, int_vector_ops = 0; - - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - if (op->is_gt) - continue; - if (op->htype == BCF_HT_STR) { - string_ops++; - } else if (op->htype == BCF_HT_REAL) { - if (op->vl_type == BCF_VL_FIXED && op->number == 1) - ; - else - float_vector_ops++; - } else if (op->htype == BCF_HT_INT) { - int_ops++; - if (op->vl_type != BCF_VL_FIXED || op->number != 1) - int_vector_ops++; - } - } - - /* - * FORMAT rows with measured strings plus float vectors have to pay the - * dynamic executor's full width-measurement pass and then still use the - * general float conversion path. Without integer vectors to amortize that - * setup, production parsing has been consistently faster on the large - * corpus (for example GT:GL:FT:DP:GQ and GT:FT:PID:GL:DP). - */ - if (string_ops > 0 && float_vector_ops > 0 && - int_vector_ops == 0 && int_ops <= 2) - return 0; - return 1; + int j, string_ops = 0, float_vector_ops = 0, int_ops = 0, int_vector_ops = 0; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + if (op->is_gt) + continue; + if (op->htype == BCF_HT_STR) { + string_ops++; + } else if (op->htype == BCF_HT_REAL) { + if (op->vl_type == BCF_VL_FIXED && op->number == 1) + ; + else + float_vector_ops++; + } else if (op->htype == BCF_HT_INT) { + int_ops++; + if (op->vl_type != BCF_VL_FIXED || op->number != 1) + int_vector_ops++; + } + } + + /* + * FORMAT rows with measured strings plus float vectors have to pay the + * dynamic executor's full width-measurement pass and then still use the + * general float conversion path. Without integer vectors to amortize that + * setup, production parsing has been consistently faster on the large + * corpus (for example GT:GL:FT:DP:GQ and GT:FT:PID:GL:DP). + */ + if (string_ops > 0 && float_vector_ops > 0 && + int_vector_ops == 0 && int_ops <= 2) + return 0; + return 1; } static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *format, @@ -3548,32 +3664,33 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma uint64_t hdr_gen, vcf_format_general_plan_t *plan) { - char *tmp, *tok, *saveptr = NULL; - int i, ret = 0; - - memset(plan, 0, sizeof(*plan)); - plan->format = (char *) malloc(format_len + 1); - tmp = (char *) malloc(format_len + 1); - if (!plan->format || !tmp) { - free(tmp); - free(plan->format); - memset(plan, 0, sizeof(*plan)); - return -1; - } - memcpy(plan->format, format, format_len + 1); - memcpy(tmp, format, format_len + 1); - plan->format_len = format_len; - plan->format_hash = format_hash; - plan->hdr_gen = hdr_gen; - - /* - * Compile at tag granularity, not full FORMAT-shape granularity. This is - * what allows GT:AD, GT:AD:DP:PL, reordered fields, and supersets with - * additional header-described tags to share the same executor instead of - * needing exact string-specific kernels. - */ - for (tok = strtok_r(tmp, ":", &saveptr); tok; - tok = strtok_r(NULL, ":", &saveptr)) { + char *tmp, *tok, *saveptr = NULL; + int i, ret = 0; + + memset(plan, 0, sizeof(*plan)); + plan->format = (char *) malloc(format_len + 1); + tmp = (char *) malloc(format_len + 1); + if (!plan->format || !tmp) { + free(tmp); + free(plan->format); + memset(plan, 0, sizeof(*plan)); + return -1; + } + memcpy(plan->format, format, format_len + 1); + memcpy(tmp, format, format_len + 1); + plan->format_len = format_len; + plan->format_hash = format_hash; + plan->hdr_gen = hdr_gen; + plan->fallback_reason = VCF_FORMAT_PLAN_FB_UNSUPPORTED; + + /* + * Compile at tag granularity, not full FORMAT-shape granularity. This is + * what allows GT:AD, GT:AD:DP:PL, reordered fields, and supersets with + * additional header-described tags to share the same executor instead of + * needing exact string-specific kernels. + */ + for (tok = strtok_r(tmp, ":", &saveptr); tok; + tok = strtok_r(NULL, ":", &saveptr)) { int key, htype; if (plan->n_ops >= MAX_N_FMT) @@ -3585,17 +3702,17 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (plan->ops[i].key == key) goto done; - htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); - if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) - goto done; - - /* - * Only compile tags with enough header information to reproduce the - * production BCF layout. Undefined tags and exotic types intentionally - * stay on the production parser, which can emit warnings and install - * dummy header records where appropriate. - */ - plan->ops[plan->n_ops].key = key; + htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); + if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) + goto done; + + /* + * Only compile tags with enough header information to reproduce the + * production BCF layout. Undefined tags and exotic types intentionally + * stay on the generic parser, which can emit warnings and install + * dummy header records where appropriate. + */ + plan->ops[plan->n_ops].key = key; plan->ops[plan->n_ops].number = bcf_hdr_id2number(h, BCF_HL_FMT, key); plan->ops[plan->n_ops].htype = htype; plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; @@ -3624,91 +3741,94 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (!plan->n_ops) goto done; - if (!vcf_format_general_plan_profitable(plan)) - goto done; + if (!vcf_format_general_plan_profitable(plan)) + goto done; plan->supported = 1; - ret = 1; + ret = 1; done: - free(tmp); - return ret; + free(tmp); + return ret; } static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h, - const char *format) -{ - bcf_hdr_aux_t *aux; - vcf_format_plan_cache_t *cache; - vcf_format_general_plan_t *plan; - size_t format_len; - uint64_t format_hash, hdr_gen; - int i, idx, ret; - - /* - * The compiler reads h->id[] and header metadata directly. If a caller has - * mutated the header but not synced it yet, the production parser is the - * only safe path because it already owns all header-repair semantics. - */ - if (h->dirty) - return NULL; - - aux = get_hdr_aux(h); - if (!aux) - return NULL; - cache = vcf_format_plan_cache_get(h); - if (!cache) - return NULL; - - format_len = strlen(format); - format_hash = vcf_format_plan_hash(format, format_len); - hdr_gen = aux->format_plan_gen; - for (i = 0; i < cache->n; i++) { - plan = &cache->plans[i]; - if (plan->format && plan->hdr_gen == hdr_gen && - plan->format_len == format_len && - plan->format_hash == format_hash && - memcmp(plan->format, format, format_len) == 0) - return plan->supported ? plan : NULL; - } - - idx = vcf_format_plan_cache_slot(cache); - if (idx < 0) - return NULL; - plan = &cache->plans[idx]; - ret = vcf_format_general_plan_compile(h, format, format_len, format_hash, - hdr_gen, plan); - if (ret < 0) { - vcf_format_general_plan_destroy(plan); - if (idx == cache->n - 1) - cache->n--; - return NULL; - } - return plan->supported ? plan : NULL; + const char *format, + vcf_format_plan_fallback_reason_t *reason) +{ + bcf_hdr_aux_t *aux; + vcf_format_plan_cache_t *cache; + vcf_format_general_plan_t *plan; + size_t format_len; + uint64_t format_hash, hdr_gen; + int i, idx, ret; + + /* + * The compiler reads h->id[] and header metadata directly. If a caller has + * mutated the header but not synced it yet, the generic parser is the + * only safe path because it already owns all header-repair semantics. + */ + if (h->dirty) + return NULL; + + aux = get_hdr_aux(h); + if (!aux) + return NULL; + cache = vcf_format_plan_cache_get(h); + if (!cache) + return NULL; + + format_len = strlen(format); + format_hash = vcf_format_plan_hash(format, format_len); + hdr_gen = aux->format_plan_gen; + for (i = 0; i < cache->n; i++) { + plan = &cache->plans[i]; + if (plan->format && plan->hdr_gen == hdr_gen && + plan->format_len == format_len && + plan->format_hash == format_hash && + memcmp(plan->format, format, format_len) == 0) { + if (!plan->supported) + vcf_format_plan_set_reason(reason, plan->fallback_reason); + return plan->supported ? plan : NULL; + } + } + + idx = vcf_format_plan_cache_slot(cache); + if (idx < 0) + return NULL; + plan = &cache->plans[idx]; + ret = vcf_format_general_plan_compile(h, format, format_len, format_hash, + hdr_gen, plan); + if (ret < 0) { + vcf_format_general_plan_destroy(plan); + if (idx == cache->n - 1) + cache->n--; + return NULL; + } + if (!plan->supported) + vcf_format_plan_set_reason(reason, plan->fallback_reason); + return plan->supported ? plan : NULL; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) { - const char *s = *sp; - int a0, a1, phased; + const char *s = *sp; + int a0, a1, phased; - if (s[0] == '.' && (s[1] == '/' || s[1] == '|') && s[2] == '.') { - out[0] = 0; - out[1] = 0; - *sp = s + 3; - return 0; - } - if (!(s[0] >= '0' && s[0] <= '9') || (s[1] != '/' && s[1] != '|') || - !(s[2] >= '0' && s[2] <= '9')) - return -1; + if (((s[0] != '.' && !(s[0] >= '0' && s[0] <= '9'))) || + (s[1] != '/' && s[1] != '|') || + ((s[2] != '.' && !(s[2] >= '0' && s[2] <= '9')))) + return -1; - a0 = s[0] - '0'; - a1 = s[2] - '0'; - phased = s[1] == '|'; - out[0] = (uint8_t)(((a0 + 1) << 1) | phased); - out[1] = (uint8_t)(((a1 + 1) << 1) | phased); - *sp = s + 3; - return 0; + phased = s[1] == '|'; + a0 = s[0] == '.' ? -1 : s[0] - '0'; + a1 = s[2] == '.' ? -1 : s[2] - '0'; + out[0] = a0 < 0 ? (uint8_t) phased : + (uint8_t)(((a0 + 1) << 1) | phased); + out[1] = a1 < 0 ? (uint8_t) phased : + (uint8_t)(((a1 + 1) << 1) | phased); + *sp = s + 3; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) @@ -3748,29 +3868,37 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_init(vcf_plan_int_range_t *range) { - range->min = INT32_MAX; - range->max = INT32_MIN; - range->has_special = 0; + range->min = INT32_MAX; + range->max = INT32_MIN; + range->has_special = 0; } VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add(vcf_plan_int_range_t *range, int32_t val) { - if (val == bcf_int32_missing || val == bcf_int32_vector_end) - range->has_special = 1; - if (range->max < val) - range->max = val; - if (range->min > val && val > INT32_MIN + 1) - range->min = val; + if (val == bcf_int32_missing || val == bcf_int32_vector_end) + range->has_special = 1; + if (range->max < val) + range->max = val; + if (range->min > val && val > INT32_MIN + 1) + range->min = val; +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add_regular(vcf_plan_int_range_t *range, int32_t val) +{ + if (range->max < val) + range->max = val; + if (range->min > val) + range->min = val; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_vector_count(const float *vals, int width) { - int i; + int i; - for (i = 0; i < width; i++) - if (bcf_float_is_vector_end(vals[i])) - break; - return i; + for (i = 0; i < width; i++) + if (bcf_float_is_vector_end(vals[i])) + break; + return i; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) @@ -3794,36 +3922,39 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value_range(const char **sp, int32_t *out, vcf_plan_int_range_t *range) { - const char *s = *sp; - uint32_t val = 0, cutoff = BCF_MAX_BT_INT32 / 10, cutlim = BCF_MAX_BT_INT32 % 10; - - if (*s >= '0' && *s <= '9') { - do { - uint32_t digit = *s - '0'; - if (val > cutoff || (val == cutoff && digit > cutlim)) - return -1; - val = val * 10 + digit; - s++; - } while (*s >= '0' && *s <= '9'); - *out = (int32_t)val; - *sp = s; - vcf_plan_int_range_add(range, *out); - return 0; - } - if (vcf_plan_int_value(sp, out) < 0) - return -1; - vcf_plan_int_range_add(range, *out); - return 0; -} - -static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, - int width, int *nread) + const char *s = *sp; + uint32_t val = 0, cutoff = BCF_MAX_BT_INT32 / 10, cutlim = BCF_MAX_BT_INT32 % 10; + + if (*s >= '0' && *s <= '9') { + do { + uint32_t digit = *s - '0'; + if (val > cutoff || (val == cutoff && digit > cutlim)) + return -1; + val = val * 10 + digit; + s++; + } while (*s >= '0' && *s <= '9'); + *out = (int32_t)val; + *sp = s; + vcf_plan_int_range_add_regular(range, *out); + return 0; + } + if (vcf_plan_int_value(sp, out) < 0) + return -1; + vcf_plan_int_range_add(range, *out); + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector_counted_range(const char **sp, + int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) { const char *s = *sp; - int i; + int i, nvals; for (i = 0; i < width; i++) { - if (vcf_plan_int_value(&s, &out[i]) < 0) + if (vcf_plan_int_value_range(&s, &out[i], range) < 0) return -1; if (*s != ',') { i++; @@ -3831,8 +3962,10 @@ static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, } s++; } - if (nread) - *nread = i; + nvals = i; + *nread = nvals; + if (i < width) + range->has_special = 1; for (; i < width; i++) out[i] = bcf_int32_vector_end; if (*s == ',') @@ -3841,380 +3974,290 @@ static int vcf_plan_parse_int_vector_counted(const char **sp, int32_t *out, return 0; } -static int vcf_plan_parse_int_vector_counted_range(const char **sp, int32_t *out, - int width, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - int i, nvals; - - for (i = 0; i < width; i++) { - if (vcf_plan_int_value_range(&s, &out[i], range) < 0) - return -1; - if (*s != ',') { - i++; - break; - } - s++; - } - nvals = i; - if (nread) - *nread = nvals; - if (i < width) - range->has_special = 1; - for (; i < width; i++) - out[i] = bcf_int32_vector_end; - if (*s == ',') - return -1; - *sp = s; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted(const char **sp, int32_t *out, int *nread) -{ - const char *s = *sp; - - if (vcf_plan_int_value(&s, &out[0]) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - *sp = s; - if (nread) - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value(&s, &out[1]) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - if (nread) - *nread = 2; - return 0; -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char **sp, int32_t *out, int *nread, vcf_plan_int_range_t *range) { - const char *s = *sp; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - if (nread) - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - if (nread) - *nread = 2; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted(const char **sp, int32_t *out, int *nread) -{ - const char *s = *sp; - - if (vcf_plan_int_value(&s, &out[0]) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - *sp = s; - if (nread) - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value(&s, &out[1]) < 0) - return -1; - if (*s != ',') { - out[2] = bcf_int32_vector_end; - *sp = s; - if (nread) - *nread = 2; - return 0; - } - s++; - if (vcf_plan_int_value(&s, &out[2]) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - if (nread) - *nread = 3; - return 0; + const char *s = *sp; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + *sp = s; + range->has_special = 1; + *nread = 1; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + *nread = 2; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char **sp, int32_t *out, int *nread, vcf_plan_int_range_t *range) { - const char *s = *sp; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - if (nread) - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { - out[2] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - if (nread) - *nread = 2; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - if (nread) - *nread = 3; - return 0; + const char *s = *sp; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + *sp = s; + range->has_special = 1; + *nread = 1; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { + out[2] = bcf_int32_vector_end; + *sp = s; + range->has_special = 1; + *nread = 2; + return 0; + } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s == ',') + return -1; + *sp = s; + *nread = 3; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char **sp, int32_t *out, int *nread, vcf_plan_int_range_t *range) { - const char *s = *sp; - int i = 4; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 1; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { - out[2] = bcf_int32_vector_end; - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 2; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 3; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s == ',') - return -1; + const char *s = *sp; + int i = 4; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { + out[1] = bcf_int32_vector_end; + out[2] = bcf_int32_vector_end; + out[3] = bcf_int32_vector_end; + range->has_special = 1; + i = 1; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { + out[2] = bcf_int32_vector_end; + out[3] = bcf_int32_vector_end; + range->has_special = 1; + i = 2; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { + out[3] = bcf_int32_vector_end; + range->has_special = 1; + i = 3; + goto done; + } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s == ',') + return -1; done: - *sp = s; - if (nread) - *nread = i; - return 0; + *sp = s; + *nread = i; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector6_counted_range(const char **sp, int32_t *out, int *nread, vcf_plan_int_range_t *range) { - const char *s = *sp; - int i = 6, j; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { i = 1; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { i = 2; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { i = 3; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s != ',') { i = 4; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[4], range) < 0) - return -1; - if (*s != ',') { i = 5; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[5], range) < 0) - return -1; - if (*s == ',') - return -1; - goto done; + const char *s = *sp; + int i = 6, j; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { i = 1; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { i = 2; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { i = 3; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s != ',') { i = 4; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[4], range) < 0) + return -1; + if (*s != ',') { i = 5; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[5], range) < 0) + return -1; + if (*s == ',') + return -1; + goto done; fill: - range->has_special = 1; - for (j = i; j < 6; j++) - out[j] = bcf_int32_vector_end; + range->has_special = 1; + for (j = i; j < 6; j++) + out[j] = bcf_int32_vector_end; done: - *sp = s; - if (nread) - *nread = i; - return 0; + *sp = s; + *nread = i; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector10_counted_range(const char **sp, int32_t *out, int *nread, vcf_plan_int_range_t *range) { - const char *s = *sp; - int i = 10, j; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { i = 1; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { i = 2; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { i = 3; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s != ',') { i = 4; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[4], range) < 0) - return -1; - if (*s != ',') { i = 5; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[5], range) < 0) - return -1; - if (*s != ',') { i = 6; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[6], range) < 0) - return -1; - if (*s != ',') { i = 7; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[7], range) < 0) - return -1; - if (*s != ',') { i = 8; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[8], range) < 0) - return -1; - if (*s != ',') { i = 9; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[9], range) < 0) - return -1; - if (*s == ',') - return -1; - goto done; + const char *s = *sp; + int i = 10, j; + + if (vcf_plan_int_value_range(&s, &out[0], range) < 0) + return -1; + if (*s != ',') { i = 1; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[1], range) < 0) + return -1; + if (*s != ',') { i = 2; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[2], range) < 0) + return -1; + if (*s != ',') { i = 3; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[3], range) < 0) + return -1; + if (*s != ',') { i = 4; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[4], range) < 0) + return -1; + if (*s != ',') { i = 5; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[5], range) < 0) + return -1; + if (*s != ',') { i = 6; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[6], range) < 0) + return -1; + if (*s != ',') { i = 7; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[7], range) < 0) + return -1; + if (*s != ',') { i = 8; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[8], range) < 0) + return -1; + if (*s != ',') { i = 9; goto fill; } + s++; + if (vcf_plan_int_value_range(&s, &out[9], range) < 0) + return -1; + if (*s == ',') + return -1; + goto done; fill: - range->has_special = 1; - for (j = i; j < 10; j++) - out[j] = bcf_int32_vector_end; + range->has_special = 1; + for (j = i; j < 10; j++) + out[j] = bcf_int32_vector_end; done: - *sp = s; - if (nread) - *nread = i; - return 0; + *sp = s; + *nread = i; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) { - if (**sp != sep) - return -1; - (*sp)++; - return 0; + if (**sp != sep) + return -1; + (*sp)++; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int width) { - const char *s = *sp, *t = s; - int l; + const char *s = *sp, *t = s; + int l; - while (*t && *t != ':' && *t != '\t') - t++; - l = t - s; - if (l > width) - return -1; - memcpy(out, s, l); - if (l < width) - memset(out + l, 0, width - l); - *sp = t; - return 0; + while (*t && *t != ':' && *t != '\t') + t++; + l = t - s; + if (l > width) + return -1; + memcpy(out, s, l); + if (l < width) + memset(out + l, 0, width - l); + *sp = t; + return 0; } static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int width) { - const char *s = *sp; - int i = 0; - - if (*s == ':' || *s == '\t' || *s == '\0') { - bcf_float_set_missing(out[i++]); - } else { - for (;;) { - if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) - return -1; - i++; - if (*s != ',') - break; - s++; - } - } - for (; i < width; i++) - bcf_float_set_vector_end(out[i]); - *sp = s; - return 0; -} + const char *s = *sp; + int i = 0; -VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible(const char **sp, int32_t *out) -{ - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - *out = bcf_int32_missing; - return 0; - } - return vcf_plan_int_value(sp, out); + if (*s == ':' || *s == '\t' || *s == '\0') { + bcf_float_set_missing(out[i++]); + } else { + for (;;) { + if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) + return -1; + i++; + if (*s != ',') + break; + s++; + } + } + for (; i < width; i++) + bcf_float_set_vector_end(out[i]); + *sp = s; + return 0; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible_range(const char **sp, int32_t *out, vcf_plan_int_range_t *range) { - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - *out = bcf_int32_missing; - vcf_plan_int_range_add(range, *out); - return 0; - } - return vcf_plan_int_value_range(sp, out, range); + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + *out = bcf_int32_missing; + vcf_plan_int_range_add(range, *out); + return 0; + } + return vcf_plan_int_value_range(sp, out, range); } VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float *out) { - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - bcf_float_set_missing(*out); - return 0; - } - return vcf_plan_float_value(sp, out); + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + bcf_float_set_missing(*out); + return 0; + } + return vcf_plan_float_value(sp, out); +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_fill_missing_int_vector(int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) +{ + int i; + + out[0] = bcf_int32_missing; + vcf_plan_int_range_add(range, out[0]); + for (i = 1; i < width; i++) + out[i] = bcf_int32_vector_end; + range->has_special = 1; + *nread = 1; } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(const char **sp, @@ -4222,15 +4265,11 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(con int *nread, vcf_plan_int_range_t *range) { - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - out[0] = bcf_int32_missing; - out[1] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[0]); - if (nread) - *nread = 1; - return 0; - } - return vcf_plan_parse_int_vector2_counted_range(sp, out, nread, range); + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + vcf_plan_fill_missing_int_vector(out, 2, nread, range); + return 0; + } + return vcf_plan_parse_int_vector2_counted_range(sp, out, nread, range); } VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(const char **sp, @@ -4238,16 +4277,11 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(con int *nread, vcf_plan_int_range_t *range) { - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - out[0] = bcf_int32_missing; - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - vcf_plan_int_range_add(range, out[0]); - if (nread) - *nread = 1; - return 0; - } - return vcf_plan_parse_int_vector3_counted_range(sp, out, nread, range); + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + vcf_plan_fill_missing_int_vector(out, 3, nread, range); + return 0; + } + return vcf_plan_parse_int_vector3_counted_range(sp, out, nread, range); } static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, @@ -4256,95 +4290,97 @@ static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, int *nread, vcf_plan_int_range_t *range) { - int i; - - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - out[0] = bcf_int32_missing; - vcf_plan_int_range_add(range, out[0]); - for (i = 1; i < width; i++) - out[i] = bcf_int32_vector_end; - range->has_special = 1; - if (nread) - *nread = 1; - return 0; - } - switch (width) { - case 4: - return vcf_plan_parse_int_vector4_counted_range(sp, out, nread, range); - case 6: - return vcf_plan_parse_int_vector6_counted_range(sp, out, nread, range); - case 10: - return vcf_plan_parse_int_vector10_counted_range(sp, out, nread, range); - default: - break; - } - return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); -} - -static void vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, - bcf1_t *v, int *widths, - vcf_format_row_op_t *row_ops) -{ - int j; - - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - vcf_format_row_op_t *row = &row_ops[j]; - - row->key = op->key; - row->width = widths[j] > 0 ? widths[j] : 1; - row->offset = 0; - if (op->is_gt) { - row->kind = row->width == 2 && v->n_allele <= 10 ? VCF_FORMAT_ROW_GT2 : VCF_FORMAT_ROW_GT; - row->size = row->kind == VCF_FORMAT_ROW_GT2 ? 2 : row->width * (int)sizeof(int32_t); - } else if (op->htype == BCF_HT_INT) { - if (row->width == 1) - row->kind = VCF_FORMAT_ROW_INT1; - else if (row->width == 2) - row->kind = VCF_FORMAT_ROW_INT2; - else if (row->width == 3) - row->kind = VCF_FORMAT_ROW_INT3; - else - row->kind = VCF_FORMAT_ROW_INTN; - row->size = row->width * (int)sizeof(int32_t); - } else if (op->htype == BCF_HT_REAL) { - row->kind = row->width == 1 ? VCF_FORMAT_ROW_FLOAT1 : VCF_FORMAT_ROW_FLOATN; - row->size = row->width * (int)sizeof(float); - } else { - row->kind = VCF_FORMAT_ROW_STR; - row->size = row->width; - } - } + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + vcf_plan_fill_missing_int_vector(out, width, nread, range); + return 0; + } + switch (width) { + case 4: + return vcf_plan_parse_int_vector4_counted_range(sp, out, nread, range); + case 6: + return vcf_plan_parse_int_vector6_counted_range(sp, out, nread, range); + case 10: + return vcf_plan_parse_int_vector10_counted_range(sp, out, nread, range); + default: + break; + } + return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); +} + +static int vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, + bcf1_t *v, int *widths, + vcf_format_row_op_t *row_ops, + vcf_format_plan_fallback_reason_t *reason) +{ + int j; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + vcf_format_row_op_t *row = &row_ops[j]; + + row->key = op->key; + row->width = widths[j] > 0 ? widths[j] : 1; + row->offset = 0; + if (op->is_gt) { + if (row->width != 2 || v->n_allele > 10) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_GT_SHAPE); + return -4; + } + row->kind = VCF_FORMAT_ROW_GT2; + row->size = 2; + } else if (op->htype == BCF_HT_INT) { + if (row->width == 1) + row->kind = VCF_FORMAT_ROW_INT1; + else if (row->width == 2) + row->kind = VCF_FORMAT_ROW_INT2; + else if (row->width == 3) + row->kind = VCF_FORMAT_ROW_INT3; + else + row->kind = VCF_FORMAT_ROW_INTN; + row->size = row->width * (int)sizeof(int32_t); + } else if (op->htype == BCF_HT_REAL) { + row->kind = row->width == 1 ? VCF_FORMAT_ROW_FLOAT1 : VCF_FORMAT_ROW_FLOATN; + row->size = row->width * (int)sizeof(float); + } else { + row->kind = VCF_FORMAT_ROW_STR; + row->size = row->width; + } + row->can_compact = row->kind == VCF_FORMAT_ROW_INT2 || + row->kind == VCF_FORMAT_ROW_INT3 || + row->kind == VCF_FORMAT_ROW_INTN || + row->kind == VCF_FORMAT_ROW_FLOATN; + } + return 0; } static const char *vcf_format_skip_sample_column(const char *cur, const char *end) { - while (cur < end && *cur && *cur != '\t') - cur++; - if (cur < end && *cur == '\t') - cur++; - return cur; + while (cur < end && *cur && *cur != '\t') + cur++; + if (cur < end && *cur == '\t') + cur++; + return cur; } static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t *v) { - if (op->is_gt) - return 2; - if (op->htype == BCF_HT_STR) - return 0; - - switch (op->vl_type) { - case BCF_VL_FIXED: - return op->number > 0 ? op->number : 0; - case BCF_VL_A: - return v->n_allele > 1 ? v->n_allele - 1 : 0; - case BCF_VL_R: - return v->n_allele; - case BCF_VL_G: - return v->n_allele * (v->n_allele + 1) / 2; - default: - return 0; - } + if (op->is_gt) + return 2; + if (op->htype == BCF_HT_STR) + return 0; + + switch (op->vl_type) { + case BCF_VL_FIXED: + return op->number > 0 ? op->number : 0; + case BCF_VL_A: + return v->n_allele > 1 ? v->n_allele - 1 : 0; + case BCF_VL_R: + return v->n_allele; + case BCF_VL_G: + return v->n_allele * (v->n_allele + 1) / 2; + default: + return 0; + } } static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt); @@ -4355,479 +4391,508 @@ static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring const vcf_plan_int_range_t *ranges, int first_op) { - int j; - - for (j = first_op; j < n_ops; j++) { - const vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = (uint8_t*)mem->s + op->offset; - - bcf_enc_int1(dst, op->key); - if (op->kind == VCF_FORMAT_ROW_GT2) { - if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_STR) { - if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) - return -1; - if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { - if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) - return -1; - if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) - return -1; - } else if (op->kind == VCF_FORMAT_ROW_INT1 || - op->kind == VCF_FORMAT_ROW_INT2 || - op->kind == VCF_FORMAT_ROW_INT3 || - op->kind == VCF_FORMAT_ROW_INTN) { - if (bcf_enc_vint_known_range_special(dst, nsamples * op->width, (int32_t *)buf, - op->width, ranges[j].min, ranges[j].max, - ranges[j].has_special) < 0) - return -1; - } else { - if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) - return -1; - } - } - return 0; -} + int j; -static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt) -{ - int n = nsamples * 2; + for (j = first_op; j < n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; - if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0) - return -1; - return kputsn((const char *)gt, n, dst) < 0 ? -1 : 0; + bcf_enc_int1(dst, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_STR) { + if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) + return -1; + } else if (op->kind == VCF_FORMAT_ROW_INT1 || + op->kind == VCF_FORMAT_ROW_INT2 || + op->kind == VCF_FORMAT_ROW_INT3 || + op->kind == VCF_FORMAT_ROW_INTN) { + if (bcf_enc_vint_known_range_special(dst, nsamples * op->width, (int32_t *)buf, + op->width, ranges[j].min, ranges[j].max, + ranges[j].has_special) < 0) + return -1; + } else { + if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) + return -1; + } + } + return 0; } -static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int n_ops) +static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt) { - int j; + int n = nsamples * 2; - for (j = 0; j < n_ops; j++) { - if (row_ops[j].kind != VCF_FORMAT_ROW_GT2 && - row_ops[j].kind != VCF_FORMAT_ROW_FLOAT1) - break; - } - return j; + if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0) + return -1; + return kputsn((const char *)gt, n, dst) < 0 ? -1 : 0; } -static int vcf_format_general_composable_supported(const vcf_format_row_op_t *row_ops, - int n_ops) +static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int n_ops) { - int j; - - for (j = 0; j < n_ops; j++) { - switch (row_ops[j].kind) { - case VCF_FORMAT_ROW_GT2: - case VCF_FORMAT_ROW_INT1: - case VCF_FORMAT_ROW_INT2: - case VCF_FORMAT_ROW_INT3: - case VCF_FORMAT_ROW_INTN: - case VCF_FORMAT_ROW_FLOAT1: - case VCF_FORMAT_ROW_FLOATN: - case VCF_FORMAT_ROW_STR: - break; - default: - return 0; - } - } - return 1; -} + int j; -static int vcf_format_row_can_compact(const vcf_format_row_op_t *op) -{ - return op->kind == VCF_FORMAT_ROW_INT2 || - op->kind == VCF_FORMAT_ROW_INT3 || - op->kind == VCF_FORMAT_ROW_INTN || - op->kind == VCF_FORMAT_ROW_FLOATN; + for (j = 0; j < n_ops; j++) { + if (row_ops[j].kind != VCF_FORMAT_ROW_GT2 && + row_ops[j].kind != VCF_FORMAT_ROW_FLOAT1) + break; + } + return j; } static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, vcf_format_row_op_t *op, int width) { - size_t elem_size = op->kind == VCF_FORMAT_ROW_FLOATN ? sizeof(float) : sizeof(int32_t); - size_t old_stride = (size_t) op->width * elem_size; - size_t new_stride = (size_t) width * elem_size; - char *base = mem->s + op->offset; - int sample; + size_t elem_size = op->kind == VCF_FORMAT_ROW_FLOATN ? sizeof(float) : sizeof(int32_t); + size_t old_stride = (size_t) op->width * elem_size; + size_t new_stride = (size_t) width * elem_size; + char *base = mem->s + op->offset; + int sample; - for (sample = 1; sample < nsamples; sample++) - memmove(base + sample * new_stride, base + sample * old_stride, new_stride); - op->width = width; - op->size = (int)new_stride; - if (op->kind == VCF_FORMAT_ROW_INT2 || op->kind == VCF_FORMAT_ROW_INT3) - op->kind = width == 1 ? VCF_FORMAT_ROW_INT1 : - width == 2 ? VCF_FORMAT_ROW_INT2 : - width == 3 ? VCF_FORMAT_ROW_INT3 : VCF_FORMAT_ROW_INTN; + for (sample = 1; sample < nsamples; sample++) + memmove(base + sample * new_stride, base + sample * old_stride, new_stride); + op->width = width; + op->size = (int)new_stride; + if (op->kind == VCF_FORMAT_ROW_INT2 || op->kind == VCF_FORMAT_ROW_INT3) + op->kind = width == 1 ? VCF_FORMAT_ROW_INT1 : + width == 2 ? VCF_FORMAT_ROW_INT2 : + width == 3 ? VCF_FORMAT_ROW_INT3 : VCF_FORMAT_ROW_INTN; } static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, - bcf1_t *v, char *q, int *widths) -{ - const char *cur, *end; - int has_measured = 0, sample, kept = 0, j; - int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); - int output_nsamples = bcf_hdr_nsamples(h); - - /* - * With bcf_hdr_set_samples(), the text line still contains the original - * sample columns but BCF output must contain only the retained samples. The - * measurement pass therefore scans original columns and updates row-local - * widths only for samples that will be emitted. - */ - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - - if (op->measured_width) { - /* - * Strings and Number=. numeric vectors need a first pass so the - * transposed FORMAT storage has one row-local stride. The bound is - * deliberately small; wide or malformed records fall back whole-row - * to the production parser rather than growing a second general - * allocator here. - */ - widths[j] = 0; - has_measured = 1; - } else { - widths[j] = vcf_format_general_expected_width(op, v); - if (widths[j] <= 0 || widths[j] > 64) - return -4; - } - } - - if (!has_measured) - return 0; - - cur = q + 1; - end = s->s + s->l; - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { - cur = vcf_format_skip_sample_column(cur, end); - continue; - } - for (j = 0; j < plan->n_ops; j++) { - const vcf_format_op_t *op = &plan->ops[j]; - const char *field = cur; - int w = 1; - - /* - * This pass validates the sample field separators at the same time - * as measuring widths. A single unexpected ':' or tab position is - * enough to reject the fast path, preserving production behavior for - * odd FORMAT/sample cardinality cases. - */ - while (cur < end && *cur && *cur != ':' && *cur != '\t') { - if (op->measured_width && - (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) && - *cur == ',') - w++; - cur++; - } - if (op->measured_width && !op->is_gt && op->htype == BCF_HT_STR) { - w = cur - field; - if (j > 0) - w++; - if (w <= 0) - w = 1; - } - if (op->measured_width) { - if (widths[j] < w) - widths[j] = w; - } - - if (j + 1 < plan->n_ops) { - if (*cur != ':') - return -4; - cur++; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - return -4; - } - } - if (++kept == output_nsamples) - break; - } - if (kept != output_nsamples) - return -4; - for (j = 0; j < plan->n_ops; j++) - if (plan->ops[j].measured_width) { - if (widths[j] <= 0) - widths[j] = 1; - if (widths[j] > 64) - return -4; - } - - return 0; + bcf1_t *v, char *q, int *widths, + vcf_format_plan_fallback_reason_t *reason) +{ + const char *cur, *end; + int has_measured = 0, sample, kept = 0, j; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h); + + /* + * With bcf_hdr_set_samples(), the text line still contains the original + * sample columns but BCF output must contain only the retained samples. The + * measurement pass therefore scans original columns and updates row-local + * widths only for samples that will be emitted. + */ + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + + if (op->measured_width) { + /* + * Strings and Number=. numeric vectors need a first pass so the + * transposed FORMAT storage has one row-local stride. Numeric + * vectors keep the conservative width cap because they multiply + * parsing, padding, and integer-width decisions. Strings are allowed + * a larger bounded width because they are copied as bytes and are + * common in phase-set annotations. + */ + widths[j] = 0; + has_measured = 1; + } else { + widths[j] = vcf_format_general_expected_width(op, v); + if (widths[j] <= 0 || widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + } + + if (!has_measured) + return 0; + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + const char *field = cur; + int w = 1; + + /* + * This pass validates the sample field separators at the same time + * as measuring widths. A single unexpected ':' or tab position is + * enough to reject the fast path, preserving production behavior for + * odd FORMAT/sample cardinality cases. + */ + while (cur < end && *cur && *cur != ':' && *cur != '\t') { + if (op->measured_width && + (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) && + *cur == ',') { + w++; + if (w > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + cur++; + } + if (op->measured_width && !op->is_gt && op->htype == BCF_HT_STR) { + w = cur - field; + if (j > 0) + w++; + if (w <= 0) + w = 1; + if (w > VCF_FORMAT_MAX_STRING_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } + } + if (op->measured_width) { + if (widths[j] < w) + widths[j] = w; + } + + if (j + 1 < plan->n_ops) { + if (*cur != ':') { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + return -4; + } + cur++; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + return -4; + } + } + } + if (++kept == output_nsamples) + break; + } + if (kept != output_nsamples) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SAMPLE_COUNT); + return -4; + } + for (j = 0; j < plan->n_ops; j++) + if (plan->ops[j].measured_width) { + if (widths[j] <= 0) + widths[j] = 1; + if (plan->ops[j].htype == BCF_HT_STR) { + if (widths[j] > VCF_FORMAT_MAX_STRING_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } + } else if (widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + + return 0; } static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, char *q, - vcf_format_row_op_t *row_ops) -{ - kstring_t *mem = (kstring_t*)&h->mem; - int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); - int output_nsamples = bcf_hdr_nsamples(h), sample, kept = 0, j; - int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); - int max_counts[MAX_N_FMT]; - vcf_plan_int_range_t ranges[MAX_N_FMT]; - size_t indiv_l0 = v->indiv.l; - size_t direct_offsets[MAX_N_FMT]; - uint8_t *op_base[MAX_N_FMT]; - size_t op_stride[MAX_N_FMT]; - const char *cur = q + 1, *end = s->s + s->l; - - if (!vcf_format_general_composable_supported(row_ops, plan->n_ops)) - return -4; - - /* - * The executor writes data in BCF's transposed FORMAT layout: all samples - * for FORMAT op 0, then all samples for op 1, etc. Leading fixed-width - * GT2/FLOAT1 rows can be written directly to v->indiv; the remaining rows - * are staged in h->mem so they can be parsed sample-major and encoded - * op-major once row-local ranges and widths are known. - * - * If keep_samples is active, nsamples is the number of columns to scan in - * the input line and output_nsamples is the dense BCF sample count. This - * mirrors the production parser: unselected sample columns may influence - * neither emitted widths nor output cardinality. - */ - for (j = 0; j < plan->n_ops; j++) { - max_counts[j] = 0; - direct_offsets[j] = 0; - vcf_plan_int_range_init(&ranges[j]); - } - - for (j = 0; j < direct_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - - bcf_enc_int1(&v->indiv, op->key); - if (op->kind == VCF_FORMAT_ROW_GT2) { - if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * 2) < 0) - goto error; - direct_offsets[j] = v->indiv.l; - v->indiv.l += (size_t)output_nsamples * 2; - } else { - if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || - ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * sizeof(float)) < 0) - goto error; - direct_offsets[j] = v->indiv.l; - v->indiv.l += (size_t)output_nsamples * sizeof(float); - } - } - - mem->l = 0; - for (j = direct_ops; j < plan->n_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - - if ((uint64_t) mem->l + output_nsamples * (uint64_t) op->size > INT_MAX) - goto error; - if (align_mem(mem) < 0) - goto error; - op->offset = mem->l; - if (ks_resize(mem, mem->l + output_nsamples * (size_t) op->size) < 0) - goto error; - mem->l += output_nsamples * (size_t) op->size; - } - for (j = 0; j < plan->n_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - if (j < direct_ops) { - op_base[j] = (uint8_t *)v->indiv.s + direct_offsets[j]; - op_stride[j] = op->kind == VCF_FORMAT_ROW_GT2 ? 2 : (size_t)op->size; - } else { - op_base[j] = (uint8_t *)mem->s + op->offset; - op_stride[j] = (size_t)op->size; - } - } - - for (sample = 0; sample < nsamples && cur < end; sample++) { - if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { - cur = vcf_format_skip_sample_column(cur, end); - continue; - } - for (j = 0; j < plan->n_ops; j++) { - vcf_format_row_op_t *op = &row_ops[j]; - uint8_t *buf = op_base[j] + kept * op_stride[j]; - int n = op->width; - - /* - * Each op parser consumes exactly one sample subfield and leaves cur - * on the following ':' or tab. Values that require production-only - * handling, such as non-simple GT encodings, return -4 via fallback. - */ - switch (op->kind) { - case VCF_FORMAT_ROW_GT2: - if (vcf_plan_gt2_u8(&cur, buf) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT1: - if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT2: - if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INT3: - if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_INTN: - if (vcf_plan_parse_int_vector_flexible_counted_range(&cur, (int32_t *)buf, - op->width, &n, &ranges[j]) < 0) - goto fallback; - break; - case VCF_FORMAT_ROW_FLOAT1: - if (j < direct_ops) { - float f; - if (vcf_plan_float_scalar_flexible(&cur, &f) < 0) - goto fallback; - float_to_le(f, buf); - } else if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) { - goto fallback; - } - break; - case VCF_FORMAT_ROW_FLOATN: - if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) - goto fallback; - n = vcf_plan_float_vector_count((float *)buf, op->width); - break; - case VCF_FORMAT_ROW_STR: - if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) - goto fallback; - break; - default: - goto fallback; - } - if (max_counts[j] < n) - max_counts[j] = n; - - if (j + 1 < plan->n_ops) { - if (vcf_plan_expect_sep(&cur, ':') < 0) - goto fallback; - } else { - if (*cur == '\t') - cur++; - else if (*cur == '\0' || cur >= end) - ; - else - goto fallback; - } - } - if (++kept == output_nsamples) - break; - } - if (kept != output_nsamples) - goto fallback; - for (j = 0; j < plan->n_ops; j++) { - if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) - goto fallback; - if (max_counts[j] < row_ops[j].width) { - /* - * Production encodes fixed-width vector rows at the observed row - * maximum, not necessarily the conservative header-derived width. - * Compacting here avoids unnecessary whole-row fallback while - * keeping byte-identical BCF output. - */ - if (!vcf_format_row_can_compact(&row_ops[j])) - goto fallback; - vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]); - } - } - - v->n_fmt = plan->n_ops; - v->n_sample = output_nsamples; - if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, output_nsamples, - plan->n_ops, row_ops, - ranges, direct_ops) < 0) - goto error; - vcf_format_plan_stats.hits++; - vcf_format_plan_stats.parsed_samples += output_nsamples; - return 0; + vcf_format_row_op_t *row_ops, + vcf_format_plan_fallback_reason_t *reason) +{ + kstring_t *mem = (kstring_t*)&h->mem; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h), sample, kept = 0, j; + int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); + int max_counts[MAX_N_FMT]; + vcf_plan_int_range_t ranges[MAX_N_FMT]; + size_t indiv_l0 = v->indiv.l; + size_t direct_offsets[MAX_N_FMT]; + uint8_t *op_base[MAX_N_FMT]; + size_t op_stride[MAX_N_FMT]; + const char *cur = q + 1, *end = s->s + s->l; + + /* + * The executor writes data in BCF's transposed FORMAT layout: all samples + * for FORMAT op 0, then all samples for op 1, etc. Leading fixed-width + * GT2/FLOAT1 rows can be written directly to v->indiv; the remaining rows + * are staged in h->mem so they can be parsed sample-major and encoded + * op-major once row-local ranges and widths are known. + * + * If keep_samples is active, nsamples is the number of columns to scan in + * the input line and output_nsamples is the dense BCF sample count. This + * mirrors the generic parser: unselected sample columns may influence + * neither emitted widths nor output cardinality. + */ + for (j = 0; j < plan->n_ops; j++) { + max_counts[j] = row_ops[j].can_compact ? 0 : row_ops[j].width; + direct_offsets[j] = 0; + vcf_plan_int_range_init(&ranges[j]); + } + + for (j = 0; j < direct_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + bcf_enc_int1(&v->indiv, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * 2) < 0) + goto error; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)output_nsamples * 2; + } else { + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * sizeof(float)) < 0) + goto error; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)output_nsamples * sizeof(float); + } + } + + mem->l = 0; + for (j = direct_ops; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + if ((uint64_t) mem->l + output_nsamples * (uint64_t) op->size > INT_MAX) + goto error; + if (align_mem(mem) < 0) + goto error; + op->offset = mem->l; + if (ks_resize(mem, mem->l + output_nsamples * (size_t) op->size) < 0) + goto error; + mem->l += output_nsamples * (size_t) op->size; + } + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + if (j < direct_ops) { + op_base[j] = (uint8_t *)v->indiv.s + direct_offsets[j]; + op_stride[j] = op->kind == VCF_FORMAT_ROW_GT2 ? 2 : (size_t)op->size; + } else { + op_base[j] = (uint8_t *)mem->s + op->offset; + op_stride[j] = (size_t)op->size; + } + } + + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = op_base[j] + kept * op_stride[j]; + int n = op->width; + + /* + * Each op parser consumes exactly one sample subfield and leaves cur + * on the following ':' or tab. Values that require production-only + * handling, such as non-simple GT encodings, return -4 via fallback. + */ + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (vcf_plan_gt2_u8(&cur, buf) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_GT_SHAPE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INT1: + if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INT2: + if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INT3: + if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INTN: + if (vcf_plan_parse_int_vector_flexible_counted_range(&cur, (int32_t *)buf, + op->width, &n, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_FLOAT1: + if (j < direct_ops) { + float f; + if (vcf_plan_float_scalar_flexible(&cur, &f) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + float_to_le(f, buf); + } else if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_FLOATN: + if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + n = vcf_plan_float_vector_count((float *)buf, op->width); + break; + case VCF_FORMAT_ROW_STR: + if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + goto fallback; + } + break; + default: + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + if (row_ops[j].can_compact && max_counts[j] < n) + max_counts[j] = n; + + if (j + 1 < plan->n_ops) { + if (vcf_plan_expect_sep(&cur, ':') < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + goto fallback; + } + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + goto fallback; + } + } + } + if (++kept == output_nsamples) + break; + } + if (kept != output_nsamples) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SAMPLE_COUNT); + goto fallback; + } + for (j = 0; j < plan->n_ops; j++) { + if (!row_ops[j].can_compact) + continue; + if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + goto fallback; + } + if (max_counts[j] < row_ops[j].width) { + /* + * Production encodes fixed-width vector rows at the observed row + * maximum, not necessarily the conservative header-derived width. + * Compacting here avoids unnecessary whole-row fallback while + * keeping byte-identical BCF output. + */ + vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]); + } + } + + v->n_fmt = plan->n_ops; + v->n_sample = output_nsamples; + if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, output_nsamples, + plan->n_ops, row_ops, + ranges, direct_ops) < 0) + goto error; + if (vcf_parse_format_check7(h, v) < 0) + goto error; + if (vcf_format_plan_stats_enabled()) { + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += output_nsamples; + } + return 0; fallback: - /* - * Only v->indiv is mutated by this executor before success is known. All - * scratch data lives in h->mem and can be overwritten by the fallback parse. - */ - v->indiv.l = indiv_l0; - return -4; + /* + * Only v->indiv is mutated by this executor before success is known. All + * scratch data lives in h->mem and can be overwritten by the fallback parse. + */ + v->indiv.l = indiv_l0; + return -4; error: - v->indiv.l = indiv_l0; - return -1; + v->indiv.l = indiv_l0; + return -1; } static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, - char *q) + char *q, + vcf_format_plan_fallback_reason_t *reason) { - int widths[MAX_N_FMT]; - vcf_format_row_op_t row_ops[MAX_N_FMT]; + int widths[MAX_N_FMT]; + vcf_format_row_op_t row_ops[MAX_N_FMT]; - if (vcf_format_general_strict_widths(s, h, plan, v, q, widths) < 0) - return -4; - vcf_format_general_resolve_ops(plan, v, widths, row_ops); - return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops); + if (vcf_format_general_strict_widths(s, h, plan, v, q, widths, reason) < 0) + return -4; + if (vcf_format_general_resolve_ops(plan, v, widths, row_ops, reason) < 0) + return -4; + return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops, reason); } static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { - vcf_format_general_plan_t *plan; - int nsamples, ret; - - plan = vcf_format_general_plan_get(h, p); - if (!plan) - goto fallback; - if (!vcf_format_fast_guard_enabled(&plan->general_guard)) { - /* - * If this FORMAT string repeatedly fails row-local validation, stop - * probing it for a short cooldown. This protects mixed or pathological - * files from paying fast-path setup cost on every record. - */ - vcf_format_plan_stats.fallback++; - return -3; - } - - nsamples = bcf_hdr_nsamples(h); - if (!nsamples) - return 0; - ret = vcf_parse_format_general_strict(s, h, v, plan, q); - if (ret == 0) { - vcf_format_fast_guard_success(&plan->general_guard); - return ret; - } - if (ret != -4) - return ret; + vcf_format_general_plan_t *plan; + vcf_format_plan_fallback_reason_t reason = VCF_FORMAT_PLAN_FB_PARSE; + int nsamples, ret; + + plan = vcf_format_general_plan_get(h, p, &reason); + if (!plan) { + reason = VCF_FORMAT_PLAN_FB_UNSUPPORTED; + goto fallback; + } + if (!vcf_format_fast_guard_enabled(&plan->general_guard)) { + /* + * If this FORMAT string repeatedly fails row-local validation, stop + * probing it for a short cooldown. This protects mixed or pathological + * files from paying fast-path setup cost on every record. + */ + vcf_format_plan_note_fallback(VCF_FORMAT_PLAN_FB_GUARD); + return -3; + } + + nsamples = bcf_hdr_nsamples(h); + if (!nsamples) + return 0; + ret = vcf_parse_format_general_strict(s, h, v, plan, q, &reason); + if (ret == 0) { + vcf_format_fast_guard_success(&plan->general_guard); + return ret; + } + if (ret != -4) + return ret; fallback: - if (plan) - vcf_format_fast_guard_fallback(&plan->general_guard); - vcf_format_plan_stats.fallback++; - return -3; + if (plan) { + if (vcf_format_plan_width_reason(reason)) + vcf_format_fast_guard_width_fallback(&plan->general_guard); + else if (vcf_format_plan_guard_counts_reason(reason)) + vcf_format_fast_guard_fallback(&plan->general_guard); + } + vcf_format_plan_note_fallback(reason); + return -3; } static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { - int plan_mode; - - plan_mode = vcf_format_plan_mode(); - if (!plan_mode) - return -3; - vcf_format_plan_stats.attempts++; + if (!vcf_format_plan_enabled()) + return -3; + if (vcf_format_plan_stats_enabled()) + vcf_format_plan_stats.attempts++; - /* All enabled modes now use the same dynamic per-tag plan. */ - return vcf_parse_format_general_planned(s, h, v, p, q); + return vcf_parse_format_general_planned(s, h, v, p, q); } // detect FORMAT "." From f9e993493ffad066ee992efbc57f99c0d0e8274e Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 17:25:43 +0200 Subject: [PATCH 31/38] remove cooldown --- docs/FORMAT_PLAN_CURRENT.md | 45 ++++------ docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 36 ++++++-- docs/FORMAT_PLAN_OVERVIEW.md | 5 +- test/test_view.c | 9 +- vcf.c | 134 +---------------------------- 5 files changed, 55 insertions(+), 174 deletions(-) diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md index 762c0da62..b16557458 100644 --- a/docs/FORMAT_PLAN_CURRENT.md +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -34,18 +34,17 @@ compile cost. Planner statistics are collected only when `HTS_VCF_FORMAT_PLAN_STATS=1` is also set. Normal production parsing therefore avoids touching the process-wide test counters. The test hook reports both aggregate attempts/hits/fallbacks and -fallback reason counters: unsupported schema, guard cooldown, numeric width, -string width, GT shape, parse failure, separator mismatch, and sample-count -mismatch. +fallback reason counters: unsupported schema, numeric width, string width, GT +shape, parse failure, separator mismatch, and sample-count mismatch. `bcf_hdr_sync()` clears the header-owned plan cache and increments the private generation after header dictionaries are rebuilt. The planner also refuses to compile while `h->dirty` is set, leaving unsynced or header-repair cases on the generic parser. -The cache and per-plan guard counters are mutable header-owned state, like other -htslib header scratch storage. Callers should not concurrently parse through -the same `bcf_hdr_t` from multiple threads. +The cache is mutable header-owned state, like other htslib header scratch +storage. Callers should not concurrently parse through the same `bcf_hdr_t` +from multiple threads. The compile step rejects: @@ -93,24 +92,17 @@ For fixed-width vector fields, the executor can compact underfilled rows to the observed row maximum before BCF encoding. This avoids whole-row fallback when the generic parser would also emit a narrower byte-identical vector width. -## Guard Policy +## Fallback Policy -Each cached dynamic plan has a small runtime guard: +Supported cached plans are probed on every row. If row-local validation fails, +the executor rolls back its partial `v->indiv` writes and the generic parser +handles the whole FORMAT column for that record. The fallback does not disable +or cool down the cached plan; nearby rows with the same FORMAT schema can still +take the optimized path. -- attempts, hits, fallbacks; -- consecutive miss streak; -- temporary cooldown. - -An isolated fallback does not disable the fast path. A plan is paused after -eight consecutive misses, or after at least 128 attempts with more than 10% -guard-counted fallbacks. Row-local numeric/string width misses are counted in -diagnostics but do not poison the normal guard, because those rows can be sparse -within an otherwise profitable schema. A separate dense-width guard pauses a -schema only after at least 128 width probes with more than 75% width misses; this -catches pathological over-cap schemas without disabling CCDG-like layouts where -only a small minority of rows have very long phase strings. After 256 skipped -records, the plan probes again so later stable regions can recover the optimized -path. +Compile-time unsupported schemas are still cached as unsupported, so repeated +unoptimizable FORMAT strings pay the compile/classification cost once and then +fall back directly to the generic parser. ## Correctness Rules @@ -191,10 +183,9 @@ All planned outputs compared byte-identical to baseline. The CCDG 10k fallbacks are all `string_width=139`, meaning only rows with measured string fields wider than the 256-byte planned cap use the generic parser. The float/string control fixtures still fall back as unsupported -because the low-profit schema gate deliberately rejects those schemas. A -briefly tested consecutive-width guard regressed CCDG to 9,702 hits / 298 -fallbacks; the retained dense-width guard restores the expected sparse-fallback -profile. +because the low-profit schema gate deliberately rejects those schemas. Briefly +tested runtime guards regressed sparse-fallback CCDG-like layouts, so the current +implementation leaves row-local fallbacks local to the record. ## Full Threaded Corpus Benchmark @@ -511,7 +502,7 @@ into numeric and string limits: - numeric measured vectors remain capped at 64 values; - measured strings are capped at 256 bytes; -- numeric/string width fallbacks are counted but do not disable the schema guard. +- numeric/string width fallbacks are counted but do not disable the cached plan. A 512-byte string cap was tested first. It recovered all CCDG 10k planner fallbacks, but the bcftools-level signal was mixed. The retained 256-byte cap diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md index 7d3d12180..e81ffcaba 100644 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -404,7 +404,6 @@ The implementation now reports fallback reasons under `HTS_VCF_FORMAT_PLAN_STATS=1`: - unsupported schema; -- guard cooldown; - numeric width; - string width; - GT shape; @@ -414,8 +413,7 @@ The implementation now reports fallback reasons under The single width cap was split into a 64-value numeric-vector cap and a 256-byte measured-string cap. Numeric and string width fallbacks are diagnostic -only for the normal schema guard: they do not disable a schema that succeeds on -nearby rows. +only: they do not disable a schema that succeeds on nearby rows. Two string caps were benchmarked. A 512-byte cap planned all CCDG 10k rows but had a mixed bcftools-level signal. The retained 256-byte cap planned 9,861 of @@ -423,7 +421,7 @@ had a mixed bcftools-level signal. The retained 256-byte cap planned 9,861 of ```text vcf-format-plan attempts=10000 hits=9861 fallback=139 parsed_samples=31574922 -vcf-format-plan-fallback unsupported=0 guard=0 numeric_width=0 string_width=139 gt_shape=0 parse=0 separator=0 sample_count=0 +vcf-format-plan-fallback unsupported=0 numeric_width=0 string_width=139 gt_shape=0 parse=0 separator=0 sample_count=0 ``` Result: retained. Focused tests passed, `git diff --check` was clean, and the @@ -458,8 +456,8 @@ Retained changes: - new fixtures cover rollback after partial planned parsing, malformed unselected samples under `bcf_hdr_set_samples()`, repeated wide GT values, and malformed sample-count failures; -- dense-width guard behavior was tightened so sparse over-cap string rows do not - poison CCDG-like schemas. +- row-local width fallbacks remain record-local so sparse over-cap string rows + do not poison CCDG-like schemas. Result: retained. `make check` passed with 377/377 tests. `make maintainer-check` was attempted but failed before the whitespace/copyright @@ -478,6 +476,32 @@ compared byte-identical. CCDG 10k user-time speedups were 1.14x for `view_bcf`, 1.56x for `query_format`, and 1.12x for `filter_gt`; GIAB single-sample FORMAT rows remained modestly positive, as expected. +## Runtime Cooldown Removal + +The per-plan runtime cooldown was removed after an A/B pass showed no practical +benefit on realistic workloads. The cooldown had paused a supported cached +schema after repeated row-local fallbacks, but standard corpus hit/fallback +counts were identical with and without it. The remaining protection is simpler: +compile-time unsupported schemas are negative-cached, low-profit schemas are +rejected at compile time, and row-local misses fall back only for that record. + +The final no-cooldown parser corpus in +`bench/format-shape/large/results-no-cooldown-final` compared byte-identical to +baseline. Representative planned user times: + +| Input | Baseline user | Planned user | Hits / fallback | +|---|---:|---:|---:| +| CCDG 10k | 2.46 s | 2.16 s | 9,861 / 139 | +| 1000G chr22 full GT | 24.50 s | 9.34 s | 1,103,547 / 0 | +| Large reordered likelihood | 2.89 s | 2.42 s | 20,000 / 0 | +| Large float/string negative | 2.88 s | 2.86 s | 0 / 16,000 | +| Mixed row-local fallbacks | 2.14 s | 1.83 s | 12,000 / 0 | +| Two-string float negative | 2.21 s | 2.22 s | 0 / 12,000 | + +Result: retained. Focused planner tests passed, `test/test_format_plan_cache` +passed, all large parser-corpus outputs compared byte-identical, and +`git diff --check` was clean. + ## Main Lessons - Tag-level composition is the right MVP boundary; exact full FORMAT strings are diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 8f384a61f..1e210f8de 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -31,9 +31,8 @@ blocks. Fallbacks are whole-row, but they are now classified for diagnostics when `HTS_VCF_FORMAT_PLAN_STATS=1` is set. The current reason counters distinguish -unsupported schemas, guard cooldowns, numeric-width limits, string-width limits, -GT shape misses, parse failures, separator mismatches, and sample-count -mismatches. +unsupported schemas, numeric-width limits, string-width limits, GT shape misses, +parse failures, separator mismatches, and sample-count mismatches. ## Why This Shape diff --git a/test/test_view.c b/test/test_view.c index 08cf53bbe..6a78e1027 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -41,7 +41,6 @@ extern void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, uint64_t *fallback, uint64_t *parsed_samples); extern void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, - uint64_t *guard, uint64_t *numeric_width, uint64_t *string_width, uint64_t *gt_shape, @@ -453,11 +452,11 @@ int main(int argc, char *argv[]) const char *format_plan_stats = getenv("HTS_VCF_FORMAT_PLAN_STATS"); if (format_plan_stats && strcmp(format_plan_stats, "1") == 0) { uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; - uint64_t unsupported = 0, guard = 0; + uint64_t unsupported = 0; uint64_t numeric_width = 0, string_width = 0, gt_shape = 0, parse = 0; uint64_t separator = 0, sample_count = 0; vcf_format_plan_stats_for_test(&attempts, &hits, &fallback, &parsed_samples); - vcf_format_plan_fallback_stats_for_test(&unsupported, &guard, &numeric_width, + vcf_format_plan_fallback_stats_for_test(&unsupported, &numeric_width, &string_width, >_shape, &parse, &separator, &sample_count); @@ -467,8 +466,8 @@ int main(int argc, char *argv[]) (unsigned long long) fallback, (unsigned long long) parsed_samples); fprintf(stderr, - "vcf-format-plan-fallback unsupported=%llu guard=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", - (unsigned long long) unsupported, (unsigned long long) guard, + "vcf-format-plan-fallback unsupported=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", + (unsigned long long) unsupported, (unsigned long long) numeric_width, (unsigned long long) string_width, (unsigned long long) gt_shape, (unsigned long long) parse, (unsigned long long) separator, (unsigned long long) sample_count); diff --git a/vcf.c b/vcf.c index f17838c37..23144ef3e 100644 --- a/vcf.c +++ b/vcf.c @@ -3230,7 +3230,6 @@ static vcf_format_plan_stats_t vcf_format_plan_stats; typedef enum { VCF_FORMAT_PLAN_FB_UNSUPPORTED = 0, - VCF_FORMAT_PLAN_FB_GUARD, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH, VCF_FORMAT_PLAN_FB_STRING_WIDTH, VCF_FORMAT_PLAN_FB_GT_SHAPE, @@ -3266,7 +3265,6 @@ void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, } void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, - uint64_t *guard, uint64_t *numeric_width, uint64_t *string_width, uint64_t *gt_shape, @@ -3276,8 +3274,6 @@ void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, { if (unsupported) *unsupported = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_UNSUPPORTED]; - if (guard) - *guard = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GUARD]; if (numeric_width) *numeric_width = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH]; if (string_width) @@ -3314,25 +3310,7 @@ static int vcf_format_plan_enabled(void) return enabled; } -typedef struct { - uint32_t attempts; - uint32_t hits; - uint32_t fallbacks; - uint32_t width_attempts; - uint32_t width_fallbacks; - uint16_t miss_streak; - uint16_t width_miss_streak; - uint16_t cooldown; - uint8_t disabled; -} vcf_format_fast_guard_t; - enum { - VCF_FORMAT_FAST_DISABLE_STREAK = 8, - VCF_FORMAT_FAST_PROBE_ATTEMPTS = 128, - VCF_FORMAT_FAST_MAX_FALLBACK_PCT = 10, - VCF_FORMAT_FAST_WIDTH_PROBE_ATTEMPTS = 128, - VCF_FORMAT_FAST_MAX_WIDTH_FALLBACK_PCT = 75, - VCF_FORMAT_FAST_COOLDOWN_RECORDS = 256, VCF_FORMAT_MAX_NUMERIC_WIDTH = 64, VCF_FORMAT_MAX_STRING_WIDTH = 256 }; @@ -3353,98 +3331,6 @@ static inline void vcf_format_plan_set_reason(vcf_format_plan_fallback_reason_t *dst = reason; } -static inline int vcf_format_plan_width_reason(vcf_format_plan_fallback_reason_t reason) -{ - return reason == VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH || - reason == VCF_FORMAT_PLAN_FB_STRING_WIDTH; -} - -static inline int vcf_format_plan_guard_counts_reason(vcf_format_plan_fallback_reason_t reason) -{ - /* - * Row-local width limits are expected on mixed real-world files. They are - * tracked by a separate dense-width guard so sparse long rows do not disable - * an otherwise useful schema. - */ - return !vcf_format_plan_width_reason(reason); -} - -static inline void vcf_format_fast_guard_reset(vcf_format_fast_guard_t *guard) -{ - guard->attempts = 0; - guard->hits = 0; - guard->fallbacks = 0; - guard->width_attempts = 0; - guard->width_fallbacks = 0; - guard->miss_streak = 0; - guard->width_miss_streak = 0; - guard->disabled = 0; -} - -static inline int vcf_format_fast_guard_enabled(vcf_format_fast_guard_t *guard) -{ - if (!guard->disabled) - return 1; - if (guard->cooldown) { - guard->cooldown--; - return 0; - } - vcf_format_fast_guard_reset(guard); - return 1; -} - -static inline void vcf_format_fast_guard_success(vcf_format_fast_guard_t *guard) -{ - if (guard->attempts != UINT32_MAX) - guard->attempts++; - if (guard->hits != UINT32_MAX) - guard->hits++; - if (guard->width_attempts != UINT32_MAX) - guard->width_attempts++; - guard->miss_streak = 0; - guard->width_miss_streak = 0; -} - -static inline void vcf_format_fast_guard_fallback(vcf_format_fast_guard_t *guard) -{ - if (guard->attempts != UINT32_MAX) - guard->attempts++; - if (guard->fallbacks != UINT32_MAX) - guard->fallbacks++; - if (guard->miss_streak != UINT16_MAX) - guard->miss_streak++; - guard->width_miss_streak = 0; - - if (guard->miss_streak >= VCF_FORMAT_FAST_DISABLE_STREAK) { - guard->disabled = 1; - guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; - return; - } - if (guard->attempts >= VCF_FORMAT_FAST_PROBE_ATTEMPTS && - (uint64_t) guard->fallbacks * 100 > - (uint64_t) guard->attempts * VCF_FORMAT_FAST_MAX_FALLBACK_PCT) { - guard->disabled = 1; - guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; - } -} - -static inline void vcf_format_fast_guard_width_fallback(vcf_format_fast_guard_t *guard) -{ - if (guard->width_attempts != UINT32_MAX) - guard->width_attempts++; - if (guard->width_fallbacks != UINT32_MAX) - guard->width_fallbacks++; - if (guard->width_miss_streak != UINT16_MAX) - guard->width_miss_streak++; - - if (guard->width_attempts >= VCF_FORMAT_FAST_WIDTH_PROBE_ATTEMPTS && - (uint64_t) guard->width_fallbacks * 100 > - (uint64_t) guard->width_attempts * VCF_FORMAT_FAST_MAX_WIDTH_FALLBACK_PCT) { - guard->disabled = 1; - guard->cooldown = VCF_FORMAT_FAST_COOLDOWN_RECORDS; - } -} - typedef struct { /* * Header-derived operation for one FORMAT tag. This is the reusable, @@ -3476,7 +3362,6 @@ typedef struct { vcf_format_plan_fallback_reason_t fallback_reason; int n_ops; vcf_format_op_t ops[MAX_N_FMT]; - vcf_format_fast_guard_t general_guard; } vcf_format_general_plan_t; struct vcf_format_plan_cache_t { @@ -4852,34 +4737,17 @@ static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, reason = VCF_FORMAT_PLAN_FB_UNSUPPORTED; goto fallback; } - if (!vcf_format_fast_guard_enabled(&plan->general_guard)) { - /* - * If this FORMAT string repeatedly fails row-local validation, stop - * probing it for a short cooldown. This protects mixed or pathological - * files from paying fast-path setup cost on every record. - */ - vcf_format_plan_note_fallback(VCF_FORMAT_PLAN_FB_GUARD); - return -3; - } nsamples = bcf_hdr_nsamples(h); if (!nsamples) return 0; ret = vcf_parse_format_general_strict(s, h, v, plan, q, &reason); - if (ret == 0) { - vcf_format_fast_guard_success(&plan->general_guard); + if (ret == 0) return ret; - } if (ret != -4) return ret; fallback: - if (plan) { - if (vcf_format_plan_width_reason(reason)) - vcf_format_fast_guard_width_fallback(&plan->general_guard); - else if (vcf_format_plan_guard_counts_reason(reason)) - vcf_format_fast_guard_fallback(&plan->general_guard); - } vcf_format_plan_note_fallback(reason); return -3; } From be6eaa480713fc88bb760063d710e7a74db48318 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 18:37:35 +0200 Subject: [PATCH 32/38] Reuse FORMAT string spans in planner --- test/format-plan-string-span.vcf | 12 +++++ test/test.pl | 70 ++++++++++++++++++++++-- vcf.c | 92 +++++++++++++++++++++++++++++--- 3 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 test/format-plan-string-span.vcf diff --git a/test/format-plan-string-span.vcf b/test/format-plan-string-span.vcf new file mode 100644 index 000000000..b2861114a --- /dev/null +++ b/test/format-plan-string-span.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:FT:PID:DP 0/1:PASS:P1:12 0/0:LowQual:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:8 ./.:.:.:0 +1 2 . A C . PASS . FT:GT:PID:DP PASS:0/1:P1:12 q10:1/1:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:4 .:./.:.:0 +1 3 . A C . PASS . PID:HP:GT:DP P1:H1:0|1:10 .:H2:1|1:5 PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:H3:./.:0 +1 4 . A C . PASS . GT:PID:FT:HP:DP 0/1:SHORT:PASS:A:20 0/0:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:LowQual:hap-two:9 ./.:.:.:.:0 diff --git a/test/test.pl b/test/test.pl index 50aee44fa..00c677a90 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1212,14 +1212,55 @@ sub test_vcf_44 cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf"); } +sub test_vcf_format_plan_check_stats +{ + my ($stats_file, $expected) = @_; + return if !defined($expected); + + open(my $fh, '<', $stats_file) + or return "failed to open planner stats file $stats_file: $!"; + local $/; + my $stats = <$fh>; + close($fh) or return "failed to close planner stats file $stats_file: $!"; + + my %observed; + if ($stats =~ /vcf-format-plan attempts=(\d+) hits=(\d+) fallback=(\d+) parsed_samples=(\d+)/) { + @observed{qw(attempts hits fallback parsed_samples)} = ($1, $2, $3, $4); + } else { + return "missing planner stats in $stats_file\n$stats"; + } + + if ($stats =~ /vcf-format-plan-fallback unsupported=(\d+) numeric_width=(\d+) string_width=(\d+) gt_shape=(\d+) parse=(\d+) separator=(\d+) sample_count=(\d+)/) { + @observed{qw(unsupported numeric_width string_width gt_shape parse separator sample_count)} + = ($1, $2, $3, $4, $5, $6, $7); + } else { + return "missing planner fallback stats in $stats_file\n$stats"; + } + + for my $key (sort keys %$expected) { + return "planner stat $key: expected $$expected{$key}, got $observed{$key}\n$stats" + if !exists($observed{$key}) || $observed{$key} != $$expected{$key}; + } + + return; +} + sub test_vcf_format_plan_one { - my ($opts, $input, $label, $extra_args) = @_; + my ($opts, $input, $label, $extra_args, $expected_stats) = @_; my $base = "$$opts{tmp}/$label.base.bcf"; my $plan = "$$opts{tmp}/$label.plan.bcf"; my $disabled = "$$opts{tmp}/$label.disabled.bcf"; + my $plan_stats = "$$opts{tmp}/$label.plan.stats"; my $test = "VCF FORMAT planner: $label"; my $args = defined($extra_args) ? $extra_args : ""; + my $plan_env = "HTS_VCF_FORMAT_PLAN=1"; + my $plan_stderr = ""; + + if (defined($expected_stats)) { + $plan_env .= " HTS_VCF_FORMAT_PLAN_STATS=1"; + $plan_stderr = " 2>$plan_stats"; + } print "$test:\n"; @@ -1231,7 +1272,7 @@ sub test_vcf_format_plan_one return; } - $cmd = "env HTS_VCF_FORMAT_PLAN=1 $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $plan"; + $cmd = "env $plan_env $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $plan$plan_stderr"; print "\t$cmd\n"; ($ret, $out) = _cmd($cmd); if ($ret) { @@ -1239,6 +1280,14 @@ sub test_vcf_format_plan_one return; } + if (defined($expected_stats)) { + my $stats_error = test_vcf_format_plan_check_stats($plan_stats, $expected_stats); + if ($stats_error) { + failed($opts, $test, $stats_error); + return; + } + } + $cmd = "cmp $base $plan"; print "\t$cmd\n"; ($ret, $out) = _cmd($cmd); @@ -1308,6 +1357,15 @@ sub test_vcf_format_plan test_vcf_format_plan_one($opts, $input, $label, ""); } + test_vcf_format_plan_one($opts, "format-plan-string-span.vcf", + "format-plan-string-span", "", + { attempts => 4, hits => 4, fallback => 0, + parsed_samples => 12, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + for my $samples ("S1,S3", "S2", "^S2") { for my $input ("format-plan-composable.vcf", "format-plan-edge.vcf") { (my $label = "$input.$samples") =~ s/[^A-Za-z0-9_.-]/_/g; @@ -1316,7 +1374,13 @@ sub test_vcf_format_plan } test_vcf_format_plan_one($opts, "format-plan-sample-skip.vcf", - "format-plan-sample-skip.S1_S3", "-s S1,S3"); + "format-plan-sample-skip.S1_S3", "-s S1,S3", + { attempts => 1, hits => 1, fallback => 0, + parsed_samples => 2, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); test_vcf_format_plan_failure($opts, "format-plan-sample-count.vcf", "format-plan-sample-count"); } diff --git a/vcf.c b/vcf.c index 23144ef3e..9d7459ff4 100644 --- a/vcf.c +++ b/vcf.c @@ -3361,6 +3361,7 @@ typedef struct { int supported; vcf_format_plan_fallback_reason_t fallback_reason; int n_ops; + uint8_t has_string_spans; vcf_format_op_t ops[MAX_N_FMT]; } vcf_format_general_plan_t; @@ -3397,6 +3398,11 @@ typedef struct { uint8_t can_compact; } vcf_format_row_op_t; +typedef struct { + const char *ptr; + int len; +} vcf_format_string_span_t; + typedef struct { int32_t min; int32_t max; @@ -3613,6 +3619,7 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma if (plan->ops[plan->n_ops].number != 1) goto done; plan->ops[plan->n_ops].measured_width = 1; + plan->has_string_spans = 1; } else if (vl != BCF_VL_FIXED && vl != BCF_VL_A && vl != BCF_VL_R && vl != BCF_VL_G && vl != BCF_VL_VAR) { @@ -4087,6 +4094,21 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int return 0; } +VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string_span(const vcf_format_string_span_t *span, + const char **sp, + char *out, int width) +{ + int l = span->len; + + if (*sp != span->ptr || l > width) + return -1; + memcpy(out, span->ptr, l); + if (l < width) + memset(out + l, 0, width - l); + *sp = span->ptr + l; + return 0; +} + static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int width) { const char *s = *sp; @@ -4355,13 +4377,19 @@ static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, bcf1_t *v, char *q, int *widths, + size_t *string_span_offsets, + size_t *string_spans_end, vcf_format_plan_fallback_reason_t *reason) { + kstring_t *mem = (kstring_t*)&h->mem; const char *cur, *end; - int has_measured = 0, sample, kept = 0, j; + int has_measured = 0, has_string_spans = 0, sample, kept = 0, j; int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); int output_nsamples = bcf_hdr_nsamples(h); + if (string_spans_end) + *string_spans_end = 0; + /* * With bcf_hdr_set_samples(), the text line still contains the original * sample columns but BCF output must contain only the retained samples. The @@ -4371,6 +4399,8 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; + if (string_span_offsets) + string_span_offsets[j] = (size_t)-1; if (op->measured_width) { /* * Strings and Number=. numeric vectors need a first pass so the @@ -4382,6 +4412,8 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, */ widths[j] = 0; has_measured = 1; + if (!op->is_gt && op->htype == BCF_HT_STR) + has_string_spans = 1; } else { widths[j] = vcf_format_general_expected_width(op, v); if (widths[j] <= 0 || widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { @@ -4394,6 +4426,29 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, if (!has_measured) return 0; + if (has_string_spans && string_span_offsets) { + mem->l = 0; + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + size_t bytes; + + if (!op->measured_width || op->is_gt || op->htype != BCF_HT_STR) + continue; + if (align_mem(mem) < 0) + return -1; + string_span_offsets[j] = mem->l; + bytes = (size_t) output_nsamples * sizeof(vcf_format_string_span_t); + if (output_nsamples < 0 || + output_nsamples > INT_MAX / (int)sizeof(vcf_format_string_span_t) || + (uint64_t) mem->l + (uint64_t) bytes > INT_MAX || + ks_resize(mem, mem->l + bytes) < 0) + return -1; + mem->l += bytes; + } + if (string_spans_end) + *string_spans_end = mem->l; + } + cur = q + 1; end = s->s + s->l; for (sample = 0; sample < nsamples && cur < end; sample++) { @@ -4426,6 +4481,12 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, } if (op->measured_width && !op->is_gt && op->htype == BCF_HT_STR) { w = cur - field; + if (string_span_offsets && string_span_offsets[j] != (size_t)-1) { + vcf_format_string_span_t *spans = + (vcf_format_string_span_t *)(mem->s + string_span_offsets[j]); + spans[kept].ptr = field; + spans[kept].len = w; + } if (j > 0) w++; if (w <= 0) @@ -4487,6 +4548,8 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, char *q, vcf_format_row_op_t *row_ops, + const size_t *string_span_offsets, + size_t string_spans_end, vcf_format_plan_fallback_reason_t *reason) { kstring_t *mem = (kstring_t*)&h->mem; @@ -4538,7 +4601,7 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, } } - mem->l = 0; + mem->l = string_spans_end; for (j = direct_ops; j < plan->n_ops; j++) { vcf_format_row_op_t *op = &row_ops[j]; @@ -4630,7 +4693,15 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, n = vcf_plan_float_vector_count((float *)buf, op->width); break; case VCF_FORMAT_ROW_STR: - if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) { + if (string_span_offsets && string_span_offsets[j] != (size_t)-1) { + vcf_format_string_span_t *spans = + (vcf_format_string_span_t *)(mem->s + string_span_offsets[j]); + if (vcf_plan_copy_string_span(&spans[kept], &cur, + (char *)buf, op->width) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + goto fallback; + } + } else if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) { vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); goto fallback; } @@ -4717,12 +4788,21 @@ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, { int widths[MAX_N_FMT]; vcf_format_row_op_t row_ops[MAX_N_FMT]; + size_t string_span_offsets_buf[MAX_N_FMT]; + size_t *string_span_offsets = plan->has_string_spans ? string_span_offsets_buf : NULL; + size_t string_spans_end = 0; + int ret; - if (vcf_format_general_strict_widths(s, h, plan, v, q, widths, reason) < 0) - return -4; + ret = vcf_format_general_strict_widths(s, h, plan, v, q, widths, + string_span_offsets, + &string_spans_end, reason); + if (ret < 0) + return ret; if (vcf_format_general_resolve_ops(plan, v, widths, row_ops, reason) < 0) return -4; - return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops, reason); + return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops, + string_span_offsets, + string_spans_end, reason); } static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, From 0767a556cf87e1a3e148631fd3b24974d986da8a Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 22:19:13 +0200 Subject: [PATCH 33/38] update --- bench/format-shape/.gitignore | 11 +- bench/format-shape/README.md | 22 +-- .../large/bcftools-full-ccdg-inputs.tsv | 2 +- bench/format-shape/large/results/checks.tsv | 21 --- bench/format-shape/large/results/timings.tsv | 31 ---- bench/format-shape/results/checks.tsv | 21 --- bench/format-shape/results/timings.tsv | 31 ---- docs/FORMAT_PLAN_CURRENT.md | 77 ++++----- docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 30 ++-- docs/FORMAT_PLAN_OVERVIEW.md | 10 +- test/format-plan-empty-format-tag.vcf | 6 + test/format-plan-float-vector.vcf | 13 ++ test/format-plan-malformed-fields.vcf | 11 ++ test/test.pl | 59 ++++++- vcf.c | 155 +++++++++++------- 15 files changed, 249 insertions(+), 251 deletions(-) delete mode 100644 bench/format-shape/large/results/checks.tsv delete mode 100644 bench/format-shape/large/results/timings.tsv delete mode 100644 bench/format-shape/results/checks.tsv delete mode 100644 bench/format-shape/results/timings.tsv create mode 100644 test/format-plan-empty-format-tag.vcf create mode 100644 test/format-plan-float-vector.vcf create mode 100644 test/format-plan-malformed-fields.vcf diff --git a/bench/format-shape/.gitignore b/bench/format-shape/.gitignore index 801993aef..b60d39628 100644 --- a/bench/format-shape/.gitignore +++ b/bench/format-shape/.gitignore @@ -4,13 +4,6 @@ synthetic/*.vcf.gz large/**/*.vcf.gz large/**/*.vcf.bgz large/**/*.tbi -large/results/*.bcf -large/results/*.stderr -large/results/*.tmp +large/results/* large/results-*/* -results/*.bcf -results/*.stderr -!results/timings.tsv -!results/checks.tsv -!large/results/timings.tsv -!large/results/checks.tsv +results/* diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md index 5c51d4b73..2959bb250 100644 --- a/bench/format-shape/README.md +++ b/bench/format-shape/README.md @@ -29,14 +29,14 @@ bench/format-shape/ results/ generated timing logs and BCF outputs ``` -The downloaded/generated VCF inputs are intentionally ignored by git to avoid -accidentally pushing large benchmark data. The manifest, scripts, docs, and -small result summaries are tracked; the local data can be regenerated from the -commands below. +The downloaded/generated VCF inputs and benchmark result files are intentionally +ignored by git to avoid accidentally pushing large local data. The manifests, +scripts, and docs are tracked; local data and timing summaries can be +regenerated from the commands below. -`results/` can be regenerated at any time and may become large. The script -keeps BCF outputs locally so `cmp` checks are inspectable, but `.gitignore` -excludes those large files. +`results/` can be regenerated at any time and may become large. The scripts +write timing/check summaries and keep BCF outputs locally so `cmp` checks are +inspectable, but `.gitignore` excludes those rerun artifacts. ## Repo Tests @@ -95,10 +95,11 @@ The parent CCDG/1000G high-coverage chr22 file for https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` -It is 26.0 GiB compressed and is available locally at: +It is 26.0 GiB compressed. For local reruns, point the full-CCDG manifest at a +local copy such as: ```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +/path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` Do not run the normal output-materializing command harness on this file. A @@ -378,13 +379,14 @@ SYNTHETIC_ONLY_NEW=1 \ bench/format-shape/large/synthetic 2048 ``` -The latest large run is summarized in: +The latest large run used this local output directory: ```text bench/format-shape/large/results-prod-hardening2/timings.tsv bench/format-shape/large/results-prod-hardening2/checks.tsv ``` +Generated result files are ignored; the summary below is the portable record. All plan outputs in that run compared byte-identical to baseline. That run includes fallback reason diagnostics. In the CCDG 10k slice, the diff --git a/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv b/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv index 0dce0fd61..0c25507d6 100644 --- a/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv +++ b/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv @@ -1,2 +1,2 @@ name path source -ccdg_chr22_full /Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz local full 1000G/CCDG high-coverage chr22 VCF, 3,202 samples +ccdg_chr22_full /path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz local full 1000G/CCDG high-coverage chr22 VCF, 3,202 samples diff --git a/bench/format-shape/large/results/checks.tsv b/bench/format-shape/large/results/checks.tsv deleted file mode 100644 index 3dfeed0c0..000000000 --- a/bench/format-shape/large/results/checks.tsv +++ /dev/null @@ -1,21 +0,0 @@ -name comparison status -ccdg_10k baseline_vs_exact ok -ccdg_10k baseline_vs_interp ok -1000g_chr22_full_genotypes baseline_vs_exact ok -1000g_chr22_full_genotypes baseline_vs_interp ok -large_ccdg_likelihood_2048s baseline_vs_exact ok -large_ccdg_likelihood_2048s baseline_vs_interp ok -large_reordered_likelihood_2048s baseline_vs_exact ok -large_reordered_likelihood_2048s baseline_vs_interp ok -large_multiallelic_likelihood_2048s baseline_vs_exact ok -large_multiallelic_likelihood_2048s baseline_vs_interp ok -large_float_string_2048s baseline_vs_exact ok -large_float_string_2048s baseline_vs_interp ok -large_phase_width_variation_2048s baseline_vs_exact ok -large_phase_width_variation_2048s baseline_vs_interp ok -large_mixed_likelihood_2048s baseline_vs_exact ok -large_mixed_likelihood_2048s baseline_vs_interp ok -large_gt_first_reordered_2048s baseline_vs_exact ok -large_gt_first_reordered_2048s baseline_vs_interp ok -large_two_string_float_2048s baseline_vs_exact ok -large_two_string_float_2048s baseline_vs_interp ok diff --git a/bench/format-shape/large/results/timings.tsv b/bench/format-shape/large/results/timings.tsv deleted file mode 100644 index 42a3a118e..000000000 --- a/bench/format-shape/large/results/timings.tsv +++ /dev/null @@ -1,31 +0,0 @@ -name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.95 2.76 0.13 0 0 0 0 0 0 0 -ccdg_10k exact 1.88 1.73 0.13 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.85 1.7 0.13 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_full_genotypes baseline 27.79 27.05 0.65 0 0 0 0 0 0 0 -1000g_chr22_full_genotypes exact 6.59 6 0.57 1103547 1103547 0 2763281688 0 0 0 -1000g_chr22_full_genotypes interp 7.18 6.51 0.6 1103547 1103547 0 2763281688 0 0 0 -large_ccdg_likelihood_2048s baseline 4.28 4.03 0.21 0 0 0 0 0 0 0 -large_ccdg_likelihood_2048s exact 2.95 2.77 0.16 20000 20000 0 40960000 0 0 0 -large_ccdg_likelihood_2048s interp 2.93 2.74 0.18 20000 20000 0 40960000 20000 20000 0 -large_reordered_likelihood_2048s baseline 3.06 2.91 0.13 0 0 0 0 0 0 0 -large_reordered_likelihood_2048s exact 2.76 2.63 0.12 20000 20000 0 40960000 0 0 0 -large_reordered_likelihood_2048s interp 2.76 2.62 0.12 20000 20000 0 40960000 0 0 0 -large_multiallelic_likelihood_2048s baseline 3.35 3.18 0.15 0 0 0 0 0 0 0 -large_multiallelic_likelihood_2048s exact 2.29 2.13 0.13 16000 16000 0 32768000 0 0 0 -large_multiallelic_likelihood_2048s interp 2.06 1.92 0.13 16000 16000 0 32768000 16000 16000 0 -large_float_string_2048s baseline 3.15 2.92 0.19 0 0 0 0 0 0 0 -large_float_string_2048s exact 3.04 2.85 0.18 16000 16000 0 32768000 0 0 0 -large_float_string_2048s interp 3 2.8 0.18 16000 16000 0 32768000 0 0 0 -large_phase_width_variation_2048s baseline 2.78 2.58 0.18 0 0 0 0 0 0 0 -large_phase_width_variation_2048s exact 2.25 2.04 0.19 12000 12000 0 24576000 0 0 0 -large_phase_width_variation_2048s interp 2.25 2.05 0.18 12000 12000 0 24576000 12000 12000 0 -large_mixed_likelihood_2048s baseline 2.3 2.18 0.1 0 0 0 0 0 0 0 -large_mixed_likelihood_2048s exact 1.69 1.58 0.09 12000 11400 600 23347200 7355 6650 705 -large_mixed_likelihood_2048s interp 1.7 1.59 0.09 12000 12000 0 24576000 11295 10236 1059 -large_gt_first_reordered_2048s baseline 1.84 1.74 0.07 0 0 0 0 0 0 0 -large_gt_first_reordered_2048s exact 1.62 1.54 0.06 12000 12000 0 24576000 0 0 0 -large_gt_first_reordered_2048s interp 1.62 1.53 0.07 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s baseline 2.46 2.27 0.17 0 0 0 0 0 0 0 -large_two_string_float_2048s exact 2.64 2.46 0.16 12000 12000 0 24576000 0 0 0 -large_two_string_float_2048s interp 2.65 2.47 0.16 12000 12000 0 24576000 0 0 0 diff --git a/bench/format-shape/results/checks.tsv b/bench/format-shape/results/checks.tsv deleted file mode 100644 index 58c9e8f97..000000000 --- a/bench/format-shape/results/checks.tsv +++ /dev/null @@ -1,21 +0,0 @@ -name comparison status -ccdg_10k baseline_vs_exact ok -ccdg_10k baseline_vs_interp ok -1000g_chr22_genotypes baseline_vs_exact ok -1000g_chr22_genotypes baseline_vs_interp ok -1000g_wgs_sites baseline_vs_exact ok -1000g_wgs_sites baseline_vs_interp ok -clinvar_grch38_chr22 baseline_vs_exact ok -clinvar_grch38_chr22 baseline_vs_interp ok -gnomad_v4.1_exomes_sites baseline_vs_exact ok -gnomad_v4.1_exomes_sites baseline_vs_interp ok -synthetic_ccdg_likelihood baseline_vs_exact ok -synthetic_ccdg_likelihood baseline_vs_interp ok -synthetic_reordered_likelihood baseline_vs_exact ok -synthetic_reordered_likelihood baseline_vs_interp ok -synthetic_fixed_numeric baseline_vs_exact ok -synthetic_fixed_numeric baseline_vs_interp ok -synthetic_float_string baseline_vs_exact ok -synthetic_float_string baseline_vs_interp ok -synthetic_multiallelic_likelihood baseline_vs_exact ok -synthetic_multiallelic_likelihood baseline_vs_interp ok diff --git a/bench/format-shape/results/timings.tsv b/bench/format-shape/results/timings.tsv deleted file mode 100644 index 7966f9d75..000000000 --- a/bench/format-shape/results/timings.tsv +++ /dev/null @@ -1,31 +0,0 @@ -name mode real user sys attempts hits fallback parsed_samples shape_attempts shape_hits shape_fallback -ccdg_10k baseline 2.68 2.46 0.18 0 0 0 0 0 0 0 -ccdg_10k exact 1.74 1.57 0.15 10000 10000 0 32020000 0 0 0 -ccdg_10k interp 1.85 1.66 0.16 10000 10000 0 32020000 10000 10000 0 -1000g_chr22_genotypes baseline 0.04 0.03 0 0 0 0 0 0 0 0 -1000g_chr22_genotypes exact 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 -1000g_chr22_genotypes interp 0.02 0.01 0 1170 1170 0 2929680 1170 0 0 -1000g_wgs_sites baseline 0.01 0 0 0 0 0 0 0 0 0 -1000g_wgs_sites exact 0.01 0 0 0 0 0 0 0 0 0 -1000g_wgs_sites interp 0.01 0 0 0 0 0 0 0 0 0 -clinvar_grch38_chr22 baseline 0.01 0 0 0 0 0 0 0 0 0 -clinvar_grch38_chr22 exact 0.01 0 0 0 0 0 0 0 0 0 -clinvar_grch38_chr22 interp 0.01 0 0 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites baseline 0.46 0.4 0.05 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites exact 0.47 0.41 0.04 0 0 0 0 0 0 0 -gnomad_v4.1_exomes_sites interp 0.45 0.4 0.05 0 0 0 0 0 0 0 -synthetic_ccdg_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_ccdg_likelihood exact 0.01 0 0 2000 2000 0 16000 0 0 0 -synthetic_ccdg_likelihood interp 0.01 0 0 2000 2000 0 16000 2000 2000 0 -synthetic_reordered_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_reordered_likelihood exact 0 0 0 2000 2000 0 16000 0 0 0 -synthetic_reordered_likelihood interp 0.01 0 0 2000 2000 0 16000 0 0 0 -synthetic_fixed_numeric baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_fixed_numeric exact 0.01 0 0 2000 2000 0 16000 2000 0 0 -synthetic_fixed_numeric interp 0.01 0 0 2000 2000 0 16000 2000 0 0 -synthetic_float_string baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_float_string exact 0.01 0 0 2000 2000 0 16000 2000 0 0 -synthetic_float_string interp 0.01 0 0 2000 2000 0 16000 2000 0 0 -synthetic_multiallelic_likelihood baseline 0.01 0 0 0 0 0 0 0 0 0 -synthetic_multiallelic_likelihood exact 0 0 0 1200 1200 0 9600 0 0 0 -synthetic_multiallelic_likelihood interp 0 0 0 1200 1200 0 9600 1200 1200 0 diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md index b16557458..2493b20f8 100644 --- a/docs/FORMAT_PLAN_CURRENT.md +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -53,8 +53,8 @@ The compile step rejects: - unsupported header types; - unsupported number models; - `GT` declarations that are not `Type=String,Number=1`. -- string-plus-float-vector schemas with too little integer-vector work to repay - the dynamic path's width-measurement cost. +- measured-string plus float-vector schemas that do not also have integer-vector + work for the planned executor. Undefined tags intentionally fall back to the generic parser so existing dummy-header repair and warning behavior is preserved. @@ -113,7 +113,8 @@ The planned parser must preserve these invariants: - selected-sample parsing must honor `h->keep_samples`, use `h->nsamples_ori` for input-column scans, and set `v->n_sample` to the retained sample count; - duplicate or undefined tags use the generic parser; -- low-profit string/float-heavy schemas use the generic parser; +- measured-string plus float-vector schemas without integer-vector work use the + generic parser; - unsupported GT encodings force fallback; - numeric vectors preserve observed width and vector-end padding; - strings use observed maximum byte length and zero-pad shorter samples; @@ -128,33 +129,16 @@ Focused validation lives in the existing `test/test.pl` harness as `HTS_VCF_FORMAT_PLAN=1` byte-for-byte with `cmp`, and also verifies that unrecognized control values such as `HTS_VCF_FORMAT_PLAN=off` behave like the generic parser. The repo fixtures cover numeric-width and GT-shape fallback, -low-value float/string schemas, cache growth, long FORMAT strings, string-width -fallback, separator fallback, parse fallback with rollback, repeated wide GT -values, selected-sample skipping of malformed unselected columns, and -sample-count mismatch. The selected-sample checks compare explicit inclusion -and exclusion lists (`S1,S3`, `S2`, and `^S2`). `test/test_format_plan_cache` -mutates and resyncs a header after a plan has been compiled for the same FORMAT -string, then verifies the row is planned again with the new metadata. - -## Current Source Delta - -After removing the old exact kernels and SIMD tab scanner, then hardening the -dynamic cache, the live parser/test hook delta relative to `origin/develop` is: - -| File | Added lines | -|---|---:| -| `vcf.c` | 1,939 added / 164 removed | -| `Makefile` | 6 | -| `test/test.pl` | 110 | -| `test/test_format_plan_cache.c` | 133 | -| `test/test_view.c` | 45 added / 2 removed | -| `test/format-plan-cache.vcf` | 61 | -| `test/format-plan-edge.vcf` | 38 | -| `test/format-plan-float-string.vcf` | 8 | -| `test/format-plan-fallback.vcf` | 10 | -| `test/format-plan-repeated-wide-gt.vcf` | 14 | -| `test/format-plan-sample-count.vcf` | 6 | -| `test/format-plan-sample-skip.vcf` | 7 | +mixed string/float schemas kept on the generic parser, cache growth, long FORMAT +strings, string-width fallback, separator fallback, parse fallback with rollback, +repeated wide GT values, float-vector compaction, selected-sample skipping of +malformed unselected columns, and sample-count mismatch. The selected-sample +checks compare explicit inclusion and exclusion lists (`S1,S3`, `S2`, and +`^S2`) and also verify retained-sample float widths do not depend on skipped +input columns. +`test/test_format_plan_cache` mutates and resyncs a header after a plan has been +compiled for the same FORMAT string, then verifies the row is planned again with +the new metadata. ## Large Corpus Benchmark @@ -183,9 +167,10 @@ All planned outputs compared byte-identical to baseline. The CCDG 10k fallbacks are all `string_width=139`, meaning only rows with measured string fields wider than the 256-byte planned cap use the generic parser. The float/string control fixtures still fall back as unsupported -because the low-profit schema gate deliberately rejects those schemas. Briefly -tested runtime guards regressed sparse-fallback CCDG-like layouts, so the current -implementation leaves row-local fallbacks local to the record. +because the mixed string/float shape boundary keeps those rows on the generic +parser. Briefly tested runtime guards regressed sparse-fallback CCDG-like +layouts, so the current implementation leaves row-local fallbacks local to the +record. ## Full Threaded Corpus Benchmark @@ -197,9 +182,9 @@ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded-profit-gate \ bench/format-shape/large/threaded-inputs.tsv ``` -All 40 planned outputs compared byte-identical to baseline. Detailed timings -are in `bench/format-shape/large/results-threaded-profit-gate/timings.tsv`; the -table below summarizes real-time speedup. +All 40 planned outputs compared byte-identical to baseline. Generated result +files are ignored; the table below summarizes the recorded real-time speedup +from `bench/format-shape/large/results-threaded-profit-gate`. | Input | 0 threads | 2 threads | 4 threads | 8 threads | |---|---:|---:|---:|---:| @@ -219,19 +204,19 @@ table below summarizes real-time speedup. A clean bcftools `develop` worktree was built at: ```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan +/path/to/bcftools-htslib-vcf-plan ``` using this htslib worktree: ```sh -make HTSDIR=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/htslib-vcf-avx-sanity bcftools +make HTSDIR=/path/to/htslib bcftools ``` Timing command: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ bench/format-shape/scripts/run_bcftools_bench.sh \ bench/format-shape/large/threaded-inputs.tsv @@ -264,7 +249,7 @@ path through bcftools rather than only through the test harness. Command: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ SAMPLE_COUNT=2 KEEP_OUTPUTS=0 \ OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ bench/format-shape/scripts/run_bcftools_bench.sh \ @@ -324,7 +309,7 @@ The broader command runner exercises bcftools paths that either consume FORMAT records, discard FORMAT records, or mostly operate on site-level data: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-commands \ bench/format-shape/scripts/run_bcftools_command_bench.sh \ bench/format-shape/large/bcftools-command-inputs.tsv @@ -363,7 +348,7 @@ planner overhead in workloads that do not benefit from FORMAT decoding. grow quickly. It was run against the smaller merge manifest: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ COMMANDS=merge_self KEEP_OUTPUTS=0 \ OUTDIR=bench/format-shape/large/results-bcftools-merge \ bench/format-shape/scripts/run_bcftools_command_bench.sh \ @@ -386,7 +371,7 @@ structural variants, and v5.0q CHM13v2.0 small variants. The same bcftools command suite was run against those files plus the all-sample CCDG 10k slice: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening \ bench/format-shape/scripts/run_bcftools_command_bench.sh \ bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv @@ -413,10 +398,10 @@ The parent CCDG/1000G high-coverage chr22 file is 26.0 GiB compressed: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` -It is available locally at: +For local reruns, point the full-CCDG manifest at a local copy such as: ```text -/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz +/path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` The normal command harness is unsafe for this input because one full @@ -424,7 +409,7 @@ The normal command harness is unsafe for this input because one full full-file benchmark therefore used the streaming checksum harness: ```sh -BCFTOOLS=/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/bcftools-htslib-vcf-plan/bcftools \ +BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ OUTDIR=bench/format-shape/large/results-bcftools-full-ccdg-stream \ bash bench/format-shape/scripts/run_bcftools_command_bench_stream.sh \ bench/format-shape/large/bcftools-full-ccdg-inputs.tsv diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md index e81ffcaba..8052a40c1 100644 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -182,7 +182,7 @@ remained byte-identical after the rewrite, with the same broad performance profile: 1000G chr22 GT user time at 26.06 s baseline versus 7.96 s planned, and CCDG 10k at 2.55 s baseline versus 2.24 s planned. -## Profitability Gate For String/Float Shapes +## String/Float Shape Boundary The expanded threaded benchmark exposed two regressions: @@ -195,11 +195,12 @@ dynamic path had to measure string widths over every sample before parsing, then still use the general float conversion path, while there were no integer vectors to amortize that setup. -Result: retained. The compiler now negative-caches these low-profit schemas and -sends only those FORMAT rows to the generic parser. The full threaded corpus -remained byte-identical. The two-string float case improved from a consistent -slowdown, roughly 0.86-0.89x, to parity at 1.00-1.01x. Other integer-heavy -likelihood rows stayed on the dynamic path. +Result: retained as a conservative support boundary. The compiler now +negative-caches measured-string plus float-vector schemas that do not also have +integer-vector work, and sends those FORMAT rows to the generic parser. The +full threaded corpus remained byte-identical. The two-string float case +improved from a consistent slowdown, roughly 0.86-0.89x, to parity at +1.00-1.01x. Other integer-heavy likelihood rows stayed on the dynamic path. ## Selected-Sample Support @@ -286,8 +287,7 @@ The full parent CCDG/1000G high-coverage chr22 VCF was identified as: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz ``` -It is 26.0 GiB compressed and was later found locally under -`/Users/jeremiah.li/geneticoptims/inplace-htslib-refactor/data/original`. +It is 26.0 GiB compressed and requires a local copy for reruns. The normal command harness materializes complete outputs, which is not practical for this file: a single `view_bcf -Ob -l 0` baseline output reached 155 GiB before that run was stopped. A streaming checksum harness was added so command @@ -465,10 +465,11 @@ checks because the local build invoked the C compiler on `test/usepublic.cpp` with `-std=gnu23`. The relevant whitespace check and `git diff --check` passed separately. -The htslib large corpus in `bench/format-shape/large/results-prod-hardening2` -compared byte-identical to baseline. CCDG 10k held the expected 9,861 / 139 -hit/fallback split, and 1000G chr22 full GT remained the largest win at -24.61 s baseline user time versus 9.48 s planned. +The htslib large corpus run written locally under +`bench/format-shape/large/results-prod-hardening2` compared byte-identical to +baseline. The generated result files are ignored, so the recorded summary is: +CCDG 10k held the expected 9,861 / 139 hit/fallback split, and 1000G chr22 full +GT remained the largest win at 24.61 s baseline user time versus 9.48 s planned. The latest bcftools GIAB/CCDG command corpus in `bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening` also @@ -482,8 +483,9 @@ The per-plan runtime cooldown was removed after an A/B pass showed no practical benefit on realistic workloads. The cooldown had paused a supported cached schema after repeated row-local fallbacks, but standard corpus hit/fallback counts were identical with and without it. The remaining protection is simpler: -compile-time unsupported schemas are negative-cached, low-profit schemas are -rejected at compile time, and row-local misses fall back only for that record. +compile-time unsupported schemas are negative-cached, unsupported mixed +string/float shapes are rejected at compile time, and row-local misses fall back +only for that record. The final no-cooldown parser corpus in `bench/format-shape/large/results-no-cooldown-final` compared byte-identical to diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 1e210f8de..e30d1b7da 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -19,10 +19,10 @@ unsupported decisions tied to the exact header metadata that produced them. If the row fits the supported operation set, the dynamic executor parses samples and writes BCF's transposed FORMAT layout directly. If anything looks unsafe or unsupported, htslib falls back to the generic parser for the whole FORMAT -column. The planner also keeps a small profitability gate: schemas dominated by -measured strings plus float vectors, such as `GT:FT:PID:GL:DP`, currently use -the generic parser because the dynamic path's width-measurement work costs -more than it saves. +column. The planner also keeps a conservative shape boundary: schemas dominated +by measured strings plus float vectors, such as `GT:FT:PID:GL:DP`, currently use +the generic parser because the dynamic path's width-measurement work costs more +than it saves. The optimized path also supports selected-sample reads. When `bcf_hdr_set_samples()` is active, it scans the original sample columns, skips @@ -91,7 +91,7 @@ Known fallback cases include: - undefined FORMAT tags that require production header repair; - unsupported header types or number models; -- low-profit string/float-heavy schemas; +- mixed measured-string plus float-vector schemas without integer-vector work; - duplicate FORMAT tags; - malformed separators or unexpected sample cardinality; - row-local widths above the bounded fast-path limit; diff --git a/test/format-plan-empty-format-tag.vcf b/test/format-plan-empty-format-tag.vcf new file mode 100644 index 000000000..b9ca9d63c --- /dev/null +++ b/test/format-plan-empty-format-tag.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C . PASS . GT::DP 0/1::5 diff --git a/test/format-plan-float-vector.vcf b/test/format-plan-float-vector.vcf new file mode 100644 index 000000000..a6c70e5de --- /dev/null +++ b/test/format-plan-float-vector.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:GL:QS:AB:DP 0/1:-0.1,-1.2,-3:0.5,1.5:0.50:7 0/0:0,-5,-9:0.1,0.2,0.3,0.4:.:8 ./.:.:1.25,2.5:.:0 +1 2 . A C,G . PASS . GT:GL:QS:AB:DP 1/2:-9,-8,-7,-6,-5,-4:0.1:0.25:12 0/2:-2,-3,-4,-5,-6,-7:0.2,0.3:0.75:9 ./.:.:.:.:0 +1 3 . G T . PASS . QS:GT:GL:AB:DP 0.1,0.2:0/1:-1,-2,-3:0.5:5 .:0/0:0,-10,-20:0.0:4 0.3,0.4,0.5:./.:.:.:0 +1 4 . T G . PASS . GT:GL:QS:AB:DP 0/1:.:.:.:6 0/0:.:.:.:4 ./.:.:.:.:0 +1 5 . C T . PASS . GT:GL:QS:AB:DP 0/1:-0.4,-0.8:0.1:0.5:3 0/0:0,-2:0.2:0.1:2 ./.:.:.:.:0 diff --git a/test/format-plan-malformed-fields.vcf b/test/format-plan-malformed-fields.vcf new file mode 100644 index 000000000..60f7a2da2 --- /dev/null +++ b/test/format-plan-malformed-fields.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C . PASS . GT:F:DP 0/1::5 +1 2 . A C . PASS . GT:GL:DP 0/1::5 +1 3 . A C . PASS . ST:DP :5 diff --git a/test/test.pl b/test/test.pl index 00c677a90..b50a6364a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1317,18 +1317,30 @@ sub test_vcf_format_plan_one sub test_vcf_format_plan_failure { - my ($opts, $input, $label) = @_; + my ($opts, $input, $label, $expected_stats) = @_; my $base = "$$opts{tmp}/$label.base.bcf"; my $plan = "$$opts{tmp}/$label.plan.bcf"; + my $plan_stats = "$$opts{tmp}/$label.plan.stats"; my $test = "VCF FORMAT planner expected failure: $label"; + my $plan_env = "HTS_VCF_FORMAT_PLAN=1"; + my $plan_stderr = ""; + + if (defined($expected_stats)) { + $plan_env .= " HTS_VCF_FORMAT_PLAN_STATS=1"; + $plan_stderr = " 2>$plan_stats"; + } print "$test:\n"; + if (!-e "$$opts{path}/$input") { + failed($opts, $test, "missing test input $$opts{path}/$input"); + return; + } my $cmd = "env HTS_VCF_FORMAT_PLAN=0 $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $base"; print "\t$cmd\n"; my ($base_ret, $base_out) = _cmd($cmd); - $cmd = "env HTS_VCF_FORMAT_PLAN=1 $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $plan"; + $cmd = "env $plan_env $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $plan$plan_stderr"; print "\t$cmd\n"; my ($plan_ret, $plan_out) = _cmd($cmd); @@ -1337,6 +1349,14 @@ sub test_vcf_format_plan_failure return; } + if (defined($expected_stats)) { + my $stats_error = test_vcf_format_plan_check_stats($plan_stats, $expected_stats); + if ($stats_error) { + failed($opts, $test, $stats_error); + return; + } + } + passed($opts, $test); } @@ -1366,6 +1386,33 @@ sub test_vcf_format_plan parse => 0, separator => 0, sample_count => 0 }); + test_vcf_format_plan_one($opts, "format-plan-malformed-fields.vcf", + "format-plan-malformed-fields", "", + { attempts => 3, hits => 0, fallback => 3, + parsed_samples => 0, + unsupported => 0, numeric_width => 0, + string_width => 1, gt_shape => 0, + parse => 2, separator => 0, + sample_count => 0 }); + + test_vcf_format_plan_one($opts, "format-plan-float-vector.vcf", + "format-plan-float-vector", "", + { attempts => 5, hits => 5, fallback => 0, + parsed_samples => 15, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + + test_vcf_format_plan_one($opts, "format-plan-float-vector.vcf", + "format-plan-float-vector.S1_S3", "-s S1,S3", + { attempts => 5, hits => 5, fallback => 0, + parsed_samples => 10, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + for my $samples ("S1,S3", "S2", "^S2") { for my $input ("format-plan-composable.vcf", "format-plan-edge.vcf") { (my $label = "$input.$samples") =~ s/[^A-Za-z0-9_.-]/_/g; @@ -1383,6 +1430,14 @@ sub test_vcf_format_plan sample_count => 0 }); test_vcf_format_plan_failure($opts, "format-plan-sample-count.vcf", "format-plan-sample-count"); + test_vcf_format_plan_failure($opts, "format-plan-empty-format-tag.vcf", + "format-plan-empty-format-tag", + { attempts => 1, hits => 0, fallback => 1, + parsed_samples => 0, + unsupported => 1, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); } sub write_multiblock_bgzf { diff --git a/vcf.c b/vcf.c index 9d7459ff4..59fba63b5 100644 --- a/vcf.c +++ b/vcf.c @@ -3399,6 +3399,11 @@ typedef struct { } vcf_format_row_op_t; typedef struct { + /* + * Measured string fields are scanned once up front to determine the row + * width. Keep the span from that pass so execution can copy bytes without + * searching for the same ':' or tab delimiter again. + */ const char *ptr; int len; } vcf_format_string_span_t; @@ -3515,37 +3520,41 @@ static int vcf_format_plan_cache_slot(vcf_format_plan_cache_t *cache) return idx; } -static int vcf_format_general_plan_profitable(const vcf_format_general_plan_t *plan) +static inline int vcf_format_op_is_vector(const vcf_format_op_t *op) +{ + return op->vl_type != BCF_VL_FIXED || op->number != 1; +} + +/* + * Return whether this FORMAT composition is inside the current planned + * executor's supported shape set. This is a support boundary, not a learned + * runtime heuristic: mixed measured-string plus float-vector rows are kept on + * the generic parser unless there is also integer-vector work for the planner + * to accelerate. + */ +static int vcf_format_general_plan_shape_supported(const vcf_format_general_plan_t *plan) { - int j, string_ops = 0, float_vector_ops = 0, int_ops = 0, int_vector_ops = 0; + int j, has_measured_string = 0, has_float_vector = 0, has_int_vector = 0; for (j = 0; j < plan->n_ops; j++) { const vcf_format_op_t *op = &plan->ops[j]; if (op->is_gt) continue; if (op->htype == BCF_HT_STR) { - string_ops++; - } else if (op->htype == BCF_HT_REAL) { - if (op->vl_type == BCF_VL_FIXED && op->number == 1) - ; - else - float_vector_ops++; - } else if (op->htype == BCF_HT_INT) { - int_ops++; - if (op->vl_type != BCF_VL_FIXED || op->number != 1) - int_vector_ops++; + has_measured_string = 1; + } else if (op->htype == BCF_HT_REAL && vcf_format_op_is_vector(op)) { + has_float_vector = 1; + } else if (op->htype == BCF_HT_INT && vcf_format_op_is_vector(op)) { + has_int_vector = 1; } } /* - * FORMAT rows with measured strings plus float vectors have to pay the - * dynamic executor's full width-measurement pass and then still use the - * general float conversion path. Without integer vectors to amortize that - * setup, production parsing has been consistently faster on the large - * corpus (for example GT:GL:FT:DP:GQ and GT:FT:PID:GL:DP). + * Examples intentionally left on generic: GT:GL:FT:DP:GQ and + * GT:FT:PID:GL:DP. Both are valid FORMAT schemas, but this executor has no + * cheap integer-vector encoding work to offset the measured-string pass. */ - if (string_ops > 0 && float_vector_ops > 0 && - int_vector_ops == 0 && int_ops <= 2) + if (has_measured_string && has_float_vector && !has_int_vector) return 0; return 1; } @@ -3555,7 +3564,7 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma uint64_t hdr_gen, vcf_format_general_plan_t *plan) { - char *tmp, *tok, *saveptr = NULL; + char *tmp, *tok, *format_end; int i, ret = 0; memset(plan, 0, sizeof(*plan)); @@ -3580,10 +3589,21 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma * additional header-described tags to share the same executor instead of * needing exact string-specific kernels. */ - for (tok = strtok_r(tmp, ":", &saveptr); tok; - tok = strtok_r(NULL, ":", &saveptr)) { + /* + * Keep empty FORMAT tokens visible. strtok_r() would collapse GT::DP into + * GT:DP, but the generic parser treats the empty tag as malformed. + */ + format_end = tmp + format_len; + for (tok = tmp; tok <= format_end; ) { + char *next = memchr(tok, ':', (size_t)(format_end - tok)); int key, htype; + if (!next) + next = format_end; + if (next == tok) + goto done; + *next = '\0'; + if (plan->n_ops >= MAX_N_FMT) goto done; key = bcf_hdr_id2int(h, BCF_DT_ID, tok); @@ -3629,11 +3649,15 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma } } plan->n_ops++; + + if (next == format_end) + break; + tok = next + 1; } if (!plan->n_ops) goto done; - if (!vcf_format_general_plan_profitable(plan)) + if (!vcf_format_general_plan_shape_supported(plan)) goto done; plan->supported = 1; @@ -3783,16 +3807,6 @@ VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add_regular(vcf_plan_int_range_t range->min = val; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_vector_count(const float *vals, int width) -{ - int i; - - for (i = 0; i < width; i++) - if (bcf_float_is_vector_end(vals[i])) - break; - return i; -} - VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) { const char *s = *sp; @@ -4109,23 +4123,28 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string_span(const vcf_format_string_spa return 0; } -static int vcf_plan_parse_float_vector_dynamic(const char **sp, float *out, int width) +/* + * Parse a dynamic-width float vector and report the number of values seen + * before padding the rest of the row with BCF vector-end markers. Returning + * the count avoids a second pass over the encoded floats, which is also safer + * than inferring width from sentinel values after conversion. + */ +static int vcf_plan_parse_float_vector_dynamic_counted(const char **sp, + float *out, int width, + int *nread) { const char *s = *sp; int i = 0; - if (*s == ':' || *s == '\t' || *s == '\0') { - bcf_float_set_missing(out[i++]); - } else { - for (;;) { - if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) - return -1; - i++; - if (*s != ',') - break; - s++; - } + for (;;) { + if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) + return -1; + i++; + if (*s != ',') + break; + s++; } + *nread = i; for (; i < width; i++) bcf_float_set_vector_end(out[i]); *sp = s; @@ -4145,10 +4164,6 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible_range(const char **sp, i VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float *out) { - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - bcf_float_set_missing(*out); - return 0; - } return vcf_plan_float_value(sp, out); } @@ -4283,8 +4298,12 @@ static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t * return v->n_allele > 1 ? v->n_allele - 1 : 0; case BCF_VL_R: return v->n_allele; - case BCF_VL_G: - return v->n_allele * (v->n_allele + 1) / 2; + case BCF_VL_G: { + uint64_t n = (uint64_t) v->n_allele; + uint64_t width = n * (n + 1) / 2; + + return width > INT_MAX ? INT_MAX : (int) width; + } default: return 0; } @@ -4374,6 +4393,12 @@ static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, width == 3 ? VCF_FORMAT_ROW_INT3 : VCF_FORMAT_ROW_INTN; } +/* + * Resolve FORMAT widths before execution. Fixed widths come from the header + * and current allele count; Type=String and Number=. numeric rows require a + * sample scan. Returns 0 for a usable plan, -4 for generic fallback, and -1 + * for allocation failure. + */ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, const vcf_format_general_plan_t *plan, bcf1_t *v, char *q, int *widths, @@ -4489,8 +4514,6 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, } if (j > 0) w++; - if (w <= 0) - w = 1; if (w > VCF_FORMAT_MAX_STRING_WIDTH) { vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); return -4; @@ -4527,22 +4550,34 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, } for (j = 0; j < plan->n_ops; j++) if (plan->ops[j].measured_width) { - if (widths[j] <= 0) - widths[j] = 1; if (plan->ops[j].htype == BCF_HT_STR) { + if (widths[j] <= 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } if (widths[j] > VCF_FORMAT_MAX_STRING_WIDTH) { vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); return -4; } - } else if (widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { - vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); - return -4; + } else { + if (widths[j] <= 0) + widths[j] = 1; + if (widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } } } return 0; } +/* + * Execute a row-local FORMAT plan. Parsing proceeds sample-major because that + * matches the VCF text, then staged rows are encoded op-major to match BCF + * FORMAT layout. Returns 0 on success, -4 for generic fallback, and -1 on hard + * errors after rolling back any direct writes to v->indiv. + */ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, @@ -4686,11 +4721,11 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, } break; case VCF_FORMAT_ROW_FLOATN: - if (vcf_plan_parse_float_vector_dynamic(&cur, (float *)buf, op->width) < 0) { + if (vcf_plan_parse_float_vector_dynamic_counted(&cur, (float *)buf, + op->width, &n) < 0) { vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); goto fallback; } - n = vcf_plan_float_vector_count((float *)buf, op->width); break; case VCF_FORMAT_ROW_STR: if (string_span_offsets && string_span_offsets[j] != (size_t)-1) { From 52f55073bdcf3c980f71a6fdc5f542d0f6e3e462 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 22:28:47 +0200 Subject: [PATCH 34/38] Trim benchmark corpus tooling from product branch --- bench/format-shape/.gitignore | 9 - bench/format-shape/README.md | 400 ------------- bench/format-shape/inputs.tsv | 11 - .../large/bcftools-command-inputs.tsv | 6 - .../large/bcftools-full-ccdg-inputs.tsv | 2 - .../large/bcftools-giab-ccdg-inputs.tsv | 6 - .../large/bcftools-merge-inputs.tsv | 4 - bench/format-shape/large/inputs.tsv | 11 - bench/format-shape/large/threaded-inputs.tsv | 11 - .../scripts/make_large_synthetic.pl | 237 -------- bench/format-shape/scripts/make_synthetic.pl | 131 ----- .../scripts/run_bcftools_bench.sh | 74 --- .../scripts/run_bcftools_command_bench.sh | 176 ------ .../run_bcftools_command_bench_stream.sh | 164 ------ bench/format-shape/scripts/run_bench.sh | 66 --- .../format-shape/scripts/run_thread_bench.sh | 74 --- docs/FORMAT_PLAN_CURRENT.md | 556 ------------------ docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 517 ---------------- docs/FORMAT_PLAN_OVERVIEW.md | 129 ---- 19 files changed, 2584 deletions(-) delete mode 100644 bench/format-shape/.gitignore delete mode 100644 bench/format-shape/README.md delete mode 100644 bench/format-shape/inputs.tsv delete mode 100644 bench/format-shape/large/bcftools-command-inputs.tsv delete mode 100644 bench/format-shape/large/bcftools-full-ccdg-inputs.tsv delete mode 100644 bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv delete mode 100644 bench/format-shape/large/bcftools-merge-inputs.tsv delete mode 100644 bench/format-shape/large/inputs.tsv delete mode 100644 bench/format-shape/large/threaded-inputs.tsv delete mode 100644 bench/format-shape/scripts/make_large_synthetic.pl delete mode 100755 bench/format-shape/scripts/make_synthetic.pl delete mode 100755 bench/format-shape/scripts/run_bcftools_bench.sh delete mode 100755 bench/format-shape/scripts/run_bcftools_command_bench.sh delete mode 100644 bench/format-shape/scripts/run_bcftools_command_bench_stream.sh delete mode 100755 bench/format-shape/scripts/run_bench.sh delete mode 100755 bench/format-shape/scripts/run_thread_bench.sh delete mode 100644 docs/FORMAT_PLAN_CURRENT.md delete mode 100644 docs/FORMAT_PLAN_EXPERIMENT_LOG.md delete mode 100644 docs/FORMAT_PLAN_OVERVIEW.md diff --git a/bench/format-shape/.gitignore b/bench/format-shape/.gitignore deleted file mode 100644 index b60d39628..000000000 --- a/bench/format-shape/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -public/*.vcf.gz -public/remote-indexes/*.tbi -synthetic/*.vcf.gz -large/**/*.vcf.gz -large/**/*.vcf.bgz -large/**/*.tbi -large/results/* -large/results-*/* -results/* diff --git a/bench/format-shape/README.md b/bench/format-shape/README.md deleted file mode 100644 index 2959bb250..000000000 --- a/bench/format-shape/README.md +++ /dev/null @@ -1,400 +0,0 @@ -# VCF FORMAT Shape Benchmark Corpus - -This directory is a local test and benchmark corpus for the experimental VCF -FORMAT planner in `vcf.c`. It is intentionally kept under the repository -worktree instead of `/tmp` so the inputs survive restarts. - -The canonical feature docs are: - -- `docs/FORMAT_PLAN_OVERVIEW.md` for the high-level feature summary; -- `docs/FORMAT_PLAN_CURRENT.md` for the current implementation and benchmark - tables; -- `docs/FORMAT_PLAN_EXPERIMENT_LOG.md` for the historical experiment log. - -## Layout - -```text -bench/format-shape/ - inputs.tsv input manifest used by the benchmark script - public/ downloaded public VCF slices - synthetic/ generated VCFs covering targeted FORMAT shapes - large/ meaningful multi-second benchmark inputs/results - scripts/make_synthetic.pl deterministic synthetic VCF generator - scripts/make_large_synthetic.pl - scripts/run_bench.sh baseline/plan timing and cmp runner - scripts/run_thread_bench.sh threaded timing and cmp runner - scripts/run_bcftools_bench.sh bcftools threaded timing runner - scripts/run_bcftools_command_bench.sh broader bcftools command runner - scripts/run_bcftools_command_bench_stream.sh checksum-only large-output runner - results/ generated timing logs and BCF outputs -``` - -The downloaded/generated VCF inputs and benchmark result files are intentionally -ignored by git to avoid accidentally pushing large local data. The manifests, -scripts, and docs are tracked; local data and timing summaries can be -regenerated from the commands below. - -`results/` can be regenerated at any time and may become large. The scripts -write timing/check summaries and keep BCF outputs locally so `cmp` checks are -inspectable, but `.gitignore` excludes those rerun artifacts. - -## Repo Tests - -The small correctness cases that should travel with the implementation now live -in the normal htslib test harness, not only in this benchmark directory. -`make check` runs `test_vcf_format_plan` inside `test/test.pl` plus -`test/test_format_plan_cache`. Those tests assert byte-identical planned output -at the parser-output level, selected-sample behavior, rollback after partial -planned parsing, malformed-input failure behavior, and header-cache generation -invalidation. Fallback reason counters remain local diagnostics for benchmark -analysis rather than production test assertions. - -The benchmark corpus remains for performance and production-shape coverage. It -should not become a normal test-suite dependency because several inputs are -large public VCFs or generated multi-second workloads. - -## Public Inputs - -The small `public/` and `synthetic/` inputs are smoke/correctness fixtures. They -are not large enough to provide stable timing signal except for the CCDG 10k -subset. Use `large/inputs.tsv` for optimization decisions. - -The public files were sliced with `tabix -h URL REGION | ./bgzip -c > file`. -They are small enough to keep in the worktree but diverse enough to catch -non-FORMAT and real-world INFO-heavy workloads. - -| File | Source | Shape | -|---|---|---| -| `public/ccdg_chr22_10k.vcf.gz` | local CCDG subset | 3,202-sample CCDG likelihood FORMAT | -| `public/1000g_chr22_genotypes_16050k_16150k.vcf.gz` | 1000 Genomes Phase 3 chr22 genotypes | sample-rich `GT` FORMAT | -| `public/1000g_wgs_sites_chr22_16050k_16300k.vcf.gz` | 1000 Genomes Phase 3 WGS sites | sites-only | -| `public/clinvar_grch38_chr22_16050k_20000k.vcf.gz` | ClinVar GRCh38 VCF | sites-only clinical annotations | -| `public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz` | gnomAD v4.1 exomes chr22 | sites-only, INFO-heavy | -| `large/public/giab/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz` | GIAB HG002 v4.2.1 | 4,048,342-record single-sample truth-set small variants | -| `large/public/giab/HG002_GRCh38_v5.0q_smvar.vcf.gz` | GIAB HG002 v5.0q GRCh38 | 5,945,525-record single-sample small variants | -| `large/public/giab/HG002_GRCh38_v5.0q_stvar.vcf.gz` | GIAB HG002 v5.0q GRCh38 | 6,268,852-record single-sample structural variants | -| `large/public/giab/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz` | GIAB HG002 v5.0q CHM13v2.0 | 5,829,374-record single-sample small variants | - -Source URLs used: - -```text -https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz -https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz -https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz -https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr22.vcf.bgz -https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_GRCh38_v5.0q_smvar.vcf.gz -https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_GRCh38_v5.0q_stvar.vcf.gz -https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/v5.0q/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz -``` - -The parent CCDG/1000G high-coverage chr22 file for -`public/ccdg_chr22_10k.vcf.gz` is: - -```text -https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -It is 26.0 GiB compressed. For local reruns, point the full-CCDG manifest at a -local copy such as: - -```text -/path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -Do not run the normal output-materializing command harness on this file. A -single uncompressed BCF output reached 155 GiB before the run was interrupted. -Use the streaming checksum harness below instead. - -## Synthetic Inputs - -The synthetic files are generated by: - -```sh -bench/format-shape/scripts/make_synthetic.pl bench/format-shape/synthetic -for f in bench/format-shape/synthetic/*.vcf; do ./bgzip -f "$f"; done -``` - -They cover: - -- CCDG-like likelihood layouts with optional `AB` and `PGT/PID`, -- reordered likelihood fields, -- fixed numeric vectors, -- float-vector plus string FORMAT fields, -- multiallelic AD/PL likelihood rows. - -## Running - -Build the tools first: - -```sh -make test/test_view tabix bgzip -``` - -Run all inputs: - -```sh -bench/format-shape/scripts/run_bench.sh -``` - -Run only the meaningful large corpus: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -`KEEP_OUTPUTS=0` still writes temporary BCF files and compares them with `cmp`, -but deletes the large BCF outputs after each input is checked. - -Run the threaded scaling corpus: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded \ - bench/format-shape/scripts/run_thread_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -By default this runs unthreaded plus `-@ 2`, `-@ 4`, and `-@ 8`. Override with -`THREADS_LIST="2 4 8"` or a similar space-separated list. The threaded manifest -now mirrors the full large corpus so thread scaling is checked across the same -real and synthetic workload shapes as the primary benchmark. - -The script runs each input in two modes: - -```text -baseline: HTS_VCF_FORMAT_PLAN=0 -plan: HTS_VCF_FORMAT_PLAN=1 -``` - -It writes: - -```text -bench/format-shape/results/timings.tsv -bench/format-shape/results/checks.tsv -``` - -`checks.tsv` compares plan BCF output against baseline with `cmp`. -The threaded runner writes the same files under its selected output directory, -with an additional `threads` column. - -Run the same threaded corpus through bcftools: - -```sh -BCFTOOLS=/path/to/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ - bench/format-shape/scripts/run_bcftools_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -This uses `bcftools view --no-version -Ob -l 0`, compares planned output against -baseline with `cmp`, and records the same `0 2 4 8` thread counts by default. -It does not report planner counters because bcftools does not expose the -`test/test_view` stats hook. - -To exercise selected-sample parsing, set `SAMPLE_COUNT=N`. The runner queries -the first N samples from each input with `bcftools query -l` and passes them to -`bcftools view -s`; sites-only inputs have no sample list and run unchanged. - -```sh -BCFTOOLS=/path/to/bcftools SAMPLE_COUNT=2 \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ - bench/format-shape/scripts/run_bcftools_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -Run broader bcftools command shapes: - -```sh -BCFTOOLS=/path/to/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-commands \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-command-inputs.tsv -``` - -This runner is intended to be a bridge toward future tests. It runs each -command once with `HTS_VCF_FORMAT_PLAN=0` and once with -`HTS_VCF_FORMAT_PLAN=1`, then compares outputs with `cmp`. - -The default command set is: - -| Command | Purpose | Output check | -|---|---|---| -| `view_bcf` | Full `bcftools view --no-version -Ob -l 0` conversion. | Binary BCF `cmp`. | -| `view_sites` | `bcftools view -G` after dropping genotypes. | Binary BCF `cmp`. | -| `query_sites` | Fixed-column query that should not benefit from FORMAT parsing. | Text `cmp`. | -| `query_format` | Query `%GT` for the first `QUERY_SAMPLE_COUNT` samples. | Text `cmp`. | -| `stats` | `bcftools stats` over the input. | Text `cmp`. | -| `filter_gt` | `bcftools view -i 'GT="alt"'` for the first `QUERY_SAMPLE_COUNT` samples. | Binary BCF `cmp`. | -| `merge_self` | `bcftools merge --no-index --force-samples` of the input with itself. | Binary BCF `cmp`. | - -`query_format`, `filter_gt`, and `merge_self` are skipped for sites-only inputs. -By default the query/filter commands select two samples -(`QUERY_SAMPLE_COUNT=2`) to avoid generating enormous text output on cohort-scale -VCFs. Override with: - -```sh -COMMANDS="query_format stats" QUERY_SAMPLE_COUNT=8 THREADS_LIST="0 4" \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-command-inputs.tsv -``` - -The runner writes: - -```text -timings.tsv name, command, threads, mode, real/user/sys -checks.tsv baseline-vs-plan cmp status, including skipped_no_samples -commands.tsv command descriptions captured with the result directory -``` - -For very large inputs, use the streaming checksum variant. It runs the same -command families but pipes output through `cksum` and compares checksums instead -of storing complete BCF/text outputs: - -```sh -BCFTOOLS=/path/to/bcftools \ -OUTDIR=bench/format-shape/large/results-bcftools-full-ccdg-stream \ - bash bench/format-shape/scripts/run_bcftools_command_bench_stream.sh \ - bench/format-shape/large/bcftools-full-ccdg-inputs.tsv -``` - -The full CCDG chr22 streaming run wrote: - -```text -bench/format-shape/large/results-bcftools-full-ccdg-stream/timings.tsv -bench/format-shape/large/results-bcftools-full-ccdg-stream/checks.tsv -bench/format-shape/large/results-bcftools-full-ccdg-stream/checksums.tsv -``` - -All baseline-vs-plan checksums compared `ok`. - -| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | -|---|---:|---:|---:|---:|---:|---:| -| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | -| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | -| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | -| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | -| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | -| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | - -For CI, the likely future shape is to keep one or two tiny inputs per command -and assert `checks.tsv` has only `ok` or expected `skipped_no_samples` rows. -The large corpus should remain a performance benchmark rather than a normal -test-suite dependency. - -Run the GIAB plus CCDG correctness/performance pass: - -```sh -BCFTOOLS=/path/to/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv -``` - -If using the sibling bcftools checkout in this workspace, build it against this -HTSlib checkout explicitly: - -```sh -cd ../bcftools-htslib-vcf-plan -make HTSDIR=../htslib-vcf-avx-sanity bcftools -``` - -This pass is primarily a production-shape correctness check. GIAB is -single-sample, so it does not show the large cohort speedups, but it does cover -real truth-set small-variant and structural-variant FORMAT details. The first -GIAB v5.0q run exposed a planned-path bug where `.|.` was serialized as `./.`. -The GT2 parser now preserves phased missing alleles, and the fixed rerun has -all baseline-vs-plan command outputs comparing `ok`. - -Latest hardened GIAB/CCDG command run: - -| Input | Command | Real speedup | User speedup | -|---|---|---:|---:| -| CCDG 10k | view_bcf | 1.14x | 1.14x | -| CCDG 10k | view_sites | 1.13x | 1.14x | -| CCDG 10k | query_format | 1.52x | 1.56x | -| CCDG 10k | filter_gt | 1.12x | 1.12x | -| GIAB HG002 GRCh38 v4.2.1 | view_bcf | 1.09x | 1.09x | -| GIAB HG002 GRCh38 v4.2.1 | query_format | 1.07x | 1.07x | -| GIAB HG002 GRCh38 v4.2.1 | filter_gt | 1.09x | 1.09x | -| GIAB HG002 GRCh38 v5.0q small variants | view_bcf | 1.09x | 1.09x | -| GIAB HG002 GRCh38 v5.0q small variants | query_format | 1.09x | 1.07x | -| GIAB HG002 GRCh38 v5.0q structural variants | view_bcf | 1.09x | 1.09x | -| GIAB HG002 GRCh38 v5.0q structural variants | query_format | 1.02x | 1.02x | -| GIAB HG002 CHM13 v5.0q small variants | view_bcf | 1.07x | 1.07x | -| GIAB HG002 CHM13 v5.0q small variants | query_format | 1.06x | 1.06x | - -`merge_self` is intentionally not in the default `COMMANDS` list because it can -produce very large outputs on cohort-scale inputs. Run it against the smaller -merge manifest: - -```sh -BCFTOOLS=/path/to/bcftools COMMANDS=merge_self \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-merge \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-merge-inputs.tsv -``` - -This is not a semantic recommendation to merge a file with itself in production; -it is a controlled benchmark shape. `--force-samples` creates distinct sample -names and `--no-index` avoids needing local tabix indexes for generated slices. - -The latest local merge run wrote: - -```text -bench/format-shape/large/results-bcftools-merge/timings.tsv -bench/format-shape/large/results-bcftools-merge/checks.tsv -``` - -All planned merge outputs compared byte-identical to baseline. The small 1000G -genotype input improved from 0.14 s to 0.10 s, the 1024-sample CCDG-like input -improved from 4.50 s to 4.33 s, and the 1024-sample float/string input was -unchanged at 2.69 s. - -## Large Corpus - -`large/inputs.tsv` currently contains: - -- the CCDG 10k subset, -- the full 1000 Genomes chr22 Phase 3 genotype VCF, -- eight generated 2,048-sample synthetic FORMAT workloads: - CCDG-like likelihood, reordered likelihood, multiallelic likelihood, - float/string FORMAT, variable phase-string widths, row-local likelihood - fallbacks, GT-first wrong-order likelihood-like rows, and two-string - float rows. - -`large/threaded-inputs.tsv` mirrors this full corpus for `-@` scaling checks. -`large/bcftools-command-inputs.tsv` is a smaller representative set for the -broader command benchmark: GT-only, real CCDG-like FORMAT, reordered FORMAT, -string/float negative control, and an INFO-heavy sites-only gnomAD slice. -`large/bcftools-merge-inputs.tsv` is smaller still, so merge output does not -explode during routine local benchmarks. - -To refresh only the newer cache-regression synthetic files without rewriting the -older large VCFs: - -```sh -SYNTHETIC_ONLY_NEW=1 \ - bench/format-shape/scripts/make_large_synthetic.pl \ - bench/format-shape/large/synthetic 2048 -``` - -The latest large run used this local output directory: - -```text -bench/format-shape/large/results-prod-hardening2/timings.tsv -bench/format-shape/large/results-prod-hardening2/checks.tsv -``` - -Generated result files are ignored; the summary below is the portable record. -All plan outputs in that run compared byte-identical to baseline. - -That run includes fallback reason diagnostics. In the CCDG 10k slice, the -planner hit 9,861 of 10,000 rows; the remaining 139 rows fell back for -`string_width`, meaning their measured string field exceeded the current -256-byte planned cap. - -One rejected optimization is recorded in -`bench/format-shape/large/results-opt-nosubset-split`: splitting the all-samples -loop from the `keep_samples` loop preserved correctness but slowed the planned -rows, so that code was reverted. diff --git a/bench/format-shape/inputs.tsv b/bench/format-shape/inputs.tsv deleted file mode 100644 index f5ae75eb9..000000000 --- a/bench/format-shape/inputs.tsv +++ /dev/null @@ -1,11 +0,0 @@ -name path source -ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset -1000g_chr22_genotypes bench/format-shape/public/1000g_chr22_genotypes_16050k_16150k.vcf.gz 1000 Genomes Phase 3 chr22 genotypes slice -1000g_wgs_sites bench/format-shape/public/1000g_wgs_sites_chr22_16050k_16300k.vcf.gz 1000 Genomes Phase 3 WGS sites-only slice -clinvar_grch38_chr22 bench/format-shape/public/clinvar_grch38_chr22_16050k_20000k.vcf.gz ClinVar GRCh38 chr22 slice -gnomad_v4.1_exomes_sites bench/format-shape/public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz gnomAD v4.1 exomes sites chr22 slice -synthetic_ccdg_likelihood bench/format-shape/synthetic/synthetic_ccdg_likelihood.vcf.gz synthetic CCDG-like likelihood FORMAT -synthetic_reordered_likelihood bench/format-shape/synthetic/synthetic_reordered_likelihood.vcf.gz synthetic reordered likelihood FORMAT -synthetic_fixed_numeric bench/format-shape/synthetic/synthetic_fixed_numeric.vcf.gz synthetic fixed numeric FORMAT -synthetic_float_string bench/format-shape/synthetic/synthetic_float_string.vcf.gz synthetic float and string FORMAT -synthetic_multiallelic_likelihood bench/format-shape/synthetic/synthetic_multiallelic_likelihood.vcf.gz synthetic multiallelic likelihood FORMAT diff --git a/bench/format-shape/large/bcftools-command-inputs.tsv b/bench/format-shape/large/bcftools-command-inputs.tsv deleted file mode 100644 index 62f1957e9..000000000 --- a/bench/format-shape/large/bcftools-command-inputs.tsv +++ /dev/null @@ -1,6 +0,0 @@ -name path source -ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples -1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF -large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples -large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT negative-control shape, 16k records x 2,048 samples -gnomad_sites_chr22 bench/format-shape/public/gnomad_v4.1_exomes_sites_chr22_20000k_20100k.vcf.gz gnomAD v4.1 exomes chr22 sites-only INFO-heavy slice diff --git a/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv b/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv deleted file mode 100644 index 0c25507d6..000000000 --- a/bench/format-shape/large/bcftools-full-ccdg-inputs.tsv +++ /dev/null @@ -1,2 +0,0 @@ -name path source -ccdg_chr22_full /path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz local full 1000G/CCDG high-coverage chr22 VCF, 3,202 samples diff --git a/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv b/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv deleted file mode 100644 index 852e684eb..000000000 --- a/bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv +++ /dev/null @@ -1,6 +0,0 @@ -name path source -ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG/1000G high-coverage chr22 slice, 10k records x 3,202 samples -giab_hg002_grch38_v421 bench/format-shape/large/public/giab/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz GIAB HG002 NIST v4.2.1 GRCh38 benchmark small variants -giab_hg002_grch38_v50q_smvar bench/format-shape/large/public/giab/HG002_GRCh38_v5.0q_smvar.vcf.gz GIAB HG002 v5.0q GRCh38 small variants -giab_hg002_grch38_v50q_stvar bench/format-shape/large/public/giab/HG002_GRCh38_v5.0q_stvar.vcf.gz GIAB HG002 v5.0q GRCh38 structural variants -giab_hg002_chm13_v50q_smvar bench/format-shape/large/public/giab/HG002_CHM13v2.0_v5.0q_smvar.vcf.gz GIAB HG002 v5.0q CHM13v2.0 small variants diff --git a/bench/format-shape/large/bcftools-merge-inputs.tsv b/bench/format-shape/large/bcftools-merge-inputs.tsv deleted file mode 100644 index 7764e0c05..000000000 --- a/bench/format-shape/large/bcftools-merge-inputs.tsv +++ /dev/null @@ -1,4 +0,0 @@ -name path source -small_1000g_genotypes bench/format-shape/public/1000g_chr22_genotypes_16050k_16150k.vcf.gz small 1000 Genomes GT slice used as a quick merge smoke case -large_ccdg_likelihood_1024s bench/format-shape/large/synthetic/large_ccdg_likelihood_1024s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 1,024 samples -large_float_string_1024s bench/format-shape/large/synthetic/large_float_string_1024s.vcf.gz synthetic float/string FORMAT negative-control shape, 16k records x 1,024 samples diff --git a/bench/format-shape/large/inputs.tsv b/bench/format-shape/large/inputs.tsv deleted file mode 100644 index 795882a7e..000000000 --- a/bench/format-shape/large/inputs.tsv +++ /dev/null @@ -1,11 +0,0 @@ -name path source -ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples -1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF -large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples -large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples -large_multiallelic_likelihood_2048s bench/format-shape/large/synthetic/large_multiallelic_likelihood_2048s.vcf.gz synthetic multiallelic likelihood FORMAT, 16k records x 2,048 samples -large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT, 16k records x 2,048 samples -large_phase_width_variation_2048s bench/format-shape/large/synthetic/large_phase_width_variation_2048s.vcf.gz synthetic likelihood FORMAT with variable PGT/PID widths, 12k records x 2,048 samples -large_mixed_likelihood_2048s bench/format-shape/large/synthetic/large_mixed_likelihood_2048s.vcf.gz synthetic likelihood FORMAT with row-local unsupported/wrong-width rows, 12k records x 2,048 samples -large_gt_first_reordered_2048s bench/format-shape/large/synthetic/large_gt_first_reordered_2048s.vcf.gz synthetic GT-first reordered non-shape likelihood FORMAT, 12k records x 2,048 samples -large_two_string_float_2048s bench/format-shape/large/synthetic/large_two_string_float_2048s.vcf.gz synthetic two-string float FORMAT, 12k records x 2,048 samples diff --git a/bench/format-shape/large/threaded-inputs.tsv b/bench/format-shape/large/threaded-inputs.tsv deleted file mode 100644 index 795882a7e..000000000 --- a/bench/format-shape/large/threaded-inputs.tsv +++ /dev/null @@ -1,11 +0,0 @@ -name path source -ccdg_10k bench/format-shape/public/ccdg_chr22_10k.vcf.gz local CCDG subset, 10k records x 3,202 samples -1000g_chr22_full_genotypes bench/format-shape/large/public/1000g_chr22_full_genotypes.vcf.gz 1000 Genomes Phase 3 full chr22 genotype VCF -large_ccdg_likelihood_2048s bench/format-shape/large/synthetic/large_ccdg_likelihood_2048s.vcf.gz synthetic CCDG-like likelihood FORMAT, 20k records x 2,048 samples -large_reordered_likelihood_2048s bench/format-shape/large/synthetic/large_reordered_likelihood_2048s.vcf.gz synthetic reordered likelihood FORMAT, 20k records x 2,048 samples -large_multiallelic_likelihood_2048s bench/format-shape/large/synthetic/large_multiallelic_likelihood_2048s.vcf.gz synthetic multiallelic likelihood FORMAT, 16k records x 2,048 samples -large_float_string_2048s bench/format-shape/large/synthetic/large_float_string_2048s.vcf.gz synthetic float/string FORMAT, 16k records x 2,048 samples -large_phase_width_variation_2048s bench/format-shape/large/synthetic/large_phase_width_variation_2048s.vcf.gz synthetic likelihood FORMAT with variable PGT/PID widths, 12k records x 2,048 samples -large_mixed_likelihood_2048s bench/format-shape/large/synthetic/large_mixed_likelihood_2048s.vcf.gz synthetic likelihood FORMAT with row-local unsupported/wrong-width rows, 12k records x 2,048 samples -large_gt_first_reordered_2048s bench/format-shape/large/synthetic/large_gt_first_reordered_2048s.vcf.gz synthetic GT-first reordered non-shape likelihood FORMAT, 12k records x 2,048 samples -large_two_string_float_2048s bench/format-shape/large/synthetic/large_two_string_float_2048s.vcf.gz synthetic two-string float FORMAT, 12k records x 2,048 samples diff --git a/bench/format-shape/scripts/make_large_synthetic.pl b/bench/format-shape/scripts/make_large_synthetic.pl deleted file mode 100644 index 785242b2d..000000000 --- a/bench/format-shape/scripts/make_large_synthetic.pl +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -my $outdir = shift @ARGV or die "usage: make_large_synthetic.pl OUTDIR [NSAMPLES]\n"; -my $nsamples = shift @ARGV || 2048; -my $scale = shift @ARGV || 1; -my @samples = map { "S$_" } 1..$nsamples; - -sub header { - my ($fh) = @_; - print $fh "##fileformat=VCFv4.3\n"; - print $fh "##contig=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", join("\t", @samples), "\n"; -} - -sub open_vcf { - my ($name) = @_; - open my $fh, ">", "$outdir/$name.vcf" or die "$outdir/$name.vcf: $!\n"; - header($fh); - return $fh; -} - -sub genotype { - my ($i, $s, $n_alt) = @_; - return "./." if (($i + $s) % 97) == 0; - return "0/0" if (($i + $s) % 5) == 0; - return "1|1" if $n_alt == 1 && (($i + $s) % 23) == 0; - return $n_alt > 1 && (($i + $s) % 7) == 0 ? "1/2" : "0/1"; -} - -sub ad { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 131) == 0; - return join(",", map { (($i * 3 + $s * 5 + $_ * 7) % 120) } 0..($n_allele - 1)); -} - -sub pl { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 137) == 0; - my $n = $n_allele * ($n_allele + 1) / 2; - return join(",", map { (($i + $s + $_) * 13) % 700 } 0..($n - 1)); -} - -sub gl { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 127) == 0; - my $n = $n_allele * ($n_allele + 1) / 2; - return join(",", map { sprintf("%.2f", -1 * ((($i + $s + $_) % 30) / 4.0)) } 0..($n - 1)); -} - -sub write_ccdg_like { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 21000000 + $i; - my $phase = $i % 2 == 0; - my $fmt = $phase ? "GT:AB:AD:DP:GQ:PGT:PID:PL" : "GT:AB:AD:DP:GQ:PL"; - my @vals; - for my $s (0..$#samples) { - my $gt = genotype($i, $s, 1); - my $ab = $gt eq "0/1" ? sprintf("%.2f", (($i + $s) % 90) / 100) : "."; - my $base = join(":", $gt, $ab, ad($i, $s, 2), (($i+$s)%160), (($i+$s)%99)); - if ($phase) { - push @vals, join(":", $base, ($gt =~ /\|/ ? $gt : "0|1"), "${pos}_A_T", pl($i, $s, 2)); - } else { - push @vals, join(":", $base, pl($i, $s, 2)); - } - } - print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", $fmt, @vals), "\n"; - } - close $fh; -} - -sub write_reordered { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 22000000 + $i; - my @vals; - for my $s (0..$#samples) { - push @vals, join(":", (($i+$s)%160), (($i+$s)%99), genotype($i, $s, 1), ad($i, $s, 2), pl($i, $s, 2)); - } - print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", "DP:GQ:GT:AD:PL", @vals), "\n"; - } - close $fh; -} - -sub write_multiallelic { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 23000000 + $i; - my $n_alt = 1 + ($i % 3); - my @alts = qw(C G T); - my $alt = join(",", @alts[0..($n_alt - 1)]); - my @vals; - for my $s (0..$#samples) { - push @vals, join(":", genotype($i, $s, $n_alt), ad($i, $s, $n_alt + 1), (($i+$s)%160), (($i+$s)%99), pl($i, $s, $n_alt + 1)); - } - print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", "GT:AD:DP:GQ:PL", @vals), "\n"; - } - close $fh; -} - -sub write_float_string { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 24000000 + $i; - my @vals; - for my $s (0..$#samples) { - my $ft = (($i+$s)%17) == 0 ? "LowQual" : "PASS"; - push @vals, join(":", genotype($i, $s, 2), gl($i, $s, 3), $ft, (($i+$s)%160), (($i+$s)%99)); - } - print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", "GT:GL:FT:DP:GQ", @vals), "\n"; - } - close $fh; -} - -sub write_phase_width_variation { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 25000000 + $i; - my @vals; - for my $s (0..$#samples) { - my $gt = genotype($i, $s, 1); - my $pgt = (($i + $s) % 29) == 0 ? "." : ($gt =~ /\|/ ? $gt : "0|1"); - my $pid; - if (($i + $s) % 31 == 0) { - $pid = "."; - } elsif (($i + $s) % 7 == 0) { - $pid = "${pos}_${s}_A_T_LONG_PHASE_SET"; - } elsif (($i + $s) % 5 == 0) { - $pid = "${pos}_A_T"; - } else { - $pid = "P" . (($i + $s) % 97); - } - push @vals, join(":", $gt, ad($i, $s, 2), (($i+$s)%160), - (($i+$s)%99), $pgt, $pid, pl($i, $s, 2)); - } - print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", - "GT:AD:DP:GQ:PGT:PID:PL", @vals), "\n"; - } - close $fh; -} - -sub write_mixed_likelihood { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 26000000 + $i; - my $n_alt = ($i % 17) == 0 ? 8 : (($i % 11) == 0 ? 2 : 1); - my @alts = qw(C G T AA AC AG AT GA); - my $alt = join(",", @alts[0..($n_alt - 1)]); - my $n_allele = $n_alt + 1; - my @vals; - for my $s (0..$#samples) { - my $ad = ad($i, $s, $n_allele); - my $pl = pl($i, $s, $n_allele); - if (($i % 19) == 0 && $ad ne ".") { - my @ad = split /,/, $ad; - pop @ad; - $ad = join(",", @ad); - } - if (($i % 23) == 0 && $pl ne ".") { - my @pl = split /,/, $pl; - pop @pl; - $pl = join(",", @pl); - } - push @vals, join(":", genotype($i, $s, $n_alt), $ad, - (($i+$s)%160), (($i+$s)%99), $pl); - } - print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", - "GT:AD:DP:GQ:PL", @vals), "\n"; - } - close $fh; -} - -sub write_gt_first_reordered { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 27000000 + $i; - my @vals; - for my $s (0..$#samples) { - push @vals, join(":", genotype($i, $s, 1), (($i+$s)%160), - ad($i, $s, 2), (($i+$s)%99), pl($i, $s, 2)); - } - print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", - "GT:DP:AD:GQ:PL", @vals), "\n"; - } - close $fh; -} - -sub write_two_string_float { - my ($name, $records) = @_; - my $fh = open_vcf($name); - for my $i (1..$records) { - my $pos = 28000000 + $i; - my @vals; - for my $s (0..$#samples) { - my $ft = (($i+$s)%17) == 0 ? "LowQual" : "PASS"; - my $pid = (($i+$s)%13) == 0 ? "." : "PS" . (($i * 11 + $s) % 100000); - push @vals, join(":", genotype($i, $s, 2), $ft, $pid, - gl($i, $s, 3), (($i+$s)%160)); - } - print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", - "GT:FT:PID:GL:DP", @vals), "\n"; - } - close $fh; -} - -unless ($ENV{SYNTHETIC_ONLY_NEW}) { - write_ccdg_like("large_ccdg_likelihood_${nsamples}s", 20000 * $scale); - write_reordered("large_reordered_likelihood_${nsamples}s", 20000 * $scale); - write_multiallelic("large_multiallelic_likelihood_${nsamples}s", 16000 * $scale); - write_float_string("large_float_string_${nsamples}s", 16000 * $scale); -} -write_phase_width_variation("large_phase_width_variation_${nsamples}s", 12000 * $scale); -write_mixed_likelihood("large_mixed_likelihood_${nsamples}s", 12000 * $scale); -write_gt_first_reordered("large_gt_first_reordered_${nsamples}s", 12000 * $scale); -write_two_string_float("large_two_string_float_${nsamples}s", 12000 * $scale); diff --git a/bench/format-shape/scripts/make_synthetic.pl b/bench/format-shape/scripts/make_synthetic.pl deleted file mode 100755 index 5266b4dc0..000000000 --- a/bench/format-shape/scripts/make_synthetic.pl +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -my $outdir = shift @ARGV or die "usage: make_synthetic.pl OUTDIR\n"; -my @samples = map { "S$_" } 1..8; - -sub header { - my ($fh) = @_; - print $fh "##fileformat=VCFv4.3\n"; - print $fh "##contig=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "##FORMAT=\n"; - print $fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", join("\t", @samples), "\n"; -} - -sub open_vcf { - my ($name) = @_; - open my $fh, ">", "$outdir/$name.vcf" or die "$outdir/$name.vcf: $!\n"; - header($fh); - return $fh; -} - -sub genotype { - my ($i, $s, $n_alt) = @_; - return "./." if (($i + $s) % 29) == 0; - return "0/0" if (($i + $s) % 5) == 0; - return "1|1" if $n_alt == 1 && (($i + $s) % 11) == 0; - return $n_alt > 1 && (($i + $s) % 7) == 0 ? "1/2" : "0/1"; -} - -sub ad { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 37) == 0; - return join(",", map { (($i * 3 + $s * 5 + $_ * 7) % 40) } 0..($n_allele - 1)); -} - -sub pl { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 41) == 0; - my $n = $n_allele * ($n_allele + 1) / 2; - return join(",", map { (($i + $s + $_) * 13) % 500 } 0..($n - 1)); -} - -sub gl { - my ($i, $s, $n_allele) = @_; - return "." if (($i + $s) % 31) == 0; - my $n = $n_allele * ($n_allele + 1) / 2; - return join(",", map { sprintf("%.2f", -1 * ((($i + $s + $_) % 20) / 3.0)) } 0..($n - 1)); -} - -my $fh = open_vcf("synthetic_ccdg_likelihood"); -for my $i (1..2000) { - my $pos = 20000000 + $i; - my $phase = $i % 2 == 0; - my $fmt = $phase ? "GT:AB:AD:DP:GQ:PGT:PID:PL" : "GT:AB:AD:DP:GQ:PL"; - my @vals; - for my $s (0..$#samples) { - my $gt = genotype($i, $s, 1); - my $ab = $gt eq "0/1" ? sprintf("%.2f", (($i + $s) % 90) / 100) : "."; - my $base = join(":", $gt, $ab, ad($i, $s, 2), (($i+$s)%80), (($i+$s)%99)); - if ($phase) { - push @vals, join(":", $base, ($gt =~ /\|/ ? $gt : "0|1"), "${pos}_A_T", pl($i, $s, 2)); - } else { - push @vals, join(":", $base, pl($i, $s, 2)); - } - } - print $fh join("\t", "chr22", $pos, ".", "A", "T", 50, "PASS", ".", $fmt, @vals), "\n"; -} -close $fh; - -$fh = open_vcf("synthetic_reordered_likelihood"); -for my $i (1..2000) { - my $pos = 20100000 + $i; - my @vals; - for my $s (0..$#samples) { - push @vals, join(":", (($i+$s)%80), (($i+$s)%99), genotype($i, $s, 1), ad($i, $s, 2), pl($i, $s, 2)); - } - print $fh join("\t", "chr22", $pos, ".", "G", "C", 50, "PASS", ".", "DP:GQ:GT:AD:PL", @vals), "\n"; -} -close $fh; - -$fh = open_vcf("synthetic_fixed_numeric"); -for my $i (1..2000) { - my $pos = 20200000 + $i; - my @vals; - for my $s (0..$#samples) { - my $hq = (($i+$s)%150) . "," . (($i+$s+9)%150); - my $sb = join(",", map { ($i + $s + $_) % 30 } 0..3); - push @vals, join(":", genotype($i, $s, 1), $hq, (($i+$s)%60), $sb); - } - print $fh join("\t", "chr22", $pos, ".", "C", "A", 50, "PASS", ".", "GT:HQ:MIN_DP:SB", @vals), "\n"; -} -close $fh; - -$fh = open_vcf("synthetic_float_string"); -for my $i (1..2000) { - my $pos = 20300000 + $i; - my @vals; - for my $s (0..$#samples) { - my $ft = (($i+$s)%13) == 0 ? "LowQual" : "PASS"; - push @vals, join(":", genotype($i, $s, 2), gl($i, $s, 3), $ft, (($i+$s)%80), (($i+$s)%99)); - } - print $fh join("\t", "chr22", $pos, ".", "A", "C,G", 50, "PASS", ".", "GT:GL:FT:DP:GQ", @vals), "\n"; -} -close $fh; - -$fh = open_vcf("synthetic_multiallelic_likelihood"); -for my $i (1..1200) { - my $pos = 20400000 + $i; - my $n_alt = 1 + ($i % 3); - my @alts = qw(C G T); - my $alt = join(",", @alts[0..($n_alt - 1)]); - my @vals; - for my $s (0..$#samples) { - push @vals, join(":", genotype($i, $s, $n_alt), ad($i, $s, $n_alt + 1), (($i+$s)%90), (($i+$s)%99), pl($i, $s, $n_alt + 1)); - } - print $fh join("\t", "chr22", $pos, ".", "A", $alt, 50, "PASS", ".", "GT:AD:DP:GQ:PL", @vals), "\n"; -} -close $fh; diff --git a/bench/format-shape/scripts/run_bcftools_bench.sh b/bench/format-shape/scripts/run_bcftools_bench.sh deleted file mode 100755 index 520690690..000000000 --- a/bench/format-shape/scripts/run_bcftools_bench.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/sh -set -eu - -bcftools=${BCFTOOLS:-bcftools} -inputs=${1:-bench/format-shape/large/threaded-inputs.tsv} -outdir=${OUTDIR:-bench/format-shape/large/results-bcftools} -keep_outputs=${KEEP_OUTPUTS:-1} -threads_list=${THREADS_LIST:-0 2 4 8} -sample_count=${SAMPLE_COUNT:-0} -mkdir -p "$outdir" - -timings="$outdir/timings.tsv" -checks="$outdir/checks.tsv" - -printf 'name\tthreads\tmode\treal\tuser\tsys\n' > "$timings" -printf 'name\tthreads\tcomparison\tstatus\n' > "$checks" - -tail -n +2 "$inputs" | while IFS=' ' read -r name path source -do - sample_args= - if [ "$sample_count" != 0 ]; then - samples=$("$bcftools" query -l "$path" | awk -v n="$sample_count" ' - NR <= n { if (s) s = s "," $0; else s = $0 } - END { print s } - ') - if [ -n "$samples" ]; then - sample_args="-s $samples" - fi - fi - for threads in $threads_list - do - base_out="$outdir/$name.t$threads.baseline.bcf" - plan_out="$outdir/$name.t$threads.plan.bcf" - thread_args= - if [ "$threads" != 0 ]; then - thread_args="--threads $threads" - fi - - for mode in baseline plan - do - err="$outdir/$name.t$threads.$mode.stderr" - out="$outdir/$name.t$threads.$mode.bcf" - case "$mode" in - baseline) - env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args -o "$out" "$path" 2> "$err" - ;; - plan) - env HTS_VCF_FORMAT_PLAN=1 /usr/bin/time -p "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args -o "$out" "$path" 2> "$err" - ;; - esac - - awk -v name="$name" -v threads="$threads" -v mode="$mode" ' - /^real / { real=$2 } - /^user / { user=$2 } - /^sys / { sys=$2 } - END { - printf "%s\t%s\t%s\t%s\t%s\t%s\n", - name, threads, mode, real+0, user+0, sys+0 - } - ' "$err" >> "$timings" - done - - if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then - printf '%s\t%s\tbaseline_vs_plan\tok\n' "$name" "$threads" >> "$checks" - else - printf '%s\t%s\tbaseline_vs_plan\tDIFF\n' "$name" "$threads" >> "$checks" - fi - if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$plan_out" - fi - done -done - -printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/bench/format-shape/scripts/run_bcftools_command_bench.sh b/bench/format-shape/scripts/run_bcftools_command_bench.sh deleted file mode 100755 index db84b0aa4..000000000 --- a/bench/format-shape/scripts/run_bcftools_command_bench.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/sh -set -eu - -# Broader production-style command benchmark for the VCF FORMAT planner. -# -# The conversion benchmark in run_bcftools_bench.sh measures one important -# path: VCF text -> BCF output via `bcftools view`. This script intentionally -# exercises a wider set of bcftools command shapes so we can see which workflows -# actually expose FORMAT parse cost: -# -# view_bcf full VCF -> BCF conversion -# view_sites VCF -> BCF after dropping genotypes with -G -# query_sites fixed-column/INFO-oriented query -# query_format FORMAT accessor query for a small sample subset -# stats bcftools stats -# filter_gt FORMAT expression filtering for a small sample subset -# merge_self bcftools merge of the input with itself using --force-samples -# -# Each command is run twice, once with HTS_VCF_FORMAT_PLAN=0 and once with -# HTS_VCF_FORMAT_PLAN=1. Outputs are compared with cmp whenever the command is -# applicable. FORMAT commands are skipped for sites-only inputs. -# -# Keep the default THREADS_LIST narrow here. This harness multiplies inputs by -# commands by planner modes, so exhaustive thread scaling belongs in the -# dedicated threaded runner unless a specific command needs investigation. - -bcftools=${BCFTOOLS:-bcftools} -inputs=${1:-bench/format-shape/large/bcftools-command-inputs.tsv} -outdir=${OUTDIR:-bench/format-shape/large/results-bcftools-commands} -keep_outputs=${KEEP_OUTPUTS:-1} -threads_list=${THREADS_LIST:-0} -commands=${COMMANDS:-view_bcf view_sites query_sites query_format stats filter_gt} -query_sample_count=${QUERY_SAMPLE_COUNT:-2} -mkdir -p "$outdir" - -timings="$outdir/timings.tsv" -checks="$outdir/checks.tsv" -cmds_out="$outdir/commands.tsv" - -printf 'name\tcommand\tthreads\tmode\treal\tuser\tsys\n' > "$timings" -printf 'name\tcommand\tthreads\tcomparison\tstatus\n' > "$checks" -printf 'command\tdescription\n' > "$cmds_out" -printf 'view_bcf\tbcftools view --no-version -Ob -l 0\n' >> "$cmds_out" -printf 'view_sites\tbcftools view --no-version -G -Ob -l 0\n' >> "$cmds_out" -printf 'query_sites\tbcftools query fixed site fields\n' >> "$cmds_out" -printf 'query_format\tbcftools query GT for first QUERY_SAMPLE_COUNT samples\n' >> "$cmds_out" -printf 'stats\tbcftools stats\n' >> "$cmds_out" -printf 'filter_gt\tbcftools view -i GT="alt" for first QUERY_SAMPLE_COUNT samples\n' >> "$cmds_out" -printf 'merge_self\tbcftools merge --no-index --force-samples of the input with itself\n' >> "$cmds_out" - -run_one() -{ - mode=$1 - command=$2 - threads=$3 - path=$4 - out=$5 - err=$6 - sample_args=$7 - thread_args= - plan=0 - - if [ "$mode" = plan ]; then - plan=1 - fi - if [ "$threads" != 0 ]; then - thread_args="--threads $threads" - fi - - case "$command" in - view_bcf) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -Ob -l 0 $thread_args \ - -o "$out" "$path" 2> "$err" - ;; - view_sites) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -G -Ob -l 0 $thread_args \ - -o "$out" "$path" 2> "$err" - ;; - query_sites) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" query -f '%CHROM\t%POS\t%REF\t%ALT\n' \ - "$path" > "$out" 2> "$err" - ;; - query_format) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" query $sample_args -f '%CHROM\t%POS[\t%GT]\n' \ - "$path" > "$out" 2> "$err" - ;; - stats) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" stats "$path" > "$out" 2> "$err" - ;; - filter_gt) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -Ob -l 0 $thread_args $sample_args \ - -i 'GT="alt"' -o "$out" "$path" 2> "$err" - ;; - merge_self) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" merge --no-index --force-samples --no-version -Ob \ - $thread_args -o "$out" "$path" "$path" 2> "$err" - ;; - *) - printf 'unknown command: %s\n' "$command" >&2 - return 1 - ;; - esac -} - -tail -n +2 "$inputs" | while IFS=' ' read -r name path source -do - samples=$("$bcftools" query -l "$path" | awk -v n="$query_sample_count" ' - NR <= n { if (s) s = s "," $0; else s = $0 } - END { print s } - ') - sample_args= - if [ -n "$samples" ]; then - sample_args="-s $samples" - fi - - for command in $commands - do - case "$command" in - query_format|filter_gt|merge_self) - if [ -z "$sample_args" ]; then - for threads in $threads_list - do - printf '%s\t%s\t%s\tbaseline_vs_plan\tskipped_no_samples\n' \ - "$name" "$command" "$threads" >> "$checks" - done - continue - fi - ;; - esac - - for threads in $threads_list - do - base_out="$outdir/$name.$command.t$threads.baseline.out" - plan_out="$outdir/$name.$command.t$threads.plan.out" - - for mode in baseline plan - do - err="$outdir/$name.$command.t$threads.$mode.stderr" - out="$outdir/$name.$command.t$threads.$mode.out" - run_one "$mode" "$command" "$threads" "$path" "$out" "$err" "$sample_args" - - awk -v name="$name" -v command="$command" \ - -v threads="$threads" -v mode="$mode" ' - /^real / { real=$2 } - /^user / { user=$2 } - /^sys / { sys=$2 } - END { - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - name, command, threads, mode, - real+0, user+0, sys+0 - } - ' "$err" >> "$timings" - done - - if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then - printf '%s\t%s\t%s\tbaseline_vs_plan\tok\n' \ - "$name" "$command" "$threads" >> "$checks" - else - printf '%s\t%s\t%s\tbaseline_vs_plan\tDIFF\n' \ - "$name" "$command" "$threads" >> "$checks" - fi - if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$plan_out" - fi - done - done -done - -printf 'wrote %s, %s, and %s\n' "$timings" "$checks" "$cmds_out" diff --git a/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh b/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh deleted file mode 100644 index 7f1a84438..000000000 --- a/bench/format-shape/scripts/run_bcftools_command_bench_stream.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Streaming variant of run_bcftools_command_bench.sh for very large VCFs. -# It runs the same command families but pipes command output through cksum, -# avoiding temporary BCF/text outputs that can be hundreds of GiB on full -# cohort chromosomes. Baseline and planned checksums are compared. - -bcftools=${BCFTOOLS:-bcftools} -inputs=${1:-bench/format-shape/large/bcftools-command-inputs.tsv} -outdir=${OUTDIR:-bench/format-shape/large/results-bcftools-commands-stream} -threads_list=${THREADS_LIST:-0} -commands=${COMMANDS:-view_bcf view_sites query_sites query_format stats filter_gt} -query_sample_count=${QUERY_SAMPLE_COUNT:-2} -mkdir -p "$outdir" - -timings="$outdir/timings.tsv" -checks="$outdir/checks.tsv" -cmds_out="$outdir/commands.tsv" -checksums="$outdir/checksums.tsv" - -printf 'name\tcommand\tthreads\tmode\treal\tuser\tsys\n' > "$timings" -printf 'name\tcommand\tthreads\tcomparison\tstatus\n' > "$checks" -printf 'name\tcommand\tthreads\tmode\tcksum\tbytes\n' > "$checksums" -printf 'command\tdescription\n' > "$cmds_out" -printf 'view_bcf\tbcftools view --no-version -Ob -l 0 streamed to cksum\n' >> "$cmds_out" -printf 'view_sites\tbcftools view --no-version -G -Ob -l 0 streamed to cksum\n' >> "$cmds_out" -printf 'query_sites\tbcftools query fixed site fields streamed to cksum\n' >> "$cmds_out" -printf 'query_format\tbcftools query GT for first QUERY_SAMPLE_COUNT samples streamed to cksum\n' >> "$cmds_out" -printf 'stats\tbcftools stats streamed to cksum\n' >> "$cmds_out" -printf 'filter_gt\tbcftools view -i GT="alt" for first QUERY_SAMPLE_COUNT samples streamed to cksum\n' >> "$cmds_out" -printf 'merge_self\tbcftools merge --no-index --force-samples streamed to cksum\n' >> "$cmds_out" - -run_one() -{ - local mode=$1 - local command=$2 - local threads=$3 - local path=$4 - local sum_out=$5 - local err=$6 - local sample_args=$7 - local plan=0 - local thread_args= - - if [ "$mode" = plan ]; then - plan=1 - fi - if [ "$threads" != 0 ]; then - thread_args="--threads $threads" - fi - - case "$command" in - view_bcf) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -Ob -l 0 $thread_args \ - -o - "$path" 2> "$err" | cksum > "$sum_out" - ;; - view_sites) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -G -Ob -l 0 $thread_args \ - -o - "$path" 2> "$err" | cksum > "$sum_out" - ;; - query_sites) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" query -f '%CHROM\t%POS\t%REF\t%ALT\n' \ - "$path" 2> "$err" | cksum > "$sum_out" - ;; - query_format) - # shellcheck disable=SC2086 - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" query $sample_args -f '%CHROM\t%POS[\t%GT]\n' \ - "$path" 2> "$err" | cksum > "$sum_out" - ;; - stats) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" stats "$path" 2> "$err" | cksum > "$sum_out" - ;; - filter_gt) - # shellcheck disable=SC2086 - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" view --no-version -Ob -l 0 $thread_args \ - $sample_args -i 'GT="alt"' -o - "$path" 2> "$err" | cksum > "$sum_out" - ;; - merge_self) - env HTS_VCF_FORMAT_PLAN=$plan /usr/bin/time -p \ - "$bcftools" merge --no-index --force-samples --no-version -Ob \ - $thread_args -o - "$path" "$path" 2> "$err" | cksum > "$sum_out" - ;; - *) - printf 'unknown command: %s\n' "$command" >&2 - return 1 - ;; - esac -} - -tail -n +2 "$inputs" | while IFS=$'\t' read -r name path source -do - samples=$("$bcftools" query -l "$path" | awk -v n="$query_sample_count" ' - NR <= n { if (s) s = s "," $0; else s = $0 } - END { print s } - ') - sample_args= - if [ -n "$samples" ]; then - sample_args="-s $samples" - fi - - for command in $commands - do - case "$command" in - query_format|filter_gt|merge_self) - if [ -z "$sample_args" ]; then - for threads in $threads_list - do - printf '%s\t%s\t%s\tbaseline_vs_plan\tskipped_no_samples\n' \ - "$name" "$command" "$threads" >> "$checks" - done - continue - fi - ;; - esac - - for threads in $threads_list - do - base_sum="$outdir/$name.$command.t$threads.baseline.cksum" - plan_sum="$outdir/$name.$command.t$threads.plan.cksum" - - for mode in baseline plan - do - err="$outdir/$name.$command.t$threads.$mode.stderr" - sum="$outdir/$name.$command.t$threads.$mode.cksum" - run_one "$mode" "$command" "$threads" "$path" "$sum" "$err" "$sample_args" - - awk -v name="$name" -v command="$command" \ - -v threads="$threads" -v mode="$mode" ' - /^real / { real=$2 } - /^user / { user=$2 } - /^sys / { sys=$2 } - END { - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - name, command, threads, mode, - real+0, user+0, sys+0 - } - ' "$err" >> "$timings" - - awk -v name="$name" -v command="$command" \ - -v threads="$threads" -v mode="$mode" ' - { printf "%s\t%s\t%s\t%s\t%s\t%s\n", - name, command, threads, mode, $1, $2 } - ' "$sum" >> "$checksums" - done - - if cmp "$base_sum" "$plan_sum" >/dev/null 2>&1; then - printf '%s\t%s\t%s\tbaseline_vs_plan\tok\n' \ - "$name" "$command" "$threads" >> "$checks" - else - printf '%s\t%s\t%s\tbaseline_vs_plan\tDIFF\n' \ - "$name" "$command" "$threads" >> "$checks" - fi - done - done -done - -printf 'wrote %s, %s, %s, and %s\n' "$timings" "$checks" "$checksums" "$cmds_out" diff --git a/bench/format-shape/scripts/run_bench.sh b/bench/format-shape/scripts/run_bench.sh deleted file mode 100755 index 66430d22a..000000000 --- a/bench/format-shape/scripts/run_bench.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/sh -set -eu - -test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-bench/format-shape/inputs.tsv} -outdir=${OUTDIR:-bench/format-shape/results} -keep_outputs=${KEEP_OUTPUTS:-1} -mkdir -p "$outdir" - -timings="$outdir/timings.tsv" -checks="$outdir/checks.tsv" - -printf 'name\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\n' > "$timings" -printf 'name\tcomparison\tstatus\n' > "$checks" - -tail -n +2 "$inputs" | while IFS=' ' read -r name path source -do - base_out="$outdir/$name.baseline.bcf" - plan_out="$outdir/$name.plan.bcf" - - for mode in baseline plan - do - err="$outdir/$name.$mode.stderr" - out="$outdir/$name.$mode.bcf" - case "$mode" in - baseline) - env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" - ;; - plan) - env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 \ - /usr/bin/time -p "$test_view" -b -l 0 "$path" > "$out" 2> "$err" - ;; - esac - - awk -v name="$name" -v mode="$mode" ' - /^real / { real=$2 } - /^user / { user=$2 } - /^sys / { sys=$2 } - /^vcf-format-plan / { - for (i=1; i<=NF; i++) { - split($i, kv, "=") - if (kv[1] == "attempts") attempts=kv[2] - else if (kv[1] == "hits") hits=kv[2] - else if (kv[1] == "fallback") fallback=kv[2] - else if (kv[1] == "parsed_samples") parsed=kv[2] - } - } - END { - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - name, mode, real+0, user+0, sys+0, - attempts+0, hits+0, fallback+0, parsed+0 - } - ' "$err" >> "$timings" - done - - if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then - printf '%s\tbaseline_vs_plan\tok\n' "$name" >> "$checks" - else - printf '%s\tbaseline_vs_plan\tDIFF\n' "$name" >> "$checks" - fi - if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$plan_out" - fi -done - -printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/bench/format-shape/scripts/run_thread_bench.sh b/bench/format-shape/scripts/run_thread_bench.sh deleted file mode 100755 index dc0934467..000000000 --- a/bench/format-shape/scripts/run_thread_bench.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/sh -set -eu - -test_view=${TEST_VIEW:-./test/test_view} -inputs=${1:-bench/format-shape/large/threaded-inputs.tsv} -outdir=${OUTDIR:-bench/format-shape/large/results-threaded} -keep_outputs=${KEEP_OUTPUTS:-1} -threads_list=${THREADS_LIST:-0 2 4 8} -mkdir -p "$outdir" - -timings="$outdir/timings.tsv" -checks="$outdir/checks.tsv" - -printf 'name\tthreads\tmode\treal\tuser\tsys\tattempts\thits\tfallback\tparsed_samples\n' > "$timings" -printf 'name\tthreads\tcomparison\tstatus\n' > "$checks" - -tail -n +2 "$inputs" | while IFS=' ' read -r name path source -do - for threads in $threads_list - do - base_out="$outdir/$name.t$threads.baseline.bcf" - plan_out="$outdir/$name.t$threads.plan.bcf" - thread_args= - if [ "$threads" != 0 ]; then - thread_args="-@ $threads" - fi - - for mode in baseline plan - do - err="$outdir/$name.t$threads.$mode.stderr" - out="$outdir/$name.t$threads.$mode.bcf" - case "$mode" in - baseline) - env HTS_VCF_FORMAT_PLAN=0 /usr/bin/time -p "$test_view" -b -l 0 $thread_args "$path" > "$out" 2> "$err" - ;; - plan) - env HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 \ - /usr/bin/time -p "$test_view" -b -l 0 $thread_args "$path" > "$out" 2> "$err" - ;; - esac - - awk -v name="$name" -v threads="$threads" -v mode="$mode" ' - /^real / { real=$2 } - /^user / { user=$2 } - /^sys / { sys=$2 } - /^vcf-format-plan / { - for (i=1; i<=NF; i++) { - split($i, kv, "=") - if (kv[1] == "attempts") attempts=kv[2] - else if (kv[1] == "hits") hits=kv[2] - else if (kv[1] == "fallback") fallback=kv[2] - else if (kv[1] == "parsed_samples") parsed=kv[2] - } - } - END { - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", - name, threads, mode, real+0, user+0, sys+0, - attempts+0, hits+0, fallback+0, parsed+0 - } - ' "$err" >> "$timings" - done - - if cmp "$base_out" "$plan_out" >/dev/null 2>&1; then - printf '%s\t%s\tbaseline_vs_plan\tok\n' "$name" "$threads" >> "$checks" - else - printf '%s\t%s\tbaseline_vs_plan\tDIFF\n' "$name" "$threads" >> "$checks" - fi - if [ "$keep_outputs" = 0 ]; then - rm -f "$base_out" "$plan_out" - fi - done -done - -printf 'wrote %s and %s\n' "$timings" "$checks" diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md deleted file mode 100644 index 2493b20f8..000000000 --- a/docs/FORMAT_PLAN_CURRENT.md +++ /dev/null @@ -1,556 +0,0 @@ -# Dynamic FORMAT Plan: Current Implementation - -This document describes the implementation currently present in `vcf.c`, the -correctness boundaries, and the latest benchmark results. - -## Entry Point - -`vcf_parse_format()` first calls `vcf_parse_format_planned()` when -`HTS_VCF_FORMAT_PLAN` is enabled. The planned path either parses the whole -FORMAT column or returns `-3`, allowing the existing generic parser to handle the -column unchanged. - -```text -HTS_VCF_FORMAT_PLAN enabled - -> fetch or compile header-owned FORMAT/header plan - -> resolve row-local widths - -> composable executor - -> generic fallback on unsupported or suspicious rows -``` - -The only enabled spelling is `HTS_VCF_FORMAT_PLAN=1`. Unknown values are -treated as disabled so typos such as `off` or `false` do not accidentally enable -the planner. - -## Plan Compilation - -Plans are cached in private `bcf_hdr_aux_t` state by literal FORMAT string plus -the header's private FORMAT-plan generation. This is important because VCF -header IDs, declared types, and number models are header-local. The cache grows -from 16 entries up to 128 entries, uses heap storage for long FORMAT strings, -and also caches unsupported schemas so repeated odd rows do not repeatedly pay -compile cost. - -Planner statistics are collected only when `HTS_VCF_FORMAT_PLAN_STATS=1` is -also set. Normal production parsing therefore avoids touching the process-wide -test counters. The test hook reports both aggregate attempts/hits/fallbacks and -fallback reason counters: unsupported schema, numeric width, string width, GT -shape, parse failure, separator mismatch, and sample-count mismatch. - -`bcf_hdr_sync()` clears the header-owned plan cache and increments the private -generation after header dictionaries are rebuilt. The planner also refuses to -compile while `h->dirty` is set, leaving unsynced or header-repair cases on the -generic parser. - -The cache is mutable header-owned state, like other htslib header scratch -storage. Callers should not concurrently parse through the same `bcf_hdr_t` -from multiple threads. - -The compile step rejects: - -- undefined FORMAT tags; -- duplicate FORMAT tags; -- unsupported header types; -- unsupported number models; -- `GT` declarations that are not `Type=String,Number=1`. -- measured-string plus float-vector schemas that do not also have integer-vector - work for the planned executor. - -Undefined tags intentionally fall back to the generic parser so existing -dummy-header repair and warning behavior is preserved. - -## Supported Operations - -The current executor supports: - -- `GT`, with fast `GT2` storage when the row is diploid and simple; -- integer fields with fixed `Number=N`, `Number=A`, `Number=R`, `Number=G`, or - bounded measured `Number=.` row widths; -- float fields with the same number models as integer fields; -- string fields declared as `Type=String,Number=1`, measured per row. -- `bcf_hdr_set_samples()` / `keep_samples`, by scanning the original sample - columns and writing only retained samples densely into the planned BCF output. - -Header-derived widths are resolved per record. `Number=A`, `Number=R`, and -`Number=G` depend on the current allele count. String and `Number=.` numeric -fields use a row-local measurement pass. Numeric vectors remain capped at 64 -values per FORMAT field in the planned path. Measured strings are capped -separately at 256 bytes per row field, which keeps common phase-set annotations -on the fast path while bounding scratch-buffer and transposition work. - -## Executor - -BCF stores FORMAT data transposed by tag: all samples for FORMAT op 0, then all -samples for FORMAT op 1, and so on. The dynamic executor parses VCF samples in -sample-major order and writes that transposed BCF layout. - -Leading fixed-width `GT2` and `FLOAT1` rows can write directly into `v->indiv`. -Other rows are staged in header scratch memory, then encoded after sample -parsing so integer range and observed-width metadata are known. - -For fixed-width vector fields, the executor can compact underfilled rows to the -observed row maximum before BCF encoding. This avoids whole-row fallback when -the generic parser would also emit a narrower byte-identical vector width. - -## Fallback Policy - -Supported cached plans are probed on every row. If row-local validation fails, -the executor rolls back its partial `v->indiv` writes and the generic parser -handles the whole FORMAT column for that record. The fallback does not disable -or cool down the cached plan; nearby rows with the same FORMAT schema can still -take the optimized path. - -Compile-time unsupported schemas are still cached as unsupported, so repeated -unoptimizable FORMAT strings pay the compile/classification cost once and then -fall back directly to the generic parser. - -## Correctness Rules - -The planned parser must preserve these invariants: - -- no planned parsing while the header has unsynced dictionary changes; -- header IDs, types, and number models are resolved before execution; -- selected-sample parsing must honor `h->keep_samples`, use `h->nsamples_ori` - for input-column scans, and set `v->n_sample` to the retained sample count; -- duplicate or undefined tags use the generic parser; -- measured-string plus float-vector schemas without integer-vector work use the - generic parser; -- unsupported GT encodings force fallback; -- numeric vectors preserve observed width and vector-end padding; -- strings use observed maximum byte length and zero-pad shorter samples; -- integer and float overflow/error behavior must match production htslib or - force fallback; -- successful planned rows run the same final FORMAT consistency check as the - generic parser via `vcf_parse_format_check7()`; -- direct writes to `v->indiv` must roll back before fallback. - -Focused validation lives in the existing `test/test.pl` harness as -`test_vcf_format_plan`. It compares generic parsing and -`HTS_VCF_FORMAT_PLAN=1` byte-for-byte with `cmp`, and also verifies that -unrecognized control values such as `HTS_VCF_FORMAT_PLAN=off` behave like the -generic parser. The repo fixtures cover numeric-width and GT-shape fallback, -mixed string/float schemas kept on the generic parser, cache growth, long FORMAT -strings, string-width fallback, separator fallback, parse fallback with rollback, -repeated wide GT values, float-vector compaction, selected-sample skipping of -malformed unselected columns, and sample-count mismatch. The selected-sample -checks compare explicit inclusion and exclusion lists (`S1,S3`, `S2`, and -`^S2`) and also verify retained-sample float widths do not depend on skipped -input columns. -`test/test_format_plan_cache` mutates and resyncs a header after a plan has been -compiled for the same FORMAT string, then verifies the row is planned again with -the new metadata. - -## Large Corpus Benchmark - -Command: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-prod-hardening2 \ - bench/format-shape/scripts/run_bench.sh bench/format-shape/large/inputs.tsv -``` - -All planned outputs compared byte-identical to baseline. - -| Input | Baseline user | Plan user | User speedup | Hits/fallback | -|---|---:|---:|---:|---:| -| CCDG 10k | 2.47 s | 2.21 s | 1.12x | 9,861 / 139 | -| 1000G chr22 full GT | 24.61 s | 9.48 s | 2.60x | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.00 s | 3.68 s | 1.09x | 20,000 / 0 | -| Large reordered likelihood | 2.86 s | 2.42 s | 1.18x | 20,000 / 0 | -| Large multiallelic likelihood | 3.08 s | 2.67 s | 1.15x | 16,000 / 0 | -| Large float/string | 2.88 s | 2.86 s | 1.01x | 0 / 16,000 | -| Variable phase widths | 2.53 s | 2.45 s | 1.03x | 12,000 / 0 | -| Mixed row-local fallbacks | 2.14 s | 1.84 s | 1.16x | 12,000 / 0 | -| GT-first reordered | 1.68 s | 1.41 s | 1.19x | 12,000 / 0 | -| Two-string float | 2.20 s | 2.19 s | 1.00x | 0 / 12,000 | - -The CCDG 10k fallbacks are all `string_width=139`, meaning only rows with -measured string fields wider than the 256-byte planned cap use the generic -parser. The float/string control fixtures still fall back as unsupported -because the mixed string/float shape boundary keeps those rows on the generic -parser. Briefly tested runtime guards regressed sparse-fallback CCDG-like -layouts, so the current implementation leaves row-local fallbacks local to the -record. - -## Full Threaded Corpus Benchmark - -Command: - -```sh -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-threaded-profit-gate \ - bench/format-shape/scripts/run_thread_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -All 40 planned outputs compared byte-identical to baseline. Generated result -files are ignored; the table below summarizes the recorded real-time speedup -from `bench/format-shape/large/results-threaded-profit-gate`. - -| Input | 0 threads | 2 threads | 4 threads | 8 threads | -|---|---:|---:|---:|---:| -| CCDG 10k | 1.13x | 1.15x | 1.16x | 1.15x | -| 1000G chr22 full GT | 3.10x | 3.73x | 4.34x | 3.88x | -| Large CCDG-like synthetic | 1.12x | 1.14x | 1.13x | 1.13x | -| Large reordered likelihood | 1.23x | 1.33x | 1.32x | 1.29x | -| Large multiallelic likelihood | 1.16x | 1.22x | 1.22x | 1.22x | -| Large float/string | 1.01x | 0.97x | 1.04x | 1.00x | -| Variable phase widths | 1.06x | 1.10x | 1.11x | 1.09x | -| Mixed row-local fallbacks | 1.18x | 1.25x | 1.31x | 1.23x | -| GT-first reordered negative | 1.22x | 1.31x | 1.32x | 1.32x | -| Two-string float negative | 1.00x | 1.00x | 1.01x | 1.00x | - -## bcftools Production-Style Benchmark - -A clean bcftools `develop` worktree was built at: - -```text -/path/to/bcftools-htslib-vcf-plan -``` - -using this htslib worktree: - -```sh -make HTSDIR=/path/to/htslib bcftools -``` - -Timing command: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools \ - bench/format-shape/scripts/run_bcftools_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -`bench/format-shape/large/threaded-inputs.tsv` now mirrors the full large -corpus from `large/inputs.tsv`, so threaded runs cover all real and synthetic -workload shapes rather than only the earlier two representative rows. - -The runner uses `bcftools view --no-version -Ob -l 0 [--threads N]`. All -planned outputs compared byte-identical to baseline. - -| Input | Threads | Baseline real | Plan real | Speedup | Baseline user | Plan user | -|---|---:|---:|---:|---:|---:|---:| -| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | 25.94 s | 8.05 s | -| 1000G chr22 full GT | 2 | 26.59 s | 6.99 s | 3.80x | 28.82 s | 9.04 s | -| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | 28.83 s | 9.08 s | -| 1000G chr22 full GT | 8 | 26.62 s | 6.96 s | 3.82x | 28.71 s | 9.38 s | -| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | 4.11 s | 3.66 s | -| Large CCDG-like synthetic | 2 | 3.46 s | 3.01 s | 1.15x | 4.50 s | 4.06 s | -| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | 4.51 s | 4.09 s | -| Large CCDG-like synthetic | 8 | 3.46 s | 3.00 s | 1.15x | 4.50 s | 4.05 s | - -## bcftools Selected-Sample Benchmark - -The same bcftools runner can select the first N samples from each input with -`SAMPLE_COUNT=N`. This exercises the `bcf_hdr_set_samples()` / `keep_samples` -path through bcftools rather than only through the test harness. - -Command: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -SAMPLE_COUNT=2 KEEP_OUTPUTS=0 \ -OUTDIR=bench/format-shape/large/results-bcftools-keep2 \ - bench/format-shape/scripts/run_bcftools_bench.sh \ - bench/format-shape/large/threaded-inputs.tsv -``` - -All 40 planned outputs compared byte-identical to baseline. The table shows -real-time and user-time speedup for selecting two samples from every input that -has samples; sites-only inputs naturally run without `-s`. - -| Input | Threads | Real speedup | User speedup | -|---|---:|---:|---:| -| CCDG 10k | 0 | 1.12x | 1.12x | -| CCDG 10k | 2 | 1.12x | 1.11x | -| CCDG 10k | 4 | 1.13x | 1.12x | -| CCDG 10k | 8 | 1.11x | 1.10x | -| 1000G chr22 full GT | 0 | 2.71x | 2.73x | -| 1000G chr22 full GT | 2 | 2.83x | 2.44x | -| 1000G chr22 full GT | 4 | 2.94x | 2.52x | -| 1000G chr22 full GT | 8 | 3.06x | 2.61x | -| Large CCDG-like synthetic | 0 | 1.07x | 1.08x | -| Large CCDG-like synthetic | 2 | 1.10x | 1.07x | -| Large CCDG-like synthetic | 4 | 1.09x | 1.07x | -| Large CCDG-like synthetic | 8 | 1.09x | 1.07x | -| Large reordered likelihood | 0 | 1.15x | 1.17x | -| Large reordered likelihood | 2 | 1.22x | 1.15x | -| Large reordered likelihood | 4 | 1.23x | 1.17x | -| Large reordered likelihood | 8 | 1.22x | 1.16x | -| Large multiallelic likelihood | 0 | 1.13x | 1.13x | -| Large multiallelic likelihood | 2 | 1.14x | 1.11x | -| Large multiallelic likelihood | 4 | 1.16x | 1.12x | -| Large multiallelic likelihood | 8 | 1.18x | 1.13x | -| Large float/string | 0 | 1.02x | 1.01x | -| Large float/string | 2 | 0.99x | 0.99x | -| Large float/string | 4 | 1.01x | 1.00x | -| Large float/string | 8 | 0.97x | 0.98x | -| Variable phase widths | 0 | 1.04x | 1.05x | -| Variable phase widths | 2 | 1.05x | 1.05x | -| Variable phase widths | 4 | 1.05x | 1.04x | -| Variable phase widths | 8 | 1.06x | 1.05x | -| Mixed row-local fallbacks | 0 | 1.14x | 1.16x | -| Mixed row-local fallbacks | 2 | 1.17x | 1.14x | -| Mixed row-local fallbacks | 4 | 1.18x | 1.14x | -| Mixed row-local fallbacks | 8 | 1.17x | 1.14x | -| GT-first reordered negative | 0 | 1.21x | 1.22x | -| GT-first reordered negative | 2 | 1.25x | 1.19x | -| GT-first reordered negative | 4 | 1.26x | 1.19x | -| GT-first reordered negative | 8 | 1.22x | 1.18x | -| Two-string float negative | 0 | 0.96x | 0.98x | -| Two-string float negative | 2 | 1.00x | 0.99x | -| Two-string float negative | 4 | 0.99x | 0.98x | -| Two-string float negative | 8 | 1.03x | 1.01x | - -## bcftools Command Benchmark - -The broader command runner exercises bcftools paths that either consume FORMAT -records, discard FORMAT records, or mostly operate on site-level data: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-commands \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-command-inputs.tsv -``` - -All applicable planned outputs compared byte-identical to baseline. FORMAT -commands were skipped for the sites-only gnomAD input as expected. - -| Input | Command | Real speedup | User speedup | -|---|---|---:|---:| -| CCDG 10k | view_bcf | 1.11x | 1.12x | -| CCDG 10k | view_sites | 1.12x | 1.13x | -| CCDG 10k | query_format | 1.51x | 1.56x | -| CCDG 10k | filter_gt | 1.11x | 1.12x | -| 1000G chr22 full GT | view_bcf | 2.79x | 2.94x | -| 1000G chr22 full GT | view_sites | 2.98x | 3.02x | -| 1000G chr22 full GT | query_format | 1.94x | 1.94x | -| 1000G chr22 full GT | filter_gt | 1.57x | 1.58x | -| Large reordered likelihood | view_bcf | 1.21x | 1.22x | -| Large reordered likelihood | view_sites | 1.20x | 1.20x | -| Large reordered likelihood | query_format | 1.39x | 1.42x | -| Large reordered likelihood | filter_gt | 1.14x | 1.14x | -| Large float/string | view_bcf | 1.02x | 1.02x | -| Large float/string | query_format | 1.01x | 1.00x | -| gnomAD sites chr22 | view_bcf | 0.98x | 1.00x | -| gnomAD sites chr22 | query_sites | 1.00x | 1.08x | - -`query_sites` and `stats` were generally neutral because they do little or no -FORMAT work. The small negative rows, such as CCDG `stats` at 0.94x real and -float/string `stats` at 0.93x real, are still within the area to watch for -planner overhead in workloads that do not benefit from FORMAT decoding. - -## bcftools Merge Benchmark - -`merge_self` is kept out of the default command list because merge output can -grow quickly. It was run against the smaller merge manifest: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -COMMANDS=merge_self KEEP_OUTPUTS=0 \ -OUTDIR=bench/format-shape/large/results-bcftools-merge \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-merge-inputs.tsv -``` - -All planned merge outputs compared byte-identical to baseline. - -| Input | Baseline real | Plan real | Real speedup | Baseline user | Plan user | -|---|---:|---:|---:|---:|---:| -| Small 1000G genotypes | 0.14 s | 0.10 s | 1.40x | 0.13 s | 0.08 s | -| Large CCDG likelihood 1024s | 4.50 s | 4.33 s | 1.04x | 4.05 s | 3.91 s | -| Large float/string 1024s | 2.69 s | 2.69 s | 1.00x | 2.40 s | 2.41 s | - -## GIAB and CCDG Command Check - -GIAB HG002 files were added as real-world single-sample correctness fixtures: -NIST v4.2.1 GRCh38 small variants, v5.0q GRCh38 small variants, v5.0q GRCh38 -structural variants, and v5.0q CHM13v2.0 small variants. The same bcftools -command suite was run against those files plus the all-sample CCDG 10k slice: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -KEEP_OUTPUTS=0 OUTDIR=bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening \ - bench/format-shape/scripts/run_bcftools_command_bench.sh \ - bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv -``` - -The initial GIAB v5.0q run found a correctness bug: the planned GT2 parser -encoded phased missing genotypes such as `.|.` as unphased `./.`. The parser -now preserves the phase bit for missing alleles, and `test/format-plan-edge.vcf` -has an explicit phased-missing GT row. After the fix, every command output in -this run compared byte-identical/text-identical to baseline. The table below -shows user-time speedups from the latest hardened rerun. - -| Input | Records | Samples | view_bcf | query_format | filter_gt | Notes | -|---|---:|---:|---:|---:|---:|---| -| CCDG 10k | 10,000 | 3,202 | 1.14x | 1.56x | 1.12x | Cohort FORMAT win remains visible. | -| GIAB HG002 GRCh38 v4.2.1 | 4,048,342 | 1 | 1.09x | 1.07x | 1.09x | Single-sample truth-set small variants. | -| GIAB HG002 GRCh38 v5.0q small variants | 5,945,525 | 1 | 1.09x | 1.07x | 1.07x | Includes phased missing GTs. | -| GIAB HG002 GRCh38 v5.0q structural variants | 6,268,852 | 1 | 1.09x | 1.02x | 1.08x | Structural-variant FORMAT coverage. | -| GIAB HG002 CHM13 v5.0q small variants | 5,829,374 | 1 | 1.07x | 1.06x | 1.16x | Alternate reference truth-set coverage. | - -The parent CCDG/1000G high-coverage chr22 file is 26.0 GiB compressed: - -```text -https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -For local reruns, point the full-CCDG manifest at a local copy such as: - -```text -/path/to/local/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -The normal command harness is unsafe for this input because one full -`view_bcf -Ob -l 0` output reached 155 GiB before the run was interrupted. The -full-file benchmark therefore used the streaming checksum harness: - -```sh -BCFTOOLS=/path/to/bcftools-htslib-vcf-plan/bcftools \ -OUTDIR=bench/format-shape/large/results-bcftools-full-ccdg-stream \ - bash bench/format-shape/scripts/run_bcftools_command_bench_stream.sh \ - bench/format-shape/large/bcftools-full-ccdg-inputs.tsv -``` - -All baseline-vs-plan checksums compared `ok`. - -| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | -|---|---:|---:|---:|---:|---:|---:| -| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | -| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | -| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | -| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | -| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | -| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | - -## Executor Optimization Pass - -The latest optimization pass stayed within the generic per-op executor. It did -not add schema-specific kernels. Retained changes are: - -- skip observed-count tracking for row ops that cannot compact; -- update integer ranges directly on the common positive-integer path; -- fail over-wide measured fields during the measurement pass; -- remove nullable `nread` checks from planner-private vector helpers. - -Focused tests passed: - -```sh -make test/test_view test/test_format_plan_cache -cd test && REF_PATH=: ./test.pl -F vcf_format_plan -test/test_format_plan_cache -git diff --check -``` - -The htslib large corpus result is in -`bench/format-shape/large/results-opt-batch1b`. All planned outputs compared -byte-identical to baseline. - -| Input | Plan user | User speedup | Hits/fallback | -|---|---:|---:|---:| -| CCDG 10k | 2.20 s | 1.14x | 8,396 / 1,604 | -| 1000G chr22 full GT | 8.99 s | 2.79x | 1,103,547 / 0 | -| Large CCDG-like synthetic | 3.68 s | 1.09x | 20,000 / 0 | -| Large reordered likelihood | 2.38 s | 1.22x | 20,000 / 0 | -| Large multiallelic likelihood | 2.64 s | 1.21x | 16,000 / 0 | -| Large float/string | 2.88 s | 1.00x | 0 / 16,000 | -| Variable phase widths | 2.44 s | 1.05x | 12,000 / 0 | -| Mixed row-local fallbacks | 1.83 s | 1.20x | 12,000 / 0 | -| GT-first reordered | 1.41 s | 1.23x | 12,000 / 0 | -| Two-string float | 2.24 s | 1.00x | 0 / 12,000 | - -The `keep_samples`/all-samples loop split was tested and rejected. It preserved -correctness, but `bench/format-shape/large/results-opt-nosubset-split` was -slower across the planned rows, so the change was reverted. - -For bcftools-level validation, the sibling bcftools checkout must be built -against this checkout explicitly: - -```sh -make HTSDIR=../htslib-vcf-avx-sanity bcftools -``` - -The standard GIAB/CCDG command result is in -`bench/format-shape/large/results-bcftools-giab-ccdg-opt-batch1`; all outputs -compared `ok`. CCDG 10k user-time speedups were 1.12x for `view_bcf`, 1.55x -for `query_format`, and 1.11x for `filter_gt`. GIAB single-sample FORMAT query -rows were roughly 1.08-1.12x faster; site-only controls and `stats` remain -neutral/noisy as expected. - -## Fallback Diagnostics And String Width Tuning - -A later pass added fallback reason counters and split the planned width cap -into numeric and string limits: - -- numeric measured vectors remain capped at 64 values; -- measured strings are capped at 256 bytes; -- numeric/string width fallbacks are counted but do not disable the cached plan. - -A 512-byte string cap was tested first. It recovered all CCDG 10k planner -fallbacks, but the bcftools-level signal was mixed. The retained 256-byte cap -keeps almost all CCDG rows on the planned path while leaving the longest string -rows on the generic parser. - -Focused CCDG 10k htslib result at 256 bytes: - -| Metric | Value | -|---|---:| -| Baseline user | 2.43 s | -| Plan user | 2.15 s | -| Hits / fallback | 9,861 / 139 | -| Fallback reason | `string_width=139` | - -The standard GIAB/CCDG bcftools command result for the retained version is in -`bench/format-shape/large/results-bcftools-giab-ccdg-cap256`; all outputs -compared `ok`. - -| Input | `view_bcf` user | `query_format` user | `filter_gt` user | -|---|---:|---:|---:| -| CCDG 10k | 1.13x | 1.56x | 1.10x | -| GIAB HG002 GRCh38 v4.2.1 | 1.08x | 1.08x | 1.04x | -| GIAB HG002 GRCh38 v5.0q small variants | 1.13x | 1.08x | 1.03x | -| GIAB HG002 GRCh38 v5.0q structural variants | 1.11x | 1.15x | 1.04x | -| GIAB HG002 CHM13 v5.0q small variants | 1.08x | 1.07x | 1.03x | - -## Repo Test Harness Hardening - -The latest hardening pass moved the important correctness checks into the normal -htslib `test/test.pl` harness instead of leaving them only in -`bench/format-shape`. The `make check` coverage is intentionally black-box and -includes: - -- byte-identity checks for all small planned-path fixtures; -- generic parser vs planned parser comparisons; -- disabled-control comparisons for `HTS_VCF_FORMAT_PLAN=off`; -- a rollback row where planned parsing starts and then falls back after a DP - overflow; -- repeated unsupported wide GT values; -- selected-sample parsing where malformed unselected sample fields must be - skipped and must not affect emitted widths; -- malformed sample-count input, where both generic and planned modes must fail; -- cache-generation coverage in `test/test_format_plan_cache`. - -The planned executor now calls `vcf_parse_format_check7()` on success, so the -planned path shares the generic parser's final FORMAT cardinality check. The -fallback counters are test-only diagnostics, exposed through renamed -`*_for_test` hooks rather than API-looking `hts_*` names. - -## Interpretation - -The dynamic path gives a large production-visible win for sample-rich GT-only -VCFs. On likelihood-heavy rows, it is consistently faster but still limited by -generic per-op work, string/width handling, and IO/compression costs. Some -float/string-heavy layouts remain near parity or slightly slower than baseline. -The broader bcftools command run supports the same story: commands that expose -FORMAT parsing benefit; commands dominated by site-only logic, stats, merge -bookkeeping, or compression are neutral. - -## Remaining Work - -- Reduce per-sample opcode dispatch in hot FORMAT layouts. -- Improve string and measured-width handling without losing byte identity. -- Consider a later executor-generation layer if generic per-op dispatch remains - the main gap to historical exact-kernel speed. diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md deleted file mode 100644 index 8052a40c1..000000000 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ /dev/null @@ -1,517 +0,0 @@ -# Dynamic FORMAT Plan Experiment Log - -This log records the major approaches tried while developing the dynamic FORMAT -parser, the result of each approach, and what survived into the current design. - -## Starting Point - -The initial problem was that exact, hand-written FORMAT kernels were much faster -than the dynamic implementation, but exact kernels were too brittle. They only -matched a few complete FORMAT strings, such as: - -- `GT:AB:AD:DP:GQ:PL` -- `GT:AD:DP:GQ:PL` -- `GT:AB:AD:DP:GQ:PGT:PID:PL` -- `GT:AD:DP:GQ:PGT:PID:PL` - -The target became: recognize useful structure at the FORMAT-tag level, remain -general across subsets/supersets/reordered tags, and fall back to production -htslib whenever the optimized parser could not prove byte-identical output. - -## Exact CCDG Kernels - -The first high-performance path was a set of exact kernels for dominant CCDG -likelihood layouts. They proved the upper-bound target: on the 10k CCDG subset, -exact mode was roughly 1.6 s user versus 2.6 s baseline. - -Result: useful as a performance oracle, but removed from the production -candidate because exact string matching did not satisfy the generality goal. - -## Dynamic Likelihood Shape Executor - -Next, the parser used header/type/order information to recognize a likelihood -shape rather than exact tag names: - -```text -GT2, optional FLOAT1, INT[n_allele], INT1, INT1, -optional STR1, optional STR1, INT[n_allele * (n_allele + 1) / 2] -``` - -This was selected by type/order/width rather than names such as `AD` and `PL`. -It validated allele count, observed vector counts, GT syntax, separators, sample -count, and phase-string widths per row. - -Result: it closed much of the performance gap. On one 10k CCDG run, dynamic -shape was within about 6% of exact user time while remaining byte-identical. - -Why it did not survive: it reintroduced a shape-specific executor family. That -was useful evidence, but the MVP goal shifted toward one composable per-tag -executor before adding any generation/specialization layer. - -## Cached Shape Classification - -The dynamic shape attempt initially paid repeated failed probes on non-likelihood -workloads. Caching deterministic shape facts per `(header, FORMAT)` plan fixed -that. The full 1000G GT-only workload stopped paying over a million failed -likelihood-shape probes. - -Result: retained as a lesson for future specialization. The current composable -plan still caches by `(header, FORMAT)`. - -## GT-Only Fast Path - -A tiny `FORMAT=GT` / diploid `GT2` executor was added and gave a large speedup -on the full 1000G chr22 genotype VCF, cutting dynamic-mode user time from about -9.1 s to about 5.6 s in that intermediate architecture. - -Result: the direct `GT2` insight survived, but not as a separate GT-only -executor. The current composable executor direct-writes leading `GT2` rows when -safe. - -## Integer Parse And Encode Tightening - -Several low-risk parser/encoder refinements were tried: - -- fixed-width integer vector parsers for common AD/PL widths; -- positive integer fast path before falling back to full signed/missing parsing; -- integer range tracking with a `has_special` bit so int8/int16 encoding can skip - sentinel checks only when the parser proved no missing/vector-end values. - -Result: retained. These fit the generic per-op architecture and helped recover -some likelihood-heavy performance. - -## Likelihood Row-Op Elision - -In the shape-executor phase, row-op construction was removed from the dynamic -likelihood strict path so the executor could consume cached plan indices and -row-local widths directly. - -Result: useful for the old shape executor, but not retained once the MVP pivoted -to the composable row-op model. - -## Composable MVP Pivot - -The architecture pivoted to: - -```text -FORMAT/header -> per-tag compiled ops -> one composable executor -> fallback -``` - -The dynamic path stopped routing through separate GT-only, likelihood-shape, -fixed-numeric, and measured-general executor ladders. Instead, it builds one -row-local op list from header metadata and parses supported ops in FORMAT order. - -Result: retained. This is the current design because it supports tag-level -composition for rows such as `GT:AD`, `GT:AD:DP:XX:PL`, reordered fields, and -supersets with normal header-described tags. - -Tradeoff: broader composability lost some of the microkernel speed from the -likelihood shape executor. - -## Production Hardening - -Several hardening passes made the composable MVP safer and faster: - -- tightened `GT` compile validation to require `Type=String,Number=1`; -- added malformed-but-readable `GT` header coverage; -- restored direct writes for leading fixed-encoding ops (`GT2`, `FLOAT1`); -- routed generic `INTN` widths 4, 6, and 10 through fixed-width counted parsers; -- removed unused dynamic likelihood-shape scaffolding; -- added underfilled vector compaction for fixed-width vector fields. -- replaced the original process-global 16-entry FORMAT plan cache with a - header-owned, generation-aware, dynamically sized cache that stores both - supported and unsupported compile results. - -Result: retained. The dynamic path became broader and reduced unnecessary -whole-row fallback while preserving byte-identical output. - -## Reverted Or Removed Work - -Removed: - -- exact CCDG kernels; -- dynamic likelihood-shape executor scaffolding; -- optional SIMD tab-scanning front-end; -- shape-stat benchmark plumbing; -- legacy `exact`/`interp` timing rows in the benchmark harness. - -Tested and reverted: - -- pointer-increment / reduced-bookkeeping hot-loop rewrite. It stayed - byte-correct but slowed targeted likelihood-heavy benchmarks. - -## Dynamic-Only Production Trim - -After removing exact and SIMD paths, the optimized entry became: - -```text -HTS_VCF_FORMAT_PLAN enabled -> dynamic per-tag plan -> composable executor -> generic fallback -``` - -`HTS_VCF_FORMAT_PLAN=1` now routes through the dynamic executor. Older -`interp` and `general` aliases were later removed during production tightening -so unknown values do not accidentally enable the fast path. - -Large-corpus post-trim user-time highlights: - -| Input | Baseline | Plan | Result | -|---|---:|---:|---| -| CCDG 10k | 2.62 s | 2.25 s | faster, partial fallback | -| 1000G chr22 full GT | 26.05 s | 7.98 s | major win | -| Large CCDG-like synthetic | 4.24 s | 3.78 s | modest win | -| Large float/string | 2.93 s | 2.97 s | near parity/slightly slower | -| Two-string float negative | 2.28 s | 2.56 s | slower | - -## Header-Owned Cache Hardening - -The static FORMAT plan cache was replaced with private `bcf_hdr_aux_t` state. -The hardened cache: - -- grows from 16 to 128 entries; -- stores literal FORMAT strings on the heap, so long schemas are no longer - rejected by the old fixed key buffer; -- caches unsupported compile results to avoid repeated work; -- clears on `bcf_hdr_sync()` and records a private header generation; -- declines fast planning while `h->dirty` is set. - -Result: retained. `test/format-plan-cache.vcf` now asserts 21/21 planned hits -across more than 16 distinct FORMAT schemas, including one long schema. The -new `test/test_format_plan_cache` helper verifies that a plan compiled before a -header metadata change is not reused after `bcf_hdr_sync()`. The large corpus -remained byte-identical after the rewrite, with the same broad performance -profile: 1000G chr22 GT user time at 26.06 s baseline versus 7.96 s planned, -and CCDG 10k at 2.55 s baseline versus 2.24 s planned. - -## String/Float Shape Boundary - -The expanded threaded benchmark exposed two regressions: - -- `GT:GL:FT:DP:GQ` -- `GT:FT:PID:GL:DP` - -Both schemas were syntactically supported and had zero row-local fallback, but -they were dominated by measured strings plus `Number=G` float vectors. The -dynamic path had to measure string widths over every sample before parsing, then -still use the general float conversion path, while there were no integer vectors -to amortize that setup. - -Result: retained as a conservative support boundary. The compiler now -negative-caches measured-string plus float-vector schemas that do not also have -integer-vector work, and sends those FORMAT rows to the generic parser. The -full threaded corpus remained byte-identical. The two-string float case -improved from a consistent slowdown, roughly 0.86-0.89x, to parity at -1.00-1.01x. Other integer-heavy likelihood rows stayed on the dynamic path. - -## Selected-Sample Support - -The planner originally rejected `h->keep_samples` because sample subsetting -changes the relationship between input sample columns and output BCF sample -slots. That was conservative but would have made the optimized path invisible -for common `bcftools view -s/-S` style workflows. - -The executor now treats the input and output counts separately. It scans -`h->nsamples_ori` columns when `h->keep_samples` is active, skips unselected -columns with the header bitset, writes retained samples densely, and sets -`v->n_sample` to the retained sample count. The width-measurement pass follows -the same rule, so measured strings and variable numeric widths are based only on -the samples that will be emitted, matching production htslib's selected-sample -behavior. - -Result: retained. The FORMAT-plan tests now compare explicit inclusion and -exclusion sample lists byte-for-byte against production parsing. A -bcftools run selecting the first two samples from every input completed 40/40 -byte-identical comparisons. The 1000G chr22 GT workload still showed a large -real-time win, from 26.51 s to 9.77 s unthreaded and from 25.99 s to 8.84 s at -4 threads; string/float-heavy negative rows remained near parity. - -## bcftools Production Check - -A clean bcftools `develop` worktree was built against this htslib branch and run -with `bcftools view --no-version -Ob -l 0 [--threads N]`. - -All planned outputs compared byte-identical to baseline. - -| Input | Threads | Baseline real | Plan real | Speedup | -|---|---:|---:|---:|---:| -| 1000G chr22 full GT | 0 | 27.48 s | 8.99 s | 3.06x | -| 1000G chr22 full GT | 4 | 26.71 s | 6.94 s | 3.85x | -| Large CCDG-like synthetic | 0 | 4.43 s | 3.94 s | 1.12x | -| Large CCDG-like synthetic | 4 | 3.47 s | 3.02 s | 1.15x | - -## Broader bcftools Command Check - -Added `bench/format-shape/scripts/run_bcftools_command_bench.sh` so the branch -can exercise more than `bcftools view`. The runner currently covers full BCF -conversion, genotype-dropping conversion, site queries, small FORMAT queries, -`stats`, genotype filters, and an opt-in merge benchmark. Every command runs -once with `HTS_VCF_FORMAT_PLAN=0` and once with `HTS_VCF_FORMAT_PLAN=1`, then -compares outputs with `cmp`. - -Result: retained. All applicable planned outputs compared byte-identical to -baseline. FORMAT-heavy commands showed the expected gains: 1000G full GT was -2.79x faster for `view_bcf`, 2.98x faster for `view_sites`, 1.94x faster for -`query_format`, and 1.57x faster for `filter_gt`. CCDG and reordered -likelihood workloads were smaller but positive. Site-only queries and `stats` -were mostly neutral, with a few small negative rows that remain useful overhead -watchpoints. - -`bcftools merge` was tested through the opt-in `merge_self` command against a -smaller manifest to avoid excessive duplicated-sample output. All planned merge -outputs compared byte-identical to baseline. Merge was neutral-to-positive: -small 1000G genotype input improved from 0.14 s to 0.10 s, large CCDG -likelihood improved from 4.50 s to 4.33 s, and large float/string remained -unchanged at 2.69 s. - -## GIAB and Full CCDG Probe - -Four GIAB HG002 VCFs were pulled into `bench/format-shape/large/public/giab`: -NIST v4.2.1 GRCh38 small variants, v5.0q GRCh38 small variants, v5.0q GRCh38 -structural variants, and v5.0q CHM13v2.0 small variants. The bcftools command -suite was run on those files plus the 3,202-sample CCDG 10k slice using -`bench/format-shape/large/bcftools-giab-ccdg-inputs.tsv`. - -First result: GIAB v5.0q exposed a real GT correctness bug. The planned GT2 -parser encoded `.|.` as `./.` because missing alleles were stored without the -separator phase bit. The parser now accepts simple diploid missing/digit -combinations and preserves the phase bit for `.|.`, `0|.`, and `.|0`; an -explicit edge row was added to `test/format-plan-edge.vcf`. - -After the fix, all baseline-vs-plan outputs compared `ok`. Speedups are modest -on GIAB because it is single-sample data: roughly 1.06-1.11x for `view_bcf` and -1.03-1.09x for `query_format`. CCDG 10k remained in the expected cohort range: -1.13x for `view_bcf`, 1.52x for `query_format`, and 1.10x for `filter_gt`. - -The full parent CCDG/1000G high-coverage chr22 VCF was identified as: - -```text -https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz -``` - -It is 26.0 GiB compressed and requires a local copy for reruns. -The normal command harness materializes complete outputs, which is not practical -for this file: a single `view_bcf -Ob -l 0` baseline output reached 155 GiB -before that run was stopped. A streaming checksum harness was added so command -outputs can be validated without storing them. - -The full CCDG streaming command suite completed with all baseline-vs-plan -checksums comparing `ok`: - -| Command | Baseline real | Plan real | Real speedup | Baseline user | Plan user | User speedup | -|---|---:|---:|---:|---:|---:|---:| -| `view_bcf` | 678.46 s | 562.96 s | 1.21x | 476.41 s | 377.47 s | 1.26x | -| `view_sites` | 472.27 s | 403.28 s | 1.17x | 455.70 s | 386.18 s | 1.18x | -| `query_sites` | 71.44 s | 76.78 s | 0.93x | 67.02 s | 72.00 s | 0.93x | -| `query_format` | 124.14 s | 76.88 s | 1.61x | 119.16 s | 72.27 s | 1.65x | -| `stats` | 77.45 s | 77.12 s | 1.00x | 72.86 s | 72.55 s | 1.00x | -| `filter_gt` | 531.20 s | 453.21 s | 1.17x | 512.95 s | 434.35 s | 1.18x | - -## Parser Helper Trim - -Reviewed the `vcf.c` implementation for duplicated fast-path helper code. A -first attempt collapsed the fixed-width integer vector parsers into one generic -counted loop. Correctness held, but likelihood-shaped rows regressed by roughly -10% in the focused benchmark, so that version was rejected. - -The retained refactor is intentionally narrower: remove unused non-range integer -vector helpers, remove an unused scalar helper, and centralize only the empty -integer-vector fill case. The hand-unrolled range parsers for common vector -widths remain because they are part of the measured hot path. - -Result: retained. The final `vcf.c` diff is about 116 fewer deleted helper -lines relative to the previous branch tip, with byte-identical outputs on the -focused GT/likelihood/string corpus. A repeat likelihood benchmark was neutral: -CCDG-like plan time improved from 4.18 s to 4.12 s, reordered likelihood was -2.66 s to 2.70 s, and multiallelic likelihood improved from 3.01 s to 2.98 s. - -## Production Tightening Review - -Three review passes focused on production-readiness: code-size risk, correctness -risk, and upstream polish. The retained implementation changes were deliberately -low-risk: - -- `HTS_VCF_FORMAT_PLAN` now enables only on `1`; old `interp` / `general` - aliases and typo-enables were removed. -- Planner statistics are incremented only when - `HTS_VCF_FORMAT_PLAN_STATS=1`, avoiding process-global counter writes in - normal runs. -- The row-op support check was folded into row-op resolution, removing a second - pass over the FORMAT operation list. -- The row-width bound was made explicit in the planner instead of using an - inline literal. -- Tests now assert that an unknown value such as `HTS_VCF_FORMAT_PLAN=off` - behaves like the generic parser. - -Result: retained. `make test/test_view test/test_format_plan_cache`, -the FORMAT-plan parser-output checks, and `test/test_format_plan_cache` pass. -At that point, the `vcf.c` implementation was about 1,594 added lines relative to -`origin/develop`, down from the earlier 1,703-line core. - -## Generic Executor Micro-Optimizations - -The next pass targeted the generic per-op executor rather than adding new -schema-specific kernels. Retained changes: - -- skip `max_counts` maintenance for row ops that cannot compact; -- update integer min/max directly on the common positive-integer parse path; -- reject over-wide measured `Number=.` / string fields during the measurement - pass instead of after scanning the full row; -- remove nullable `nread` checks from planner-private integer vector helpers. - -Result: retained. Focused FORMAT tests passed, `git diff --check` was clean, -and the htslib large corpus in -`bench/format-shape/large/results-opt-batch1b` compared byte-identical to -baseline. - -| Input | Baseline user | Plan user | User speedup | Hits/fallback | -|---|---:|---:|---:|---:| -| CCDG 10k | 2.50 s | 2.20 s | 1.14x | 8,396 / 1,604 | -| 1000G chr22 full GT | 25.08 s | 8.99 s | 2.79x | 1,103,547 / 0 | -| Large CCDG-like synthetic | 4.02 s | 3.68 s | 1.09x | 20,000 / 0 | -| Large reordered likelihood | 2.91 s | 2.38 s | 1.22x | 20,000 / 0 | -| Large multiallelic likelihood | 3.19 s | 2.64 s | 1.21x | 16,000 / 0 | -| Large float/string | 2.89 s | 2.88 s | 1.00x | 0 / 16,000 | -| Variable phase widths | 2.57 s | 2.44 s | 1.05x | 12,000 / 0 | -| Mixed row-local fallbacks | 2.20 s | 1.83 s | 1.20x | 12,000 / 0 | -| GT-first reordered | 1.73 s | 1.41 s | 1.23x | 12,000 / 0 | -| Two-string float | 2.25 s | 2.24 s | 1.00x | 0 / 12,000 | - -One broader structural attempt was rejected: splitting the all-samples loop from -the `keep_samples` loop. Correctness held, but -`bench/format-shape/large/results-opt-nosubset-split` was slower across the -planned corpus: CCDG 10k plan user time moved from 2.20 s to 2.28 s, 1000G -GT-only from 8.99 s to 9.30 s, and the likelihood-shaped synthetic rows also -regressed. That change was reverted. - -The standard bcftools GIAB/CCDG command corpus was then run against a bcftools -binary explicitly linked to this checkout with: - -```sh -make HTSDIR=../htslib-vcf-avx-sanity bcftools -``` - -All command outputs compared `ok` in -`bench/format-shape/large/results-bcftools-giab-ccdg-opt-batch1`. The command -profile stayed positive where FORMAT parsing matters and neutral/noisy where it -does not: CCDG 10k `query_format` was 1.55x faster by user time, CCDG 10k -`view_bcf` was 1.12x faster, and GIAB single-sample `query_format` rows were -roughly 1.08-1.12x faster. - -## Fallback Reason Counters And Split Width Caps - -The next regression investigation focused on CCDG rows that were falling back -because phase-set string fields exceeded the old single planned-width limit. -The implementation now reports fallback reasons under -`HTS_VCF_FORMAT_PLAN_STATS=1`: - -- unsupported schema; -- numeric width; -- string width; -- GT shape; -- parse failure; -- separator mismatch; -- sample-count mismatch. - -The single width cap was split into a 64-value numeric-vector cap and a -256-byte measured-string cap. Numeric and string width fallbacks are diagnostic -only: they do not disable a schema that succeeds on nearby rows. - -Two string caps were benchmarked. A 512-byte cap planned all CCDG 10k rows but -had a mixed bcftools-level signal. The retained 256-byte cap planned 9,861 of -10,000 CCDG rows and left the 139 longest string rows on the generic parser: - -```text -vcf-format-plan attempts=10000 hits=9861 fallback=139 parsed_samples=31574922 -vcf-format-plan-fallback unsupported=0 numeric_width=0 string_width=139 gt_shape=0 parse=0 separator=0 sample_count=0 -``` - -Result: retained. Focused tests passed, `git diff --check` was clean, and the -htslib large corpus in `bench/format-shape/large/results-string-cap256-reasons` -compared byte-identical to baseline. CCDG 10k user time was 2.47 s baseline -versus 2.17 s planned, 1000G chr22 full GT was 24.70 s versus 9.75 s, and the -likelihood-shaped synthetic rows remained faster or neutral. - -The bcftools GIAB/CCDG command corpus in -`bench/format-shape/large/results-bcftools-giab-ccdg-cap256` also compared -byte-identical. CCDG 10k user-time speedups were 1.13x for `view_bcf`, 1.56x -for `query_format`, and 1.10x for `filter_gt`; GIAB FORMAT-query rows were -1.07-1.15x faster, while site-only controls and `stats` remained neutral/noisy. - -## Repo Test Harness Hardening - -The final hardening pass moved the important small-case checks from the -benchmark directory into the actual htslib test harness. The bespoke shell test -was removed; the production-facing checks now live in `test/test.pl` as -`test_vcf_format_plan`, while `test/test_format_plan_cache` remains the focused -cache-generation check. - -Retained changes: - -- successful planned rows call `vcf_parse_format_check7()`, matching the generic - parser's final FORMAT cardinality validation; -- fallback diagnostics are test-only hooks with `*_for_test` names and are - emitted only when `HTS_VCF_FORMAT_PLAN_STATS=1`; -- `test_vcf_format_plan` compares planned output against generic output - byte-for-byte, including selected-sample cases and disabled-control values - such as `HTS_VCF_FORMAT_PLAN=off`; -- new fixtures cover rollback after partial planned parsing, malformed - unselected samples under `bcf_hdr_set_samples()`, repeated wide GT values, and - malformed sample-count failures; -- row-local width fallbacks remain record-local so sparse over-cap string rows - do not poison CCDG-like schemas. - -Result: retained. `make check` passed with 377/377 tests. `make -maintainer-check` was attempted but failed before the whitespace/copyright -checks because the local build invoked the C compiler on `test/usepublic.cpp` -with `-std=gnu23`. The relevant whitespace check and `git diff --check` passed -separately. - -The htslib large corpus run written locally under -`bench/format-shape/large/results-prod-hardening2` compared byte-identical to -baseline. The generated result files are ignored, so the recorded summary is: -CCDG 10k held the expected 9,861 / 139 hit/fallback split, and 1000G chr22 full -GT remained the largest win at 24.61 s baseline user time versus 9.48 s planned. - -The latest bcftools GIAB/CCDG command corpus in -`bench/format-shape/large/results-bcftools-giab-ccdg-prod-hardening` also -compared byte-identical. CCDG 10k user-time speedups were 1.14x for `view_bcf`, -1.56x for `query_format`, and 1.12x for `filter_gt`; GIAB single-sample FORMAT -rows remained modestly positive, as expected. - -## Runtime Cooldown Removal - -The per-plan runtime cooldown was removed after an A/B pass showed no practical -benefit on realistic workloads. The cooldown had paused a supported cached -schema after repeated row-local fallbacks, but standard corpus hit/fallback -counts were identical with and without it. The remaining protection is simpler: -compile-time unsupported schemas are negative-cached, unsupported mixed -string/float shapes are rejected at compile time, and row-local misses fall back -only for that record. - -The final no-cooldown parser corpus in -`bench/format-shape/large/results-no-cooldown-final` compared byte-identical to -baseline. Representative planned user times: - -| Input | Baseline user | Planned user | Hits / fallback | -|---|---:|---:|---:| -| CCDG 10k | 2.46 s | 2.16 s | 9,861 / 139 | -| 1000G chr22 full GT | 24.50 s | 9.34 s | 1,103,547 / 0 | -| Large reordered likelihood | 2.89 s | 2.42 s | 20,000 / 0 | -| Large float/string negative | 2.88 s | 2.86 s | 0 / 16,000 | -| Mixed row-local fallbacks | 2.14 s | 1.83 s | 12,000 / 0 | -| Two-string float negative | 2.21 s | 2.22 s | 0 / 12,000 | - -Result: retained. Focused planner tests passed, `test/test_format_plan_cache` -passed, all large parser-corpus outputs compared byte-identical, and -`git diff --check` was clean. - -## Main Lessons - -- Tag-level composition is the right MVP boundary; exact full FORMAT strings are - too brittle. -- Whole-row fallback keeps correctness manageable, but makes one unsupported tag - enough to lose the optimized path. -- Sample-rich GT-only VCFs are the clearest production win. -- Likelihood-heavy workloads benefit, but generic per-op dispatch and string / - measured-width handling still leave performance on the table. -- Future executor generation or shape-specialized families may be worth adding - after the composable MVP is stable. diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md deleted file mode 100644 index e30d1b7da..000000000 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ /dev/null @@ -1,129 +0,0 @@ -# Dynamic FORMAT Plan Overview - -This branch adds an optional fast path for parsing VCF `FORMAT` sample columns. -The goal is to speed up common, header-described FORMAT layouts without writing -one-off kernels for exact FORMAT strings such as `GT:AD:DP:GQ:PL`. - -## What It Does - -When `HTS_VCF_FORMAT_PLAN` is enabled, htslib first tries to compile the record's -literal FORMAT string into a small list of per-tag operations. The plan is -driven by the active VCF header: each tag contributes its key, type, declared -number model, and whether the current row needs width measurement. - -Compiled plans live in private header-owned cache state. The cache is cleared -when the header dictionaries are resynchronised, and the optimized path declines -to run while the header has unsynced mutations. That keeps cached supported and -unsupported decisions tied to the exact header metadata that produced them. - -If the row fits the supported operation set, the dynamic executor parses samples -and writes BCF's transposed FORMAT layout directly. If anything looks unsafe or -unsupported, htslib falls back to the generic parser for the whole FORMAT -column. The planner also keeps a conservative shape boundary: schemas dominated -by measured strings plus float vectors, such as `GT:FT:PID:GL:DP`, currently use -the generic parser because the dynamic path's width-measurement work costs more -than it saves. - -The optimized path also supports selected-sample reads. When -`bcf_hdr_set_samples()` is active, it scans the original sample columns, skips -unretained samples, and writes the retained samples densely into the BCF FORMAT -blocks. - -Fallbacks are whole-row, but they are now classified for diagnostics when -`HTS_VCF_FORMAT_PLAN_STATS=1` is set. The current reason counters distinguish -unsupported schemas, numeric-width limits, string-width limits, GT shape misses, -parse failures, separator mismatches, and sample-count mismatches. - -## Why This Shape - -The important design choice is tag-level composition. A file does not need an -exact hardcoded FORMAT string to benefit. For example, these can all share the -same dynamic machinery when their tags are described by supported header -metadata: - -- `GT:AD` -- `GT:AD:DP:PL` -- `GT:AB:AD:DP:GQ:PGT:PID:PL` -- reordered numeric/string tags -- supersets with additional supported tags - -This is deliberately more general than the earlier experimental exact kernels. -Those kernels were fast, but brittle: adding or removing one tag could miss the -optimized path entirely. - -## Where It Helps - -The feature is most useful for sample-rich VCF text input where FORMAT parsing is -a meaningful part of total runtime: - -- large `GT`-only genotype VCFs; -- likelihood-heavy VCFs with fields such as `AD`, `PL`, `DP`, `GQ`, `AB`, and - phase strings; -- conversion paths such as VCF.gz to BCF where text FORMAT parsing is exposed; -- workloads with repeated FORMAT layouts across many records. - -In the latest bcftools-style timing, the real 1000G chr22 GT workload sped up -from 27.48 s to 8.99 s unthreaded, and from 26.71 s to 6.94 s at 4 threads. -The likelihood-heavy synthetic workload improved more modestly, from 4.43 s to -3.94 s unthreaded and from 3.47 s to 3.02 s at 4 threads. - -With bcftools selecting the first two samples from each input, the same 1000G -GT workload improved from 26.51 s to 9.77 s unthreaded and from 25.99 s to -8.84 s at 4 threads. Selected-sample likelihood-heavy rows are still faster, -but the gains are smaller because much less FORMAT payload is emitted. - -Broader bcftools commands follow the same pattern. `bcftools view`, -`bcftools query` of FORMAT values, and genotype filters benefit when they expose -sample FORMAT parsing. Site-only queries, `stats`, and `merge` are mostly -neutral because their runtime is dominated by non-FORMAT work, output writing, -or command-level bookkeeping. A controlled `bcftools merge` self-merge check -produced byte-identical output and was neutral-to-positive across the small -merge manifest. - -## Drawbacks - -The MVP intentionally keeps fallback whole-row. It does not parse supported -tags dynamically while delegating only one unsupported tag to the generic -parser. That makes correctness easier to reason about, but a single unsupported -tag or malformed row means the entire FORMAT column uses the generic parser. - -Known fallback cases include: - -- undefined FORMAT tags that require production header repair; -- unsupported header types or number models; -- mixed measured-string plus float-vector schemas without integer-vector work; -- duplicate FORMAT tags; -- malformed separators or unexpected sample cardinality; -- row-local widths above the bounded fast-path limit; -- GT encodings outside the simple fast-path representation. - -The path is also not always faster. Some string/float-heavy layouts are roughly -at parity or slightly slower than baseline because the dynamic path still pays -measurement, dispatch, and scratch-buffer costs. - -The current planned width limits are intentionally conservative: measured -numeric vectors are capped at 64 values, and measured strings are capped at -256 bytes. Rows above those limits use the generic parser; numeric/string width -misses do not by themselves disable the schema for later rows. - -Correctness checks for this path now live in the normal htslib test harness, not -only in the benchmark directory. `make check` runs black-box byte-identity -fixtures through `test/test.pl`, selected-sample checks, malformed-input checks, -and focused header-cache generation coverage. - -## User-Facing Controls - -```text -unset / 0 generic parser only -1 dynamic per-tag planner, then generic fallback -``` - -The benchmark harness reports only `HTS_VCF_FORMAT_PLAN=1` as `plan`. -Other values are treated as disabled. - -## Related Docs - -- `docs/FORMAT_PLAN_CURRENT.md`: current implementation, supported shapes, - correctness rules, and benchmark tables. -- `docs/FORMAT_PLAN_EXPERIMENT_LOG.md`: chronological log of approaches tried, - results, reversions, and retained lessons. From 8c809baf0463d1d17216a19d3c071f9e21b96f32 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 30 Apr 2026 22:59:45 +0200 Subject: [PATCH 35/38] fix ci --- test/test_format_plan_cache.c | 8 ++- test/test_view.c | 36 ----------- vcf.c | 113 +++++++++++++++------------------- 3 files changed, 57 insertions(+), 100 deletions(-) diff --git a/test/test_format_plan_cache.c b/test/test_format_plan_cache.c index 2575dadbf..d0028e6f1 100644 --- a/test/test_format_plan_cache.c +++ b/test/test_format_plan_cache.c @@ -38,6 +38,12 @@ static void fail(const char *msg) #define check0(expr) do { if ((expr) != 0) fail("check failed: " #expr); } while (0) #define check1(expr) do { if (!(expr)) fail("check failed: " #expr); } while (0) +static int enable_format_plan(void) +{ + static char env[] = "HTS_VCF_FORMAT_PLAN=1"; + return putenv(env); +} + static void parse_line(bcf_hdr_t *hdr, bcf1_t *rec, kstring_t *line, const char *text) { @@ -105,7 +111,7 @@ int main(void) bcf1_t *rec; kstring_t line = KS_INITIALIZE; - check0(setenv("HTS_VCF_FORMAT_PLAN", "1", 1)); + check0(enable_format_plan()); hdr = bcf_hdr_init("r"); rec = bcf_init(); check1(hdr); diff --git a/test/test_view.c b/test/test_view.c index 6a78e1027..a9a0615c6 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -30,24 +30,12 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "../cram/cram.h" #include "../htslib/sam.h" #include "../htslib/vcf.h" #include "../htslib/hts_log.h" -extern void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, - uint64_t *parsed_samples); -extern void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, - uint64_t *numeric_width, - uint64_t *string_width, - uint64_t *gt_shape, - uint64_t *parse, - uint64_t *separator, - uint64_t *sample_count); - struct opts { char *fn_ref; int flag; @@ -449,30 +437,6 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); - const char *format_plan_stats = getenv("HTS_VCF_FORMAT_PLAN_STATS"); - if (format_plan_stats && strcmp(format_plan_stats, "1") == 0) { - uint64_t attempts = 0, hits = 0, fallback = 0, parsed_samples = 0; - uint64_t unsupported = 0; - uint64_t numeric_width = 0, string_width = 0, gt_shape = 0, parse = 0; - uint64_t separator = 0, sample_count = 0; - vcf_format_plan_stats_for_test(&attempts, &hits, &fallback, &parsed_samples); - vcf_format_plan_fallback_stats_for_test(&unsupported, &numeric_width, - &string_width, >_shape, - &parse, &separator, - &sample_count); - fprintf(stderr, - "vcf-format-plan attempts=%llu hits=%llu fallback=%llu parsed_samples=%llu\n", - (unsigned long long) attempts, (unsigned long long) hits, - (unsigned long long) fallback, - (unsigned long long) parsed_samples); - fprintf(stderr, - "vcf-format-plan-fallback unsupported=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", - (unsigned long long) unsupported, - (unsigned long long) numeric_width, (unsigned long long) string_width, - (unsigned long long) gt_shape, (unsigned long long) parse, - (unsigned long long) separator, (unsigned long long) sample_count); - } - if (fclose(stdout) != 0 && errno != EBADF) { fprintf(stderr, "Error closing standard output.\n"); exit_code = EXIT_FAILURE; diff --git a/vcf.c b/vcf.c index 59fba63b5..ce3f3b988 100644 --- a/vcf.c +++ b/vcf.c @@ -3242,59 +3242,52 @@ typedef enum { static uint64_t vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_N]; /* - * Dynamic FORMAT fast path. + * Planned FORMAT parser. * - * The existing FORMAT parser below is intentionally very permissive: it can - * repair missing header declarations, deal with sample subsetting, and recover - * from many odd row shapes. The fast path here only claims rows that can be - * described by the existing FORMAT header metadata and parsed as a fixed list - * of per-tag operations. If any compile-time or row-local invariant fails, it - * returns -3 to let the generic parser handle the whole FORMAT column. + * The generic FORMAT parser below is deliberately permissive: it can repair + * missing header declarations, handle sample subsetting, and recover from many + * irregular row shapes. The planned parser only handles rows that can be + * described by existing FORMAT header metadata and parsed as a fixed list of + * per-tag operations. If any compile-time or row-local invariant fails, it + * returns -3 so the generic parser handles the whole FORMAT column. * * HTS_VCF_FORMAT_PLAN controls the feature: * unset/0 use the generic parser only - * 1 try the dynamic per-tag plan, with generic fallback + * 1 enable the planned per-tag parser, with generic fallback + * + * HTS_VCF_FORMAT_PLAN_STATS=1 emits aggregate plan counters at process exit. */ -void vcf_format_plan_stats_for_test(uint64_t *attempts, uint64_t *hits, - uint64_t *fallback, uint64_t *parsed_samples) -{ - if (attempts) *attempts = vcf_format_plan_stats.attempts; - if (hits) *hits = vcf_format_plan_stats.hits; - if (fallback) *fallback = vcf_format_plan_stats.fallback; - if (parsed_samples) *parsed_samples = vcf_format_plan_stats.parsed_samples; -} - -void vcf_format_plan_fallback_stats_for_test(uint64_t *unsupported, - uint64_t *numeric_width, - uint64_t *string_width, - uint64_t *gt_shape, - uint64_t *parse, - uint64_t *separator, - uint64_t *sample_count) -{ - if (unsupported) - *unsupported = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_UNSUPPORTED]; - if (numeric_width) - *numeric_width = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH]; - if (string_width) - *string_width = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_STRING_WIDTH]; - if (gt_shape) - *gt_shape = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GT_SHAPE]; - if (parse) - *parse = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_PARSE]; - if (separator) - *separator = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SEPARATOR]; - if (sample_count) - *sample_count = vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SAMPLE_COUNT]; +static void vcf_format_plan_report_stats(void) +{ + fprintf(stderr, + "vcf-format-plan attempts=%llu hits=%llu fallback=%llu parsed_samples=%llu\n", + (unsigned long long) vcf_format_plan_stats.attempts, + (unsigned long long) vcf_format_plan_stats.hits, + (unsigned long long) vcf_format_plan_stats.fallback, + (unsigned long long) vcf_format_plan_stats.parsed_samples); + fprintf(stderr, + "vcf-format-plan-fallback unsupported=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_UNSUPPORTED], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_STRING_WIDTH], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GT_SHAPE], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_PARSE], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SEPARATOR], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SAMPLE_COUNT]); } static int vcf_format_plan_stats_enabled(void) { static int enabled = -1; + static int registered = 0; if (enabled < 0) { const char *env = getenv("HTS_VCF_FORMAT_PLAN_STATS"); enabled = env && strcmp(env, "1") == 0; + if (enabled && !registered) { + if (atexit(vcf_format_plan_report_stats) == 0) + registered = 1; + } } return enabled; } @@ -3350,9 +3343,9 @@ typedef struct { * Cache key is the literal FORMAT string plus the private header * generation. FORMAT key ids/types are header-local, so plans are owned by * the header aux block and invalidated whenever bcf_hdr_sync() rebuilds the - * dictionaries. Unsupported plans are cached too; repeated uncommon or - * undefined FORMAT strings should pay the compile cost once, then fall back - * directly to the generic parser. + * dictionaries. Unsupported FORMAT strings are cached too, so repeated + * fallback cases pay the compile cost once and then go directly to the + * generic parser. */ char *format; size_t format_len; @@ -3526,11 +3519,10 @@ static inline int vcf_format_op_is_vector(const vcf_format_op_t *op) } /* - * Return whether this FORMAT composition is inside the current planned - * executor's supported shape set. This is a support boundary, not a learned - * runtime heuristic: mixed measured-string plus float-vector rows are kept on - * the generic parser unless there is also integer-vector work for the planner - * to accelerate. + * Return whether this FORMAT composition is within the planned executor's + * supported shape set. Mixed measured-string plus float-vector rows require an + * extra measurement pass and stay on the generic parser unless the row also + * contains integer-vector work that benefits from the planned encoding path. */ static int vcf_format_general_plan_shape_supported(const vcf_format_general_plan_t *plan) { @@ -3549,11 +3541,6 @@ static int vcf_format_general_plan_shape_supported(const vcf_format_general_plan } } - /* - * Examples intentionally left on generic: GT:GL:FT:DP:GQ and - * GT:FT:PID:GL:DP. Both are valid FORMAT schemas, but this executor has no - * cheap integer-vector encoding work to offset the measured-string pass. - */ if (has_measured_string && has_float_vector && !has_int_vector) return 0; return 1; @@ -3619,9 +3606,8 @@ static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *forma /* * Only compile tags with enough header information to reproduce the - * production BCF layout. Undefined tags and exotic types intentionally - * stay on the generic parser, which can emit warnings and install - * dummy header records where appropriate. + * generic parser's BCF layout. Undefined tags and exotic types stay on + * the generic parser, which owns warning and header-repair behavior. */ plan->ops[plan->n_ops].key = key; plan->ops[plan->n_ops].number = bcf_hdr_id2number(h, BCF_HL_FMT, key); @@ -4489,8 +4475,8 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, /* * This pass validates the sample field separators at the same time * as measuring widths. A single unexpected ':' or tab position is - * enough to reject the fast path, preserving production behavior for - * odd FORMAT/sample cardinality cases. + * enough to reject the planned path, preserving generic-parser + * behavior for odd FORMAT/sample cardinality cases. */ while (cur < end && *cur && *cur != ':' && *cur != '\t') { if (op->measured_width && @@ -4672,8 +4658,9 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, /* * Each op parser consumes exactly one sample subfield and leaves cur - * on the following ':' or tab. Values that require production-only - * handling, such as non-simple GT encodings, return -4 via fallback. + * on the following ':' or tab. Values outside this executor's + * supported subset, such as non-simple GT encodings, trigger + * generic fallback. */ switch (op->kind) { case VCF_FORMAT_ROW_GT2: @@ -4780,10 +4767,10 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, } if (max_counts[j] < row_ops[j].width) { /* - * Production encodes fixed-width vector rows at the observed row - * maximum, not necessarily the conservative header-derived width. - * Compacting here avoids unnecessary whole-row fallback while - * keeping byte-identical BCF output. + * The generic parser encodes fixed-width vector rows at the + * observed row maximum, not necessarily the conservative + * header-derived width. Compacting here avoids unnecessary + * whole-row fallback while keeping byte-identical BCF output. */ vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]); } From 0999351c040793ae9763b2f9ba7550b45fd9bd11 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 1 May 2026 01:20:48 +0200 Subject: [PATCH 36/38] Document FORMAT planner modules --- vcf.c | 103 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 22 deletions(-) diff --git a/vcf.c b/vcf.c index ce3f3b988..13eb6c9ad 100644 --- a/vcf.c +++ b/vcf.c @@ -3219,6 +3219,38 @@ static inline int align_mem(kstring_t *s) static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v); +/* + * Planned FORMAT parser. + * + * The generic FORMAT parser below is deliberately permissive: it can repair + * missing header declarations, handle sample subsetting, and recover from many + * irregular row shapes. The planned parser only handles rows that can be + * described by existing FORMAT header metadata and parsed as a fixed list of + * per-tag operations. If any compile-time or row-local invariant fails, it + * returns -3 so the generic parser handles the whole FORMAT column. + * + * The implementation is organized as: + * + * - controls and diagnostics; + * - cached FORMAT plan data structures; + * - plan cache management and FORMAT-string compilation; + * - small value parsers used by the sample loop; + * - row-local layout and BCF encoding helpers; + * - width measurement for variable row shapes; + * - execution and the generic-parser fallback entry point. + */ + +/* + * Planned FORMAT parser: controls and diagnostics. + * + * HTS_VCF_FORMAT_PLAN controls the feature: + * unset/0 use the generic parser only + * 1 enable the planned per-tag parser, with generic fallback + * + * HTS_VCF_FORMAT_PLAN_STATS=1 emits aggregate plan counters at process exit. + * These counters are intentionally diagnostic; normal parsing avoids touching + * them unless the stats environment variable is enabled. + */ typedef struct { uint64_t attempts; uint64_t hits; @@ -3241,22 +3273,6 @@ typedef enum { static uint64_t vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_N]; -/* - * Planned FORMAT parser. - * - * The generic FORMAT parser below is deliberately permissive: it can repair - * missing header declarations, handle sample subsetting, and recover from many - * irregular row shapes. The planned parser only handles rows that can be - * described by existing FORMAT header metadata and parsed as a fixed list of - * per-tag operations. If any compile-time or row-local invariant fails, it - * returns -3 so the generic parser handles the whole FORMAT column. - * - * HTS_VCF_FORMAT_PLAN controls the feature: - * unset/0 use the generic parser only - * 1 enable the planned per-tag parser, with generic fallback - * - * HTS_VCF_FORMAT_PLAN_STATS=1 emits aggregate plan counters at process exit. - */ static void vcf_format_plan_report_stats(void) { fprintf(stderr, @@ -3324,6 +3340,13 @@ static inline void vcf_format_plan_set_reason(vcf_format_plan_fallback_reason_t *dst = reason; } +/* + * Planned FORMAT parser: cached plan data model. + * + * A cached plan is the header-derived, record-independent description of the + * FORMAT string. Row operations are derived later after allele-dependent and + * measured widths are known for the current record. + */ typedef struct { /* * Header-derived operation for one FORMAT tag. This is the reusable, @@ -3407,12 +3430,14 @@ typedef struct { int has_special; } vcf_plan_int_range_t; -#if defined(__GNUC__) -#define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) -#else -#define VCF_PLAN_ALWAYS_INLINE static inline -#endif - +/* + * Planned FORMAT parser: cache management and plan compilation. + * + * Plans are owned by bcf_hdr_aux_t because FORMAT ids, types, and Number + * declarations are header-local. The cache stores both supported and + * unsupported FORMAT strings; unsupported entries let repeated odd schemas + * fall through to the generic parser without repeatedly recompiling. + */ static uint64_t vcf_format_plan_hash(const char *format, size_t len) { size_t i; @@ -3712,6 +3737,21 @@ static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h return plan->supported ? plan : NULL; } +/* + * Planned FORMAT parser: value parsers. + * + * These helpers consume one FORMAT subfield and leave the input pointer on the + * following ':' or tab. They deliberately handle only the planned subset and + * report failure to the executor, which then rolls the whole row back to the + * generic parser. Integer helpers maintain the observed min/max range while + * parsing so the final BCF encoder does not need a second range scan. + */ +#if defined(__GNUC__) +#define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) +#else +#define VCF_PLAN_ALWAYS_INLINE static inline +#endif + VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) { const char *s = *sp; @@ -4215,6 +4255,14 @@ static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); } +/* + * Planned FORMAT parser: row-local layout and encoding helpers. + * + * Once all widths are known for the current record, these helpers turn cached + * plan operations into concrete row operations, allocate/stage transposed + * FORMAT buffers, compact underfilled vectors when the generic parser would do + * the same, and encode staged rows into the final packed BCF FORMAT layout. + */ static int vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, bcf1_t *v, int *widths, vcf_format_row_op_t *row_ops, @@ -4380,6 +4428,8 @@ static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, } /* + * Planned FORMAT parser: row-local width measurement. + * * Resolve FORMAT widths before execution. Fixed widths come from the header * and current allele count; Type=String and Number=. numeric rows require a * sample scan. Returns 0 for a usable plan, -4 for generic fallback, and -1 @@ -4559,6 +4609,8 @@ static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, } /* + * Planned FORMAT parser: execution. + * * Execute a row-local FORMAT plan. Parsing proceeds sample-major because that * matches the VCF text, then staged rows are encoded op-major to match BCF * FORMAT layout. Returns 0 on success, -4 for generic fallback, and -1 on hard @@ -4802,6 +4854,13 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, return -1; } +/* + * Planned FORMAT parser: entry points and fallback contract. + * + * vcf_parse_format_planned() is called before the generic FORMAT pipeline. It + * returns 0 after a successful planned parse, -3 when the caller should run the + * generic parser, and a hard error for allocation or consistency failures. + */ static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const vcf_format_general_plan_t *plan, From bd643182c8fa722abbc0cb89860263a90bb97020 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 7 May 2026 15:12:03 +0200 Subject: [PATCH 37/38] Simplify FORMAT planner integer vectors Collapse the width-specific integer vector row kinds and parsers into a single INTVEC executor path, preserving whole-column fallback behavior for malformed or over-width fields. Add planner docs covering the current control surface, cache and fallback contract, focused tests, and the full-corpus simplification benchmark delta. --- docs/FORMAT_PLAN_CURRENT.md | 139 ++++++++++ docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 88 +++++++ docs/FORMAT_PLAN_OVERVIEW.md | 80 ++++++ test/format-plan-malformed-fields.vcf | 2 + test/test.pl | 4 +- vcf.c | 348 ++++---------------------- 6 files changed, 363 insertions(+), 298 deletions(-) create mode 100644 docs/FORMAT_PLAN_CURRENT.md create mode 100644 docs/FORMAT_PLAN_EXPERIMENT_LOG.md create mode 100644 docs/FORMAT_PLAN_OVERVIEW.md diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md new file mode 100644 index 000000000..d1106a376 --- /dev/null +++ b/docs/FORMAT_PLAN_CURRENT.md @@ -0,0 +1,139 @@ +# FORMAT Planner Current State + +This document describes the current `HTS_VCF_FORMAT_PLAN=1` implementation +after simplifying the integer-vector executor. + +## Entry Point + +`vcf_parse_format()` calls `vcf_parse_format_planned()` before the generic +FORMAT parser. The planned path returns: + +- `0` after a successful planned parse. +- `-3` when the caller should run the generic parser. +- a hard error for allocation or internal consistency failures. + +The environment gate is exact: only `HTS_VCF_FORMAT_PLAN=1` enables the planned +path. `HTS_VCF_FORMAT_PLAN_STATS=1` records attempts, hits, fallback counts, +and parsed sample counts. + +## Plan Cache + +Plans are cached per header in private `bcf_hdr_aux_t` state. Cache keys are +the literal FORMAT string plus the current private header generation. Both +supported and unsupported FORMAT strings are cached, so repeated fallback cases +avoid recompilation. The cache grows from 16 to 128 entries and then evicts +entries in a simple rotating order, preferring unsupported plans when possible. + +`bcf_hdr_sync()` clears the cache and increments the generation because FORMAT +key ids, types, and lengths are header-local. + +## Compilation + +The compiler tokenizes the FORMAT string without collapsing empty tokens, so +malformed strings like `GT::DP` still fall back exactly as the generic parser +expects. It rejects: + +- empty FORMAT tokens; +- unknown or undefined tags; +- duplicate tags; +- unsupported header types; +- unsupported length models; +- non-standard `GT` declarations. + +Supported non-GT types are integer, float, and string tags with enough header +metadata to reproduce the generic BCF layout. The support is still composition +aware: measured-string plus float-vector layouts are kept on the generic parser +unless the row also contains integer-vector work. + +## Row Kinds + +The current row executor uses six row kinds: + +- `VCF_FORMAT_ROW_GT2` +- `VCF_FORMAT_ROW_INT1` +- `VCF_FORMAT_ROW_INTVEC` +- `VCF_FORMAT_ROW_FLOAT1` +- `VCF_FORMAT_ROW_FLOATN` +- `VCF_FORMAT_ROW_STR` + +The earlier width-specific integer row kinds were intentionally removed. A +single integer-vector parser now handles fixed and row-local integer widths, +including the over-width comma check needed to preserve fallback behavior. + +## Width Resolution + +Header-fixed fields use the declared width as the initial row width, including +`Number=A`, `Number=R`, and `Number=G` after applying the current record allele +count. Numeric widths must fit the planner cap of 64 values. + +Measured fields perform a first pass over original sample columns. This is +required for `Number=.` numeric rows and string spans, and it is also where the +planner validates per-sample separators. If samples have been selected with +`bcf_hdr_set_samples()`, the planner scans original columns but only measures +and emits retained samples. + +Numeric measured widths are capped at 64. String widths are capped at 256. +Integer and float vector rows may then be compacted to the observed row maximum +when that is what the generic parser would encode. + +## Fallback Contract + +Any row-level parse failure falls back for the whole FORMAT column. The +planned path must not partially keep planned output after detecting malformed +text, unsupported widths, unexpected separators, unsupported GT shape, or +sample-count mismatch. + +Diagnostic fallback reasons are: + +- `unsupported` +- `numeric_width` +- `string_width` +- `gt_shape` +- `parse` +- `separator` +- `sample_count` + +## Tests + +Focused validation commands used for the current state: + +```sh +make test/test_view test/test_format_plan_cache bgzip tabix +./test/test_format_plan_cache +perl test/test.pl -F test_vcf_format_plan +git diff --check +``` + +The `test_vcf_format_plan` fragment compares generic, planned, disabled, and +unknown environment behavior, including selected samples and expected fallback +statistics. It passed 21/21 tests after the simplification. + +The standalone `test/test_format_plan.sh` referenced in older branch notes is +not present in this checkout; the Perl test fragment is the active focused +correctness surface here. + +## Benchmark Snapshot + +The local untracked `bench/format-shape/**/*.vcf.gz` corpus snapshot was run +against the pre-simplification planner at +`0999351c040793ae9763b2f9ba7550b45fd9bd11` and the simplified branch. Both +used: + +```sh +HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 test/test_view -b -l 0 INPUT >/dev/null +``` + +Each input had one warmup per variant followed by three paired measured runs. +All 27 streaming BCF comparisons were byte-identical. + +Aggregate wall-clock delta: + +| Set | n | Faster | Slower | Neutral | Delta | +| --- | ---: | ---: | ---: | ---: | ---: | +| All inputs | 27 | 18 | 3 | 6 | -3.11% | +| Inputs with mean real >= 0.1s | 19 | 17 | 1 | 1 | -3.13% | +| FORMAT-hit inputs with mean real >= 0.1s | 15 | 14 | 0 | 1 | -3.48% | + +The slower cases were tiny no-FORMAT or site-only workloads with millisecond +absolute deltas. Among nontrivial FORMAT-hit workloads, the simplified +executor had no measured slowdown in this run. diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md new file mode 100644 index 000000000..77e9d95d9 --- /dev/null +++ b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md @@ -0,0 +1,88 @@ +# FORMAT Planner Experiment Log + +This branch is an experiment in making VCF FORMAT parsing faster while keeping +the default HTSlib behavior unchanged. The planner remains disabled unless +`HTS_VCF_FORMAT_PLAN=1` is set. + +## Current Result + +The current implementation uses a dynamic per-tag planner with a conservative +whole-column fallback contract. It does not rely on width-specific integer +micro-specializations. The executor now uses one `INTVEC` row kind for integer +vectors instead of separate `INT2`, `INT3`, and selected fixed-width parsers. + +The simplification removed about 283 lines from `vcf.c`: + +```text +vcf.c | 300 ++++-------------------------------------------------------------- +1 file changed, 17 insertions(+), 283 deletions(-) +``` + +## Correctness Evidence + +Commands run after the simplification: + +```sh +make test/test_view test/test_format_plan_cache bgzip tabix +./test/test_format_plan_cache +perl test/test.pl -F test_vcf_format_plan +git diff --check +``` + +The Perl FORMAT-plan fragment passed 21/21 tests. A streaming `cmp` comparison +between the pre-simplification planner and simplified planner also passed for +all 27 local corpus inputs. + +## Local Corpus Simplification Benchmark + +Reference implementation: detached worktree at +`0999351c040793ae9763b2f9ba7550b45fd9bd11`. + +Compared variants: + +- `before`: planner before integer-vector simplification. +- `simplified`: `codex/simplify-format-plan-executor`. + +Both variants used: + +```sh +HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 test/test_view -b -l 0 INPUT >/dev/null +``` + +Each input had one warmup per variant followed by three paired measured runs. +The input corpus was the local untracked `bench/format-shape/**/*.vcf.gz` +snapshot present in this worktree at the time of the run. + +| Set | n | Faster | Slower | Neutral | Before total | Simplified total | Delta | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| All inputs | 27 | 18 | 3 | 6 | 48.5999 | 47.0864 | -3.11% | +| Inputs with mean real >= 0.1s | 19 | 17 | 1 | 1 | 48.5033 | 46.9831 | -3.13% | +| FORMAT-hit inputs with mean real >= 0.1s | 15 | 14 | 0 | 1 | 41.5234 | 40.0798 | -3.48% | + +Representative FORMAT-heavy deltas: + +| Input | Before mean real | Simplified mean real | Delta | +| --- | ---: | ---: | ---: | +| `1000g_chr22_full_genotypes` | 10.9167 | 10.5067 | -3.76% | +| `HG002_CHM13v2.0_v5.0q_smvar` | 1.5633 | 1.3733 | -12.15% | +| `HG002_GRCh38_1_22_v4.2.1_benchmark` | 3.7733 | 3.5767 | -5.21% | +| `HG002_GRCh38_v5.0q_stvar` | 1.8800 | 1.7533 | -6.74% | +| `ccdg_chr22_10k` | 2.5467 | 2.4400 | -4.19% | +| `large_ccdg_likelihood_2048s` | 4.1833 | 4.1000 | -1.99% | +| `large_multiallelic_likelihood_2048s` | 3.0433 | 3.0133 | -0.99% | +| `large_reordered_likelihood_2048s` | 2.7767 | 2.6733 | -3.72% | + +Interpretation: the removed width-specific integer vector specializations did +not show a performance benefit on this corpus. The simpler executor was +slightly faster overall and easier to justify for review. + +## Rejected or Deferred Approaches + +Width-specific integer parsers were removed because their code-size and review +cost were not supported by the benchmark evidence. + +Runtime JIT/code generation remains deferred. It would avoid some source-level +specialization, but it would add platform, compiler, security, packaging, and +debugging complexity that is hard to justify before the non-JIT planner is +accepted and before HTSlib has a more controlled internal API boundary for VCF +record representation. diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md new file mode 100644 index 000000000..0cb44663c --- /dev/null +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -0,0 +1,80 @@ +# FORMAT Planner Overview + +The FORMAT planner is an optional fast path for parsing VCF sample columns into +BCF. It lives inside `vcf.c` and is disabled by default. Set +`HTS_VCF_FORMAT_PLAN=1` to try the planned path before the normal generic +FORMAT parser. + +The planner is deliberately conservative: if a FORMAT column is unsupported, +malformed, tied to a dirty header, or otherwise suspicious, it falls back for +the whole column and lets the existing parser own the result. Byte-identical +BCF output versus the generic parser is the hard correctness requirement. + +## Control Surface + +- `HTS_VCF_FORMAT_PLAN=1` enables the planned FORMAT parser. +- Unset, `0`, and any other value use the generic parser only. +- `HTS_VCF_FORMAT_PLAN_STATS=1` prints diagnostic counters at process exit + when the planner is enabled and reached. + +The feature does not add public API. Planner state is stored in private +`bcf_hdr_aux_t` data and is invalidated when `bcf_hdr_sync()` rebuilds header +dictionaries. + +## Architecture + +The planned path has four stages: + +1. Cache lookup or compilation for the literal FORMAT string under the active + header generation. +2. Row-local width resolution for `Number=A/R/G`, bounded `Number=.`, and + string fields that require scanning sample text. +3. Sample-major parsing into transposed FORMAT rows. +4. BCF row encoding using the same public record layout as the generic parser. + +Compiled plans describe tags at FORMAT-field granularity, not as exact whole +string kernels. That lets the same executor handle layouts such as `GT:AD`, +`GT:AD:DP:PL`, reordered fields, and additional supported header-described +tags without adding one hand-written parser per FORMAT string. + +## Supported Shapes + +The planned executor currently covers: + +- `GT` for the current `GT2` subset: two single-character alleles or missing + values separated by `/` or `|`, such as `0/1`, `.|.`, `0|.`, and `.|0`. +- Integer scalar fields. +- Integer vector fields through a single shared `INTVEC` path. +- Float scalar and vector fields. +- Fixed `Number=1` strings. +- Header-derived `Number=A`, `Number=R`, and `Number=G` numeric widths, capped + by the planner's numeric width limit. +- Bounded measured `Number=.` numeric rows and bounded measured strings. +- Selected-sample parsing via `bcf_hdr_set_samples()`. + +The planner falls back on undefined tags, duplicate FORMAT tags, unsupported +header type/number models, dirty headers, unsupported GT shapes, malformed +separators, unsafe row widths, and unprofitable string/float-heavy layouts. +Haploid, polyploid, multi-digit-allele, and very high allele-count GT shapes +therefore stay on the generic parser. Measured-string plus float-vector +compositions also fall back unless the FORMAT row contains integer-vector work +that makes the planned path worthwhile. + +## Current Simplification + +The integer-vector executor was simplified in +`codex/simplify-format-plan-executor`. Width-specific row kinds and parsers +for `INT2`, `INT3`, and selected fixed widths were removed. The executor now +uses: + +- `GT2` +- `INT1` +- `INTVEC` +- `FLOAT1` +- `FLOATN` +- `STR` + +This removed about 283 lines from `vcf.c` and made the code less shaped like a +collection of hand-specialized kernels. Local untracked corpus-snapshot +benchmarking showed the simplified executor was slightly faster overall, with +byte-identical output against the pre-simplification planner. diff --git a/test/format-plan-malformed-fields.vcf b/test/format-plan-malformed-fields.vcf index 60f7a2da2..7e464651d 100644 --- a/test/format-plan-malformed-fields.vcf +++ b/test/format-plan-malformed-fields.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.3 ##contig= ##FORMAT= +##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= @@ -9,3 +10,4 @@ 1 1 . A C . PASS . GT:F:DP 0/1::5 1 2 . A C . PASS . GT:GL:DP 0/1::5 1 3 . A C . PASS . ST:DP :5 +1 4 . A C . PASS . GT:AD:DP 0/1:1,2,:5 diff --git a/test/test.pl b/test/test.pl index b50a6364a..90aab1229 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1388,11 +1388,11 @@ sub test_vcf_format_plan test_vcf_format_plan_one($opts, "format-plan-malformed-fields.vcf", "format-plan-malformed-fields", "", - { attempts => 3, hits => 0, fallback => 3, + { attempts => 4, hits => 0, fallback => 4, parsed_samples => 0, unsupported => 0, numeric_width => 0, string_width => 1, gt_shape => 0, - parse => 2, separator => 0, + parse => 3, separator => 0, sample_count => 0 }); test_vcf_format_plan_one($opts, "format-plan-float-vector.vcf", diff --git a/vcf.c b/vcf.c index 13eb6c9ad..09093cb3a 100644 --- a/vcf.c +++ b/vcf.c @@ -3390,12 +3390,9 @@ struct vcf_format_plan_cache_t { }; typedef enum { - VCF_FORMAT_ROW_GT, VCF_FORMAT_ROW_GT2, VCF_FORMAT_ROW_INT1, - VCF_FORMAT_ROW_INT2, - VCF_FORMAT_ROW_INT3, - VCF_FORMAT_ROW_INTN, + VCF_FORMAT_ROW_INTVEC, VCF_FORMAT_ROW_FLOAT1, VCF_FORMAT_ROW_FLOATN, VCF_FORMAT_ROW_STR @@ -3876,236 +3873,35 @@ VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value_range(const char **sp, int32_t *ou return 0; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector_counted_range(const char **sp, - int32_t *out, - int width, - int *nread, - vcf_plan_int_range_t *range) +static int vcf_plan_parse_int_vector_counted_range(const char **sp, + int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) { const char *s = *sp; - int i, nvals; + int i = 0; - for (i = 0; i < width; i++) { + while (i < width) { if (vcf_plan_int_value_range(&s, &out[i], range) < 0) return -1; - if (*s != ',') { - i++; + i++; + if (*s != ',') break; - } + /* + * Another comma after width values means the subfield has too many + * entries, including the trailing-comma form "1,2,". + */ + if (i == width) + return -1; s++; } - nvals = i; - *nread = nvals; + *nread = i; if (i < width) range->has_special = 1; for (; i < width; i++) out[i] = bcf_int32_vector_end; - if (*s == ',') - return -1; - *sp = s; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_counted_range(const char **sp, int32_t *out, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - *nread = 2; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_counted_range(const char **sp, int32_t *out, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - *nread = 1; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { - out[2] = bcf_int32_vector_end; - *sp = s; - range->has_special = 1; - *nread = 2; - return 0; - } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s == ',') - return -1; - *sp = s; - *nread = 3; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector4_counted_range(const char **sp, int32_t *out, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - int i = 4; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { - out[1] = bcf_int32_vector_end; - out[2] = bcf_int32_vector_end; - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 1; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { - out[2] = bcf_int32_vector_end; - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 2; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { - out[3] = bcf_int32_vector_end; - range->has_special = 1; - i = 3; - goto done; - } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s == ',') - return -1; -done: - *sp = s; - *nread = i; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector6_counted_range(const char **sp, int32_t *out, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - int i = 6, j; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { i = 1; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { i = 2; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { i = 3; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s != ',') { i = 4; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[4], range) < 0) - return -1; - if (*s != ',') { i = 5; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[5], range) < 0) - return -1; - if (*s == ',') - return -1; - goto done; -fill: - range->has_special = 1; - for (j = i; j < 6; j++) - out[j] = bcf_int32_vector_end; -done: - *sp = s; - *nread = i; - return 0; -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector10_counted_range(const char **sp, int32_t *out, int *nread, - vcf_plan_int_range_t *range) -{ - const char *s = *sp; - int i = 10, j; - - if (vcf_plan_int_value_range(&s, &out[0], range) < 0) - return -1; - if (*s != ',') { i = 1; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[1], range) < 0) - return -1; - if (*s != ',') { i = 2; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[2], range) < 0) - return -1; - if (*s != ',') { i = 3; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[3], range) < 0) - return -1; - if (*s != ',') { i = 4; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[4], range) < 0) - return -1; - if (*s != ',') { i = 5; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[5], range) < 0) - return -1; - if (*s != ',') { i = 6; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[6], range) < 0) - return -1; - if (*s != ',') { i = 7; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[7], range) < 0) - return -1; - if (*s != ',') { i = 8; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[8], range) < 0) - return -1; - if (*s != ',') { i = 9; goto fill; } - s++; - if (vcf_plan_int_value_range(&s, &out[9], range) < 0) - return -1; - if (*s == ',') - return -1; - goto done; -fill: - range->has_special = 1; - for (j = i; j < 10; j++) - out[j] = bcf_int32_vector_end; -done: *sp = s; - *nread = i; return 0; } @@ -4208,30 +4004,6 @@ VCF_PLAN_ALWAYS_INLINE void vcf_plan_fill_missing_int_vector(int32_t *out, *nread = 1; } -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector2_flexible_counted_range(const char **sp, - int32_t *out, - int *nread, - vcf_plan_int_range_t *range) -{ - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - vcf_plan_fill_missing_int_vector(out, 2, nread, range); - return 0; - } - return vcf_plan_parse_int_vector2_counted_range(sp, out, nread, range); -} - -VCF_PLAN_ALWAYS_INLINE int vcf_plan_parse_int_vector3_flexible_counted_range(const char **sp, - int32_t *out, - int *nread, - vcf_plan_int_range_t *range) -{ - if (**sp == ':' || **sp == '\t' || **sp == '\0') { - vcf_plan_fill_missing_int_vector(out, 3, nread, range); - return 0; - } - return vcf_plan_parse_int_vector3_counted_range(sp, out, nread, range); -} - static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, int32_t *out, int width, @@ -4242,16 +4014,6 @@ static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, vcf_plan_fill_missing_int_vector(out, width, nread, range); return 0; } - switch (width) { - case 4: - return vcf_plan_parse_int_vector4_counted_range(sp, out, nread, range); - case 6: - return vcf_plan_parse_int_vector6_counted_range(sp, out, nread, range); - case 10: - return vcf_plan_parse_int_vector10_counted_range(sp, out, nread, range); - default: - break; - } return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); } @@ -4287,12 +4049,8 @@ static int vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, } else if (op->htype == BCF_HT_INT) { if (row->width == 1) row->kind = VCF_FORMAT_ROW_INT1; - else if (row->width == 2) - row->kind = VCF_FORMAT_ROW_INT2; - else if (row->width == 3) - row->kind = VCF_FORMAT_ROW_INT3; else - row->kind = VCF_FORMAT_ROW_INTN; + row->kind = VCF_FORMAT_ROW_INTVEC; row->size = row->width * (int)sizeof(int32_t); } else if (op->htype == BCF_HT_REAL) { row->kind = row->width == 1 ? VCF_FORMAT_ROW_FLOAT1 : VCF_FORMAT_ROW_FLOATN; @@ -4301,9 +4059,7 @@ static int vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, row->kind = VCF_FORMAT_ROW_STR; row->size = row->width; } - row->can_compact = row->kind == VCF_FORMAT_ROW_INT2 || - row->kind == VCF_FORMAT_ROW_INT3 || - row->kind == VCF_FORMAT_ROW_INTN || + row->can_compact = row->kind == VCF_FORMAT_ROW_INTVEC || row->kind == VCF_FORMAT_ROW_FLOATN; } return 0; @@ -4358,30 +4114,33 @@ static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring uint8_t *buf = (uint8_t*)mem->s + op->offset; bcf_enc_int1(dst, op->key); - if (op->kind == VCF_FORMAT_ROW_GT2) { + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) return -1; - } else if (op->kind == VCF_FORMAT_ROW_STR) { + break; + case VCF_FORMAT_ROW_STR: if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) return -1; if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) return -1; - } else if (op->kind == VCF_FORMAT_ROW_FLOAT1 || op->kind == VCF_FORMAT_ROW_FLOATN) { + break; + case VCF_FORMAT_ROW_FLOAT1: + case VCF_FORMAT_ROW_FLOATN: if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) return -1; if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) return -1; - } else if (op->kind == VCF_FORMAT_ROW_INT1 || - op->kind == VCF_FORMAT_ROW_INT2 || - op->kind == VCF_FORMAT_ROW_INT3 || - op->kind == VCF_FORMAT_ROW_INTN) { + break; + case VCF_FORMAT_ROW_INT1: + case VCF_FORMAT_ROW_INTVEC: if (bcf_enc_vint_known_range_special(dst, nsamples * op->width, (int32_t *)buf, op->width, ranges[j].min, ranges[j].max, ranges[j].has_special) < 0) return -1; - } else { - if (bcf_enc_vint(dst, nsamples * op->width, (int32_t *)buf, op->width) < 0) - return -1; + break; + default: + return -1; } } return 0; @@ -4408,23 +4167,31 @@ static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int return j; } -static void vcf_format_compact_row_op(kstring_t *mem, int nsamples, - vcf_format_row_op_t *op, int width) +static int vcf_format_compact_row_op(kstring_t *mem, int nsamples, + vcf_format_row_op_t *op, int width) { - size_t elem_size = op->kind == VCF_FORMAT_ROW_FLOATN ? sizeof(float) : sizeof(int32_t); - size_t old_stride = (size_t) op->width * elem_size; - size_t new_stride = (size_t) width * elem_size; + size_t elem_size; + size_t old_stride, new_stride; char *base = mem->s + op->offset; int sample; + switch (op->kind) { + case VCF_FORMAT_ROW_INTVEC: + elem_size = sizeof(int32_t); + break; + case VCF_FORMAT_ROW_FLOATN: + elem_size = sizeof(float); + break; + default: + return -1; + } + old_stride = (size_t) op->width * elem_size; + new_stride = (size_t) width * elem_size; for (sample = 1; sample < nsamples; sample++) memmove(base + sample * new_stride, base + sample * old_stride, new_stride); op->width = width; op->size = (int)new_stride; - if (op->kind == VCF_FORMAT_ROW_INT2 || op->kind == VCF_FORMAT_ROW_INT3) - op->kind = width == 1 ? VCF_FORMAT_ROW_INT1 : - width == 2 ? VCF_FORMAT_ROW_INT2 : - width == 3 ? VCF_FORMAT_ROW_INT3 : VCF_FORMAT_ROW_INTN; + return 0; } /* @@ -4727,19 +4494,7 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, goto fallback; } break; - case VCF_FORMAT_ROW_INT2: - if (vcf_plan_parse_int_vector2_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) { - vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); - goto fallback; - } - break; - case VCF_FORMAT_ROW_INT3: - if (vcf_plan_parse_int_vector3_flexible_counted_range(&cur, (int32_t *)buf, &n, &ranges[j]) < 0) { - vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); - goto fallback; - } - break; - case VCF_FORMAT_ROW_INTN: + case VCF_FORMAT_ROW_INTVEC: if (vcf_plan_parse_int_vector_flexible_counted_range(&cur, (int32_t *)buf, op->width, &n, &ranges[j]) < 0) { vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); @@ -4824,7 +4579,8 @@ static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, * header-derived width. Compacting here avoids unnecessary * whole-row fallback while keeping byte-identical BCF output. */ - vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]); + if (vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]) < 0) + goto error; } } From 7813ee3ad8aea223da168a0a7af23cf9d86b80db Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 8 May 2026 11:41:19 +0200 Subject: [PATCH 38/38] Consolidate FORMAT planner review notes --- docs/FORMAT_PLAN_CURRENT.md | 139 ------------------- docs/FORMAT_PLAN_EXPERIMENT_LOG.md | 88 ------------ docs/FORMAT_PLAN_OVERVIEW.md | 211 +++++++++++++++++++---------- 3 files changed, 139 insertions(+), 299 deletions(-) delete mode 100644 docs/FORMAT_PLAN_CURRENT.md delete mode 100644 docs/FORMAT_PLAN_EXPERIMENT_LOG.md diff --git a/docs/FORMAT_PLAN_CURRENT.md b/docs/FORMAT_PLAN_CURRENT.md deleted file mode 100644 index d1106a376..000000000 --- a/docs/FORMAT_PLAN_CURRENT.md +++ /dev/null @@ -1,139 +0,0 @@ -# FORMAT Planner Current State - -This document describes the current `HTS_VCF_FORMAT_PLAN=1` implementation -after simplifying the integer-vector executor. - -## Entry Point - -`vcf_parse_format()` calls `vcf_parse_format_planned()` before the generic -FORMAT parser. The planned path returns: - -- `0` after a successful planned parse. -- `-3` when the caller should run the generic parser. -- a hard error for allocation or internal consistency failures. - -The environment gate is exact: only `HTS_VCF_FORMAT_PLAN=1` enables the planned -path. `HTS_VCF_FORMAT_PLAN_STATS=1` records attempts, hits, fallback counts, -and parsed sample counts. - -## Plan Cache - -Plans are cached per header in private `bcf_hdr_aux_t` state. Cache keys are -the literal FORMAT string plus the current private header generation. Both -supported and unsupported FORMAT strings are cached, so repeated fallback cases -avoid recompilation. The cache grows from 16 to 128 entries and then evicts -entries in a simple rotating order, preferring unsupported plans when possible. - -`bcf_hdr_sync()` clears the cache and increments the generation because FORMAT -key ids, types, and lengths are header-local. - -## Compilation - -The compiler tokenizes the FORMAT string without collapsing empty tokens, so -malformed strings like `GT::DP` still fall back exactly as the generic parser -expects. It rejects: - -- empty FORMAT tokens; -- unknown or undefined tags; -- duplicate tags; -- unsupported header types; -- unsupported length models; -- non-standard `GT` declarations. - -Supported non-GT types are integer, float, and string tags with enough header -metadata to reproduce the generic BCF layout. The support is still composition -aware: measured-string plus float-vector layouts are kept on the generic parser -unless the row also contains integer-vector work. - -## Row Kinds - -The current row executor uses six row kinds: - -- `VCF_FORMAT_ROW_GT2` -- `VCF_FORMAT_ROW_INT1` -- `VCF_FORMAT_ROW_INTVEC` -- `VCF_FORMAT_ROW_FLOAT1` -- `VCF_FORMAT_ROW_FLOATN` -- `VCF_FORMAT_ROW_STR` - -The earlier width-specific integer row kinds were intentionally removed. A -single integer-vector parser now handles fixed and row-local integer widths, -including the over-width comma check needed to preserve fallback behavior. - -## Width Resolution - -Header-fixed fields use the declared width as the initial row width, including -`Number=A`, `Number=R`, and `Number=G` after applying the current record allele -count. Numeric widths must fit the planner cap of 64 values. - -Measured fields perform a first pass over original sample columns. This is -required for `Number=.` numeric rows and string spans, and it is also where the -planner validates per-sample separators. If samples have been selected with -`bcf_hdr_set_samples()`, the planner scans original columns but only measures -and emits retained samples. - -Numeric measured widths are capped at 64. String widths are capped at 256. -Integer and float vector rows may then be compacted to the observed row maximum -when that is what the generic parser would encode. - -## Fallback Contract - -Any row-level parse failure falls back for the whole FORMAT column. The -planned path must not partially keep planned output after detecting malformed -text, unsupported widths, unexpected separators, unsupported GT shape, or -sample-count mismatch. - -Diagnostic fallback reasons are: - -- `unsupported` -- `numeric_width` -- `string_width` -- `gt_shape` -- `parse` -- `separator` -- `sample_count` - -## Tests - -Focused validation commands used for the current state: - -```sh -make test/test_view test/test_format_plan_cache bgzip tabix -./test/test_format_plan_cache -perl test/test.pl -F test_vcf_format_plan -git diff --check -``` - -The `test_vcf_format_plan` fragment compares generic, planned, disabled, and -unknown environment behavior, including selected samples and expected fallback -statistics. It passed 21/21 tests after the simplification. - -The standalone `test/test_format_plan.sh` referenced in older branch notes is -not present in this checkout; the Perl test fragment is the active focused -correctness surface here. - -## Benchmark Snapshot - -The local untracked `bench/format-shape/**/*.vcf.gz` corpus snapshot was run -against the pre-simplification planner at -`0999351c040793ae9763b2f9ba7550b45fd9bd11` and the simplified branch. Both -used: - -```sh -HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 test/test_view -b -l 0 INPUT >/dev/null -``` - -Each input had one warmup per variant followed by three paired measured runs. -All 27 streaming BCF comparisons were byte-identical. - -Aggregate wall-clock delta: - -| Set | n | Faster | Slower | Neutral | Delta | -| --- | ---: | ---: | ---: | ---: | ---: | -| All inputs | 27 | 18 | 3 | 6 | -3.11% | -| Inputs with mean real >= 0.1s | 19 | 17 | 1 | 1 | -3.13% | -| FORMAT-hit inputs with mean real >= 0.1s | 15 | 14 | 0 | 1 | -3.48% | - -The slower cases were tiny no-FORMAT or site-only workloads with millisecond -absolute deltas. Among nontrivial FORMAT-hit workloads, the simplified -executor had no measured slowdown in this run. diff --git a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md b/docs/FORMAT_PLAN_EXPERIMENT_LOG.md deleted file mode 100644 index 77e9d95d9..000000000 --- a/docs/FORMAT_PLAN_EXPERIMENT_LOG.md +++ /dev/null @@ -1,88 +0,0 @@ -# FORMAT Planner Experiment Log - -This branch is an experiment in making VCF FORMAT parsing faster while keeping -the default HTSlib behavior unchanged. The planner remains disabled unless -`HTS_VCF_FORMAT_PLAN=1` is set. - -## Current Result - -The current implementation uses a dynamic per-tag planner with a conservative -whole-column fallback contract. It does not rely on width-specific integer -micro-specializations. The executor now uses one `INTVEC` row kind for integer -vectors instead of separate `INT2`, `INT3`, and selected fixed-width parsers. - -The simplification removed about 283 lines from `vcf.c`: - -```text -vcf.c | 300 ++++-------------------------------------------------------------- -1 file changed, 17 insertions(+), 283 deletions(-) -``` - -## Correctness Evidence - -Commands run after the simplification: - -```sh -make test/test_view test/test_format_plan_cache bgzip tabix -./test/test_format_plan_cache -perl test/test.pl -F test_vcf_format_plan -git diff --check -``` - -The Perl FORMAT-plan fragment passed 21/21 tests. A streaming `cmp` comparison -between the pre-simplification planner and simplified planner also passed for -all 27 local corpus inputs. - -## Local Corpus Simplification Benchmark - -Reference implementation: detached worktree at -`0999351c040793ae9763b2f9ba7550b45fd9bd11`. - -Compared variants: - -- `before`: planner before integer-vector simplification. -- `simplified`: `codex/simplify-format-plan-executor`. - -Both variants used: - -```sh -HTS_VCF_FORMAT_PLAN=1 HTS_VCF_FORMAT_PLAN_STATS=1 test/test_view -b -l 0 INPUT >/dev/null -``` - -Each input had one warmup per variant followed by three paired measured runs. -The input corpus was the local untracked `bench/format-shape/**/*.vcf.gz` -snapshot present in this worktree at the time of the run. - -| Set | n | Faster | Slower | Neutral | Before total | Simplified total | Delta | -| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| All inputs | 27 | 18 | 3 | 6 | 48.5999 | 47.0864 | -3.11% | -| Inputs with mean real >= 0.1s | 19 | 17 | 1 | 1 | 48.5033 | 46.9831 | -3.13% | -| FORMAT-hit inputs with mean real >= 0.1s | 15 | 14 | 0 | 1 | 41.5234 | 40.0798 | -3.48% | - -Representative FORMAT-heavy deltas: - -| Input | Before mean real | Simplified mean real | Delta | -| --- | ---: | ---: | ---: | -| `1000g_chr22_full_genotypes` | 10.9167 | 10.5067 | -3.76% | -| `HG002_CHM13v2.0_v5.0q_smvar` | 1.5633 | 1.3733 | -12.15% | -| `HG002_GRCh38_1_22_v4.2.1_benchmark` | 3.7733 | 3.5767 | -5.21% | -| `HG002_GRCh38_v5.0q_stvar` | 1.8800 | 1.7533 | -6.74% | -| `ccdg_chr22_10k` | 2.5467 | 2.4400 | -4.19% | -| `large_ccdg_likelihood_2048s` | 4.1833 | 4.1000 | -1.99% | -| `large_multiallelic_likelihood_2048s` | 3.0433 | 3.0133 | -0.99% | -| `large_reordered_likelihood_2048s` | 2.7767 | 2.6733 | -3.72% | - -Interpretation: the removed width-specific integer vector specializations did -not show a performance benefit on this corpus. The simpler executor was -slightly faster overall and easier to justify for review. - -## Rejected or Deferred Approaches - -Width-specific integer parsers were removed because their code-size and review -cost were not supported by the benchmark evidence. - -Runtime JIT/code generation remains deferred. It would avoid some source-level -specialization, but it would add platform, compiler, security, packaging, and -debugging complexity that is hard to justify before the non-JIT planner is -accepted and before HTSlib has a more controlled internal API boundary for VCF -record representation. diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md index 0cb44663c..5881077d0 100644 --- a/docs/FORMAT_PLAN_OVERVIEW.md +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -1,71 +1,67 @@ -# FORMAT Planner Overview - -The FORMAT planner is an optional fast path for parsing VCF sample columns into -BCF. It lives inside `vcf.c` and is disabled by default. Set -`HTS_VCF_FORMAT_PLAN=1` to try the planned path before the normal generic -FORMAT parser. - -The planner is deliberately conservative: if a FORMAT column is unsupported, -malformed, tied to a dirty header, or otherwise suspicious, it falls back for -the whole column and lets the existing parser own the result. Byte-identical -BCF output versus the generic parser is the hard correctness requirement. - -## Control Surface - -- `HTS_VCF_FORMAT_PLAN=1` enables the planned FORMAT parser. -- Unset, `0`, and any other value use the generic parser only. -- `HTS_VCF_FORMAT_PLAN_STATS=1` prints diagnostic counters at process exit - when the planner is enabled and reached. - -The feature does not add public API. Planner state is stored in private -`bcf_hdr_aux_t` data and is invalidated when `bcf_hdr_sync()` rebuilds header -dictionaries. - -## Architecture - -The planned path has four stages: - -1. Cache lookup or compilation for the literal FORMAT string under the active - header generation. -2. Row-local width resolution for `Number=A/R/G`, bounded `Number=.`, and - string fields that require scanning sample text. -3. Sample-major parsing into transposed FORMAT rows. -4. BCF row encoding using the same public record layout as the generic parser. - -Compiled plans describe tags at FORMAT-field granularity, not as exact whole -string kernels. That lets the same executor handle layouts such as `GT:AD`, -`GT:AD:DP:PL`, reordered fields, and additional supported header-described -tags without adding one hand-written parser per FORMAT string. - -## Supported Shapes - -The planned executor currently covers: - -- `GT` for the current `GT2` subset: two single-character alleles or missing - values separated by `/` or `|`, such as `0/1`, `.|.`, `0|.`, and `.|0`. -- Integer scalar fields. -- Integer vector fields through a single shared `INTVEC` path. -- Float scalar and vector fields. -- Fixed `Number=1` strings. -- Header-derived `Number=A`, `Number=R`, and `Number=G` numeric widths, capped - by the planner's numeric width limit. -- Bounded measured `Number=.` numeric rows and bounded measured strings. -- Selected-sample parsing via `bcf_hdr_set_samples()`. - -The planner falls back on undefined tags, duplicate FORMAT tags, unsupported -header type/number models, dirty headers, unsupported GT shapes, malformed -separators, unsafe row widths, and unprofitable string/float-heavy layouts. -Haploid, polyploid, multi-digit-allele, and very high allele-count GT shapes -therefore stay on the generic parser. Measured-string plus float-vector -compositions also fall back unless the FORMAT row contains integer-vector work -that makes the planned path worthwhile. - -## Current Simplification - -The integer-vector executor was simplified in -`codex/simplify-format-plan-executor`. Width-specific row kinds and parsers -for `INT2`, `INT3`, and selected fixed widths were removed. The executor now -uses: +# VCF FORMAT Planner Review Notes + +This is an interim review note for the opt-in VCF FORMAT planner in `vcf.c`. +It is intended to make the implementation easier to review in this branch; it +is not proposed as permanent user-facing documentation. + +## Purpose + +The planner is an optional fast path for parsing VCF sample `FORMAT` columns +into BCF. It is disabled by default and only runs when: + +```sh +HTS_VCF_FORMAT_PLAN=1 +``` + +Unset, `0`, or unknown values use the existing generic parser. The hard +correctness rule is byte-identical BCF output compared with the generic parser. +Any unsupported or suspicious FORMAT column falls back for the whole column. + +`HTS_VCF_FORMAT_PLAN_STATS=1` enables temporary diagnostic counters for review +and benchmarking. These environment variables are branch controls, not stable +public API. + +## Entry Point + +`vcf_parse_format()` tries `vcf_parse_format_planned()` before the generic +FORMAT parser when the environment gate is enabled. The planned parser returns +success only after it has fully emitted the FORMAT column. It returns the +fallback code before the caller invokes the generic parser when compilation, +width resolution, parsing, or row validation does not meet the supported +contract. + +The planned path never commits partial FORMAT output after detecting a row-local +failure. Rollback and fallback are part of the normal control flow. + +## Plan Cache + +Plans are stored in private `bcf_hdr_aux_t` state. Cache keys are the literal +FORMAT string plus the active private header generation. The cache stores both +supported and unsupported plans so repeated unsupported schemas avoid repeated +tokenization and metadata lookup. + +`bcf_hdr_sync()` clears cached plans and advances the generation because FORMAT +ids, types, and lengths are header-local. The planner refuses dirty headers. + +## Compilation + +Compilation works from header metadata rather than exact whole-FORMAT string +kernels. For each FORMAT token, the compiler records: + +- the header id; +- declared type; +- declared number model; +- whether the row needs record-local width resolution or sample-text scanning; +- the executor row kind. + +The compiler rejects empty tokens, undefined tags, duplicate tags, unsupported +types or number models, and non-standard `GT` declarations. Tokenization does +not collapse empty fields, so malformed schemas such as `GT::DP` still fall +back in a way that preserves generic-parser behavior. + +## Supported Rows + +The current executor has six row kinds: - `GT2` - `INT1` @@ -74,7 +70,78 @@ uses: - `FLOATN` - `STR` -This removed about 283 lines from `vcf.c` and made the code less shaped like a -collection of hand-specialized kernels. Local untracked corpus-snapshot -benchmarking showed the simplified executor was slightly faster overall, with -byte-identical output against the pre-simplification planner. +The earlier width-specific integer-vector row kinds were removed. A single +`INTVEC` path handles fixed and row-local integer widths, including the +over-width comma check needed to preserve fallback behavior. + +Supported shapes include: + +- simple diploid `GT` values with one-character alleles or missing values, + separated by `/` or `|`, including phased-missing forms such as `.|.`, + `0|.`, and `.|0`; +- integer and float scalar fields; +- integer and float vector fields within the planner width cap; +- numeric `Number=A`, `Number=R`, and `Number=G` widths resolved from the + current record allele count; +- bounded measured `Number=.` numeric rows; +- bounded `Type=String,Number=1` rows; +- selected-sample parsing via `bcf_hdr_set_samples()`. + +Unsupported or intentionally generic cases include undefined tags, duplicate +FORMAT tags, dirty headers, unsupported type or number declarations, unsupported +GT encodings, malformed separators, unsafe row widths, and string/float-heavy +layouts that do not benefit from planning. + +## Width Resolution + +Header-fixed rows use the declared width directly, after resolving +allele-dependent widths for the current record. Numeric widths must fit the +planner cap of 64 values. + +Measured numeric and string rows perform a first pass over original sample +columns. This is required to match the generic parser's width and padding +rules. If samples are selected, the planner still scans original sample +columns but measures and emits retained samples densely. + +Strings are capped at 256 bytes in the planned path. Wider string rows fall +back for the whole FORMAT column. + +## Fallback Contract + +Fallback is expected and intentional. It happens before generic parsing when +the compiler or row executor sees unsupported structure, unsupported widths, +unexpected separators, unsupported GT shape, parse failures, sample-count +mismatch, or allocation/internal consistency errors. + +Diagnostic fallback reasons are: + +- `unsupported` +- `numeric_width` +- `string_width` +- `gt_shape` +- `parse` +- `separator` +- `sample_count` + +## Test And Benchmark Evidence + +Focused correctness checks used for this review branch: + +```sh +make test/test_view test/test_format_plan_cache bgzip tabix +./test/test_format_plan_cache +perl test/test.pl -F test_vcf_format_plan +test/maintainer/check_spaces.pl vcf.c docs/FORMAT_PLAN_OVERVIEW.md \ + test/format-plan-malformed-fields.vcf test/test.pl +git diff --check +``` + +The focused FORMAT-plan test fragment covers disabled/unknown environment +behavior, selected samples, malformed FORMAT tokens, malformed numeric fields, +phased missing GT values, cache invalidation after header metadata changes, and +fallback after partial planned parsing. + +The current public-fork PR body contains the maintainer-facing performance +summary. The compact benchmark artifacts live on the corpus branch +`feature/vcf-parsing-speedup-corpus`, including the current `test_view` and +bcftools summaries for commit `bd643182c8fa722abbc0cb89860263a90bb97020`.