diff --git a/.gitignore b/.gitignore index 6aee57ace..d574fe03d 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ shlib-exports-*.txt /test/test_bgzf /test/test_expr /test/test_faidx +/test/test_format_plan_cache /test/test_index /test/test_introspection /test/test_kfunc diff --git a/Makefile b/Makefile index 6eab6ca6e..163208b78 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,7 @@ BUILT_TEST_PROGRAMS = \ test/test_str2int \ test/test_time_funcs \ test/test_view \ + test/test_format_plan_cache \ test/test_index \ test/test-vcf-api \ test/test-vcf-sweep \ @@ -690,6 +691,7 @@ check test: all $(HTSCODECS_TEST_TARGETS) test/test_str2int test/test_time_funcs test/fieldarith test/fieldarith.sam + test/test_format_plan_cache test/hfile if test "x$(BUILT_PLUGINS)" != "x"; then \ HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \ @@ -790,6 +792,9 @@ test/test_time_funcs: test/test_time_funcs.o test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread +test/test_format_plan_cache: test/test_format_plan_cache.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_format_plan_cache.o libhts.a $(LIBS) -lpthread + test/test_index: test/test_index.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_index.o libhts.a $(LIBS) -lpthread @@ -885,6 +890,7 @@ test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_reg test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_format_plan_cache.o: test/test_format_plan_cache.c config.h $(htslib_kstring_h) $(htslib_vcf_h) test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_kstring_h) $(htslib_kseq_h) diff --git a/docs/FORMAT_PLAN_OVERVIEW.md b/docs/FORMAT_PLAN_OVERVIEW.md new file mode 100644 index 000000000..5881077d0 --- /dev/null +++ b/docs/FORMAT_PLAN_OVERVIEW.md @@ -0,0 +1,147 @@ +# VCF FORMAT Planner Review Notes + +This is an interim review note for the opt-in VCF FORMAT planner in `vcf.c`. +It is intended to make the implementation easier to review in this branch; it +is not proposed as permanent user-facing documentation. + +## Purpose + +The planner is an optional fast path for parsing VCF sample `FORMAT` columns +into BCF. It is disabled by default and only runs when: + +```sh +HTS_VCF_FORMAT_PLAN=1 +``` + +Unset, `0`, or unknown values use the existing generic parser. The hard +correctness rule is byte-identical BCF output compared with the generic parser. +Any unsupported or suspicious FORMAT column falls back for the whole column. + +`HTS_VCF_FORMAT_PLAN_STATS=1` enables temporary diagnostic counters for review +and benchmarking. These environment variables are branch controls, not stable +public API. + +## Entry Point + +`vcf_parse_format()` tries `vcf_parse_format_planned()` before the generic +FORMAT parser when the environment gate is enabled. The planned parser returns +success only after it has fully emitted the FORMAT column. It returns the +fallback code before the caller invokes the generic parser when compilation, +width resolution, parsing, or row validation does not meet the supported +contract. + +The planned path never commits partial FORMAT output after detecting a row-local +failure. Rollback and fallback are part of the normal control flow. + +## Plan Cache + +Plans are stored in private `bcf_hdr_aux_t` state. Cache keys are the literal +FORMAT string plus the active private header generation. The cache stores both +supported and unsupported plans so repeated unsupported schemas avoid repeated +tokenization and metadata lookup. + +`bcf_hdr_sync()` clears cached plans and advances the generation because FORMAT +ids, types, and lengths are header-local. The planner refuses dirty headers. + +## Compilation + +Compilation works from header metadata rather than exact whole-FORMAT string +kernels. For each FORMAT token, the compiler records: + +- the header id; +- declared type; +- declared number model; +- whether the row needs record-local width resolution or sample-text scanning; +- the executor row kind. + +The compiler rejects empty tokens, undefined tags, duplicate tags, unsupported +types or number models, and non-standard `GT` declarations. Tokenization does +not collapse empty fields, so malformed schemas such as `GT::DP` still fall +back in a way that preserves generic-parser behavior. + +## Supported Rows + +The current executor has six row kinds: + +- `GT2` +- `INT1` +- `INTVEC` +- `FLOAT1` +- `FLOATN` +- `STR` + +The earlier width-specific integer-vector row kinds were removed. A single +`INTVEC` path handles fixed and row-local integer widths, including the +over-width comma check needed to preserve fallback behavior. + +Supported shapes include: + +- simple diploid `GT` values with one-character alleles or missing values, + separated by `/` or `|`, including phased-missing forms such as `.|.`, + `0|.`, and `.|0`; +- integer and float scalar fields; +- integer and float vector fields within the planner width cap; +- numeric `Number=A`, `Number=R`, and `Number=G` widths resolved from the + current record allele count; +- bounded measured `Number=.` numeric rows; +- bounded `Type=String,Number=1` rows; +- selected-sample parsing via `bcf_hdr_set_samples()`. + +Unsupported or intentionally generic cases include undefined tags, duplicate +FORMAT tags, dirty headers, unsupported type or number declarations, unsupported +GT encodings, malformed separators, unsafe row widths, and string/float-heavy +layouts that do not benefit from planning. + +## Width Resolution + +Header-fixed rows use the declared width directly, after resolving +allele-dependent widths for the current record. Numeric widths must fit the +planner cap of 64 values. + +Measured numeric and string rows perform a first pass over original sample +columns. This is required to match the generic parser's width and padding +rules. If samples are selected, the planner still scans original sample +columns but measures and emits retained samples densely. + +Strings are capped at 256 bytes in the planned path. Wider string rows fall +back for the whole FORMAT column. + +## Fallback Contract + +Fallback is expected and intentional. It happens before generic parsing when +the compiler or row executor sees unsupported structure, unsupported widths, +unexpected separators, unsupported GT shape, parse failures, sample-count +mismatch, or allocation/internal consistency errors. + +Diagnostic fallback reasons are: + +- `unsupported` +- `numeric_width` +- `string_width` +- `gt_shape` +- `parse` +- `separator` +- `sample_count` + +## Test And Benchmark Evidence + +Focused correctness checks used for this review branch: + +```sh +make test/test_view test/test_format_plan_cache bgzip tabix +./test/test_format_plan_cache +perl test/test.pl -F test_vcf_format_plan +test/maintainer/check_spaces.pl vcf.c docs/FORMAT_PLAN_OVERVIEW.md \ + test/format-plan-malformed-fields.vcf test/test.pl +git diff --check +``` + +The focused FORMAT-plan test fragment covers disabled/unknown environment +behavior, selected samples, malformed FORMAT tokens, malformed numeric fields, +phased missing GT values, cache invalidation after header metadata changes, and +fallback after partial planned parsing. + +The current public-fork PR body contains the maintainer-facing performance +summary. The compact benchmark artifacts live on the corpus branch +`feature/vcf-parsing-speedup-corpus`, including the current `test_view` and +bcftools summaries for commit `bd643182c8fa722abbc0cb89860263a90bb97020`. diff --git a/test/format-plan-cache.vcf b/test/format-plan-cache.vcf new file mode 100644 index 000000000..0e6bde16b --- /dev/null +++ b/test/format-plan-cache.vcf @@ -0,0 +1,61 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C . PASS . GT:F01 0/1:1 1/1:2 +1 2 . A C . PASS . GT:F02 0/1:2 1/1:3 +1 3 . A C . PASS . GT:F03 0/1:3 1/1:4 +1 4 . A C . PASS . GT:F04 0/1:4 1/1:5 +1 5 . A C . PASS . GT:F05 0/1:5 1/1:6 +1 6 . A C . PASS . GT:F06 0/1:6 1/1:7 +1 7 . A C . PASS . GT:F07 0/1:7 1/1:8 +1 8 . A C . PASS . GT:F08 0/1:8 1/1:9 +1 9 . A C . PASS . GT:F09 0/1:9 1/1:10 +1 10 . A C . PASS . GT:F10 0/1:10 1/1:11 +1 11 . A C . PASS . GT:F11 0/1:11 1/1:12 +1 12 . A C . PASS . GT:F12 0/1:12 1/1:13 +1 13 . A C . PASS . GT:F13 0/1:13 1/1:14 +1 14 . A C . PASS . GT:F14 0/1:14 1/1:15 +1 15 . A C . PASS . GT:F15 0/1:15 1/1:16 +1 16 . A C . PASS . GT:F16 0/1:16 1/1:17 +1 17 . A C . PASS . GT:F17 0/1:17 1/1:18 +1 18 . A C . PASS . GT:F18 0/1:18 1/1:19 +1 19 . A C . PASS . GT:F19 0/1:19 1/1:20 +1 20 . A C . PASS . GT:F20 0/1:20 1/1:21 +1 21 . A C . PASS . GT:LONGFORMATFIELD01:LONGFORMATFIELD02:LONGFORMATFIELD03:LONGFORMATFIELD04:LONGFORMATFIELD05:LONGFORMATFIELD06:LONGFORMATFIELD07:LONGFORMATFIELD08:LONGFORMATFIELD09:LONGFORMATFIELD10:LONGFORMATFIELD11:LONGFORMATFIELD12:LONGFORMATFIELD13:LONGFORMATFIELD14:LONGFORMATFIELD15:LONGFORMATFIELD16 0/1:1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16 1/1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17 diff --git a/test/format-plan-composable.vcf b/test/format-plan-composable.vcf new file mode 100644 index 000000000..942502853 --- /dev/null +++ b/test/format-plan-composable.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10610000 . A T 50 PASS . GT:AD 0/1:4,5 0/0:9,0 ./.:0,0 +chr22 10610010 . A T 50 PASS . GT:AD:DP:XX:PL 0/1:4,5:9:7,8:90,0,120 0/0:9,0:9:1,2:0,30,200 ./.:0,0:0:.,.:. +chr22 10610020 . A C,G 50 PASS . DP:XS:GT:XX:AD:GQ:PL 12:alpha:1/2:3,4:1,5,6:60:100,90,80,70,0,20 8:beta:0/2:5,6:4,0,4:35:80,70,60,50,40,0 0:.:./.:.,.:0,0,0:.:. +chr22 10610030 . G C 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:50:70,0 0/0:6,0:6:35:0,50 ./.:0,0:0:.:. +chr22 10610040 . G C 50 PASS . GT:AD:DP:VX:PL 0/1:3,4:7:1,2,3:70,0,90 0/0:6,0:6:5:0,50,120 ./.:0,0:0:.:. diff --git a/test/format-plan-edge.vcf b/test/format-plan-edge.vcf new file mode 100644 index 000000000..085434543 --- /dev/null +++ b/test/format-plan-edge.vcf @@ -0,0 +1,38 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10510061 . A T 64.12 PASS . GT:AB:AD:DP:GQ:PL 0/0:.:3,0:3:9:0,9,104 0/1:0.5:5,4:9:99:99,0,123 ./.:.:0,0:0:.:. +chr22 10510352 . AT A 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 1/1:0,5:5:15:1|1:10510352_AT_A:225,15,0 0/1:3,2:5:20:0|1:10510352_AT_A:20,0,200 ./.:0,0:0:.:.:.:. +chr22 10520000 . A C,G 50 PASS . GT:AD:DP:GQ:PL 1/2:1,4,5:10:60:100,80,70,60,0,20 0/2:3,0,2:5:30:80,70,60,50,40,0 ./.:0,0,0:0:.:. +chr22 10530000 . G A 50 PASS . GT:DP:AD:GQ:PL 0/1:7:3,4:42:99,0,120 0/0:5:5,0:15:0,15,200 ./.:0:0,0:.:. +chr22 10540000 . C T 50 PASS . GT:HQ:DP:GQ 0/1:10,20:7:40 0/0:.,.:5:50 ./.:.:0:. +chr22 10550000 . C T 50 PASS . GT:FT:DP:GQ 0/1:PASS:7:40 0/0:LowQual:5:50 ./.:.:0:. +chr22 10560000 . A C,G 50 PASS . GT:GL:DP:GQ 0/1:-0.1,-1.2,-9.9,-2.0,-3.0,-4.0:7:40 1/2:-9.9,-8.8,-7.7,-6.6,-5.5,-4.4:5:50 ./.:.:0:. +chr22 10570000 . A T 50 PASS . GT:AD:DP:GQ:PL 0:3,0:3:10:0,10,100 1:0,3:3:20:100,10,0 .:0,0:0:.:. +chr22 10580000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT:AD:DP:GQ:PL 10/10:0,0,0,0,0,0,0,0,0,0,7:7:20:200,190,180,170,160,150,140,130,120,110,100,90,80,70,60,50,40,30,20,10,0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460 0/10:3,0,0,0,0,0,0,0,0,0,2:5:30:0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,590,600,610,620,630,640,650 ./.:0,0,0,0,0,0,0,0,0,0,0:0:.:. +chr22 10585000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/0:.:3:10:. 0/1:.:5:20:. ./.:.:0:.:. +chr22 10586000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:50:90,0,120 0/1:3:3:20:80,0 ./.:0,0:0:.:. +chr22 10587000 . A C,G 50 PASS . GT:AD:DP:GQ:PL 1/2:1,4,5:10:60:100,80,70,60,0,20 0/2:.:5:30:. ./.:0,0,0:0:.:. +chr22 10590000 . A T 50 PASS . DP:GQ:GT:AD:PL 11:50:0/1:6,5:80,0,90 8:45:0/0:8,0:0,45,100 0:.:./.:0,0:. +chr22 10591000 . A T 50 PASS . AD:PL:GT:DP:GQ 4,3:70,0,80:0/1:7:60 9,0:0,70,120:0/0:9:50 0,0:.:./.:0:. +chr22 10592000 . A T 50 PASS . GT:DP:AB:GQ:AD:PL 0/1:12:0.42:70:7,5:90,0,100 0/0:10:0.01:60:10,0:0,60,120 ./.:0:.:.:0,0:. +chr22 10593000 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:12:3,4,5,6 0/0:-129,20:8:8,0,0,0 ./.:.,.:0:.,.,.,. +chr22 10593500 . A T 50 PASS . GT:HQ:MIN_DP:SB 0/1:127,128:32767:3,4,5,6 0/0:-32760,32768:-32761:8,0,0,0 ./.:32767,32768:0:127,128,32767,32768 +chr22 10594000 . A T 50 PASS . GT:AD:DP:GQ:PGT:PID:PL 0|1:4,5:9:50:0|1:P1:90,0,90 0/1:3,2:5:20:0|1:10594000_A_T_LONG_PHASE_SET:20,0,200 ./.:0,0:0:.:.:.:. +chr22 10595000 . A T 50 PASS . GT 0/1 1|1 ./. +chr22 10595500 . A T 50 PASS . GT .|. 0|. .|0 +chr22 10596000 . A T 50 PASS . GT 0 1 . +chr22 10597000 . A C,G,T,AA,AC,AG,AT,CA,CC,CG 50 PASS . GT 10/10 0/10 ./. diff --git a/test/format-plan-empty-format-tag.vcf b/test/format-plan-empty-format-tag.vcf new file mode 100644 index 000000000..b9ca9d63c --- /dev/null +++ b/test/format-plan-empty-format-tag.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C . PASS . GT::DP 0/1::5 diff --git a/test/format-plan-fallback.vcf b/test/format-plan-fallback.vcf new file mode 100644 index 000000000..5d082753b --- /dev/null +++ b/test/format-plan-fallback.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:PID:DP 0/1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA:12 0/0:P2:8 ./.:.:0 +1 2 . A C . PASS . GT:DP 0/1 0/0:8 ./.:0 +1 3 . A C . PASS . GT:QS:DP 0/1:1.5:999999999999999999999999999999 0/0:2.5:8 ./.:.:0 diff --git a/test/format-plan-float-string.vcf b/test/format-plan-float-string.vcf new file mode 100644 index 000000000..13db490b1 --- /dev/null +++ b/test/format-plan-float-string.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C,G . PASS . GT:GL:FT:DP 0/1:-0.25,-0.50,-0.75,-1.00,-1.25,-1.50:PASS:12 1/2:-0.50,-0.75,-1.00,-1.25,-1.50,-1.75:LowQual:8 diff --git a/test/format-plan-float-vector.vcf b/test/format-plan-float-vector.vcf new file mode 100644 index 000000000..a6c70e5de --- /dev/null +++ b/test/format-plan-float-vector.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:GL:QS:AB:DP 0/1:-0.1,-1.2,-3:0.5,1.5:0.50:7 0/0:0,-5,-9:0.1,0.2,0.3,0.4:.:8 ./.:.:1.25,2.5:.:0 +1 2 . A C,G . PASS . GT:GL:QS:AB:DP 1/2:-9,-8,-7,-6,-5,-4:0.1:0.25:12 0/2:-2,-3,-4,-5,-6,-7:0.2,0.3:0.75:9 ./.:.:.:.:0 +1 3 . G T . PASS . QS:GT:GL:AB:DP 0.1,0.2:0/1:-1,-2,-3:0.5:5 .:0/0:0,-10,-20:0.0:4 0.3,0.4,0.5:./.:.:.:0 +1 4 . T G . PASS . GT:GL:QS:AB:DP 0/1:.:.:.:6 0/0:.:.:.:4 ./.:.:.:.:0 +1 5 . C T . PASS . GT:GL:QS:AB:DP 0/1:-0.4,-0.8:0.1:0.5:3 0/0:0,-2:0.2:0.1:2 ./.:.:.:.:0 diff --git a/test/format-plan-gt-header-shape.vcf b/test/format-plan-gt-header-shape.vcf new file mode 100644 index 000000000..5e3c79e19 --- /dev/null +++ b/test/format-plan-gt-header-shape.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10620000 . A T 50 PASS . GT:DP 0/1:7 0/0:5 ./.:0 diff --git a/test/format-plan-header-mismatch.vcf b/test/format-plan-header-mismatch.vcf new file mode 100644 index 000000000..583233048 --- /dev/null +++ b/test/format-plan-header-mismatch.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +chr22 10600000 . A T 50 PASS . GT:AD:DP:GQ:PL 0/1:3,4:7:42:90,0,120 0/0:5,0:5:15:0,15,200 ./.:.:0:.:. diff --git a/test/format-plan-malformed-fields.vcf b/test/format-plan-malformed-fields.vcf new file mode 100644 index 000000000..7e464651d --- /dev/null +++ b/test/format-plan-malformed-fields.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C . PASS . GT:F:DP 0/1::5 +1 2 . A C . PASS . GT:GL:DP 0/1::5 +1 3 . A C . PASS . ST:DP :5 +1 4 . A C . PASS . GT:AD:DP 0/1:1,2,:5 diff --git a/test/format-plan-repeated-wide-gt.vcf b/test/format-plan-repeated-wide-gt.vcf new file mode 100644 index 000000000..4b1f93a1e --- /dev/null +++ b/test/format-plan-repeated-wide-gt.vcf @@ -0,0 +1,14 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 +1 1 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 2 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 3 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 4 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 5 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 6 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 7 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 8 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 9 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 +1 10 . A C,G,T,AA,AC,AG,AT,CA,CC,CG . PASS . GT 10/10 diff --git a/test/format-plan-sample-count.vcf b/test/format-plan-sample-count.vcf new file mode 100644 index 000000000..b5c168c6d --- /dev/null +++ b/test/format-plan-sample-count.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 +1 1 . A C . PASS . GT:DP 0/1:7 diff --git a/test/format-plan-sample-skip.vcf b/test/format-plan-sample-skip.vcf new file mode 100644 index 000000000..3027add4a --- /dev/null +++ b/test/format-plan-sample-skip.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:PID:DP 0/1:P1:7 0/1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA:not_an_int 0/0:P3:8 diff --git a/test/format-plan-string-span.vcf b/test/format-plan-string-span.vcf new file mode 100644 index 000000000..b2861114a --- /dev/null +++ b/test/format-plan-string-span.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +1 1 . A C . PASS . GT:FT:PID:DP 0/1:PASS:P1:12 0/0:LowQual:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:8 ./.:.:.:0 +1 2 . A C . PASS . FT:GT:PID:DP PASS:0/1:P1:12 q10:1/1:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:4 .:./.:.:0 +1 3 . A C . PASS . PID:HP:GT:DP P1:H1:0|1:10 .:H2:1|1:5 PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:H3:./.:0 +1 4 . A C . PASS . GT:PID:FT:HP:DP 0/1:SHORT:PASS:A:20 0/0:PHASESET_WITH_A_MEDIUM_LENGTH_IDENTIFIER:LowQual:hap-two:9 ./.:.:.:.:0 diff --git a/test/test.pl b/test/test.pl index eaa65ea30..90aab1229 100755 --- a/test/test.pl +++ b/test/test.pl @@ -55,6 +55,7 @@ run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out'); run_test('test_vcf_various',$opts); run_test('test_vcf_44', $opts); +run_test('test_vcf_format_plan', $opts); run_test('test_bcf_sr_sort',$opts); run_test('test_bcf_sr_no_index',$opts); run_test('test_bcf_sr_range', $opts); @@ -1211,6 +1212,234 @@ sub test_vcf_44 cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf"); } +sub test_vcf_format_plan_check_stats +{ + my ($stats_file, $expected) = @_; + return if !defined($expected); + + open(my $fh, '<', $stats_file) + or return "failed to open planner stats file $stats_file: $!"; + local $/; + my $stats = <$fh>; + close($fh) or return "failed to close planner stats file $stats_file: $!"; + + my %observed; + if ($stats =~ /vcf-format-plan attempts=(\d+) hits=(\d+) fallback=(\d+) parsed_samples=(\d+)/) { + @observed{qw(attempts hits fallback parsed_samples)} = ($1, $2, $3, $4); + } else { + return "missing planner stats in $stats_file\n$stats"; + } + + if ($stats =~ /vcf-format-plan-fallback unsupported=(\d+) numeric_width=(\d+) string_width=(\d+) gt_shape=(\d+) parse=(\d+) separator=(\d+) sample_count=(\d+)/) { + @observed{qw(unsupported numeric_width string_width gt_shape parse separator sample_count)} + = ($1, $2, $3, $4, $5, $6, $7); + } else { + return "missing planner fallback stats in $stats_file\n$stats"; + } + + for my $key (sort keys %$expected) { + return "planner stat $key: expected $$expected{$key}, got $observed{$key}\n$stats" + if !exists($observed{$key}) || $observed{$key} != $$expected{$key}; + } + + return; +} + +sub test_vcf_format_plan_one +{ + my ($opts, $input, $label, $extra_args, $expected_stats) = @_; + my $base = "$$opts{tmp}/$label.base.bcf"; + my $plan = "$$opts{tmp}/$label.plan.bcf"; + my $disabled = "$$opts{tmp}/$label.disabled.bcf"; + my $plan_stats = "$$opts{tmp}/$label.plan.stats"; + my $test = "VCF FORMAT planner: $label"; + my $args = defined($extra_args) ? $extra_args : ""; + my $plan_env = "HTS_VCF_FORMAT_PLAN=1"; + my $plan_stderr = ""; + + if (defined($expected_stats)) { + $plan_env .= " HTS_VCF_FORMAT_PLAN_STATS=1"; + $plan_stderr = " 2>$plan_stats"; + } + + print "$test:\n"; + + my $cmd = "env HTS_VCF_FORMAT_PLAN=0 $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $base"; + print "\t$cmd\n"; + my ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "generic parser command failed\n$out"); + return; + } + + $cmd = "env $plan_env $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $plan$plan_stderr"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "planned parser command failed\n$out"); + return; + } + + if (defined($expected_stats)) { + my $stats_error = test_vcf_format_plan_check_stats($plan_stats, $expected_stats); + if ($stats_error) { + failed($opts, $test, $stats_error); + return; + } + } + + $cmd = "cmp $base $plan"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, $out ? $out : "planned output differs from generic output"); + return; + } + + $cmd = "env HTS_VCF_FORMAT_PLAN=off $$opts{path}/test_view -b -l 0 $args $$opts{path}/$input > $disabled"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, "disabled-mode command failed\n$out"); + return; + } + + $cmd = "cmp $base $disabled"; + print "\t$cmd\n"; + ($ret, $out) = _cmd($cmd); + if ($ret) { + failed($opts, $test, $out ? $out : "disabled-mode output differs from generic output"); + return; + } + + passed($opts, $test); +} + +sub test_vcf_format_plan_failure +{ + my ($opts, $input, $label, $expected_stats) = @_; + my $base = "$$opts{tmp}/$label.base.bcf"; + my $plan = "$$opts{tmp}/$label.plan.bcf"; + my $plan_stats = "$$opts{tmp}/$label.plan.stats"; + my $test = "VCF FORMAT planner expected failure: $label"; + my $plan_env = "HTS_VCF_FORMAT_PLAN=1"; + my $plan_stderr = ""; + + if (defined($expected_stats)) { + $plan_env .= " HTS_VCF_FORMAT_PLAN_STATS=1"; + $plan_stderr = " 2>$plan_stats"; + } + + print "$test:\n"; + if (!-e "$$opts{path}/$input") { + failed($opts, $test, "missing test input $$opts{path}/$input"); + return; + } + + my $cmd = "env HTS_VCF_FORMAT_PLAN=0 $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $base"; + print "\t$cmd\n"; + my ($base_ret, $base_out) = _cmd($cmd); + + $cmd = "env $plan_env $$opts{path}/test_view -b -l 0 $$opts{path}/$input > $plan$plan_stderr"; + print "\t$cmd\n"; + my ($plan_ret, $plan_out) = _cmd($cmd); + + if ($base_ret == 0 || $plan_ret == 0) { + failed($opts, $test, "expected both parser modes to fail, got generic=$base_ret planned=$plan_ret"); + return; + } + + if (defined($expected_stats)) { + my $stats_error = test_vcf_format_plan_check_stats($plan_stats, $expected_stats); + if ($stats_error) { + failed($opts, $test, $stats_error); + return; + } + } + + passed($opts, $test); +} + +sub test_vcf_format_plan +{ + my ($opts) = @_; + + for my $input ( + "format-plan-edge.vcf", + "format-plan-header-mismatch.vcf", + "format-plan-composable.vcf", + "format-plan-gt-header-shape.vcf", + "format-plan-cache.vcf", + "format-plan-float-string.vcf", + "format-plan-fallback.vcf", + "format-plan-repeated-wide-gt.vcf") { + (my $label = $input) =~ s/\.vcf$//; + test_vcf_format_plan_one($opts, $input, $label, ""); + } + + test_vcf_format_plan_one($opts, "format-plan-string-span.vcf", + "format-plan-string-span", "", + { attempts => 4, hits => 4, fallback => 0, + parsed_samples => 12, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + + test_vcf_format_plan_one($opts, "format-plan-malformed-fields.vcf", + "format-plan-malformed-fields", "", + { attempts => 4, hits => 0, fallback => 4, + parsed_samples => 0, + unsupported => 0, numeric_width => 0, + string_width => 1, gt_shape => 0, + parse => 3, separator => 0, + sample_count => 0 }); + + test_vcf_format_plan_one($opts, "format-plan-float-vector.vcf", + "format-plan-float-vector", "", + { attempts => 5, hits => 5, fallback => 0, + parsed_samples => 15, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + + test_vcf_format_plan_one($opts, "format-plan-float-vector.vcf", + "format-plan-float-vector.S1_S3", "-s S1,S3", + { attempts => 5, hits => 5, fallback => 0, + parsed_samples => 10, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + + for my $samples ("S1,S3", "S2", "^S2") { + for my $input ("format-plan-composable.vcf", "format-plan-edge.vcf") { + (my $label = "$input.$samples") =~ s/[^A-Za-z0-9_.-]/_/g; + test_vcf_format_plan_one($opts, $input, $label, "-s '$samples'"); + } + } + + test_vcf_format_plan_one($opts, "format-plan-sample-skip.vcf", + "format-plan-sample-skip.S1_S3", "-s S1,S3", + { attempts => 1, hits => 1, fallback => 0, + parsed_samples => 2, + unsupported => 0, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); + test_vcf_format_plan_failure($opts, "format-plan-sample-count.vcf", + "format-plan-sample-count"); + test_vcf_format_plan_failure($opts, "format-plan-empty-format-tag.vcf", + "format-plan-empty-format-tag", + { attempts => 1, hits => 0, fallback => 1, + parsed_samples => 0, + unsupported => 1, numeric_width => 0, + string_width => 0, gt_shape => 0, + parse => 0, separator => 0, + sample_count => 0 }); +} + sub write_multiblock_bgzf { my ($name, $frags) = @_; diff --git a/test/test_format_plan_cache.c b/test/test_format_plan_cache.c new file mode 100644 index 000000000..d0028e6f1 --- /dev/null +++ b/test/test_format_plan_cache.c @@ -0,0 +1,144 @@ +/* test/test_format_plan_cache.c -- FORMAT planner cache tests. + + Copyright (C) 2026 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include + +#include "../htslib/kstring.h" +#include "../htslib/vcf.h" + +static void fail(const char *msg) +{ + fprintf(stderr, "%s\n", msg); + exit(EXIT_FAILURE); +} + +#define check0(expr) do { if ((expr) != 0) fail("check failed: " #expr); } while (0) +#define check1(expr) do { if (!(expr)) fail("check failed: " #expr); } while (0) + +static int enable_format_plan(void) +{ + static char env[] = "HTS_VCF_FORMAT_PLAN=1"; + return putenv(env); +} + +static void parse_line(bcf_hdr_t *hdr, bcf1_t *rec, kstring_t *line, + const char *text) +{ + ks_clear(line); + if (kputsn(text, strlen(text), line) < 0) + fail("failed to build VCF line"); + check0(vcf_parse(line, hdr, rec)); +} + +static void check_x_values(bcf_hdr_t *hdr, bcf1_t *rec, + const int32_t *expected, int n_expected) +{ + int32_t *values = NULL; + int n_values = 0, ret, i; + + ret = bcf_get_format_int32(hdr, rec, "X", &values, &n_values); + if (ret != n_expected) { + free(values); + fail("unexpected X vector length"); + } + for (i = 0; i < n_expected; i++) { + if (values[i] != expected[i]) { + free(values); + fail("unexpected X value"); + } + } + free(values); +} + +static void check_x_float(bcf_hdr_t *hdr, bcf1_t *rec, float expected) +{ + bcf_fmt_t *fmt; + float *values = NULL; + int n_values = 0, ret; + + check0(bcf_unpack(rec, BCF_UN_FMT)); + fmt = bcf_get_fmt(hdr, rec, "X"); + if (!fmt) + fail("missing X FORMAT field"); + if (fmt->type != BCF_BT_FLOAT || fmt->n != 1) + fail("unexpected X FORMAT storage type"); + + ret = bcf_get_format_float(hdr, rec, "X", &values, &n_values); + if (ret != 1) { + free(values); + fail("unexpected X float vector length"); + } + if (values[0] != expected) { + free(values); + fail("unexpected X float value"); + } + free(values); +} + +int main(void) +{ + static char header[] = + "##fileformat=VCFv4.3\n" + "##contig=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\n"; + static const int32_t x1[] = { 7 }; + bcf_hdr_t *hdr; + bcf1_t *rec; + kstring_t line = KS_INITIALIZE; + + check0(enable_format_plan()); + hdr = bcf_hdr_init("r"); + rec = bcf_init(); + check1(hdr); + check1(rec); + check0(bcf_hdr_parse(hdr, header)); + + parse_line(hdr, rec, &line, + "1\t1\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:7"); + check_x_values(hdr, rec, x1, 1); + + /* + * Rebuild the same FORMAT string against changed metadata. A stale plan + * would still think X is an integer and could encode the second row with + * integer storage even though the header now declares a float. The + * header-owned generation must force a fresh compile. + */ + bcf_hdr_remove(hdr, BCF_HL_FMT, "X"); + check0(bcf_hdr_append(hdr, + "##FORMAT=")); + check0(bcf_hdr_sync(hdr)); + bcf_clear1(rec); + parse_line(hdr, rec, &line, + "1\t2\t.\tA\tC\t.\tPASS\t.\tGT:X\t0/1:2"); + check_x_float(hdr, rec, 2.0f); + + bcf_destroy(rec); + bcf_hdr_destroy(hdr); + free(line.s); + return EXIT_SUCCESS; +} diff --git a/test/test_view.c b/test/test_view.c index c899ff995..a9a0615c6 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -30,7 +30,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "../cram/cram.h" #include "../htslib/sam.h" @@ -49,6 +48,7 @@ struct opts { int multi_reg; char *index; int min_shift; + char *samples; }; enum test_op { @@ -204,6 +204,9 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, if (!b) return 1; + if (opts->samples && bcf_hdr_set_samples(h, opts->samples, 0) < 0) + return 1; + if (!opts->benchmark && bcf_hdr_write(out, h) < 0) return 1; @@ -297,8 +300,9 @@ int main(int argc, char *argv[]) opts.multi_reg = 0; opts.index = NULL; opts.min_shift = 0; + opts.samples = NULL; - while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:v")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:vs:")) >= 0) { switch (c) { case 'D': opts.flag |= READ_CRAM; break; case 'S': opts.flag |= READ_COMPRESSED; break; @@ -321,11 +325,12 @@ int main(int argc, char *argv[]) case 'x': opts.index = optarg; break; case 'm': opts.min_shift = atoi(optarg); break; case 'p': out_fn = optarg; break; + case 's': opts.samples = optarg; break; case 'v': hts_verbose++; break; } } if (argc == optind) { - fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] [-x index_fn] [-m min_shift] [-p out] [-v] || [region]\n"); + fprintf(stderr, "Usage: test_view [-DSI] [-t fn_ref] [-i option=value] [-bC] [-l level] [-o option=value] [-N num_reads] [-B] [-Z hdr_nuls] [-@ num_threads] [-x index_fn] [-m min_shift] [-p out] [-s samples] [-v] || [region]\n"); fprintf(stderr, "\n"); fprintf(stderr, "-D: read CRAM format (mode 'c')\n"); fprintf(stderr, "-S: read compressed BCF, BAM, FAI (mode 'b')\n"); @@ -348,6 +353,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "-x fn: write index to fn\n"); fprintf(stderr, "-m min_shift: specifies BAI/CSI bin size; 0 is BAI(BAM) or TBI(VCF), 14 is CSI default\n"); fprintf(stderr, "-p out_fn: output to out_fn instead of stdout\n"); + fprintf(stderr, "-s samples: select VCF samples, as a comma-separated bcf_hdr_set_samples list\n"); fprintf(stderr, "-v: increase verbosity\n"); fprintf(stderr, "The region list entries should be specified as 'reg:beg-end', with intervals of a region being disjunct and sorted by the starting coordinate.\n"); return 1; diff --git a/vcf.c b/vcf.c index 544fe8c01..09093cb3a 100644 --- a/vcf.c +++ b/vcf.c @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION #include "fuzz_settings.h" @@ -112,11 +113,17 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N // Note that this preserving API and ABI requires that the first element is vdict_t struct // rather than a pointer, as user programs may (and in some cases do) access the dictionary // directly as (vdict_t*)hdr->dict. +typedef struct vcf_format_plan_cache_t vcf_format_plan_cache_t; +static void vcf_format_plan_cache_clear(vcf_format_plan_cache_t *cache); +static void vcf_format_plan_cache_destroy(vcf_format_plan_cache_t *cache); + typedef struct { vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings + vcf_format_plan_cache_t *format_plan_cache; // Header-local FORMAT planner cache + uint64_t format_plan_gen; // Incremented when header dictionaries are resynchronised int version; //cached version uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called } @@ -343,6 +350,10 @@ int bcf_hdr_sync(bcf_hdr_t *h) free(aux->key_len); aux->key_len = NULL; } + if (aux && aux->format_plan_cache) + vcf_format_plan_cache_clear(aux->format_plan_cache); + if (aux) + aux->format_plan_gen++; h->dirty = 0; return 0; @@ -1649,6 +1660,8 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if ( !aux ) goto fail; if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } aux->key_len = NULL; + aux->format_plan_cache = NULL; + aux->format_plan_gen = 0; aux->dict = *((vdict_t*)h->dict[0]); aux->version = 0; aux->ref_count = 1; @@ -1693,6 +1706,7 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); kh_destroy(hdict, aux->gen); free(aux->key_len); // may exist for dict[0] only + vcf_format_plan_cache_destroy(aux->format_plan_cache); } kh_destroy(vdict, d); free(h->id[i]); @@ -2921,6 +2935,76 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; } +static int bcf_enc_vint_known_range_special(kstring_t *s, int n, int32_t *a, int wsize, + int32_t min, int32_t max, int has_special) +{ + int i; + // min/max must match bcf_enc_vint()'s scan: missing and vector-end values + // may affect max, but are excluded from min. + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_int1(s, a[0]); + } else { + if (wsize <= 0) wsize = n; + + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + if (has_special) { + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + } else { + for (i = 0; i < n; ++i, p++) + *p = a[i]; + } + s->l += n; + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + if (has_special) { + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + } else { + for (i = 0; i < n; ++i) + { + i16_to_le((int16_t)a[i], p); + p += sizeof(int16_t); + } + } + s->l += n * sizeof(int16_t); + } else { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + i32_to_le(a[i], p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } + } + + return 0; +} + #ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; @@ -3133,185 +3217,1648 @@ static inline int align_mem(kstring_t *s) #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ -// detect FORMAT "." -static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - const char *p, const char *q) { - const char *end = s->s + s->l; - if ( q>=end ) - { - hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_NCOLS; - return -1; - } +static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v); - v->n_fmt = 0; - if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." - { - v->n_sample = bcf_hdr_nsamples(h); - return 1; - } +/* + * Planned FORMAT parser. + * + * The generic FORMAT parser below is deliberately permissive: it can repair + * missing header declarations, handle sample subsetting, and recover from many + * irregular row shapes. The planned parser only handles rows that can be + * described by existing FORMAT header metadata and parsed as a fixed list of + * per-tag operations. If any compile-time or row-local invariant fails, it + * returns -3 so the generic parser handles the whole FORMAT column. + * + * The implementation is organized as: + * + * - controls and diagnostics; + * - cached FORMAT plan data structures; + * - plan cache management and FORMAT-string compilation; + * - small value parsers used by the sample loop; + * - row-local layout and BCF encoding helpers; + * - width measurement for variable row shapes; + * - execution and the generic-parser fallback entry point. + */ - return 0; +/* + * Planned FORMAT parser: controls and diagnostics. + * + * HTS_VCF_FORMAT_PLAN controls the feature: + * unset/0 use the generic parser only + * 1 enable the planned per-tag parser, with generic fallback + * + * HTS_VCF_FORMAT_PLAN_STATS=1 emits aggregate plan counters at process exit. + * These counters are intentionally diagnostic; normal parsing avoids touching + * them unless the stats environment variable is enabled. + */ +typedef struct { + uint64_t attempts; + uint64_t hits; + uint64_t fallback; + uint64_t parsed_samples; +} vcf_format_plan_stats_t; + +static vcf_format_plan_stats_t vcf_format_plan_stats; + +typedef enum { + VCF_FORMAT_PLAN_FB_UNSUPPORTED = 0, + VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH, + VCF_FORMAT_PLAN_FB_STRING_WIDTH, + VCF_FORMAT_PLAN_FB_GT_SHAPE, + VCF_FORMAT_PLAN_FB_PARSE, + VCF_FORMAT_PLAN_FB_SEPARATOR, + VCF_FORMAT_PLAN_FB_SAMPLE_COUNT, + VCF_FORMAT_PLAN_FB_N +} vcf_format_plan_fallback_reason_t; + +static uint64_t vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_N]; + +static void vcf_format_plan_report_stats(void) +{ + fprintf(stderr, + "vcf-format-plan attempts=%llu hits=%llu fallback=%llu parsed_samples=%llu\n", + (unsigned long long) vcf_format_plan_stats.attempts, + (unsigned long long) vcf_format_plan_stats.hits, + (unsigned long long) vcf_format_plan_stats.fallback, + (unsigned long long) vcf_format_plan_stats.parsed_samples); + fprintf(stderr, + "vcf-format-plan-fallback unsupported=%llu numeric_width=%llu string_width=%llu gt_shape=%llu parse=%llu separator=%llu sample_count=%llu\n", + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_UNSUPPORTED], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_STRING_WIDTH], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_GT_SHAPE], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_PARSE], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SEPARATOR], + (unsigned long long) vcf_format_plan_fallback_reasons[VCF_FORMAT_PLAN_FB_SAMPLE_COUNT]); } -// get format information from the dictionary -static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - const char *p, const char *q, fmt_aux_t *fmt) { - const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - char *t; - int j; - ks_tokaux_t aux1; +static int vcf_format_plan_stats_enabled(void) +{ + static int enabled = -1; + static int registered = 0; - for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { - if (j >= MAX_N_FMT) { - v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", - bcf_seqname_safe(h,v), v->pos+1); - return -1; + if (enabled < 0) { + const char *env = getenv("HTS_VCF_FORMAT_PLAN_STATS"); + enabled = env && strcmp(env, "1") == 0; + if (enabled && !registered) { + if (atexit(vcf_format_plan_report_stats) == 0) + registered = 1; } + } + return enabled; +} - *(char*)aux1.p = 0; - khint_t k = kh_get(vdict, d, t); - if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { - if ( t[0]=='.' && t[1]==0 ) - { - hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_TAG_INVALID; - return -1; - } - hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1); - kstring_t tmp = {0,0,0}; - int l; - ksprintf(&tmp, "##FORMAT=", t); - bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); - free(tmp.s); - int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; - if (res < 0) bcf_hrec_destroy(hrec); - if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); +static int vcf_format_plan_enabled(void) +{ + static int enabled = -1; - k = kh_get(vdict, d, t); - v->errcode |= BCF_ERR_TAG_UNDEF; - if (res || k == kh_end(d)) { - hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_TAG_INVALID; - return -1; - } - } - fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; - fmt[j].key = kh_val(d, k).id; - fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); - fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; - v->n_fmt++; + if (enabled < 0) { + const char *env = getenv("HTS_VCF_FORMAT_PLAN"); + enabled = env && strcmp(env, "1") == 0; } - return 0; + return enabled; } -// compute max -static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { - int n_sample_ori = -1; - char *r = q + 1; // r: position in the format string - int l = 0, m = 1, g = 1, j; - v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles - const char *end = s->s + s->l; +enum { + VCF_FORMAT_MAX_NUMERIC_WIDTH = 64, + VCF_FORMAT_MAX_STRING_WIDTH = 256 +}; - while ( rkeep_samples ) - { - n_sample_ori++; - if ( !bit_array_test(h->keep_samples,n_sample_ori) ) - { - while ( *r!='\t' && ris_gt) g++; - break; +typedef enum { + VCF_FORMAT_ROW_GT2, + VCF_FORMAT_ROW_INT1, + VCF_FORMAT_ROW_INTVEC, + VCF_FORMAT_ROW_FLOAT1, + VCF_FORMAT_ROW_FLOATN, + VCF_FORMAT_ROW_STR +} vcf_format_row_kind_t; - case '\t': - *r = 0; // fall through +typedef struct { + /* + * Row-local operation. Header Number=A/R/G and measured Number=. fields + * depend on the current record, so width/size/offset are resolved per row. + */ + int key; + int width; + int size; + int offset; + vcf_format_row_kind_t kind; + uint8_t can_compact; +} vcf_format_row_op_t; - default: // valid due to while loop above. - case '\0': - case ':': - l = r - r_start; r_start = r; - if (f->max_m < m) f->max_m = m; - if (f->max_l < l) f->max_l = l; - if (f->is_gt && f->max_g < g) f->max_g = g; - l = 0, m = g = 1; - if ( *r==':' ) { - j++; f++; - if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", - h->id[BCF_DT_CTG][v->rid].key, v->pos+1); - v->errcode |= BCF_ERR_NCOLS; - return -1; - } - } else goto end_for; - break; - } - if ( r>=end ) break; - r++; - } - end_for: - v->n_sample++; - if ( v->n_sample == bcf_hdr_nsamples(h) ) break; - r++; +typedef struct { + /* + * Measured string fields are scanned once up front to determine the row + * width. Keep the span from that pass so execution can copy bytes without + * searching for the same ':' or tab delimiter again. + */ + const char *ptr; + int len; +} vcf_format_string_span_t; + +typedef struct { + int32_t min; + int32_t max; + int has_special; +} vcf_plan_int_range_t; + +/* + * Planned FORMAT parser: cache management and plan compilation. + * + * Plans are owned by bcf_hdr_aux_t because FORMAT ids, types, and Number + * declarations are header-local. The cache stores both supported and + * unsupported FORMAT strings; unsupported entries let repeated odd schemas + * fall through to the generic parser without repeatedly recompiling. + */ +static uint64_t vcf_format_plan_hash(const char *format, size_t len) +{ + size_t i; + uint64_t hash = 1469598103934665603ULL; + + for (i = 0; i < len; i++) { + hash ^= (unsigned char) format[i]; + hash *= 1099511628211ULL; } + return hash; +} - return 0; +static void vcf_format_general_plan_destroy(vcf_format_general_plan_t *plan) +{ + if (!plan) + return; + free(plan->format); + memset(plan, 0, sizeof(*plan)); } -// allocate memory for arrays -static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - const char *p, const char *q, - fmt_aux_t *fmt) { - kstring_t *mem = (kstring_t*)&h->mem; +static void vcf_format_plan_cache_clear(vcf_format_plan_cache_t *cache) +{ + int i; - int j; - for (j = 0; j < v->n_fmt; ++j) { - fmt_aux_t *f = &fmt[j]; - if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + if (!cache) + return; + for (i = 0; i < cache->n; i++) + vcf_format_general_plan_destroy(&cache->plans[i]); + cache->n = 0; + cache->next_evict = 0; +} - if ((f->y>>4&0xf) == BCF_HT_STR) { - f->size = f->is_gt? f->max_g << 2 : f->max_l; - } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { - f->size = f->max_m << 2; - } else { - hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_TAG_INVALID; - return -1; +static void vcf_format_plan_cache_destroy(vcf_format_plan_cache_t *cache) +{ + if (!cache) + return; + vcf_format_plan_cache_clear(cache); + free(cache->plans); + free(cache); +} + +static vcf_format_plan_cache_t *vcf_format_plan_cache_get(const bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + + if (!aux) + return NULL; + if (!aux->format_plan_cache) { + aux->format_plan_cache = (vcf_format_plan_cache_t *) + calloc(1, sizeof(*aux->format_plan_cache)); + if (!aux->format_plan_cache) + return NULL; + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + if (aux->format_plan_cache->hdr_gen != aux->format_plan_gen) { + vcf_format_plan_cache_clear(aux->format_plan_cache); + aux->format_plan_cache->hdr_gen = aux->format_plan_gen; + } + return aux->format_plan_cache; +} + +static int vcf_format_plan_cache_slot(vcf_format_plan_cache_t *cache) +{ + enum { VCF_FORMAT_PLAN_CACHE_INIT = 16, VCF_FORMAT_PLAN_CACHE_MAX = 128 }; + int i, idx, new_m; + vcf_format_general_plan_t *plans; + + if (cache->n < cache->m) + return cache->n++; + + if (cache->m < VCF_FORMAT_PLAN_CACHE_MAX) { + new_m = cache->m ? cache->m * 2 : VCF_FORMAT_PLAN_CACHE_INIT; + if (new_m > VCF_FORMAT_PLAN_CACHE_MAX) + new_m = VCF_FORMAT_PLAN_CACHE_MAX; + if ((size_t) new_m > SIZE_MAX / sizeof(*cache->plans)) + return -1; + plans = (vcf_format_general_plan_t *) + realloc(cache->plans, (size_t) new_m * sizeof(*cache->plans)); + if (!plans) + return -1; + memset(plans + cache->m, 0, + (size_t) (new_m - cache->m) * sizeof(*plans)); + cache->plans = plans; + cache->m = new_m; + return cache->n++; + } + + for (i = 0; i < cache->n; i++) { + idx = (cache->next_evict + i) % cache->n; + if (!cache->plans[idx].supported) + goto found; + } + idx = cache->next_evict; + +found: + vcf_format_general_plan_destroy(&cache->plans[idx]); + cache->next_evict = (idx + 1) % cache->n; + return idx; +} + +static inline int vcf_format_op_is_vector(const vcf_format_op_t *op) +{ + return op->vl_type != BCF_VL_FIXED || op->number != 1; +} + +/* + * Return whether this FORMAT composition is within the planned executor's + * supported shape set. Mixed measured-string plus float-vector rows require an + * extra measurement pass and stay on the generic parser unless the row also + * contains integer-vector work that benefits from the planned encoding path. + */ +static int vcf_format_general_plan_shape_supported(const vcf_format_general_plan_t *plan) +{ + int j, has_measured_string = 0, has_float_vector = 0, has_int_vector = 0; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + if (op->is_gt) + continue; + if (op->htype == BCF_HT_STR) { + has_measured_string = 1; + } else if (op->htype == BCF_HT_REAL && vcf_format_op_is_vector(op)) { + has_float_vector = 1; + } else if (op->htype == BCF_HT_INT && vcf_format_op_is_vector(op)) { + has_int_vector = 1; + } + } + + if (has_measured_string && has_float_vector && !has_int_vector) + return 0; + return 1; +} + +static int vcf_format_general_plan_compile(const bcf_hdr_t *h, const char *format, + size_t format_len, uint64_t format_hash, + uint64_t hdr_gen, + vcf_format_general_plan_t *plan) +{ + char *tmp, *tok, *format_end; + int i, ret = 0; + + memset(plan, 0, sizeof(*plan)); + plan->format = (char *) malloc(format_len + 1); + tmp = (char *) malloc(format_len + 1); + if (!plan->format || !tmp) { + free(tmp); + free(plan->format); + memset(plan, 0, sizeof(*plan)); + return -1; + } + memcpy(plan->format, format, format_len + 1); + memcpy(tmp, format, format_len + 1); + plan->format_len = format_len; + plan->format_hash = format_hash; + plan->hdr_gen = hdr_gen; + plan->fallback_reason = VCF_FORMAT_PLAN_FB_UNSUPPORTED; + + /* + * Compile at tag granularity, not full FORMAT-shape granularity. This is + * what allows GT:AD, GT:AD:DP:PL, reordered fields, and supersets with + * additional header-described tags to share the same executor instead of + * needing exact string-specific kernels. + */ + /* + * Keep empty FORMAT tokens visible. strtok_r() would collapse GT::DP into + * GT:DP, but the generic parser treats the empty tag as malformed. + */ + format_end = tmp + format_len; + for (tok = tmp; tok <= format_end; ) { + char *next = memchr(tok, ':', (size_t)(format_end - tok)); + int key, htype; + + if (!next) + next = format_end; + if (next == tok) + goto done; + *next = '\0'; + + if (plan->n_ops >= MAX_N_FMT) + goto done; + key = bcf_hdr_id2int(h, BCF_DT_ID, tok); + if (key < 0 || !bcf_hdr_idinfo_exists(h, BCF_HL_FMT, key)) + goto done; + for (i = 0; i < plan->n_ops; i++) + if (plan->ops[i].key == key) + goto done; + + htype = bcf_hdr_id2type(h, BCF_HL_FMT, key); + if (htype != BCF_HT_STR && htype != BCF_HT_INT && htype != BCF_HT_REAL) + goto done; + + /* + * Only compile tags with enough header information to reproduce the + * generic parser's BCF layout. Undefined tags and exotic types stay on + * the generic parser, which owns warning and header-repair behavior. + */ + plan->ops[plan->n_ops].key = key; + plan->ops[plan->n_ops].number = bcf_hdr_id2number(h, BCF_HL_FMT, key); + plan->ops[plan->n_ops].htype = htype; + plan->ops[plan->n_ops].is_gt = strcmp(tok, "GT") == 0; + plan->ops[plan->n_ops].vl_type = bcf_hdr_id2length(h, BCF_HL_FMT, key); + plan->ops[plan->n_ops].measured_width = 0; + if (plan->ops[plan->n_ops].is_gt) { + if (htype != BCF_HT_STR || plan->ops[plan->n_ops].number != 1 || + plan->ops[plan->n_ops].vl_type != BCF_VL_FIXED) + goto done; + } else { + int vl = plan->ops[plan->n_ops].vl_type; + if (htype == BCF_HT_STR) { + if (plan->ops[plan->n_ops].number != 1) + goto done; + plan->ops[plan->n_ops].measured_width = 1; + plan->has_string_spans = 1; + } else if (vl != BCF_VL_FIXED && vl != BCF_VL_A && + vl != BCF_VL_R && vl != BCF_VL_G && + vl != BCF_VL_VAR) { + goto done; + } else if (vl == BCF_VL_VAR) { + plan->ops[plan->n_ops].measured_width = 1; + } + } + plan->n_ops++; + + if (next == format_end) + break; + tok = next + 1; + } + + if (!plan->n_ops) + goto done; + if (!vcf_format_general_plan_shape_supported(plan)) + goto done; + + plan->supported = 1; + ret = 1; + +done: + free(tmp); + return ret; +} + +static vcf_format_general_plan_t *vcf_format_general_plan_get(const bcf_hdr_t *h, + const char *format, + vcf_format_plan_fallback_reason_t *reason) +{ + bcf_hdr_aux_t *aux; + vcf_format_plan_cache_t *cache; + vcf_format_general_plan_t *plan; + size_t format_len; + uint64_t format_hash, hdr_gen; + int i, idx, ret; + + /* + * The compiler reads h->id[] and header metadata directly. If a caller has + * mutated the header but not synced it yet, the generic parser is the + * only safe path because it already owns all header-repair semantics. + */ + if (h->dirty) + return NULL; + + aux = get_hdr_aux(h); + if (!aux) + return NULL; + cache = vcf_format_plan_cache_get(h); + if (!cache) + return NULL; + + format_len = strlen(format); + format_hash = vcf_format_plan_hash(format, format_len); + hdr_gen = aux->format_plan_gen; + for (i = 0; i < cache->n; i++) { + plan = &cache->plans[i]; + if (plan->format && plan->hdr_gen == hdr_gen && + plan->format_len == format_len && + plan->format_hash == format_hash && + memcmp(plan->format, format, format_len) == 0) { + if (!plan->supported) + vcf_format_plan_set_reason(reason, plan->fallback_reason); + return plan->supported ? plan : NULL; + } + } + + idx = vcf_format_plan_cache_slot(cache); + if (idx < 0) + return NULL; + plan = &cache->plans[idx]; + ret = vcf_format_general_plan_compile(h, format, format_len, format_hash, + hdr_gen, plan); + if (ret < 0) { + vcf_format_general_plan_destroy(plan); + if (idx == cache->n - 1) + cache->n--; + return NULL; + } + if (!plan->supported) + vcf_format_plan_set_reason(reason, plan->fallback_reason); + return plan->supported ? plan : NULL; +} + +/* + * Planned FORMAT parser: value parsers. + * + * These helpers consume one FORMAT subfield and leave the input pointer on the + * following ':' or tab. They deliberately handle only the planned subset and + * report failure to the executor, which then rolls the whole row back to the + * generic parser. Integer helpers maintain the observed min/max range while + * parsing so the final BCF encoder does not need a second range scan. + */ +#if defined(__GNUC__) +#define VCF_PLAN_ALWAYS_INLINE static inline __attribute__((always_inline)) +#else +#define VCF_PLAN_ALWAYS_INLINE static inline +#endif + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_gt2_u8(const char **sp, uint8_t out[2]) +{ + const char *s = *sp; + int a0, a1, phased; + + if (((s[0] != '.' && !(s[0] >= '0' && s[0] <= '9'))) || + (s[1] != '/' && s[1] != '|') || + ((s[2] != '.' && !(s[2] >= '0' && s[2] <= '9')))) + return -1; + + phased = s[1] == '|'; + a0 = s[0] == '.' ? -1 : s[0] - '0'; + a1 = s[2] == '.' ? -1 : s[2] - '0'; + out[0] = a0 < 0 ? (uint8_t) phased : + (uint8_t)(((a0 + 1) << 1) | phased); + out[1] = a1 < 0 ? (uint8_t) phased : + (uint8_t)(((a1 + 1) << 1) | phased); + *sp = s + 3; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value(const char **sp, int32_t *out) +{ + const char *s = *sp; + int sign = 1; + uint32_t val = 0, limit, cutoff, cutlim; + + if (*s == '.') { + *out = bcf_int32_missing; + *sp = s + 1; + return 0; + } + if (*s == '-') { + sign = -1; + s++; + } + if (!(*s >= '0' && *s <= '9')) + return -1; + limit = sign < 0 ? (uint32_t)(-(int64_t)BCF_MIN_BT_INT32) : (uint32_t)BCF_MAX_BT_INT32; + cutoff = limit / 10; + cutlim = limit % 10; + while (*s >= '0' && *s <= '9') { + uint32_t digit = *s - '0'; + if (val > cutoff || (val == cutoff && digit > cutlim)) + return -1; + val = val * 10 + digit; + s++; + } + if (sign < 0) + *out = -(int32_t)val; + else + *out = (int32_t)val; + *sp = s; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_init(vcf_plan_int_range_t *range) +{ + range->min = INT32_MAX; + range->max = INT32_MIN; + range->has_special = 0; +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add(vcf_plan_int_range_t *range, int32_t val) +{ + if (val == bcf_int32_missing || val == bcf_int32_vector_end) + range->has_special = 1; + if (range->max < val) + range->max = val; + if (range->min > val && val > INT32_MIN + 1) + range->min = val; +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_int_range_add_regular(vcf_plan_int_range_t *range, int32_t val) +{ + if (range->max < val) + range->max = val; + if (range->min > val) + range->min = val; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_value(const char **sp, float *out) +{ + const char *s = *sp; + char *end = NULL; + int failed = 0; + + if (*s == '.') { + bcf_float_set_missing(*out); + *sp = s + 1; + return 0; + } + *out = hts_str2dbl(s, &end, &failed); + if (failed || end == s) + return -1; + *sp = end; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_value_range(const char **sp, int32_t *out, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + uint32_t val = 0, cutoff = BCF_MAX_BT_INT32 / 10, cutlim = BCF_MAX_BT_INT32 % 10; + + if (*s >= '0' && *s <= '9') { + do { + uint32_t digit = *s - '0'; + if (val > cutoff || (val == cutoff && digit > cutlim)) + return -1; + val = val * 10 + digit; + s++; + } while (*s >= '0' && *s <= '9'); + *out = (int32_t)val; + *sp = s; + vcf_plan_int_range_add_regular(range, *out); + return 0; + } + if (vcf_plan_int_value(sp, out) < 0) + return -1; + vcf_plan_int_range_add(range, *out); + return 0; +} + +static int vcf_plan_parse_int_vector_counted_range(const char **sp, + int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) +{ + const char *s = *sp; + int i = 0; + + while (i < width) { + if (vcf_plan_int_value_range(&s, &out[i], range) < 0) + return -1; + i++; + if (*s != ',') + break; + /* + * Another comma after width values means the subfield has too many + * entries, including the trailing-comma form "1,2,". + */ + if (i == width) + return -1; + s++; + } + *nread = i; + if (i < width) + range->has_special = 1; + for (; i < width; i++) + out[i] = bcf_int32_vector_end; + *sp = s; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_expect_sep(const char **sp, int sep) +{ + if (**sp != sep) + return -1; + (*sp)++; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string(const char **sp, char *out, int width) +{ + const char *s = *sp, *t = s; + int l; + + while (*t && *t != ':' && *t != '\t') + t++; + l = t - s; + if (l > width) + return -1; + memcpy(out, s, l); + if (l < width) + memset(out + l, 0, width - l); + *sp = t; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_copy_string_span(const vcf_format_string_span_t *span, + const char **sp, + char *out, int width) +{ + int l = span->len; + + if (*sp != span->ptr || l > width) + return -1; + memcpy(out, span->ptr, l); + if (l < width) + memset(out + l, 0, width - l); + *sp = span->ptr + l; + return 0; +} + +/* + * Parse a dynamic-width float vector and report the number of values seen + * before padding the rest of the row with BCF vector-end markers. Returning + * the count avoids a second pass over the encoded floats, which is also safer + * than inferring width from sentinel values after conversion. + */ +static int vcf_plan_parse_float_vector_dynamic_counted(const char **sp, + float *out, int width, + int *nread) +{ + const char *s = *sp; + int i = 0; + + for (;;) { + if (i >= width || vcf_plan_float_value(&s, &out[i]) < 0) + return -1; + i++; + if (*s != ',') + break; + s++; + } + *nread = i; + for (; i < width; i++) + bcf_float_set_vector_end(out[i]); + *sp = s; + return 0; +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_int_scalar_flexible_range(const char **sp, int32_t *out, + vcf_plan_int_range_t *range) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + *out = bcf_int32_missing; + vcf_plan_int_range_add(range, *out); + return 0; + } + return vcf_plan_int_value_range(sp, out, range); +} + +VCF_PLAN_ALWAYS_INLINE int vcf_plan_float_scalar_flexible(const char **sp, float *out) +{ + return vcf_plan_float_value(sp, out); +} + +VCF_PLAN_ALWAYS_INLINE void vcf_plan_fill_missing_int_vector(int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) +{ + int i; + + out[0] = bcf_int32_missing; + vcf_plan_int_range_add(range, out[0]); + for (i = 1; i < width; i++) + out[i] = bcf_int32_vector_end; + range->has_special = 1; + *nread = 1; +} + +static int vcf_plan_parse_int_vector_flexible_counted_range(const char **sp, + int32_t *out, + int width, + int *nread, + vcf_plan_int_range_t *range) +{ + if (**sp == ':' || **sp == '\t' || **sp == '\0') { + vcf_plan_fill_missing_int_vector(out, width, nread, range); + return 0; + } + return vcf_plan_parse_int_vector_counted_range(sp, out, width, nread, range); +} + +/* + * Planned FORMAT parser: row-local layout and encoding helpers. + * + * Once all widths are known for the current record, these helpers turn cached + * plan operations into concrete row operations, allocate/stage transposed + * FORMAT buffers, compact underfilled vectors when the generic parser would do + * the same, and encode staged rows into the final packed BCF FORMAT layout. + */ +static int vcf_format_general_resolve_ops(const vcf_format_general_plan_t *plan, + bcf1_t *v, int *widths, + vcf_format_row_op_t *row_ops, + vcf_format_plan_fallback_reason_t *reason) +{ + int j; + + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + vcf_format_row_op_t *row = &row_ops[j]; + + row->key = op->key; + row->width = widths[j] > 0 ? widths[j] : 1; + row->offset = 0; + if (op->is_gt) { + if (row->width != 2 || v->n_allele > 10) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_GT_SHAPE); + return -4; + } + row->kind = VCF_FORMAT_ROW_GT2; + row->size = 2; + } else if (op->htype == BCF_HT_INT) { + if (row->width == 1) + row->kind = VCF_FORMAT_ROW_INT1; + else + row->kind = VCF_FORMAT_ROW_INTVEC; + row->size = row->width * (int)sizeof(int32_t); + } else if (op->htype == BCF_HT_REAL) { + row->kind = row->width == 1 ? VCF_FORMAT_ROW_FLOAT1 : VCF_FORMAT_ROW_FLOATN; + row->size = row->width * (int)sizeof(float); + } else { + row->kind = VCF_FORMAT_ROW_STR; + row->size = row->width; + } + row->can_compact = row->kind == VCF_FORMAT_ROW_INTVEC || + row->kind == VCF_FORMAT_ROW_FLOATN; + } + return 0; +} + +static const char *vcf_format_skip_sample_column(const char *cur, const char *end) +{ + while (cur < end && *cur && *cur != '\t') + cur++; + if (cur < end && *cur == '\t') + cur++; + return cur; +} + +static int vcf_format_general_expected_width(const vcf_format_op_t *op, bcf1_t *v) +{ + if (op->is_gt) + return 2; + if (op->htype == BCF_HT_STR) + return 0; + + switch (op->vl_type) { + case BCF_VL_FIXED: + return op->number > 0 ? op->number : 0; + case BCF_VL_A: + return v->n_allele > 1 ? v->n_allele - 1 : 0; + case BCF_VL_R: + return v->n_allele; + case BCF_VL_G: { + uint64_t n = (uint64_t) v->n_allele; + uint64_t width = n * (n + 1) / 2; + + return width > INT_MAX ? INT_MAX : (int) width; + } + default: + return 0; + } +} + +static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt); + +static int vcf_format_general_encode_row_ops_from_ranges(kstring_t *dst, kstring_t *mem, + int nsamples, int n_ops, + const vcf_format_row_op_t *row_ops, + const vcf_plan_int_range_t *ranges, + int first_op) +{ + int j; + + for (j = first_op; j < n_ops; j++) { + const vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = (uint8_t*)mem->s + op->offset; + + bcf_enc_int1(dst, op->key); + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (vcf_enc_gt2_u8(dst, nsamples, buf) < 0) + return -1; + break; + case VCF_FORMAT_ROW_STR: + if (bcf_enc_size(dst, op->width, BCF_BT_CHAR) < 0) + return -1; + if (kputsn((char *)buf, nsamples * (size_t)op->width, dst) < 0) + return -1; + break; + case VCF_FORMAT_ROW_FLOAT1: + case VCF_FORMAT_ROW_FLOATN: + if (bcf_enc_size(dst, op->width, BCF_BT_FLOAT) < 0) + return -1; + if (serialize_float_array(dst, nsamples * (size_t)op->width, (float *)buf) < 0) + return -1; + break; + case VCF_FORMAT_ROW_INT1: + case VCF_FORMAT_ROW_INTVEC: + if (bcf_enc_vint_known_range_special(dst, nsamples * op->width, (int32_t *)buf, + op->width, ranges[j].min, ranges[j].max, + ranges[j].has_special) < 0) + return -1; + break; + default: + return -1; + } + } + return 0; +} + +static int vcf_enc_gt2_u8(kstring_t *dst, int nsamples, const uint8_t *gt) +{ + int n = nsamples * 2; + + if (bcf_enc_size(dst, 2, BCF_BT_INT8) < 0) + return -1; + return kputsn((const char *)gt, n, dst) < 0 ? -1 : 0; +} + +static int vcf_format_direct_prefix_len(const vcf_format_row_op_t *row_ops, int n_ops) +{ + int j; + + for (j = 0; j < n_ops; j++) { + if (row_ops[j].kind != VCF_FORMAT_ROW_GT2 && + row_ops[j].kind != VCF_FORMAT_ROW_FLOAT1) + break; + } + return j; +} + +static int vcf_format_compact_row_op(kstring_t *mem, int nsamples, + vcf_format_row_op_t *op, int width) +{ + size_t elem_size; + size_t old_stride, new_stride; + char *base = mem->s + op->offset; + int sample; + + switch (op->kind) { + case VCF_FORMAT_ROW_INTVEC: + elem_size = sizeof(int32_t); + break; + case VCF_FORMAT_ROW_FLOATN: + elem_size = sizeof(float); + break; + default: + return -1; + } + old_stride = (size_t) op->width * elem_size; + new_stride = (size_t) width * elem_size; + for (sample = 1; sample < nsamples; sample++) + memmove(base + sample * new_stride, base + sample * old_stride, new_stride); + op->width = width; + op->size = (int)new_stride; + return 0; +} + +/* + * Planned FORMAT parser: row-local width measurement. + * + * Resolve FORMAT widths before execution. Fixed widths come from the header + * and current allele count; Type=String and Number=. numeric rows require a + * sample scan. Returns 0 for a usable plan, -4 for generic fallback, and -1 + * for allocation failure. + */ +static int vcf_format_general_strict_widths(kstring_t *s, const bcf_hdr_t *h, + const vcf_format_general_plan_t *plan, + bcf1_t *v, char *q, int *widths, + size_t *string_span_offsets, + size_t *string_spans_end, + vcf_format_plan_fallback_reason_t *reason) +{ + kstring_t *mem = (kstring_t*)&h->mem; + const char *cur, *end; + int has_measured = 0, has_string_spans = 0, sample, kept = 0, j; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h); + + if (string_spans_end) + *string_spans_end = 0; + + /* + * With bcf_hdr_set_samples(), the text line still contains the original + * sample columns but BCF output must contain only the retained samples. The + * measurement pass therefore scans original columns and updates row-local + * widths only for samples that will be emitted. + */ + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + + if (string_span_offsets) + string_span_offsets[j] = (size_t)-1; + if (op->measured_width) { + /* + * Strings and Number=. numeric vectors need a first pass so the + * transposed FORMAT storage has one row-local stride. Numeric + * vectors keep the conservative width cap because they multiply + * parsing, padding, and integer-width decisions. Strings are allowed + * a larger bounded width because they are copied as bytes and are + * common in phase-set annotations. + */ + widths[j] = 0; + has_measured = 1; + if (!op->is_gt && op->htype == BCF_HT_STR) + has_string_spans = 1; + } else { + widths[j] = vcf_format_general_expected_width(op, v); + if (widths[j] <= 0 || widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + } + + if (!has_measured) + return 0; + + if (has_string_spans && string_span_offsets) { + mem->l = 0; + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + size_t bytes; + + if (!op->measured_width || op->is_gt || op->htype != BCF_HT_STR) + continue; + if (align_mem(mem) < 0) + return -1; + string_span_offsets[j] = mem->l; + bytes = (size_t) output_nsamples * sizeof(vcf_format_string_span_t); + if (output_nsamples < 0 || + output_nsamples > INT_MAX / (int)sizeof(vcf_format_string_span_t) || + (uint64_t) mem->l + (uint64_t) bytes > INT_MAX || + ks_resize(mem, mem->l + bytes) < 0) + return -1; + mem->l += bytes; + } + if (string_spans_end) + *string_spans_end = mem->l; + } + + cur = q + 1; + end = s->s + s->l; + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } + for (j = 0; j < plan->n_ops; j++) { + const vcf_format_op_t *op = &plan->ops[j]; + const char *field = cur; + int w = 1; + + /* + * This pass validates the sample field separators at the same time + * as measuring widths. A single unexpected ':' or tab position is + * enough to reject the planned path, preserving generic-parser + * behavior for odd FORMAT/sample cardinality cases. + */ + while (cur < end && *cur && *cur != ':' && *cur != '\t') { + if (op->measured_width && + (op->htype == BCF_HT_INT || op->htype == BCF_HT_REAL) && + *cur == ',') { + w++; + if (w > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + cur++; + } + if (op->measured_width && !op->is_gt && op->htype == BCF_HT_STR) { + w = cur - field; + if (string_span_offsets && string_span_offsets[j] != (size_t)-1) { + vcf_format_string_span_t *spans = + (vcf_format_string_span_t *)(mem->s + string_span_offsets[j]); + spans[kept].ptr = field; + spans[kept].len = w; + } + if (j > 0) + w++; + if (w > VCF_FORMAT_MAX_STRING_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } + } + if (op->measured_width) { + if (widths[j] < w) + widths[j] = w; + } + + if (j + 1 < plan->n_ops) { + if (*cur != ':') { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + return -4; + } + cur++; + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + return -4; + } + } + } + if (++kept == output_nsamples) + break; + } + if (kept != output_nsamples) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SAMPLE_COUNT); + return -4; + } + for (j = 0; j < plan->n_ops; j++) + if (plan->ops[j].measured_width) { + if (plan->ops[j].htype == BCF_HT_STR) { + if (widths[j] <= 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } + if (widths[j] > VCF_FORMAT_MAX_STRING_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + return -4; + } + } else { + if (widths[j] <= 0) + widths[j] = 1; + if (widths[j] > VCF_FORMAT_MAX_NUMERIC_WIDTH) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + return -4; + } + } + } + + return 0; +} + +/* + * Planned FORMAT parser: execution. + * + * Execute a row-local FORMAT plan. Parsing proceeds sample-major because that + * matches the VCF text, then staged rows are encoded op-major to match BCF + * FORMAT layout. Returns 0 on success, -4 for generic fallback, and -1 on hard + * errors after rolling back any direct writes to v->indiv. + */ +static int vcf_parse_format_general_composable(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_row_op_t *row_ops, + const size_t *string_span_offsets, + size_t string_spans_end, + vcf_format_plan_fallback_reason_t *reason) +{ + kstring_t *mem = (kstring_t*)&h->mem; + int nsamples = h->keep_samples ? h->nsamples_ori : bcf_hdr_nsamples(h); + int output_nsamples = bcf_hdr_nsamples(h), sample, kept = 0, j; + int direct_ops = vcf_format_direct_prefix_len(row_ops, plan->n_ops); + int max_counts[MAX_N_FMT]; + vcf_plan_int_range_t ranges[MAX_N_FMT]; + size_t indiv_l0 = v->indiv.l; + size_t direct_offsets[MAX_N_FMT]; + uint8_t *op_base[MAX_N_FMT]; + size_t op_stride[MAX_N_FMT]; + const char *cur = q + 1, *end = s->s + s->l; + + /* + * The executor writes data in BCF's transposed FORMAT layout: all samples + * for FORMAT op 0, then all samples for op 1, etc. Leading fixed-width + * GT2/FLOAT1 rows can be written directly to v->indiv; the remaining rows + * are staged in h->mem so they can be parsed sample-major and encoded + * op-major once row-local ranges and widths are known. + * + * If keep_samples is active, nsamples is the number of columns to scan in + * the input line and output_nsamples is the dense BCF sample count. This + * mirrors the generic parser: unselected sample columns may influence + * neither emitted widths nor output cardinality. + */ + for (j = 0; j < plan->n_ops; j++) { + max_counts[j] = row_ops[j].can_compact ? 0 : row_ops[j].width; + direct_offsets[j] = 0; + vcf_plan_int_range_init(&ranges[j]); + } + + for (j = 0; j < direct_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + bcf_enc_int1(&v->indiv, op->key); + if (op->kind == VCF_FORMAT_ROW_GT2) { + if (bcf_enc_size(&v->indiv, 2, BCF_BT_INT8) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * 2) < 0) + goto error; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)output_nsamples * 2; + } else { + if (bcf_enc_size(&v->indiv, 1, BCF_BT_FLOAT) < 0 || + ks_resize(&v->indiv, v->indiv.l + (size_t)output_nsamples * sizeof(float)) < 0) + goto error; + direct_offsets[j] = v->indiv.l; + v->indiv.l += (size_t)output_nsamples * sizeof(float); + } + } + + mem->l = string_spans_end; + for (j = direct_ops; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + + if ((uint64_t) mem->l + output_nsamples * (uint64_t) op->size > INT_MAX) + goto error; + if (align_mem(mem) < 0) + goto error; + op->offset = mem->l; + if (ks_resize(mem, mem->l + output_nsamples * (size_t) op->size) < 0) + goto error; + mem->l += output_nsamples * (size_t) op->size; + } + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + if (j < direct_ops) { + op_base[j] = (uint8_t *)v->indiv.s + direct_offsets[j]; + op_stride[j] = op->kind == VCF_FORMAT_ROW_GT2 ? 2 : (size_t)op->size; + } else { + op_base[j] = (uint8_t *)mem->s + op->offset; + op_stride[j] = (size_t)op->size; + } + } + + for (sample = 0; sample < nsamples && cur < end; sample++) { + if (h->keep_samples && !bit_array_test(h->keep_samples, sample)) { + cur = vcf_format_skip_sample_column(cur, end); + continue; + } + for (j = 0; j < plan->n_ops; j++) { + vcf_format_row_op_t *op = &row_ops[j]; + uint8_t *buf = op_base[j] + kept * op_stride[j]; + int n = op->width; + + /* + * Each op parser consumes exactly one sample subfield and leaves cur + * on the following ':' or tab. Values outside this executor's + * supported subset, such as non-simple GT encodings, trigger + * generic fallback. + */ + switch (op->kind) { + case VCF_FORMAT_ROW_GT2: + if (vcf_plan_gt2_u8(&cur, buf) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_GT_SHAPE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INT1: + if (vcf_plan_int_scalar_flexible_range(&cur, (int32_t *)buf, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_INTVEC: + if (vcf_plan_parse_int_vector_flexible_counted_range(&cur, (int32_t *)buf, + op->width, &n, &ranges[j]) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_FLOAT1: + if (j < direct_ops) { + float f; + if (vcf_plan_float_scalar_flexible(&cur, &f) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + float_to_le(f, buf); + } else if (vcf_plan_float_scalar_flexible(&cur, (float *)buf) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_FLOATN: + if (vcf_plan_parse_float_vector_dynamic_counted(&cur, (float *)buf, + op->width, &n) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + break; + case VCF_FORMAT_ROW_STR: + if (string_span_offsets && string_span_offsets[j] != (size_t)-1) { + vcf_format_string_span_t *spans = + (vcf_format_string_span_t *)(mem->s + string_span_offsets[j]); + if (vcf_plan_copy_string_span(&spans[kept], &cur, + (char *)buf, op->width) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + goto fallback; + } + } else if (vcf_plan_copy_string(&cur, (char *)buf, op->width) < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_STRING_WIDTH); + goto fallback; + } + break; + default: + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_PARSE); + goto fallback; + } + if (row_ops[j].can_compact && max_counts[j] < n) + max_counts[j] = n; + + if (j + 1 < plan->n_ops) { + if (vcf_plan_expect_sep(&cur, ':') < 0) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + goto fallback; + } + } else { + if (*cur == '\t') + cur++; + else if (*cur == '\0' || cur >= end) + ; + else { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SEPARATOR); + goto fallback; + } + } + } + if (++kept == output_nsamples) + break; + } + if (kept != output_nsamples) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_SAMPLE_COUNT); + goto fallback; + } + for (j = 0; j < plan->n_ops; j++) { + if (!row_ops[j].can_compact) + continue; + if (max_counts[j] <= 0 || max_counts[j] > row_ops[j].width) { + vcf_format_plan_set_reason(reason, VCF_FORMAT_PLAN_FB_NUMERIC_WIDTH); + goto fallback; + } + if (max_counts[j] < row_ops[j].width) { + /* + * The generic parser encodes fixed-width vector rows at the + * observed row maximum, not necessarily the conservative + * header-derived width. Compacting here avoids unnecessary + * whole-row fallback while keeping byte-identical BCF output. + */ + if (vcf_format_compact_row_op(mem, output_nsamples, &row_ops[j], max_counts[j]) < 0) + goto error; + } + } + + v->n_fmt = plan->n_ops; + v->n_sample = output_nsamples; + if (vcf_format_general_encode_row_ops_from_ranges(&v->indiv, mem, output_nsamples, + plan->n_ops, row_ops, + ranges, direct_ops) < 0) + goto error; + if (vcf_parse_format_check7(h, v) < 0) + goto error; + if (vcf_format_plan_stats_enabled()) { + vcf_format_plan_stats.hits++; + vcf_format_plan_stats.parsed_samples += output_nsamples; + } + return 0; + +fallback: + /* + * Only v->indiv is mutated by this executor before success is known. All + * scratch data lives in h->mem and can be overwritten by the fallback parse. + */ + v->indiv.l = indiv_l0; + return -4; +error: + v->indiv.l = indiv_l0; + return -1; +} + +/* + * Planned FORMAT parser: entry points and fallback contract. + * + * vcf_parse_format_planned() is called before the generic FORMAT pipeline. It + * returns 0 after a successful planned parse, -3 when the caller should run the + * generic parser, and a hard error for allocation or consistency failures. + */ +static int vcf_parse_format_general_strict(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, + const vcf_format_general_plan_t *plan, + char *q, + vcf_format_plan_fallback_reason_t *reason) +{ + int widths[MAX_N_FMT]; + vcf_format_row_op_t row_ops[MAX_N_FMT]; + size_t string_span_offsets_buf[MAX_N_FMT]; + size_t *string_span_offsets = plan->has_string_spans ? string_span_offsets_buf : NULL; + size_t string_spans_end = 0; + int ret; + + ret = vcf_format_general_strict_widths(s, h, plan, v, q, widths, + string_span_offsets, + &string_spans_end, reason); + if (ret < 0) + return ret; + if (vcf_format_general_resolve_ops(plan, v, widths, row_ops, reason) < 0) + return -4; + return vcf_parse_format_general_composable(s, h, v, plan, q, row_ops, + string_span_offsets, + string_spans_end, reason); +} + +static int vcf_parse_format_general_planned(kstring_t *s, const bcf_hdr_t *h, + bcf1_t *v, char *p, char *q) +{ + vcf_format_general_plan_t *plan; + vcf_format_plan_fallback_reason_t reason = VCF_FORMAT_PLAN_FB_PARSE; + int nsamples, ret; + + plan = vcf_format_general_plan_get(h, p, &reason); + if (!plan) { + reason = VCF_FORMAT_PLAN_FB_UNSUPPORTED; + goto fallback; + } + + nsamples = bcf_hdr_nsamples(h); + if (!nsamples) + return 0; + ret = vcf_parse_format_general_strict(s, h, v, plan, q, &reason); + if (ret == 0) + return ret; + if (ret != -4) + return ret; + +fallback: + vcf_format_plan_note_fallback(reason); + return -3; +} + +static int vcf_parse_format_planned(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) +{ + if (!vcf_format_plan_enabled()) + return -3; + if (vcf_format_plan_stats_enabled()) + vcf_format_plan_stats.attempts++; + + return vcf_parse_format_general_planned(s, h, v, p, q); +} + +// detect FORMAT "." +static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q) { + const char *end = s->s + s->l; + if ( q>=end ) + { + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_NCOLS; + return -1; + } + + v->n_fmt = 0; + if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." + { + v->n_sample = bcf_hdr_nsamples(h); + return 1; + } + + return 0; +} + +// get format information from the dictionary +static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, fmt_aux_t *fmt) { + const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + char *t; + int j; + ks_tokaux_t aux1; + + for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { + if (j >= MAX_N_FMT) { + v->errcode |= BCF_ERR_LIMITS; + hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", + bcf_seqname_safe(h,v), v->pos+1); + return -1; + } + + *(char*)aux1.p = 0; + khint_t k = kh_get(vdict, d, t); + if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { + if ( t[0]=='.' && t[1]==0 ) + { + hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_TAG_INVALID; + return -1; + } + hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1); + kstring_t tmp = {0,0,0}; + int l; + ksprintf(&tmp, "##FORMAT=", t); + bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); + free(tmp.s); + int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; + if (res < 0) bcf_hrec_destroy(hrec); + if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); + + k = kh_get(vdict, d, t); + v->errcode |= BCF_ERR_TAG_UNDEF; + if (res || k == kh_end(d)) { + hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_TAG_INVALID; + return -1; + } + } + fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; + fmt[j].key = kh_val(d, k).id; + fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); + fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; + v->n_fmt++; + } + return 0; +} + +// compute max +static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { + int n_sample_ori = -1; + char *r = q + 1; // r: position in the format string + int l = 0, m = 1, g = 1, j; + v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + const char *end = s->s + s->l; + + while ( rkeep_samples ) + { + n_sample_ori++; + if ( !bit_array_test(h->keep_samples,n_sample_ori) ) + { + while ( *r!='\t' && ris_gt) g++; + break; + + case '\t': + *r = 0; // fall through + + default: // valid due to while loop above. + case '\0': + case ':': + l = r - r_start; r_start = r; + if (f->max_m < m) f->max_m = m; + if (f->max_l < l) f->max_l = l; + if (f->is_gt && f->max_g < g) f->max_g = g; + l = 0, m = g = 1; + if ( *r==':' ) { + j++; f++; + if ( j>=v->n_fmt ) { + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", + h->id[BCF_DT_CTG][v->rid].key, v->pos+1); + v->errcode |= BCF_ERR_NCOLS; + return -1; + } + } else goto end_for; + break; + } + if ( r>=end ) break; + r++; + } + end_for: + v->n_sample++; + if ( v->n_sample == bcf_hdr_nsamples(h) ) break; + r++; + } + + return 0; +} + +// allocate memory for arrays +static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, + fmt_aux_t *fmt) { + kstring_t *mem = (kstring_t*)&h->mem; + + int j; + for (j = 0; j < v->n_fmt; ++j) { + fmt_aux_t *f = &fmt[j]; + if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + + if ((f->y>>4&0xf) == BCF_HT_STR) { + f->size = f->is_gt? f->max_g << 2 : f->max_l; + } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { + f->size = f->max_m << 2; + } else { + hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_TAG_INVALID; + return -1; } if (align_mem(mem) < 0) { @@ -3686,7 +5233,13 @@ static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { + int pret; if ( !bcf_hdr_nsamples(h) ) return 0; + + pret = vcf_parse_format_planned(s, h, v, p, q); + if (pret != -3) + return pret; + kstring_t *mem = (kstring_t*)&h->mem; mem->l = 0;