From 60ac4ea61846dad99346298350b1b9abde8c240c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 12 May 2026 17:22:44 +0100 Subject: [PATCH] Make faidx work with very long (>4 Gbyte!) lines Although faidx should support very long references, writing one longer than 4Gbases on a single line broke it because it used a uint32_t field to store the line length. To make it work with such inputs, faidx1_t::line_blen is increased in size to uint64_t so the correct length can be stored. To avoid having to do the same for faidx1_t::line_len, which would make each entry quite a bit bigger for a fairly rare use-case, that field is changed so that it stores the number of bytes to be skipped at the end of each line instead of the full length. As this value will usually only be 1 or 2, a uint32_t is plenty big enough for it. Combined with the fact that the original structure had a four-byte hole in it (between line_blen and len), it's possible to store the longer line lengths while keeping faidx1_t exactly the same size as it had before. --- faidx.c | 66 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/faidx.c b/faidx.c index c7fc022a8..f2539b678 100644 --- a/faidx.c +++ b/faidx.c @@ -68,10 +68,13 @@ static inline int bgzf_getc_(BGZF *fp) { typedef struct { int id; // faidx_t->name[id] is for this struct. - uint32_t line_len, line_blen; - uint64_t len; - uint64_t seq_offset; - uint64_t qual_offset; + uint32_t line_extra; // Bytes to skip at the end of each line + // Usually 1 for \n or 2 for \r\n ending + uint64_t line_blen; // Number of bases (or quality values) on each line + // Total line length = line_blen + line_extra + uint64_t len; // Length of entire sequence + uint64_t seq_offset; // File offset to start of sequence + uint64_t qual_offset; // File offset to start of quality values (if fastq) } faidx1_t; KHASH_MAP_INIT_STR(s, faidx1_t) @@ -90,10 +93,20 @@ static int fai_name2id(void *v, const char *ref) return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id; } -static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset) +static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint64_t line_len, uint64_t line_blen, uint64_t seq_offset, uint64_t qual_offset) { if (!name) { - hts_log_error("Malformed line"); + hts_log_error("Malformed line: no name"); + return -1; + } + if (line_len < line_blen) { + hts_log_error("Malformed line: width (%"PRIu64") less than base count (%"PRIu64")", + line_len, line_blen); + return -1; + } + if (line_len - line_blen > UINT32_MAX) { + hts_log_error("Malformed line: difference between width (%"PRIu64") and base count (%"PRIu64") too large", + line_len, line_blen); return -1; } @@ -120,7 +133,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, v->id = idx->n; idx->name[idx->n++] = name_key; v->len = len; - v->line_len = line_len; + v->line_extra = (uint32_t) (line_len - line_blen); v->line_blen = line_blen; v->seq_offset = seq_offset; v->qual_offset = qual_offset; @@ -362,12 +375,13 @@ static int fai_save(const faidx_t *fai, hFILE *fp) { if (fai->format == FAI_FASTA) { snprintf(buf, sizeof(buf), - "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n", - x.len, x.seq_offset, x.line_blen, x.line_len); + "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", + x.len, x.seq_offset, x.line_blen, x.line_blen + x.line_extra); } else { snprintf(buf, sizeof(buf), - "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n", - x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset); + "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", + x.len, x.seq_offset, x.line_blen, + x.line_blen + x.line_extra, x.qual_offset); } if (hputs(fai->name[i], fp) != 0) return -1; @@ -393,10 +407,11 @@ static faidx_t *fai_read(hFILE *fp, const char *fname, int format) if (!buf) goto fail; while ((l = hgetln(buf, 0x10000, fp)) > 0) { - uint32_t line_len, line_blen, n; + uint64_t line_len, line_blen; uint64_t len; uint64_t seq_offset; uint64_t qual_offset = 0; + int n; for (p = buf; *p && !isspace_c(*p); ++p); @@ -405,14 +420,14 @@ static faidx_t *fai_read(hFILE *fp, const char *fname, int format) } if (format == FAI_FASTA) { - n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len); + n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64, &len, &seq_offset, &line_blen, &line_len); if (n != 4) { hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum); goto fail; } } else { - n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset); + n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset); if (n != 5) { if (n == 4) { @@ -726,14 +741,16 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } if (val->line_blen <= 0) { - hts_log_error("Invalid line length in index: %d", val->line_blen); + hts_log_error("Invalid line length in index: %"PRIu64, val->line_blen); *len = -1; return NULL; } + uint64_t line_len = val->line_blen + val->line_extra; + ret = bgzf_useek(fai->bgzf, offset - + beg / val->line_blen * val->line_len + + beg / val->line_blen * line_len + beg % val->line_blen, SEEK_SET); if (ret < 0) { @@ -743,7 +760,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } // Over-allocate so there is extra space for one end-of-line sequence - buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1); + buffer = (char*)malloc((size_t) end - beg + line_len - val->line_blen + 1); if (!buffer) { *len = -1; return NULL; @@ -761,7 +778,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } s = buffer; - firstline_len = val->line_len - beg % val->line_blen; + firstline_len = line_len - beg % val->line_blen; // Read the (partial) first line and its line terminator, but increment s past the // line contents only, so the terminator characters will be overwritten by the next line. @@ -772,8 +789,8 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, // Similarly read complete lines and their line terminator characters, but overwrite the latter. while (remaining > val->line_blen) { - nread = bgzf_read_small(fai->bgzf, s, val->line_len); - if (nread < (ssize_t) val->line_len) goto error; + nread = bgzf_read_small(fai->bgzf, s, line_len); + if (nread < (ssize_t) line_len) goto error; s += val->line_blen; remaining -= val->line_blen; } @@ -827,9 +844,10 @@ static int fai_get_val(const faidx_t *fai, const char *str, } /* - * The internal still has line_blen as uint32_t, but our references - * can be longer, so for future proofing we use hts_pos_t. We also needed - * a signed value so we can return negatives as an error. + * Returns hts_pos_t for historic reasons (line_blen was uint32_t, + * but this function returned a wider value in case it was expanded + * which has now happened). It also needs to return a negative value + * on error. */ hts_pos_t fai_line_length(const faidx_t *fai, const char *str) { @@ -840,7 +858,7 @@ hts_pos_t fai_line_length(const faidx_t *fai, const char *str) if (fai_get_val(fai, str, &len, &val, &beg, &end)) return -1; else - return val.line_blen; + return (hts_pos_t) (val.line_blen <= HTS_POS_MAX ? val.line_blen : HTS_POS_MAX); } char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)