From 6b085af9d83872b029a2590d4454a8027f830b52 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 Jan 2024 14:49:56 -0500 Subject: [PATCH 1/7] maf q-line support via q column tags --- taffy/impl/maf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ taffy/impl/paf.c | 1 - taffy/inc/taf.h | 2 ++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index e4b63fe..8811cee 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -81,13 +81,67 @@ Tag *maf_read_header(LI *li) { void maf_write_block(Alignment *alignment, LW *lw) { LW_write(lw, "a\n"); + + // check for base quality ('q' tag). Just looking at position[0] to not be too inefficient + // so the assumption is either you have a quality for every base in every column, or nothing + // will fail on an error if this doesn't hold (but we can relax if needed later) + bool has_qualities = false; + Tag **col_qualities = NULL; + char *qual_buffer = NULL; + if (alignment->column_number > 0) { + for (Tag *col_tag = alignment->column_tags[0]; col_tag && !has_qualities; col_tag = col_tag->n_tag) { + if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) { + has_qualities = true; + } + } + if (has_qualities) { + col_qualities = (Tag**)st_calloc(alignment->column_number, sizeof(Tag*)); + qual_buffer = (char*)st_calloc(alignment->column_number + 1, sizeof(char)); + // if we have qualites, fill in col_qualities so we don't have to fish for the tags again + for (int64_t col = 0; col < alignment->column_number; ++col) { + for (Tag *col_tag = alignment->column_tags[col]; col_tag && !col_qualities[col]; col_tag = col_tag->n_tag) { + if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) { + col_qualities[col] = col_tag; + } + } + if (col_qualities[col] == NULL) { + // see comment above + fprintf(stderr, "Error: missing base quality at column in block with base qualities\n"); + exit(1); + } + } + } + } + Alignment_Row *row = alignment->row; + int64_t row_idx = 0; while(row != NULL) { LW_write(lw, "s\t%s\t%" PRIi64 "\t%" PRIi64 "\t%s\t%" PRIi64 "\t%s\n", row->sequence_name, row->start, row->length, row->strand ? "+" : "-", row->sequence_length, row->bases); + + if (has_qualities && row->length > 0) { + int64_t i = 0; + for (int64_t col = 0; col < alignment->column_number; ++col) { + if (row->bases[col] != '-') { + // this is an ascii-shifted phred score + unsigned char qual = col_qualities[col]->value[row_idx] - (unsigned char)33; + // do the transformation shown here + // https://genome.ucsc.edu/FAQ/FAQformat.html#format5 + // MAF quality value = min( floor(actual quality value/5), 9 ) + qual_buffer[i++] = qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5); + } + } + assert(i == row->length); + qual_buffer[i] = '\0'; + LW_write(lw, "q\t%s\t%s\n", row->sequence_name, qual_buffer); + } + row = row->n_row; + ++row_idx; } LW_write(lw, "\n"); // Add a blank line at the end of the block + free(qual_buffer); + free(col_qualities); } void write_header(Tag *tag, LW *lw, char *header_prefix, char *delimiter, char *end); diff --git a/taffy/impl/paf.c b/taffy/impl/paf.c index 164d313..4181d0a 100644 --- a/taffy/impl/paf.c +++ b/taffy/impl/paf.c @@ -117,7 +117,6 @@ static void paf_write_row(Alignment_Row *q_row, Alignment_Row *t_row, int64_t nu // finalize trailing event if (current_start < num_col && current_start >= 0 && current_event[0] != '.') { - int64_t i = num_col; // note, this code block is (must be) identical to above. todo: factor out if (cs_cigar) { query_buffer[query_buffer_length] = '\0'; diff --git a/taffy/inc/taf.h b/taffy/inc/taf.h index f0a6980..ddf7d3d 100644 --- a/taffy/inc/taf.h +++ b/taffy/inc/taf.h @@ -7,6 +7,8 @@ #include "sonLib.h" #include "line_iterator.h" +#define TAF_BASE_QUALITY_TAG_KEY "q" + /* * Structures to represent blocks of an alignment */ From ef67f7775bb06008c34e8f746920a4e69413281f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Jan 2024 10:59:15 -0500 Subject: [PATCH 2/7] fix mem leak in tag_destruct() --- taffy/impl/alignment_block.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/taffy/impl/alignment_block.c b/taffy/impl/alignment_block.c index da2a303..36d43cb 100644 --- a/taffy/impl/alignment_block.c +++ b/taffy/impl/alignment_block.c @@ -6,6 +6,8 @@ void tag_destruct(Tag *tag) { while(tag != NULL) { Tag *p_tag = tag; tag = tag->n_tag; + free(p_tag->key); + free(p_tag->value); free(p_tag); } } From b5a24717b2a0835cb813f790deadf250e44c9009 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 30 Jan 2024 09:51:15 -0500 Subject: [PATCH 3/7] add gaps to qual strings --- taffy/impl/maf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index 8811cee..e3a30aa 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -129,9 +129,11 @@ void maf_write_block(Alignment *alignment, LW *lw) { // https://genome.ucsc.edu/FAQ/FAQformat.html#format5 // MAF quality value = min( floor(actual quality value/5), 9 ) qual_buffer[i++] = qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5); + } else { + qual_buffer[i++] = '-'; } } - assert(i == row->length); + assert(i == alignment->column_number); qual_buffer[i] = '\0'; LW_write(lw, "q\t%s\t%s\n", row->sequence_name, qual_buffer); } From 3b57d06d0ebe886eff509bdc0ebf210b9f2a0702 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 30 Jan 2024 10:06:43 -0500 Subject: [PATCH 4/7] align qualites with bases --- taffy/impl/maf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index e3a30aa..ed2ea86 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -135,7 +135,7 @@ void maf_write_block(Alignment *alignment, LW *lw) { } assert(i == alignment->column_number); qual_buffer[i] = '\0'; - LW_write(lw, "q\t%s\t%s\n", row->sequence_name, qual_buffer); + LW_write(lw, "q\t%s\t\t\t\t\t%s\n", row->sequence_name, qual_buffer); } row = row->n_row; From beba2160f199c18e250a176e7fb4eea0e393599b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 30 Jan 2024 12:26:34 -0500 Subject: [PATCH 5/7] maf qual parser --- taffy/impl/maf.c | 100 ++++++++++++++++++++++++++++++++++++---------- tests/view_test.c | 12 ++++++ 2 files changed, 91 insertions(+), 21 deletions(-) diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index ed2ea86..e528db2 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -5,6 +5,36 @@ #include "sonLib.h" #include "line_iterator.h" +static void set_maf_qualities(Alignment *alignment, stList* row_qualities, stList* row_quality_rows) { + // transpose our row qualities into the tags. + // first, make a tag for each quality + for (int64_t i = 0; i < alignment->column_number; ++i) { + char *column_qualities = (char*)st_calloc(alignment->row_number, sizeof(char)); + int64_t col_row_idx = 0; + for (int64_t j = 0; j < alignment->row_number; ++j) { + char qual = 'F'; // todo: is there a better default? + if (col_row_idx < stList_length(row_quality_rows) && + j == (int64_t)stList_get(row_quality_rows, col_row_idx)) { + char* row_quals = (char*)stList_get(row_qualities, col_row_idx); + qual = row_quals[i]; + // clamp it to 0-9 + if (qual < '0') { + qual = '0'; + } else if (qual > '9') { + qual = '9'; + } + ++col_row_idx; + } + // convert to ascii phred (which is bounded between 0x21 (!) and 0x7e (~) + column_qualities[j] = qual == 'F' ? '~' : '!' + (char)(5 * (qual - '0')); + } + alignment->column_tags[i] = tag_construct(TAF_BASE_QUALITY_TAG_KEY, column_qualities, NULL); + free(column_qualities); + } + stList_destruct(row_qualities); + stList_destruct(row_quality_rows); +} + Alignment *maf_read_block(LI *li) { while(1) { char *line = LI_get_next_line(li); @@ -21,42 +51,70 @@ Alignment *maf_read_block(LI *li) { stList_destruct(tokens); Alignment *alignment = st_calloc(1, sizeof(Alignment)); Alignment_Row **p_row = &(alignment->row); + Alignment_Row *last_row = NULL; + stList *row_qualities = NULL; + stList *row_quality_rows = NULL; while(1) { line = LI_get_next_line(li); if(line == NULL) { + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); + } return alignment; } tokens = stString_split(line); free(line); if(stList_length(tokens) == 0) { stList_destruct(tokens); + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); + } return alignment; } - if(strcmp(stList_get(tokens, 0), "s") != 0) { + if(strcmp(stList_get(tokens, 0), "s") == 0) { + assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line + Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row)); + alignment->row_number++; + *p_row = row; + p_row = &(row->n_row); + last_row = row; + row->sequence_name = stString_copy(stList_get(tokens, 1)); + row->start = atol(stList_get(tokens, 2)); + row->length = atol(stList_get(tokens, 3)); + assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0); + row->strand = strcmp(stList_get(tokens, 4), "+") == 0; + row->sequence_length = atol(stList_get(tokens, 5)); + row->bases = stString_copy(stList_get(tokens, 6)); + stList_destruct(tokens); + if(alignment->row_number == 1) { + alignment->column_number = strlen(row->bases); + alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *)); + } + else { + assert(alignment->column_number == strlen(row->bases)); + } + } else if(strcmp(stList_get(tokens, 0), TAF_BASE_QUALITY_TAG_KEY) == 0) { + if (row_qualities == NULL) { + row_qualities = stList_construct3(0, free); + row_quality_rows = stList_construct(); + } + char *qual_seq_name = stList_get(tokens, 1); + if (last_row == NULL || strcmp(qual_seq_name, last_row->sequence_name) != 0) { + fprintf(stderr, "Error: q line invalid because sequence name does not match previous s line: %s\n", line); + exit(1); + } + assert(stList_length(tokens) == 3); + stList_append(row_qualities, stString_copy(stList_get(tokens, 2))); + stList_append(row_quality_rows, (void*)(alignment->row_number - 1)); + stList_destruct(tokens); + } else { assert(strcmp(stList_get(tokens, 0), "i") == 0 || strcmp(stList_get(tokens, 0), "e") == 0); // Must be an "i" or "e" line, which we ignore stList_destruct(tokens); continue; } - assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line - Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row)); - alignment->row_number++; - *p_row = row; - p_row = &(row->n_row); - row->sequence_name = stString_copy(stList_get(tokens, 1)); - row->start = atol(stList_get(tokens, 2)); - row->length = atol(stList_get(tokens, 3)); - assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0); - row->strand = strcmp(stList_get(tokens, 4), "+") == 0; - row->sequence_length = atol(stList_get(tokens, 5)); - row->bases = stString_copy(stList_get(tokens, 6)); - stList_destruct(tokens); - if(alignment->row_number == 1) { - alignment->column_number = strlen(row->bases); - alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *)); - } - else { - assert(alignment->column_number == strlen(row->bases)); - } + } + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); } return alignment; } diff --git a/tests/view_test.c b/tests/view_test.c index 22fd19a..bc1f11a 100644 --- a/tests/view_test.c +++ b/tests/view_test.c @@ -72,8 +72,20 @@ static void test_paf(CuTest *testCase) { } +static void test_maf_quals(CuTest *testCase) { + char *example_file = "./tests/qual.maf"; + char *output_file = "./tests/qual.out.maf"; + char *truth_file = "./tests/qual.maf"; + int i = st_system("./bin/taffy view -i %s -m > %s", example_file, output_file); + CuAssertIntEquals(testCase, 0, i); // return value should be zero + int diff_ret = st_system("diff %s %s", output_file, truth_file); + CuAssertIntEquals(testCase, 0, diff_ret); // return value should be zero if files sames + st_system("rm -f %s", output_file); +} + CuSuite* view_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_paf); + SUITE_ADD_TEST(suite, test_maf_quals); return suite; } From 57ca84c7471f3e6758107f5bc275eb8d72917db8 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 13 Mar 2024 11:32:52 -0400 Subject: [PATCH 6/7] switch to base64 encoding for taf qualities --- .gitmodules | 3 ++ Makefile | 14 +++++++--- README.md | 6 ++++ taffy/impl/maf.c | 61 ++++++++++++++++++++++++----------------- taffy/submodules/base64 | 1 + 5 files changed, 56 insertions(+), 29 deletions(-) create mode 160000 taffy/submodules/base64 diff --git a/.gitmodules b/.gitmodules index bff9d10..53d3cc3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ [submodule "taffy/submodules/abPOA"] path = taffy/submodules/abPOA url = https://github.com/yangao07/abPOA.git +[submodule "taffy/submodules/base64"] + path = taffy/submodules/base64 + url = https://github.com/zhicheng/base64.git diff --git a/Makefile b/Makefile index 0c2416d..c5e5692 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ include ${rootPath}/include.mk srcDir = taffy/impl libHeaders = taffy/inc/*.h libTests = tests/*.c +base64Dir = taffy/submodules/base64 all: all_libs all_progs all_libs: ${LIBDIR}/libstTaf.a @@ -31,10 +32,14 @@ ${sonLibDir}/sonLib.a : sonLib ${sonLibDir}/cuTest.a : sonLib +${base64Dir}/base64.o : ${base64Dir}/base64.h ${base64Dir}/base64.c + cd ${base64Dir} && ${CC} ${CFLAGS} -o base64.o -c base64.c + ln -f ${base64Dir}/base64.h ${INCLDIR} + stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a -${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} - ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o +${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} ${base64Dir}/base64.o + ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${base64Dir}/base64.o mv libstTaf.a ${LIBDIR}/ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} @@ -43,7 +48,7 @@ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} ${srcDir}/line_iterator.o : ${srcDir}/line_iterator.c ${libHeaders} ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/line_iterator.o -c ${srcDir}/line_iterator.c -${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders} +${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders} ${base64Dir}/base64.o ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/maf.o -c ${srcDir}/maf.c ${srcDir}/paf.o : ${srcDir}/paf.c ${libHeaders} @@ -70,7 +75,7 @@ ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders} ${BINDIR}/stTafTests : ${libTests} ${LIBDIR}/libstTaf.a ${stTafDependencies} ${CC} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stTafTests ${libTests} ${LIBDIR}/libstTaf.a ${LDLIBS} -${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies} +${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies} ${CXX} ${CPPFLAGS} ${CXXFLAGS} taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o -o ${BINDIR}/taffy ${LIBDIR}/libstTaf.a ${LDLIBS} taffy_main.o : taffy_main.cpp ${stTafDependencies} ${libHeaders} @@ -104,6 +109,7 @@ python_test: all ${BINDIR}/stTafTests clean : cd ${sonLibRootDir} && ${MAKE} clean cd taffy/submodules/abPOA && ${MAKE} clean + rm -f ${base64Dir}/base64.o rm -rf *.o taffy/impl/*.o ${LIBDIR} ${BINDIR} static : diff --git a/README.md b/README.md index 42ea358..e91ac9f 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,12 @@ The corresponding TAF file (265 bytes): CCCCC AGGGG +# Base Qualities + +The [MAF format](https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5) can represent base qualites via `q`-lines. Taffy will read these into `q`-tags in TAF. Whereas MAF only supports 11 quality values, `0-9` and `F`, the TAF `q`-tags allow one full bytes per quality value so roundtripping from TAF to MAF then back to TAF will be lossy. These qualites are encoded in base64 in TAF (similar to `vg`'s GAM format). To access the ith quality value in Python, you'd use `base64.b64decode(quality_string)[i]`. + +Base qualities can be stored in a `q` tag in TAF. + # Installing Taffy CLI/C Library Do build this repo clone the repo as follows and then make: diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index 1879935..dda69ec 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -4,32 +4,37 @@ #include "taf.h" #include "sonLib.h" #include "line_iterator.h" +#include "base64.h" static void set_maf_qualities(Alignment *alignment, stList* row_qualities, stList* row_quality_rows) { // transpose our row qualities into the tags. - // first, make a tag for each quality + // first, make a tag for each column for (int64_t i = 0; i < alignment->column_number; ++i) { - char *column_qualities = (char*)st_calloc(alignment->row_number, sizeof(char)); + unsigned char *column_qualities = (unsigned char*)st_calloc(alignment->row_number, sizeof(unsigned char)); int64_t col_row_idx = 0; for (int64_t j = 0; j < alignment->row_number; ++j) { - char qual = 'F'; // todo: is there a better default? + unsigned char qual = 255; // todo: is there a better default? if (col_row_idx < stList_length(row_quality_rows) && j == (int64_t)stList_get(row_quality_rows, col_row_idx)) { - char* row_quals = (char*)stList_get(row_qualities, col_row_idx); - qual = row_quals[i]; - // clamp it to 0-9 - if (qual < '0') { - qual = '0'; - } else if (qual > '9') { - qual = '9'; + char* row_quals = (char*)stList_get(row_qualities, col_row_idx); + qual = (unsigned char)row_quals[i]; + // apply conversion from maf qual to raw qual + // as defined on https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5 + if (qual == 'F') { + qual = 99; + } else if (qual != '-') { + assert(qual >= '0' && qual <= '9'); + qual = (qual - '0') * 5; } ++col_row_idx; } - // convert to ascii phred (which is bounded between 0x21 (!) and 0x7e (~) - column_qualities[j] = qual == 'F' ? '~' : '!' + (char)(5 * (qual - '0')); + column_qualities[j] = qual; } - alignment->column_tags[i] = tag_construct(TAF_BASE_QUALITY_TAG_KEY, column_qualities, NULL); + char *encoded_qualities = (char*)st_calloc(BASE64_ENCODE_OUT_SIZE(alignment->row_number) + 1, sizeof(char)); + base64_encode(column_qualities, alignment->row_number, encoded_qualities); + alignment->column_tags[i] = tag_construct(TAF_BASE_QUALITY_TAG_KEY, encoded_qualities, NULL); free(column_qualities); + free(encoded_qualities); } stList_destruct(row_qualities); stList_destruct(row_quality_rows); @@ -144,7 +149,7 @@ void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) { // so the assumption is either you have a quality for every base in every column, or nothing // will fail on an error if this doesn't hold (but we can relax if needed later) bool has_qualities = false; - Tag **col_qualities = NULL; + unsigned char **decoded_col_qualities = NULL; char *qual_buffer = NULL; if (alignment->column_number > 0) { for (Tag *col_tag = alignment->column_tags[0]; col_tag && !has_qualities; col_tag = col_tag->n_tag) { @@ -153,16 +158,17 @@ void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) { } } if (has_qualities) { - col_qualities = (Tag**)st_calloc(alignment->column_number, sizeof(Tag*)); - qual_buffer = (char*)st_calloc(alignment->column_number + 1, sizeof(char)); - // if we have qualites, fill in col_qualities so we don't have to fish for the tags again + decoded_col_qualities = (unsigned char**)st_calloc(alignment->column_number, sizeof(unsigned char*)); + qual_buffer = (char*)st_calloc(alignment->column_number, sizeof(char)); + // if we have qualites, decode them into a matrix for the entire block so they can be transposed for (int64_t col = 0; col < alignment->column_number; ++col) { - for (Tag *col_tag = alignment->column_tags[col]; col_tag && !col_qualities[col]; col_tag = col_tag->n_tag) { + for (Tag *col_tag = alignment->column_tags[col]; col_tag && !decoded_col_qualities[col]; col_tag = col_tag->n_tag) { if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) { - col_qualities[col] = col_tag; + decoded_col_qualities[col] = (unsigned char*)st_calloc(alignment->row_number + 1, sizeof(unsigned char)); + base64_decode(col_tag->value, BASE64_ENCODE_OUT_SIZE(alignment->row_number) - 1, decoded_col_qualities[col]); } } - if (col_qualities[col] == NULL) { + if (decoded_col_qualities[col] == NULL) { // see comment above fprintf(stderr, "Error: missing base quality at column in block with base qualities\n"); exit(1); @@ -182,12 +188,12 @@ void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) { int64_t i = 0; for (int64_t col = 0; col < alignment->column_number; ++col) { if (row->bases[col] != '-') { - // this is an ascii-shifted phred score - unsigned char qual = col_qualities[col]->value[row_idx] - (unsigned char)33; + // this is a raw quality value + unsigned char qual = decoded_col_qualities[col][row_idx]; // do the transformation shown here // https://genome.ucsc.edu/FAQ/FAQformat.html#format5 // MAF quality value = min( floor(actual quality value/5), 9 ) - qual_buffer[i++] = qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5); + qual_buffer[i++] = (char)(qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5)); } else { qual_buffer[i++] = '-'; } @@ -204,8 +210,13 @@ void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) { } } LW_write(lw, "\n"); // Add a blank line at the end of the block - free(qual_buffer); - free(col_qualities); + if (has_qualities) { + free(qual_buffer); + for (int64_t i = 0; i < alignment->column_number; ++i) { + free(decoded_col_qualities[i]); + } + free(decoded_col_qualities); + } } void maf_write_block(Alignment *alignment, LW *lw) { diff --git a/taffy/submodules/base64 b/taffy/submodules/base64 new file mode 160000 index 0000000..81060e3 --- /dev/null +++ b/taffy/submodules/base64 @@ -0,0 +1 @@ +Subproject commit 81060e3338120b43d759ee8adfe24619370c5f36 From f1e72aafbbc0d39182a924346764388512f8d5ea Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 Apr 2024 12:22:22 -0400 Subject: [PATCH 7/7] add missing test file to git --- tests/qual.maf | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/qual.maf diff --git a/tests/qual.maf b/tests/qual.maf new file mode 100644 index 0000000..b771d27 --- /dev/null +++ b/tests/qual.maf @@ -0,0 +1,35 @@ +##maf version=1 + +a +s x 0 8 + 50 CAA---ATAAG +q x 999---99999 +s ins 0 11 + 11 CAAGGGATAAG +q ins 94490049989 + +a +s x 8 1 + 50 G + +a +s x 9 1 + 50 C + +a +s x 10 3 + 50 TTG + +a +s x 13 1 + 50 G + +a +s x 14 19 + 50 AAATTTTCTGGAGTTCTAT + +a +s x 33 1 + 50 T + +a +s x 34 4 + 50 ATAT + +a +s x 38 1 + 50 T + +a +s x 39 11 + 50 CCAACTCTCTG +