diff --git a/.gitmodules b/.gitmodules index bff9d10..53d3cc3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ [submodule "taffy/submodules/abPOA"] path = taffy/submodules/abPOA url = https://github.com/yangao07/abPOA.git +[submodule "taffy/submodules/base64"] + path = taffy/submodules/base64 + url = https://github.com/zhicheng/base64.git diff --git a/Makefile b/Makefile index 0c2416d..c5e5692 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ include ${rootPath}/include.mk srcDir = taffy/impl libHeaders = taffy/inc/*.h libTests = tests/*.c +base64Dir = taffy/submodules/base64 all: all_libs all_progs all_libs: ${LIBDIR}/libstTaf.a @@ -31,10 +32,14 @@ ${sonLibDir}/sonLib.a : sonLib ${sonLibDir}/cuTest.a : sonLib +${base64Dir}/base64.o : ${base64Dir}/base64.h ${base64Dir}/base64.c + cd ${base64Dir} && ${CC} ${CFLAGS} -o base64.o -c base64.c + ln -f ${base64Dir}/base64.h ${INCLDIR} + stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a -${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} - ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o +${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} ${base64Dir}/base64.o + ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${base64Dir}/base64.o mv libstTaf.a ${LIBDIR}/ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} @@ -43,7 +48,7 @@ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} ${srcDir}/line_iterator.o : ${srcDir}/line_iterator.c ${libHeaders} ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/line_iterator.o -c ${srcDir}/line_iterator.c -${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders} +${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders} ${base64Dir}/base64.o ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/maf.o -c ${srcDir}/maf.c ${srcDir}/paf.o : ${srcDir}/paf.c ${libHeaders} @@ -70,7 +75,7 @@ ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders} ${BINDIR}/stTafTests : ${libTests} ${LIBDIR}/libstTaf.a ${stTafDependencies} ${CC} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stTafTests ${libTests} ${LIBDIR}/libstTaf.a ${LDLIBS} -${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies} +${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies} ${CXX} ${CPPFLAGS} ${CXXFLAGS} taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o -o ${BINDIR}/taffy ${LIBDIR}/libstTaf.a ${LDLIBS} taffy_main.o : taffy_main.cpp ${stTafDependencies} ${libHeaders} @@ -104,6 +109,7 @@ python_test: all ${BINDIR}/stTafTests clean : cd ${sonLibRootDir} && ${MAKE} clean cd taffy/submodules/abPOA && ${MAKE} clean + rm -f ${base64Dir}/base64.o rm -rf *.o taffy/impl/*.o ${LIBDIR} ${BINDIR} static : diff --git a/README.md b/README.md index 42ea358..e91ac9f 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,12 @@ The corresponding TAF file (265 bytes): CCCCC AGGGG +# Base Qualities + +The [MAF format](https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5) can represent base qualites via `q`-lines. Taffy will read these into `q`-tags in TAF. Whereas MAF only supports 11 quality values, `0-9` and `F`, the TAF `q`-tags allow one full bytes per quality value so roundtripping from TAF to MAF then back to TAF will be lossy. These qualites are encoded in base64 in TAF (similar to `vg`'s GAM format). To access the ith quality value in Python, you'd use `base64.b64decode(quality_string)[i]`. + +Base qualities can be stored in a `q` tag in TAF. + # Installing Taffy CLI/C Library Do build this repo clone the repo as follows and then make: diff --git a/taffy/impl/alignment_block.c b/taffy/impl/alignment_block.c index a3d4c53..747b137 100644 --- a/taffy/impl/alignment_block.c +++ b/taffy/impl/alignment_block.c @@ -48,6 +48,8 @@ void tag_destruct(Tag *tag) { while(tag != NULL) { Tag *p_tag = tag; tag = tag->n_tag; + free(p_tag->key); + free(p_tag->value); free(p_tag); } } diff --git a/taffy/impl/maf.c b/taffy/impl/maf.c index 70c395d..dda69ec 100644 --- a/taffy/impl/maf.c +++ b/taffy/impl/maf.c @@ -4,6 +4,41 @@ #include "taf.h" #include "sonLib.h" #include "line_iterator.h" +#include "base64.h" + +static void set_maf_qualities(Alignment *alignment, stList* row_qualities, stList* row_quality_rows) { + // transpose our row qualities into the tags. + // first, make a tag for each column + for (int64_t i = 0; i < alignment->column_number; ++i) { + unsigned char *column_qualities = (unsigned char*)st_calloc(alignment->row_number, sizeof(unsigned char)); + int64_t col_row_idx = 0; + for (int64_t j = 0; j < alignment->row_number; ++j) { + unsigned char qual = 255; // todo: is there a better default? + if (col_row_idx < stList_length(row_quality_rows) && + j == (int64_t)stList_get(row_quality_rows, col_row_idx)) { + char* row_quals = (char*)stList_get(row_qualities, col_row_idx); + qual = (unsigned char)row_quals[i]; + // apply conversion from maf qual to raw qual + // as defined on https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5 + if (qual == 'F') { + qual = 99; + } else if (qual != '-') { + assert(qual >= '0' && qual <= '9'); + qual = (qual - '0') * 5; + } + ++col_row_idx; + } + column_qualities[j] = qual; + } + char *encoded_qualities = (char*)st_calloc(BASE64_ENCODE_OUT_SIZE(alignment->row_number) + 1, sizeof(char)); + base64_encode(column_qualities, alignment->row_number, encoded_qualities); + alignment->column_tags[i] = tag_construct(TAF_BASE_QUALITY_TAG_KEY, encoded_qualities, NULL); + free(column_qualities); + free(encoded_qualities); + } + stList_destruct(row_qualities); + stList_destruct(row_quality_rows); +} Alignment *maf_read_block(LI *li) { while(1) { @@ -21,42 +56,70 @@ Alignment *maf_read_block(LI *li) { stList_destruct(tokens); Alignment *alignment = st_calloc(1, sizeof(Alignment)); Alignment_Row **p_row = &(alignment->row); + Alignment_Row *last_row = NULL; + stList *row_qualities = NULL; + stList *row_quality_rows = NULL; while(1) { line = LI_get_next_line(li); if(line == NULL) { + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); + } return alignment; } tokens = stString_split(line); free(line); if(stList_length(tokens) == 0) { stList_destruct(tokens); + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); + } return alignment; } - if(strcmp(stList_get(tokens, 0), "s") != 0) { + if(strcmp(stList_get(tokens, 0), "s") == 0) { + assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line + Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row)); + alignment->row_number++; + *p_row = row; + p_row = &(row->n_row); + last_row = row; + row->sequence_name = stString_copy(stList_get(tokens, 1)); + row->start = atol(stList_get(tokens, 2)); + row->length = atol(stList_get(tokens, 3)); + assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0); + row->strand = strcmp(stList_get(tokens, 4), "+") == 0; + row->sequence_length = atol(stList_get(tokens, 5)); + row->bases = stString_copy(stList_get(tokens, 6)); + stList_destruct(tokens); + if(alignment->row_number == 1) { + alignment->column_number = strlen(row->bases); + alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *)); + } + else { + assert(alignment->column_number == strlen(row->bases)); + } + } else if(strcmp(stList_get(tokens, 0), TAF_BASE_QUALITY_TAG_KEY) == 0) { + if (row_qualities == NULL) { + row_qualities = stList_construct3(0, free); + row_quality_rows = stList_construct(); + } + char *qual_seq_name = stList_get(tokens, 1); + if (last_row == NULL || strcmp(qual_seq_name, last_row->sequence_name) != 0) { + fprintf(stderr, "Error: q line invalid because sequence name does not match previous s line: %s\n", line); + exit(1); + } + assert(stList_length(tokens) == 3); + stList_append(row_qualities, stString_copy(stList_get(tokens, 2))); + stList_append(row_quality_rows, (void*)(alignment->row_number - 1)); + stList_destruct(tokens); + } else { assert(strcmp(stList_get(tokens, 0), "i") == 0 || strcmp(stList_get(tokens, 0), "e") == 0); // Must be an "i" or "e" line, which we ignore stList_destruct(tokens); continue; } - assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line - Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row)); - alignment->row_number++; - *p_row = row; - p_row = &(row->n_row); - row->sequence_name = stString_copy(stList_get(tokens, 1)); - row->start = atol(stList_get(tokens, 2)); - row->length = atol(stList_get(tokens, 3)); - assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0); - row->strand = strcmp(stList_get(tokens, 4), "+") == 0; - row->sequence_length = atol(stList_get(tokens, 5)); - row->bases = stString_copy(stList_get(tokens, 6)); - stList_destruct(tokens); - if(alignment->row_number == 1) { - alignment->column_number = strlen(row->bases); - alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *)); - } - else { - assert(alignment->column_number == strlen(row->bases)); - } + } + if (row_qualities != NULL) { + set_maf_qualities(alignment, row_qualities, row_quality_rows); } return alignment; } @@ -81,17 +144,79 @@ Tag *maf_read_header(LI *li) { void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) { LW_write(lw, "a\n"); + + // check for base quality ('q' tag). Just looking at position[0] to not be too inefficient + // so the assumption is either you have a quality for every base in every column, or nothing + // will fail on an error if this doesn't hold (but we can relax if needed later) + bool has_qualities = false; + unsigned char **decoded_col_qualities = NULL; + char *qual_buffer = NULL; + if (alignment->column_number > 0) { + for (Tag *col_tag = alignment->column_tags[0]; col_tag && !has_qualities; col_tag = col_tag->n_tag) { + if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) { + has_qualities = true; + } + } + if (has_qualities) { + decoded_col_qualities = (unsigned char**)st_calloc(alignment->column_number, sizeof(unsigned char*)); + qual_buffer = (char*)st_calloc(alignment->column_number, sizeof(char)); + // if we have qualites, decode them into a matrix for the entire block so they can be transposed + for (int64_t col = 0; col < alignment->column_number; ++col) { + for (Tag *col_tag = alignment->column_tags[col]; col_tag && !decoded_col_qualities[col]; col_tag = col_tag->n_tag) { + if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) { + decoded_col_qualities[col] = (unsigned char*)st_calloc(alignment->row_number + 1, sizeof(unsigned char)); + base64_decode(col_tag->value, BASE64_ENCODE_OUT_SIZE(alignment->row_number) - 1, decoded_col_qualities[col]); + } + } + if (decoded_col_qualities[col] == NULL) { + // see comment above + fprintf(stderr, "Error: missing base quality at column in block with base qualities\n"); + exit(1); + } + } + } + } + Alignment_Row *row = alignment->row; + int64_t row_idx = 0; while(row != NULL) { char *bases = color_bases ? color_base_string(row->bases, alignment->column_number) : row->bases; LW_write(lw, "s\t%s\t%" PRIi64 "\t%" PRIi64 "\t%s\t%" PRIi64 "\t%s\n", row->sequence_name, row->start, row->length, - row->strand ? "+" : "-", row->sequence_length, bases); + row->strand ? "+" : "-", row->sequence_length, bases); + + if (has_qualities && row->length > 0) { + int64_t i = 0; + for (int64_t col = 0; col < alignment->column_number; ++col) { + if (row->bases[col] != '-') { + // this is a raw quality value + unsigned char qual = decoded_col_qualities[col][row_idx]; + // do the transformation shown here + // https://genome.ucsc.edu/FAQ/FAQformat.html#format5 + // MAF quality value = min( floor(actual quality value/5), 9 ) + qual_buffer[i++] = (char)(qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5)); + } else { + qual_buffer[i++] = '-'; + } + } + assert(i == alignment->column_number); + qual_buffer[i] = '\0'; + LW_write(lw, "q\t%s\t\t\t\t\t%s\n", row->sequence_name, qual_buffer); + } + row = row->n_row; + ++row_idx; if(color_bases) { free(bases); } } LW_write(lw, "\n"); // Add a blank line at the end of the block + if (has_qualities) { + free(qual_buffer); + for (int64_t i = 0; i < alignment->column_number; ++i) { + free(decoded_col_qualities[i]); + } + free(decoded_col_qualities); + } } void maf_write_block(Alignment *alignment, LW *lw) { diff --git a/taffy/impl/paf.c b/taffy/impl/paf.c index 164d313..4181d0a 100644 --- a/taffy/impl/paf.c +++ b/taffy/impl/paf.c @@ -117,7 +117,6 @@ static void paf_write_row(Alignment_Row *q_row, Alignment_Row *t_row, int64_t nu // finalize trailing event if (current_start < num_col && current_start >= 0 && current_event[0] != '.') { - int64_t i = num_col; // note, this code block is (must be) identical to above. todo: factor out if (cs_cigar) { query_buffer[query_buffer_length] = '\0'; diff --git a/taffy/inc/taf.h b/taffy/inc/taf.h index 50bd97f..d9907e7 100644 --- a/taffy/inc/taf.h +++ b/taffy/inc/taf.h @@ -7,6 +7,8 @@ #include "sonLib.h" #include "line_iterator.h" +#define TAF_BASE_QUALITY_TAG_KEY "q" + /* * Structures to represent blocks of an alignment */ diff --git a/taffy/submodules/base64 b/taffy/submodules/base64 new file mode 160000 index 0000000..81060e3 --- /dev/null +++ b/taffy/submodules/base64 @@ -0,0 +1 @@ +Subproject commit 81060e3338120b43d759ee8adfe24619370c5f36 diff --git a/tests/qual.maf b/tests/qual.maf new file mode 100644 index 0000000..b771d27 --- /dev/null +++ b/tests/qual.maf @@ -0,0 +1,35 @@ +##maf version=1 + +a +s x 0 8 + 50 CAA---ATAAG +q x 999---99999 +s ins 0 11 + 11 CAAGGGATAAG +q ins 94490049989 + +a +s x 8 1 + 50 G + +a +s x 9 1 + 50 C + +a +s x 10 3 + 50 TTG + +a +s x 13 1 + 50 G + +a +s x 14 19 + 50 AAATTTTCTGGAGTTCTAT + +a +s x 33 1 + 50 T + +a +s x 34 4 + 50 ATAT + +a +s x 38 1 + 50 T + +a +s x 39 11 + 50 CCAACTCTCTG + diff --git a/tests/view_test.c b/tests/view_test.c index 83a2792..17b010c 100644 --- a/tests/view_test.c +++ b/tests/view_test.c @@ -72,6 +72,17 @@ static void test_paf(CuTest *testCase) { } +static void test_maf_quals(CuTest *testCase) { + char *example_file = "./tests/qual.maf"; + char *output_file = "./tests/qual.out.maf"; + char *truth_file = "./tests/qual.maf"; + int i = st_system("./bin/taffy view -i %s -m > %s", example_file, output_file); + CuAssertIntEquals(testCase, 0, i); // return value should be zero + int diff_ret = st_system("diff %s %s", output_file, truth_file); + CuAssertIntEquals(testCase, 0, diff_ret); // return value should be zero if files sames + st_system("rm -f %s", output_file); +} + static void test_lineage_diffs(CuTest *testCase) { { char *example_file = "./tests/evolverMammals.maf.mini"; @@ -103,6 +114,7 @@ static void test_ref_diffs(CuTest *testCase) { CuSuite* view_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_paf); + SUITE_ADD_TEST(suite, test_maf_quals); SUITE_ADD_TEST(suite, test_lineage_diffs); SUITE_ADD_TEST(suite, test_ref_diffs); return suite;