Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
[submodule "taffy/submodules/abPOA"]
path = taffy/submodules/abPOA
url = https://github.com/yangao07/abPOA.git
[submodule "taffy/submodules/base64"]
path = taffy/submodules/base64
url = https://github.com/zhicheng/base64.git
14 changes: 10 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include ${rootPath}/include.mk
srcDir = taffy/impl
libHeaders = taffy/inc/*.h
libTests = tests/*.c
base64Dir = taffy/submodules/base64

all: all_libs all_progs
all_libs: ${LIBDIR}/libstTaf.a
Expand Down Expand Up @@ -31,10 +32,14 @@ ${sonLibDir}/sonLib.a : sonLib

${sonLibDir}/cuTest.a : sonLib

${base64Dir}/base64.o : ${base64Dir}/base64.h ${base64Dir}/base64.c
cd ${base64Dir} && ${CC} ${CFLAGS} -o base64.o -c base64.c
ln -f ${base64Dir}/base64.h ${INCLDIR}

stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a

${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies}
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o
${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} ${base64Dir}/base64.o
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/tai.o ${base64Dir}/base64.o
mv libstTaf.a ${LIBDIR}/

${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders}
Expand All @@ -43,7 +48,7 @@ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders}
${srcDir}/line_iterator.o : ${srcDir}/line_iterator.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/line_iterator.o -c ${srcDir}/line_iterator.c

${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders}
${srcDir}/maf.o : ${srcDir}/maf.c ${libHeaders} ${base64Dir}/base64.o
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/maf.o -c ${srcDir}/maf.c

${srcDir}/paf.o : ${srcDir}/paf.c ${libHeaders}
Expand All @@ -70,7 +75,7 @@ ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders}
${BINDIR}/stTafTests : ${libTests} ${LIBDIR}/libstTaf.a ${stTafDependencies}
${CC} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stTafTests ${libTests} ${LIBDIR}/libstTaf.a ${LDLIBS}

${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies}
${BINDIR}/taffy : taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o ${LIBDIR}/libstTaf.a ${libHeaders} ${stTafDependencies}
${CXX} ${CPPFLAGS} ${CXXFLAGS} taf_norm.o taf_add_gap_bases.o taf_index.o taf_view.o taf_sort.o taf_stats.o taffy_main.o -o ${BINDIR}/taffy ${LIBDIR}/libstTaf.a ${LDLIBS}

taffy_main.o : taffy_main.cpp ${stTafDependencies} ${libHeaders}
Expand Down Expand Up @@ -104,6 +109,7 @@ python_test: all ${BINDIR}/stTafTests
clean :
cd ${sonLibRootDir} && ${MAKE} clean
cd taffy/submodules/abPOA && ${MAKE} clean
rm -f ${base64Dir}/base64.o
rm -rf *.o taffy/impl/*.o ${LIBDIR} ${BINDIR}

static :
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ The corresponding TAF file (265 bytes):
CCCCC
AGGGG

# Base Qualities

The [MAF format](https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5) can represent base qualites via `q`-lines. Taffy will read these into `q`-tags in TAF. Whereas MAF only supports 11 quality values, `0-9` and `F`, the TAF `q`-tags allow one full bytes per quality value so roundtripping from TAF to MAF then back to TAF will be lossy. These qualites are encoded in base64 in TAF (similar to `vg`'s GAM format). To access the ith quality value in Python, you'd use `base64.b64decode(quality_string)[i]`.

Base qualities can be stored in a `q` tag in TAF.

# Installing Taffy CLI/C Library

Do build this repo clone the repo as follows and then make:
Expand Down
2 changes: 2 additions & 0 deletions taffy/impl/alignment_block.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ void tag_destruct(Tag *tag) {
while(tag != NULL) {
Tag *p_tag = tag;
tag = tag->n_tag;
free(p_tag->key);
free(p_tag->value);
free(p_tag);
}
}
Expand Down
169 changes: 147 additions & 22 deletions taffy/impl/maf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,41 @@
#include "taf.h"
#include "sonLib.h"
#include "line_iterator.h"
#include "base64.h"

static void set_maf_qualities(Alignment *alignment, stList* row_qualities, stList* row_quality_rows) {
// transpose our row qualities into the tags.
// first, make a tag for each column
for (int64_t i = 0; i < alignment->column_number; ++i) {
unsigned char *column_qualities = (unsigned char*)st_calloc(alignment->row_number, sizeof(unsigned char));
int64_t col_row_idx = 0;
for (int64_t j = 0; j < alignment->row_number; ++j) {
unsigned char qual = 255; // todo: is there a better default?
if (col_row_idx < stList_length(row_quality_rows) &&
j == (int64_t)stList_get(row_quality_rows, col_row_idx)) {
char* row_quals = (char*)stList_get(row_qualities, col_row_idx);
qual = (unsigned char)row_quals[i];
// apply conversion from maf qual to raw qual
// as defined on https://genome.cse.ucsc.edu/FAQ/FAQformat.html#format5
if (qual == 'F') {
qual = 99;
} else if (qual != '-') {
assert(qual >= '0' && qual <= '9');
qual = (qual - '0') * 5;
}
++col_row_idx;
}
column_qualities[j] = qual;
}
char *encoded_qualities = (char*)st_calloc(BASE64_ENCODE_OUT_SIZE(alignment->row_number) + 1, sizeof(char));
base64_encode(column_qualities, alignment->row_number, encoded_qualities);
alignment->column_tags[i] = tag_construct(TAF_BASE_QUALITY_TAG_KEY, encoded_qualities, NULL);
free(column_qualities);
free(encoded_qualities);
}
stList_destruct(row_qualities);
stList_destruct(row_quality_rows);
}

Alignment *maf_read_block(LI *li) {
while(1) {
Expand All @@ -21,42 +56,70 @@ Alignment *maf_read_block(LI *li) {
stList_destruct(tokens);
Alignment *alignment = st_calloc(1, sizeof(Alignment));
Alignment_Row **p_row = &(alignment->row);
Alignment_Row *last_row = NULL;
stList *row_qualities = NULL;
stList *row_quality_rows = NULL;
while(1) {
line = LI_get_next_line(li);
if(line == NULL) {
if (row_qualities != NULL) {
set_maf_qualities(alignment, row_qualities, row_quality_rows);
}
return alignment;
}
tokens = stString_split(line);
free(line);
if(stList_length(tokens) == 0) {
stList_destruct(tokens);
if (row_qualities != NULL) {
set_maf_qualities(alignment, row_qualities, row_quality_rows);
}
return alignment;
}
if(strcmp(stList_get(tokens, 0), "s") != 0) {
if(strcmp(stList_get(tokens, 0), "s") == 0) {
assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line
Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row));
alignment->row_number++;
*p_row = row;
p_row = &(row->n_row);
last_row = row;
row->sequence_name = stString_copy(stList_get(tokens, 1));
row->start = atol(stList_get(tokens, 2));
row->length = atol(stList_get(tokens, 3));
assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0);
row->strand = strcmp(stList_get(tokens, 4), "+") == 0;
row->sequence_length = atol(stList_get(tokens, 5));
row->bases = stString_copy(stList_get(tokens, 6));
stList_destruct(tokens);
if(alignment->row_number == 1) {
alignment->column_number = strlen(row->bases);
alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *));
}
else {
assert(alignment->column_number == strlen(row->bases));
}
} else if(strcmp(stList_get(tokens, 0), TAF_BASE_QUALITY_TAG_KEY) == 0) {
if (row_qualities == NULL) {
row_qualities = stList_construct3(0, free);
row_quality_rows = stList_construct();
}
char *qual_seq_name = stList_get(tokens, 1);
if (last_row == NULL || strcmp(qual_seq_name, last_row->sequence_name) != 0) {
fprintf(stderr, "Error: q line invalid because sequence name does not match previous s line: %s\n", line);
exit(1);
}
assert(stList_length(tokens) == 3);
stList_append(row_qualities, stString_copy(stList_get(tokens, 2)));
stList_append(row_quality_rows, (void*)(alignment->row_number - 1));
stList_destruct(tokens);
} else {
assert(strcmp(stList_get(tokens, 0), "i") == 0 || strcmp(stList_get(tokens, 0), "e") == 0); // Must be an "i" or "e" line, which we ignore
stList_destruct(tokens);
continue;
}
assert(strcmp(stList_get(tokens, 0), "s") == 0); // Must be an "s" line
Alignment_Row *row = st_calloc(1, sizeof(Alignment_Row));
alignment->row_number++;
*p_row = row;
p_row = &(row->n_row);
row->sequence_name = stString_copy(stList_get(tokens, 1));
row->start = atol(stList_get(tokens, 2));
row->length = atol(stList_get(tokens, 3));
assert(strcmp(stList_get(tokens, 4), "+") == 0 || strcmp(stList_get(tokens, 4), "-") == 0);
row->strand = strcmp(stList_get(tokens, 4), "+") == 0;
row->sequence_length = atol(stList_get(tokens, 5));
row->bases = stString_copy(stList_get(tokens, 6));
stList_destruct(tokens);
if(alignment->row_number == 1) {
alignment->column_number = strlen(row->bases);
alignment->column_tags = st_calloc(alignment->column_number, sizeof(Tag *));
}
else {
assert(alignment->column_number == strlen(row->bases));
}
}
if (row_qualities != NULL) {
set_maf_qualities(alignment, row_qualities, row_quality_rows);
}
return alignment;
}
Expand All @@ -81,17 +144,79 @@ Tag *maf_read_header(LI *li) {

void maf_write_block2(Alignment *alignment, LW *lw, bool color_bases) {
LW_write(lw, "a\n");

// check for base quality ('q' tag). Just looking at position[0] to not be too inefficient
// so the assumption is either you have a quality for every base in every column, or nothing
// will fail on an error if this doesn't hold (but we can relax if needed later)
bool has_qualities = false;
unsigned char **decoded_col_qualities = NULL;
char *qual_buffer = NULL;
if (alignment->column_number > 0) {
for (Tag *col_tag = alignment->column_tags[0]; col_tag && !has_qualities; col_tag = col_tag->n_tag) {
if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) {
has_qualities = true;
}
}
if (has_qualities) {
decoded_col_qualities = (unsigned char**)st_calloc(alignment->column_number, sizeof(unsigned char*));
qual_buffer = (char*)st_calloc(alignment->column_number, sizeof(char));
// if we have qualites, decode them into a matrix for the entire block so they can be transposed
for (int64_t col = 0; col < alignment->column_number; ++col) {
for (Tag *col_tag = alignment->column_tags[col]; col_tag && !decoded_col_qualities[col]; col_tag = col_tag->n_tag) {
if (strcmp(col_tag->key, TAF_BASE_QUALITY_TAG_KEY) == 0) {
decoded_col_qualities[col] = (unsigned char*)st_calloc(alignment->row_number + 1, sizeof(unsigned char));
base64_decode(col_tag->value, BASE64_ENCODE_OUT_SIZE(alignment->row_number) - 1, decoded_col_qualities[col]);
}
}
if (decoded_col_qualities[col] == NULL) {
// see comment above
fprintf(stderr, "Error: missing base quality at column in block with base qualities\n");
exit(1);
}
}
}
}

Alignment_Row *row = alignment->row;
int64_t row_idx = 0;
while(row != NULL) {
char *bases = color_bases ? color_base_string(row->bases, alignment->column_number) : row->bases;
LW_write(lw, "s\t%s\t%" PRIi64 "\t%" PRIi64 "\t%s\t%" PRIi64 "\t%s\n", row->sequence_name, row->start, row->length,
row->strand ? "+" : "-", row->sequence_length, bases);
row->strand ? "+" : "-", row->sequence_length, bases);

if (has_qualities && row->length > 0) {
int64_t i = 0;
for (int64_t col = 0; col < alignment->column_number; ++col) {
if (row->bases[col] != '-') {
// this is a raw quality value
unsigned char qual = decoded_col_qualities[col][row_idx];
// do the transformation shown here
// https://genome.ucsc.edu/FAQ/FAQformat.html#format5
// MAF quality value = min( floor(actual quality value/5), 9 )
qual_buffer[i++] = (char)(qual >= 99 ? 'F' : (qual >= 45 ? '9' : '0' + qual/5));
} else {
qual_buffer[i++] = '-';
}
}
assert(i == alignment->column_number);
qual_buffer[i] = '\0';
LW_write(lw, "q\t%s\t\t\t\t\t%s\n", row->sequence_name, qual_buffer);
}

row = row->n_row;
++row_idx;
if(color_bases) {
free(bases);
}
}
LW_write(lw, "\n"); // Add a blank line at the end of the block
if (has_qualities) {
free(qual_buffer);
for (int64_t i = 0; i < alignment->column_number; ++i) {
free(decoded_col_qualities[i]);
}
free(decoded_col_qualities);
}
}

void maf_write_block(Alignment *alignment, LW *lw) {
Expand Down
1 change: 0 additions & 1 deletion taffy/impl/paf.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ static void paf_write_row(Alignment_Row *q_row, Alignment_Row *t_row, int64_t nu

// finalize trailing event
if (current_start < num_col && current_start >= 0 && current_event[0] != '.') {
int64_t i = num_col;
// note, this code block is (must be) identical to above. todo: factor out
if (cs_cigar) {
query_buffer[query_buffer_length] = '\0';
Expand Down
2 changes: 2 additions & 0 deletions taffy/inc/taf.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "sonLib.h"
#include "line_iterator.h"

#define TAF_BASE_QUALITY_TAG_KEY "q"

/*
* Structures to represent blocks of an alignment
*/
Expand Down
1 change: 1 addition & 0 deletions taffy/submodules/base64
Submodule base64 added at 81060e
35 changes: 35 additions & 0 deletions tests/qual.maf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
##maf version=1

a
s x 0 8 + 50 CAA---ATAAG
q x 999---99999
s ins 0 11 + 11 CAAGGGATAAG
q ins 94490049989

a
s x 8 1 + 50 G

a
s x 9 1 + 50 C

a
s x 10 3 + 50 TTG

a
s x 13 1 + 50 G

a
s x 14 19 + 50 AAATTTTCTGGAGTTCTAT

a
s x 33 1 + 50 T

a
s x 34 4 + 50 ATAT

a
s x 38 1 + 50 T

a
s x 39 11 + 50 CCAACTCTCTG

12 changes: 12 additions & 0 deletions tests/view_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,17 @@ static void test_paf(CuTest *testCase) {

}

static void test_maf_quals(CuTest *testCase) {
char *example_file = "./tests/qual.maf";
char *output_file = "./tests/qual.out.maf";
char *truth_file = "./tests/qual.maf";
int i = st_system("./bin/taffy view -i %s -m > %s", example_file, output_file);
CuAssertIntEquals(testCase, 0, i); // return value should be zero
int diff_ret = st_system("diff %s %s", output_file, truth_file);
CuAssertIntEquals(testCase, 0, diff_ret); // return value should be zero if files sames
st_system("rm -f %s", output_file);
}

static void test_lineage_diffs(CuTest *testCase) {
{
char *example_file = "./tests/evolverMammals.maf.mini";
Expand Down Expand Up @@ -103,6 +114,7 @@ static void test_ref_diffs(CuTest *testCase) {
CuSuite* view_test_suite(void) {
CuSuite* suite = CuSuiteNew();
SUITE_ADD_TEST(suite, test_paf);
SUITE_ADD_TEST(suite, test_maf_quals);
SUITE_ADD_TEST(suite, test_lineage_diffs);
SUITE_ADD_TEST(suite, test_ref_diffs);
return suite;
Expand Down