From 50dc2455fc38347ee2bdab964ad164b7b24cb4fc Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 19 Sep 2014 22:39:09 +0300 Subject: [PATCH 01/32] Update seqtk.c added -B/-E to trimfq for keeping first/last INT bp" --- seqtk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/seqtk.c b/seqtk.c index e169c9a..7913ace 100644 --- a/seqtk.c +++ b/seqtk.c @@ -272,22 +272,26 @@ int stk_trimfq(int argc, char *argv[]) gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0; + int i, c, min_len = 30, left = 0, right = 0, left_keep = 0, right_keep = 0; while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; } } if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); + fprintf(stderr, " -l INT maximally trim down from rigth to INT bp (disabled by -b/-e) [%d]\n", min_len); fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); + fprintf(stderr, " -B INT trim down from right and keep first INT bp from left (disabled by -q/-l/-b/-e) [%d]\n", left_keep); + fprintf(stderr, " -E INT trim down from left and keep last INT bp from right (disabled by -q/-l/-b/-e/-B) [%d]\n", right_keep); fprintf(stderr, "\n"); return 1; } @@ -301,6 +305,11 @@ int stk_trimfq(int argc, char *argv[]) if (left || right) { beg = left; end = seq->seq.l - right; if (beg >= end) beg = end = 0; + } else if (left_keep) { + beg = 0; end = left_keep; + } else if (right_keep) { + beg = seq->seq.l - right_keep; end = seq->seq.l; + if (beg < 0) beg = 0; } else if (seq->qual.l > min_len) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From 7be87da5e051f5dd3a24fdf2bc9562d0b0be44e2 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 19 Sep 2014 23:23:56 +0300 Subject: [PATCH 02/32] Update seqtk.c fixing bugs and also "shortcircuting" the case when param == 0 --- seqtk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/seqtk.c b/seqtk.c index 7913ace..59bdef0 100644 --- a/seqtk.c +++ b/seqtk.c @@ -273,7 +273,7 @@ int stk_trimfq(int argc, char *argv[]) kseq_t *seq; double param = 0.05, q_int2real[128]; int i, c, min_len = 30, left = 0, right = 0, left_keep = 0, right_keep = 0; - while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) { + while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; @@ -307,10 +307,11 @@ int stk_trimfq(int argc, char *argv[]) if (beg >= end) beg = end = 0; } else if (left_keep) { beg = 0; end = left_keep; + if (seq->seq.l < end) end = seq->seq.l; } else if (right_keep) { beg = seq->seq.l - right_keep; end = seq->seq.l; if (beg < 0) beg = 0; - } else if (seq->qual.l > min_len) { + } else if (seq->qual.l > min_len && param != 0) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; @@ -321,7 +322,7 @@ int stk_trimfq(int argc, char *argv[]) } /* max never set; all low qual, just give first min_len bp */ - if (max == 0.) beg = 0, end = min_len; + if (max == 0. || param == 0) beg = 0, end = min_len; if (end - beg < min_len) { // window-based int is, imax; From 0b1591092893f6e6371aa93bc01f807f27d22667 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 19 Sep 2014 23:28:32 +0300 Subject: [PATCH 03/32] Update seqtk.c fixing bugs --- seqtk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/seqtk.c b/seqtk.c index 59bdef0..08c027f 100644 --- a/seqtk.c +++ b/seqtk.c @@ -301,7 +301,7 @@ int stk_trimfq(int argc, char *argv[]) q_int2real[i] = pow(10., -(i - 33) / 10.); while (kseq_read(seq) >= 0) { int beg, tmp, end; - double s, max; + double s, max = 0.; if (left || right) { beg = left; end = seq->seq.l - right; if (beg >= end) beg = end = 0; @@ -311,7 +311,7 @@ int stk_trimfq(int argc, char *argv[]) } else if (right_keep) { beg = seq->seq.l - right_keep; end = seq->seq.l; if (beg < 0) beg = 0; - } else if (seq->qual.l > min_len && param != 0) { + } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; @@ -322,7 +322,7 @@ int stk_trimfq(int argc, char *argv[]) } /* max never set; all low qual, just give first min_len bp */ - if (max == 0. || param == 0) beg = 0, end = min_len; + if (max == 0.) beg = 0, end = min_len; if (end - beg < min_len) { // window-based int is, imax; From fd8400b50c689df5bfdfe45f6d38151497494c21 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sat, 20 Sep 2014 10:06:05 +0300 Subject: [PATCH 04/32] Update seqtk.c fixing description --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index 08c027f..96af0f1 100644 --- a/seqtk.c +++ b/seqtk.c @@ -287,7 +287,7 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down from rigth to INT bp (disabled by -b/-e) [%d]\n", min_len); + fprintf(stderr, " -l INT maximally trim down from rigth to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); fprintf(stderr, " -B INT trim down from right and keep first INT bp from left (disabled by -q/-l/-b/-e) [%d]\n", left_keep); From 63918fb47f575d32b9558d551e35aba51e91875a Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sat, 20 Sep 2014 11:21:28 +0300 Subject: [PATCH 05/32] Update README.md Updated with -B/-E for trimfq examples --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 9f71109..88bf43f 100644 --- a/README.md +++ b/README.md @@ -54,3 +54,10 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa +* Keep first 50bp from the left end of each read by trimming the right end: + + seqtk trimfq -B 50 in.fq > out.fq + +* Keep last 50bp from the right end of each read by trimming the left end: + + seqtk trimfq -E 50 in.fq > out.fq From c99d7fcfefa45701d3d398434e502f44ec161da2 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sat, 20 Sep 2014 12:18:47 +0300 Subject: [PATCH 06/32] Update seqtk.c fixed typo "rigth" to "right" --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index 96af0f1..377296f 100644 --- a/seqtk.c +++ b/seqtk.c @@ -287,7 +287,7 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down from rigth to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -l INT maximally trim down from right end to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); fprintf(stderr, " -B INT trim down from right and keep first INT bp from left (disabled by -q/-l/-b/-e) [%d]\n", left_keep); From 77bc4e717cc4def2611c96f549ed7d7f208e4f6a Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Mon, 22 Sep 2014 08:39:43 +0300 Subject: [PATCH 07/32] Update seqtk.c updated version string to 1.0-r68a-dirty --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index 377296f..b27c831 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1360,7 +1360,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r68-dirty\n\n"); + fprintf(stderr, "Version: 1.0-r68a-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); From e5d97e7742ab9e5d79d00d84ece978ec71af6e24 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 25 Sep 2014 11:51:15 +0300 Subject: [PATCH 08/32] Update seqtk.c Added the possibility to use simultaneously in trimfq: a) -e with -E, and b) -b with -B. --- seqtk.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/seqtk.c b/seqtk.c index b27c831..4f5dc9f 100644 --- a/seqtk.c +++ b/seqtk.c @@ -287,11 +287,11 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down from right end to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -B INT trim down from right and keep first INT bp from left (disabled by -q/-l/-b/-e) [%d]\n", left_keep); - fprintf(stderr, " -E INT trim down from left and keep last INT bp from right (disabled by -q/-l/-b/-e/-B) [%d]\n", right_keep); + fprintf(stderr, " -l INT maximally trims down from right end to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l; it has priority over -B) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l; it has priority over -E) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-l/-e) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-l/-b/-B) [%d]\n", right_keep); fprintf(stderr, "\n"); return 1; } @@ -302,15 +302,17 @@ int stk_trimfq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max = 0.; - if (left || right) { - beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; - } else if (left_keep) { - beg = 0; end = left_keep; + if (left_keep) { + beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg ) beg = end = 0; } else if (right_keep) { - beg = seq->seq.l - right_keep; end = seq->seq.l; + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; + if (end < 0 ) beg = end = 0; + } else if (left || right) { + beg = left; end = seq->seq.l - right; + if (beg >= end) beg = end = 0; } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; @@ -1360,7 +1362,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r68a-dirty\n\n"); + fprintf(stderr, "Version: 1.0-r68b-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); From cbe7863fc10cfac89a26514297679f47a3168921 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 25 Sep 2014 11:53:27 +0300 Subject: [PATCH 09/32] Update README.md Added examples for use simultaneously use with trimfq of: a) -e with -E, and b) -b with -B. --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 88bf43f..b2b7beb 100644 --- a/README.md +++ b/README.md @@ -61,3 +61,11 @@ Seqtk Examples * Keep last 50bp from the right end of each read by trimming the left end: seqtk trimfq -E 50 in.fq > out.fq + +* Trim 5bp from left end and keep next 50bp from left end of each read: + + seqtk trimfq -B 50 -b 5 in.fq > out.fq + +* Trim 5bp from right end and keep the 50bp from right end of each read: + + seqtk trimfq -E 50 -e 5 in.fq > out.fq From 744b2ae9995353078d593e94010e3fc449f9358d Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 2 Oct 2014 14:56:27 +0300 Subject: [PATCH 10/32] Update seqtk.c setting the minimum length --- seqtk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/seqtk.c b/seqtk.c index 4f5dc9f..6d7ee1c 100644 --- a/seqtk.c +++ b/seqtk.c @@ -272,7 +272,7 @@ int stk_trimfq(int argc, char *argv[]) gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0, left_keep = 0, right_keep = 0; + int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; @@ -305,14 +305,14 @@ int stk_trimfq(int argc, char *argv[]) if (left_keep) { beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; - if (seq->seq.l < beg ) beg = end = 0; + if (seq->seq.l < beg || end < min_len || end - beg < min_len) { beg = 0; end = min_len } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; - if (end < 0 ) beg = end = 0; + if (end < min_len || end - beg < min_len) { beg = 0; end = min_len } } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; + if (beg >= end || end < min_len || end - beg < min_len) { beg = 0; end = min_len; } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From f97e5831a0827d9365415d5da65aadfc3fec3911 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 2 Oct 2014 15:45:23 +0300 Subject: [PATCH 11/32] Update seqtk.c the minimum reads after the trimming is 1 instead of zero (no reads with length 0 are created) --- seqtk.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/seqtk.c b/seqtk.c index 6d7ee1c..c814b9b 100644 --- a/seqtk.c +++ b/seqtk.c @@ -272,7 +272,7 @@ int stk_trimfq(int argc, char *argv[]) gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + int i, c, min_len = 30, left = 0, right = 0, left_keep = 0, right_keep = 0; while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; @@ -305,14 +305,18 @@ int stk_trimfq(int argc, char *argv[]) if (left_keep) { beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; - if (seq->seq.l < beg || end < min_len || end - beg < min_len) { beg = 0; end = min_len } + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < 1) { beg = 0; end = 1 } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; - if (end < min_len || end - beg < min_len) { beg = 0; end = min_len } + if (end < 0) end = 0; + if (end - beg < 1) { beg = 0; end = 1 } } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end || end < min_len || end - beg < min_len) { beg = 0; end = min_len; } + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < 1) { beg = 0; end = 1 } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From 442fa31b83fa8a0da84be951bf6289bc11e370f7 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 2 Oct 2014 18:46:54 +0300 Subject: [PATCH 12/32] Update seqtk.c fixes --- seqtk.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/seqtk.c b/seqtk.c index c814b9b..27f3be1 100644 --- a/seqtk.c +++ b/seqtk.c @@ -287,11 +287,11 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trims down from right end to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l; it has priority over -B) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l; it has priority over -E) [0]\n"); - fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-l/-e) [%d]\n", left_keep); - fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-l/-b/-B) [%d]\n", right_keep); + fprintf(stderr, " -l INT maximally trims down from right end to INT bp [%d]\n", min_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q; it has priority over -B) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q; it has priority over -E) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-e) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-b/-B) [%d]\n", right_keep); fprintf(stderr, "\n"); return 1; } @@ -306,17 +306,17 @@ int stk_trimfq(int argc, char *argv[]) beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < 1) { beg = 0; end = 1 } + if (end - beg < min_len) { beg = 0; end = min_len } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; if (end < 0) end = 0; - if (end - beg < 1) { beg = 0; end = 1 } + if (end - beg < min_len) { beg = 0; end = min_len } } else if (left || right) { beg = left; end = seq->seq.l - right; if (end < 0) end = 0; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < 1) { beg = 0; end = 1 } + if (end - beg < min_len) { beg = 0; end = min_len } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From a87a7f7ef7caa0758235c415e11f652a66a53b26 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 3 Oct 2014 10:04:49 +0300 Subject: [PATCH 13/32] Update seqtk.c fixes bugs and descriptions of "trimfq -l" --- seqtk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/seqtk.c b/seqtk.c index 27f3be1..0902a37 100644 --- a/seqtk.c +++ b/seqtk.c @@ -287,7 +287,7 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trims down from right end to INT bp [%d]\n", min_len); + fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q; it has priority over -B) [0]\n"); fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q; it has priority over -E) [0]\n"); fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-e) [%d]\n", left_keep); @@ -306,17 +306,17 @@ int stk_trimfq(int argc, char *argv[]) beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { beg = 0; end = min_len } + if (end - beg < min_len) { beg = 0; end = min_len; } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; if (end < 0) end = 0; - if (end - beg < min_len) { beg = 0; end = min_len } + if (end - beg < min_len) { beg = 0; end = min_len; } } else if (left || right) { beg = left; end = seq->seq.l - right; if (end < 0) end = 0; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { beg = 0; end = min_len } + if (end - beg < min_len) { beg = 0; end = min_len; } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From f646fb4ca7c673cd9e92e1de94aa6b737e38b391 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 3 Oct 2014 10:10:32 +0300 Subject: [PATCH 14/32] Update README.md Adding example for "trimfq -l" usage with other trimming options. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index b2b7beb..b4f2a67 100644 --- a/README.md +++ b/README.md @@ -69,3 +69,7 @@ Seqtk Examples * Trim 5bp from right end and keep the 50bp from right end of each read: seqtk trimfq -E 50 -e 5 in.fq > out.fq + +* Trim 5bp from right end and keep the 50bp from right end of each read and if trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: + + seqtk trimfq -E 50 -e 5 -l 20 in.fq > out.fq From a30bec29897a7c45099fbb3b66710bf41cd64e3d Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 3 Oct 2014 11:28:36 +0300 Subject: [PATCH 15/32] Update seqtk.c trying to fix locally the bug where reads with no sequence are converted to FASTA format in TRIMFQ and also setting be default "trimfq -l" to 1 instead of 30. --- seqtk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/seqtk.c b/seqtk.c index 0902a37..6ba7c54 100644 --- a/seqtk.c +++ b/seqtk.c @@ -272,7 +272,7 @@ int stk_trimfq(int argc, char *argv[]) gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0, left_keep = 0, right_keep = 0; + int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; @@ -302,7 +302,16 @@ int stk_trimfq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max = 0.; - if (left_keep) { + if (seq->seq.l == 0) { // trying to fix locally the bug where reads with no sequence are converted to FASTA format + beg = 0; + end = 1; + seq->seq.l = 1; + seq->qual.l = 1; + seq->seq.s = (char*)malloc(2); + seq->seq.s[0] = 'A'; + seq->qual.l = (char*)malloc(2); + seq->qual.s[0]='F'; + } else if (left_keep) { beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; if (seq->seq.l < beg) beg = seq->seq.l; From 9063c2a7f7ba6b40b1a2472a1881070dddae72c4 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 3 Oct 2014 11:35:46 +0300 Subject: [PATCH 16/32] Update seqtk.c fixing bug in trimfq --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index 6ba7c54..aebd9de 100644 --- a/seqtk.c +++ b/seqtk.c @@ -309,7 +309,7 @@ int stk_trimfq(int argc, char *argv[]) seq->qual.l = 1; seq->seq.s = (char*)malloc(2); seq->seq.s[0] = 'A'; - seq->qual.l = (char*)malloc(2); + seq->qual.s = (char*)malloc(2); seq->qual.s[0]='F'; } else if (left_keep) { beg = left; end = left + left_keep; From f07d0c9443d99b53390301a31319525261b85b7c Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Fri, 3 Oct 2014 22:57:27 +0300 Subject: [PATCH 17/32] Update seqtk.c fixed bugs regarding trimfq --- seqtk.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/seqtk.c b/seqtk.c index aebd9de..ca558c1 100644 --- a/seqtk.c +++ b/seqtk.c @@ -315,17 +315,29 @@ int stk_trimfq(int argc, char *argv[]) beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { beg = 0; end = min_len; } + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; if (end < 0) end = 0; - if (end - beg < min_len) { beg = 0; end = min_len; } + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } } else if (left || right) { beg = left; end = seq->seq.l - right; if (end < 0) end = 0; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { beg = 0; end = min_len; } + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From 7a3462169a6ce225498df4124e4278254780582d Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Mon, 6 Oct 2014 10:44:41 +0300 Subject: [PATCH 18/32] Update seqtk.c updated to the version string to: 1.0-r68e-dirty --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index ca558c1..a1f3464 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1387,7 +1387,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r68b-dirty\n\n"); + fprintf(stderr, "Version: 1.0-r68e-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); From 9ce4ca962125442f0965eba90bbc9d6862d6dc6d Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 22 Oct 2015 12:50:28 +0300 Subject: [PATCH 19/32] updates --- kseq.h | 183 ++-- seqtk.c | 3075 +++++++++++++++++++++++++++++-------------------------- 2 files changed, 1728 insertions(+), 1530 deletions(-) diff --git a/kseq.h b/kseq.h index 84c1fa3..b2238d1 100644 --- a/kseq.h +++ b/kseq.h @@ -37,46 +37,43 @@ #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - int begin, end; \ - int is_eof:2, bufsize:30; \ - type_t f; \ - unsigned char *buf; \ +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) -#define __KS_BASIC(SCOPE, type_t, __bufsize) \ - SCOPE kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; ks->bufsize = __bufsize; \ - ks->buf = (unsigned char*)malloc(__bufsize); \ - return ks; \ - } \ - SCOPE void ks_destroy(kstream_t *ks) \ - { \ - if (!ks) return; \ - free(ks->buf); \ - free(ks); \ +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ } -#define __KS_INLINED(__read) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1;} \ + } \ + return (int)ks->buf[ks->begin++]; \ + } #ifndef KSTRING_T #define KSTRING_T kstring_t @@ -90,74 +87,64 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#define __KS_GETUNTIL(SCOPE, __read) \ - SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') break; \ - } else if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ - return str->l; \ - } - -#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(SCOPE, type_t, __bufsize) \ - __KS_GETUNTIL(SCOPE, __read) \ - __KS_INLINED(__read) - -#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) - -#define KSTREAM_DECLARE(type_t, __read) \ - __KS_TYPE(type_t) \ - extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ - extern kstream_t *ks_init(type_t f); \ - extern void ks_destroy(kstream_t *ks); \ - __KS_INLINED(__read) + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } -/****************** - * FASTA/Q parser * - ******************/ +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) @@ -231,7 +218,7 @@ typedef struct __kstring_t { } kseq_t; #define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) diff --git a/seqtk.c b/seqtk.c index a1f3464..0e522fe 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1,1432 +1,1643 @@ -/* The MIT License - - Copyright (c) 20082-2012 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -typedef struct { - int n, m; - uint64_t *a; -} reglist_t; - -#include "khash.h" -KHASH_MAP_INIT_STR(reg, reglist_t) -KHASH_SET_INIT_INT64(64) - -typedef kh_reg_t reghash_t; - -reghash_t *stk_reg_read(const char *fn) -{ - reghash_t *h = kh_init(reg); - gzFile fp; - kstream_t *ks; - int dret; - kstring_t *str; - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int beg = -1, end = -1; - reglist_t *p; - khint_t k = kh_get(reg, h, str->s); - if (k == kh_end(h)) { - int ret; - char *s = strdup(str->s); - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - beg = atoi(str->s); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - end = atoi(str->s); - if (end < 0) end = -1; - } - } - } - } - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column - if (beg < 0) beg = 0, end = INT_MAX; - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | end; - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return h; -} - -void stk_reg_destroy(reghash_t *h) -{ - khint_t k; - if (h == 0) return; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); -} - -/* constant table */ - -unsigned char seq_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; - -unsigned char seq_nt6_table[256] = { - 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 -}; - -char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; -unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; -int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; -char comp_tab[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', - 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, - 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', - 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 -}; - -static void stk_printstr(const kstring_t *s, unsigned line_len) -{ - if (line_len != UINT_MAX && line_len != 0) { - int i, rest = s->l; - for (i = 0; i < s->l; i += line_len, rest -= line_len) { - putchar('\n'); - if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout); - else fwrite(s->s + i, 1, rest, stdout); - } - putchar('\n'); - } else { - putchar('\n'); - puts(s->s); - } -} - -void stk_printseq(const kseq_t *s, int line_len) -{ - putchar(s->qual.l? '@' : '>'); - fputs(s->name.s, stdout); - if (s->comment.l) { - putchar(' '); fputs(s->comment.s, stdout); - } - stk_printstr(&s->seq, line_len); - if (s->qual.l) { - putchar('+'); - stk_printstr(&s->qual, line_len); - } -} - -/* - 64-bit Mersenne Twister pseudorandom number generator. Adapted from: - - http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c - - which was written by Takuji Nishimura and Makoto Matsumoto and released - under the 3-clause BSD license. -*/ - -typedef uint64_t krint64_t; - -struct _krand_t; -typedef struct _krand_t krand_t; - -#define KR_NN 312 -#define KR_MM 156 -#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ -#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */ - -struct _krand_t { - int mti; - krint64_t mt[KR_NN]; -}; - -static void kr_srand0(krint64_t seed, krand_t *kr) -{ - kr->mt[0] = seed; - for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti) - kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti; -} - -krand_t *kr_srand(krint64_t seed) -{ - krand_t *kr; - kr = malloc(sizeof(krand_t)); - kr_srand0(seed, kr); - return kr; -} - -krint64_t kr_rand(krand_t *kr) -{ - krint64_t x; - static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL }; - if (kr->mti >= KR_NN) { - int i; - if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr); - for (i = 0; i < KR_NN - KR_MM; ++i) { - x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); - kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)]; - } - for (; i < KR_NN - 1; ++i) { - x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); - kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)]; - } - x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM); - kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)]; - kr->mti = 0; - } - x = kr->mt[kr->mti++]; - x ^= (x >> 29) & 0x5555555555555555ULL; - x ^= (x << 17) & 0x71D67FFFEDA60000ULL; - x ^= (x << 37) & 0xFFF7EEE000000000ULL; - x ^= (x >> 43); - return x; -} - -#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0)) - - -/* quality based trimming with Mott's algorithm */ -int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record - gzFile fp; - kseq_t *seq; - double param = 0.05, q_int2real[128]; - int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; - while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { - switch (c) { - case 'q': param = atof(optarg); break; - case 'l': min_len = atoi(optarg); break; - case 'b': left = atoi(optarg); break; - case 'e': right = atoi(optarg); break; - case 'B': left_keep = atoi(optarg); break; - case 'E': right_keep = atoi(optarg); break; - } - } - if (optind == argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q; it has priority over -B) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q; it has priority over -E) [0]\n"); - fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-e) [%d]\n", left_keep); - fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-b/-B) [%d]\n", right_keep); - fprintf(stderr, "\n"); - return 1; - } - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - for (i = 0; i < 128; ++i) - q_int2real[i] = pow(10., -(i - 33) / 10.); - while (kseq_read(seq) >= 0) { - int beg, tmp, end; - double s, max = 0.; - if (seq->seq.l == 0) { // trying to fix locally the bug where reads with no sequence are converted to FASTA format - beg = 0; - end = 1; - seq->seq.l = 1; - seq->qual.l = 1; - seq->seq.s = (char*)malloc(2); - seq->seq.s[0] = 'A'; - seq->qual.s = (char*)malloc(2); - seq->qual.s[0]='F'; - } else if (left_keep) { - beg = left; end = left + left_keep; - if (seq->seq.l < end) end = seq->seq.l; - if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (right_keep) { - beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; - if (beg < 0) beg = 0; - if (end < 0) end = 0; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (left || right) { - beg = left; end = seq->seq.l - right; - if (end < 0) end = 0; - if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (seq->qual.l > min_len && param != 0.) { - for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { - int q = seq->qual.s[i]; - if (q < 36) q = 36; - if (q > 127) q = 127; - s += param - q_int2real[q]; - if (s > max) max = s, beg = tmp, end = i + 1; - if (s < 0) s = 0, tmp = i + 1; - } - - /* max never set; all low qual, just give first min_len bp */ - if (max == 0.) beg = 0, end = min_len; - - if (end - beg < min_len) { // window-based - int is, imax; - for (i = 0, is = 0; i < min_len; ++i) - is += seq->qual.s[i] - 33; - for (imax = is, beg = 0; i < seq->qual.l; ++i) { - is += (int)seq->qual.s[i] - seq->qual.s[i - min_len]; - if (imax < is) imax = is, beg = i - min_len + 1; - } - end = beg + min_len; - } - } else beg = 0, end = seq->seq.l; - putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); - if (seq->comment.l) { - putchar(' '); puts(seq->comment.s); - } else putchar('\n'); - fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); - if (seq->qual.l) { - puts("+"); - fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* composition */ -int stk_comp(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, upper_only = 0, from_stdin; - reghash_t *h = 0; - reglist_t dummy; - while ((c = getopt(argc, argv, "ur:")) >= 0) { - switch (c) { - case 'u': upper_only = 1; break; - case 'r': h = stk_reg_read(optarg); break; - } - } - from_stdin = !isatty(fileno(stdin)); - if (argc == optind && !from_stdin) { - fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); - fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); - return 1; - } - if (from_stdin && strcmp(argv[optind], "-") != 0) - fprintf(stderr, "[W::%s] stdin is available; the input file is ignored!\n", __func__); - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); - while ((l = kseq_read(seq)) >= 0) { - int i, k; - reglist_t *p = 0; - if (h) { - khint_t k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) p = &kh_val(h, k); - } else { - p = &dummy; - dummy.a[0] = l; - } - for (k = 0; p && k < p->n; ++k) { - int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; - int la, lb, lc, na, nb, nc, cnt[11]; - if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; - else la = 'a', lb = -1, lc = 0; - na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - memset(cnt, 0, 11 * sizeof(int)); - for (i = beg; i < end; ++i) { - int is_CpG = 0, a, b, c; - a = na; b = nb; c = nc; - na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - if (b == 2 || b == 10) { // C or Y - if (nb == 4 || nb == 5) is_CpG = 1; - } else if (b == 4 || b == 5) { // G or R - if (lb == 2 || lb == 10) is_CpG = 1; - } - if (upper_only == 0 || isupper(a)) { - if (c > 1) ++cnt[c+2]; - if (c == 1) ++cnt[seq_nt16to4_table[b]]; - if (b == 10 || b == 5) ++cnt[9]; - else if (c == 2) { - ++cnt[8]; - } - if (is_CpG) { - ++cnt[7]; - if (b == 10 || b == 5) ++cnt[10]; - } - } - la = a; lb = b; lc = c; - } - if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); - else printf("%s\t%d", seq->name.s, l); - for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); - putchar('\n'); - } - fflush(stdout); - } - free(dummy.a); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_randbase(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk randbase \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - int i; - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - int c, b, a, j, k, m; - b = seq->seq.s[i]; - c = seq_nt16_table[b]; - a = bitcnt_table[c]; - if (a == 2) { - m = (drand48() < 0.5); - for (j = k = 0; j < 4; ++j) { - if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; - } - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_hety(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; - char *buf; - uint32_t cnt[3]; - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk hety [options] \n\n"); - fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); - fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); - fprintf(stderr, " -m treat lowercases as masked\n"); - fprintf(stderr, "\n"); - return 1; - } - while ((c = getopt(argc, argv, "w:t:m")) >= 0) { - switch (c) { - case 'w': win_size = atoi(optarg); break; - case 't': n_start = atoi(optarg); break; - case 'm': is_lower_mask = 1; break; - } - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - win_step = win_size / n_start; - buf = calloc(win_size, 1); - while ((l = kseq_read(seq)) >= 0) { - int x, i, y, z, next = 0; - cnt[0] = cnt[1] = cnt[2] = 0; - for (i = 0; i <= l; ++i) { - if ((i >= win_size && i % win_step == 0) || i == l) { - if (i == l && l >= win_size) { - for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; - } - if (cnt[1] + cnt[2] > 0) - printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, - (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); - next = i; - } - if (i < l) { - y = i % win_size; - c = seq->seq.s[i]; - if (is_lower_mask && islower(c)) c = 'N'; - c = seq_nt16_table[c]; - x = bitcnt_table[c]; - if (i >= win_size) --cnt[(int)buf[y]]; - buf[y] = z = x > 2? 0 : x == 2? 2 : 1; - ++cnt[z]; - } - } - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* subseq */ - -int stk_subseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_tab = 0, line = 0; - khint_t k; - while ((c = getopt(argc, argv, "tl:")) >= 0) { - switch (c) { - case 't': is_tab = 1; break; - case 'l': line = atoi(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); - fprintf(stderr, "Options: -t TAB delimited output\n"); - fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); - fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // subseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); - continue; - } - if (end > seq->seq.l) end = seq->seq.l; - if (is_tab == 0) { - printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); - if (beg > 0 || (int)p->a[i] != INT_MAX) { - if (end == INT_MAX) { - if (beg) printf(":%d", beg+1); - } else printf(":%d-%d", beg+1, end); - } - if (seq->comment.l) printf("\t%s", seq->comment.s); - } else printf("%s\t%d\t", seq->name.s, beg + 1); - if (end > seq->seq.l) end = seq->seq.l; - for (j = 0; j < end - beg; ++j) { - if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n'); - putchar(seq->seq.s[j + beg]); - } - putchar('\n'); - if (seq->qual.l != seq->seq.l || is_tab) continue; - printf("+"); - for (j = 0; j < end - beg; ++j) { - if (j == 0 || (line > 0 && j % line == 0)) putchar('\n'); - putchar(seq->qual.s[j + beg]); - } - putchar('\n'); - } - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* mergefa */ -int stk_mergefa(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0; - uint64_t cnt[5]; - while ((c = getopt(argc, argv, "himrq:")) >= 0) { - switch (c) { - case 'i': is_intersect = 1; break; - case 'h': is_haploid = 1; break; - case 'm': is_mask = 1; break; - case 'r': is_randhet = 1; break; - case 'q': qual = atoi(optarg); break; - } - } - if (is_mask && is_intersect) { - fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); - return 1; - } - if (optind + 2 > argc) { - fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); - fprintf(stderr, "Options: -q INT quality threshold [0]\n"); - fprintf(stderr, " -i take intersection\n"); - fprintf(stderr, " -m convert to lowercase when one of the input base is N\n"); - fprintf(stderr, " -r pick a random allele from het\n"); - fprintf(stderr, " -h suppress hets in the input\n\n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; - srand48(11); - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2], b[2], is_upper; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); - if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); - if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; - c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; - if (c[0] == 0) c[0] = 15; - if (c[1] == 0) c[1] = 15; - b[0] = bitcnt_table[c[0]]; - b[1] = bitcnt_table[c[1]]; - if (is_upper) { - if (b[0] == 1 && b[1] == 1) { - if (c[0] == c[1]) ++cnt[0]; - else ++cnt[1]; - } else if (b[0] == 1 && b[1] == 2) ++cnt[2]; - else if (b[0] == 2 && b[1] == 1) ++cnt[3]; - else if (b[0] == 2 && b[1] == 2) ++cnt[4]; - } - if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0; - if (is_intersect) { - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0! - } else if (is_mask) { - if (c[0] == 15 || c[1] == 15) is_upper = 0; - c[0] &= c[1]; - if (c[0] == 0) is_upper = 0; - } else if (is_randhet) { - if (b[0] == 1 && b[1] == 1) { // two homs - c[0] |= c[1]; - } else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het - c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]); - } else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets - if (lrand48()&1) { - if (lrand48()&1) { - for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele - if (c[0]&i) c[0] &= i; - } else { - for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele - if (c[0]&i) c[0] &= i; - } - } // else set as het - } else is_upper = 0; - } else c[0] |= c[1]; - c[0] = seq_nt16_rev_table[c[0]]; - if (!is_upper) c[0] = tolower(c[0]); - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]); - return 0; -} - -int stk_famask(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk famask \n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2]; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (c[1] == 'x') c[0] = tolower(c[0]); - else if (c[1] != 'X') c[0] = c[1]; - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_mutfa(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - kstream_t *ks; - int l, i, dret; - kstring_t *str; - khint_t k; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mutfa \n\n"); - fprintf(stderr, "Note: contains at least four columns per line which are:\n"); - fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); - return 1; - } - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - char *s = strdup(str->s); - int beg = 0, ret; - reglist_t *p; - k = kh_get(reg, h, s); - if (k == kh_end(h)) { - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col - ks_getuntil(ks, 0, str, &dret); // 3rd col - ks_getuntil(ks, 0, str, &dret); // 4th col - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (isalpha(str->s[0]) && str->l == 1) { - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; - } - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - // mutfa - fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) { - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32; - if (beg < seq->seq.l) - seq->seq.s[beg] = (int)p->a[i]; - } - } - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); - return 0; -} - -int stk_listhet(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int i, l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk listhet \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - for (i = 0; i < l; ++i) { - int b = seq->seq.s[i]; - if (bitcnt_table[seq_nt16_table[b]] == 2) - printf("%s\t%d\t%c\n", seq->name.s, i+1, b); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* cutN */ -static int cutN_min_N_tract = 1000; -static int cutN_nonN_penalty = 10; - -static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) -{ - int i, b, e; - while (k < ks->seq.l) { - if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { - int score, max; - score = 0; e = max = -1; - for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, e = i; - } - score = 0; b = max = -1; - for (i = e; i >= 0 && score >= 0; --i) { /* backward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, b = i; - } - if (e + 1 - b >= cutN_min_N_tract) { - *begin = b; - *end = e + 1; - return *end; - } - k = e + 1; - } else ++k; - } - return -1; -} -static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) -{ - int i; - if (begin >= end) return; // FIXME: why may this happen? Understand it! - fprintf(fpout, "%c%s:%d-%d", ks->qual.l? '@' : '>', ks->name.s, begin+1, end); - for (i = begin; i < end && i < ks->seq.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->seq.s[i], fpout); - } - fputc('\n', fpout); - if (ks->qual.l == 0) return; - fputs("+\n", fpout); - for (i = begin; i < end && i < ks->qual.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->qual.s[i], fpout); - } - fputc('\n', fpout); -} -int stk_cutN(int argc, char *argv[]) -{ - int c, l, gap_only = 0; - gzFile fp; - kseq_t *ks; - while ((c = getopt(argc, argv, "n:p:g")) >= 0) { - switch (c) { - case 'n': cutN_min_N_tract = atoi(optarg); break; - case 'p': cutN_nonN_penalty = atoi(optarg); break; - case 'g': gap_only = 1; break; - default: return 1; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); - fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); - fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); - fprintf(stderr, " -g print gaps only, no sequence\n\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while ((l = kseq_read(ks)) >= 0) { - int k = 0, begin = 0, end = 0; - while (find_next_cut(ks, k, &begin, &end) >= 0) { - if (begin != 0) { - if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); - else print_seq(stdout, ks, k, begin); - } - k = end; - } - if (!gap_only) print_seq(stdout, ks, k, l); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -int stk_hrun(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *ks; - int min_len = 7, l = 0, c = 0, beg = 0, i; - if (argc == optind) { - fprintf(stderr, "Usage: seqtk hrun [minLen=%d]\n", min_len); - return 1; - } - if (argc == optind + 2) min_len = atoi(argv[optind+1]); - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - c = ks->seq.s[0]; l = 1; beg = 0; - for (i = 1; i < ks->seq.l; ++i) { - if (ks->seq.s[i] != c) { - if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); - c = ks->seq.s[i]; l = 1; beg = i; - } else ++l; - } - } - if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* sample */ - -static void cpy_kstr(kstring_t *dst, const kstring_t *src) -{ - if (src->l == 0) return; - if (src->l + 1 > dst->m) { - dst->m = src->l + 1; - kroundup32(dst->m); - dst->s = realloc(dst->s, dst->m); - } - dst->l = src->l; - memcpy(dst->s, src->s, src->l + 1); -} - -static void cpy_kseq(kseq_t *dst, const kseq_t *src) -{ - cpy_kstr(&dst->name, &src->name); - cpy_kstr(&dst->seq, &src->seq); - cpy_kstr(&dst->qual, &src->qual); - cpy_kstr(&dst->comment, &src->comment); -} - -int stk_sample(int argc, char *argv[]) -{ - int c, twopass = 0; - uint64_t i, num = 0, n_seqs = 0; - double frac = 0.; - gzFile fp; - kseq_t *seq; - krand_t *kr = 0; - - while ((c = getopt(argc, argv, "2s:")) >= 0) - if (c == 's') kr = kr_srand(atol(optarg)); - else if (c == '2') twopass = 1; - - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk sample [-2] [-s seed=11] |\n\n"); - fprintf(stderr, "Options: -s INT RNG seed [11]\n"); - fprintf(stderr, " -2 2-pass mode: twice as slow but with much reduced memory\n\n"); - return 1; - } - frac = atof(argv[optind+1]); - if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.; - else if (twopass) { - fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__); - twopass = 0; - } - if (kr == 0) kr = kr_srand(11); - - if (!twopass) { // the streaming version - kseq_t *buf = 0; - if (num > 0) buf = calloc(num, sizeof(kseq_t)); - if (num > 0 && buf == NULL) { - fprintf(stderr, "[E::%s] Could not allocate enough memory for %" PRIu64 " sequences. Exiting...\n", __func__, num); - free(kr); - return 1; - } - - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) { - double r = kr_drand(kr); - ++n_seqs; - if (num) { - uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); - if (y < num) cpy_kseq(&buf[y], seq); - } else if (r < frac) stk_printseq(seq, UINT_MAX); - } - for (i = 0; i < num; ++i) { - kseq_t *p = &buf[i]; - if (p->seq.l) stk_printseq(p, UINT_MAX); - free(p->seq.s); free(p->qual.s); free(p->name.s); - } - if (buf != NULL) free(buf); - } else { - uint64_t *buf; - khash_t(64) *hash; - int absent; - - if (strcmp(argv[optind], "-") == 0) { - fprintf(stderr, "[E::%s] in the 2-pass mode, the input cannot be STDIN.\n", __func__); - free(kr); - return 1; - } - - // 1st pass - buf = malloc(num * 8); - for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; - fp = gzopen(argv[optind], "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) { - double r = kr_drand(kr); - uint64_t y; - ++n_seqs; - y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); - if (y < num) buf[y] = n_seqs; - } - kseq_destroy(seq); - gzclose(fp); - hash = kh_init(64); - for (i = 0; i < num; ++i) kh_put(64, hash, buf[i], &absent); - free(buf); - // 2nd pass - fp = gzopen(argv[optind], "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) - if (kh_get(64, hash, ++n_seqs) != kh_end(hash)) - stk_printseq(seq, UINT_MAX); - kh_destroy(64, hash); - } - - kseq_destroy(seq); - gzclose(fp); - free(kr); - return 0; -} - -/* seq */ - -void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr) -{ - unsigned i, j; - khiter_t k; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) { // not found in the hash table - if (is_complement) { - if (mask_chr) { - for (j = 0; j < seq->seq.l; ++j) - seq->seq.s[j] = mask_chr; - } else { - for (j = 0; j < seq->seq.l; ++j) - seq->seq.s[j] = tolower(seq->seq.s[j]); - } - } - } else { - reglist_t *p = &kh_val(h, k); - if (!is_complement) { - for (i = 0; i < p->n; ++i) { - unsigned beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) continue; - if (end > seq->seq.l) end = seq->seq.l; - if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); - else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr; - } - } else { - int8_t *mask = calloc(seq->seq.l, 1); - for (i = 0; i < p->n; ++i) { - unsigned beg = p->a[i]>>32, end = p->a[i]; - if (end >= seq->seq.l) end = seq->seq.l; - for (j = beg; j < end; ++j) mask[j] = 1; - } - if (mask_chr) { - for (j = 0; j < seq->seq.l; ++j) - if (mask[j] == 0) seq->seq.s[j] = mask_chr; - } else { - for (j = 0; j < seq->seq.l; ++j) - if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]); - } - free(mask); - } - } -} - -int stk_seq(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, from_stdin, max_q = 255; - unsigned i, line_len = 0; - int64_t n_seqs = 0; - double frac = 1.; - khash_t(reg) *h = 0; - krand_t *kr = 0; - - while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVX:")) >= 0) { - switch (c) { - case 'a': - case 'A': flag |= 1; break; - case 'C': flag |= 2; break; - case 'r': flag |= 4; break; - case 'c': flag |= 8; break; - case '1': flag |= 16; break; - case '2': flag |= 32; break; - case 'V': flag |= 64; break; - case 'N': flag |= 128; break; - case 'M': h = stk_reg_read(optarg); break; - case 'n': mask_chr = *optarg; break; - case 'Q': qual_shift = atoi(optarg); break; - case 'q': qual_thres = atoi(optarg); break; - case 'X': max_q = atoi(optarg); break; - case 'l': line_len = atoi(optarg); break; - case 'L': min_len = atoi(optarg); break; - case 's': kr = kr_srand(atol(optarg)); break; - case 'f': frac = atof(optarg); break; - } - } - if (kr == 0) kr = kr_srand(11); - from_stdin = !isatty(fileno(stdin)); - if (argc == optind && !from_stdin) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk seq [options] |\n\n"); - fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n"); - fprintf(stderr, " -X INT mask bases with quality higher than INT [255]\n"); - fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n"); - fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len); - fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift); - fprintf(stderr, " -s INT random seed (effective with -f) [11]\n"); - fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); - fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); - fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); - fprintf(stderr, " -c mask complement region (effective with -M)\n"); - fprintf(stderr, " -r reverse complement\n"); - fprintf(stderr, " -A force FASTA output (discard quality)\n"); - fprintf(stderr, " -C drop comments at the header lines\n"); - fprintf(stderr, " -N drop sequences containing ambiguous bases\n"); - fprintf(stderr, " -1 output the 2n-1 reads only\n"); - fprintf(stderr, " -2 output the 2n reads only\n"); - fprintf(stderr, " -V shift quality by '(-Q) - 33'\n"); - fprintf(stderr, "\n"); - free(kr); - return 1; - } - if (line_len == 0) line_len = UINT_MAX; - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - qual_thres += qual_shift; - while (kseq_read(seq) >= 0) { - ++n_seqs; - if (seq->seq.l < min_len) continue; // NB: length filter before taking random - if (frac < 1. && kr_drand(kr) >= frac) continue; - if (flag & 48) { // then choose odd/even reads only - if ((flag&16) && (n_seqs&1) == 0) continue; - if ((flag&32) && (n_seqs&1) == 1) continue; - } - if (seq->qual.l && qual_thres > qual_shift) { - if (mask_chr) { - for (i = 0; i < seq->seq.l; ++i) - if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) - seq->seq.s[i] = mask_chr; - } else { - for (i = 0; i < seq->seq.l; ++i) - if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) - seq->seq.s[i] = tolower(seq->seq.s[i]); - } - } - if (flag & 1) seq->qual.l = 0; - if (flag & 2) seq->comment.l = 0; - if (h) stk_mask(seq, h, flag&8, mask_chr); // masking - if (flag & 4) { // reverse complement - int c0, c1; - for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence - c0 = comp_tab[(int)seq->seq.s[i]]; - c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]]; - seq->seq.s[i] = c1; - seq->seq.s[seq->seq.l - 1 - i] = c0; - } - if (seq->seq.l & 1) // complement the remaining base - seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]]; - if (seq->qual.l) { - for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality - c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0; - } - } - if ((flag & 64) && seq->qual.l && qual_shift != 33) - for (i = 0; i < seq->qual.l; ++i) - seq->qual.s[i] -= qual_shift - 33; - if (flag & 128) { - for (i = 0; i < seq->seq.l; ++i) - if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break; - if (i < seq->seq.l) continue; - } - stk_printseq(seq, line_len); - } - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - free(kr); - return 0; -} - -int stk_mergepe(int argc, char *argv[]) -{ - gzFile fp1, fp2; - kseq_t *seq[2]; - - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mergepe \n"); - return 1; - } - fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - seq[0] = kseq_init(fp1); - seq[1] = kseq_init(fp2); - while (kseq_read(seq[0]) >= 0) { - if (kseq_read(seq[1]) < 0) { - fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__); - break; - } - stk_printseq(seq[0], 0); - stk_printseq(seq[1], 0); - } - if (kseq_read(seq[1]) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__); - kseq_destroy(seq[0]); gzclose(fp1); - kseq_destroy(seq[1]); gzclose(fp2); - return 0; -} - -int stk_dropse(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq, last; - int from_stdin; - - from_stdin = !isatty(fileno(stdin)); - if (argc == 1 && !from_stdin) { - fprintf(stderr, "Usage: seqtk dropSE \n"); - return 1; - } - if (from_stdin && argc != 1 && strcmp(argv[1], "-") != 0) - fprintf(stderr, "[W::%s] stdin is available; the input file is ignored!\n", __func__); - fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - - memset(&last, 0, sizeof(kseq_t)); - while (kseq_read(seq) >= 0) { - if (last.name.l) { - kstring_t *p = &last.name, *q = &seq->name; - int is_diff; - if (p->l == q->l) { - int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; - is_diff = strncmp(p->s, q->s, l); - } else is_diff = 1; - if (!is_diff) { - stk_printseq(&last, 0); - stk_printseq(seq, 0); - last.name.l = 0; - } else cpy_kseq(&last, seq); - } else cpy_kseq(&last, seq); - } - - kseq_destroy(seq); - gzclose(fp); - // free last! - return 0; -} - - -int stk_kfreq(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *ks; - int kmer, i, l, mask; - char *nei; - - if (argc < 2) { - fprintf(stderr, "Usage: seqtk kfreq \n"); - return 1; - } - - // get the k-mer - l = strlen(argv[1]); - for (i = kmer = 0; i < l; ++i) { - int c = seq_nt6_table[(int)argv[1][i]]; - assert(c >= 1 && c <= 4); - kmer = kmer << 2 | (c - 1); - } - mask = (1<<2*l) - 1; - - // get the neighbors - nei = calloc(1, 1<<2*l); - for (i = 0; i < l; ++i) { - int j, x; - x = kmer & ~(3 << 2*i); - for (j = 0; j < 4; ++j) - nei[x|j<<2*i] = 1; - } - - fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - int k, x[2], cnt[2], cnt_nei[2], which; - x[0] = x[1] = k = cnt[0] = cnt[1] = cnt_nei[0] = cnt_nei[1] = 0; - for (i = 0; i < ks->seq.l; ++i) { - int c = seq_nt6_table[(int)ks->seq.s[i]]; - if (c >= 1 && c <= 4) { - x[0] = (x[0] << 2 | (c - 1)) & mask; - x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1)); - if (k < l) ++k; - if (k == l) { - if (x[0] == kmer) ++cnt[0]; - else if (x[1] == kmer) ++cnt[1]; - if (nei[x[0]]) ++cnt_nei[0]; - else if (nei[x[1]]) ++cnt_nei[1]; - } - } else k = 0; - } - which = cnt_nei[0] > cnt_nei[1]? 0 : 1; - printf("%s\t%ld\t%c\t%d\t%d\n", ks->name.s, ks->seq.l, "+-"[which], cnt_nei[which], cnt[which]); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* main function */ -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r68e-dirty\n\n"); - fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); - fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); - fprintf(stderr, " sample subsample sequences\n"); - fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); - fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n"); - fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n"); - fprintf(stderr, " hety regional heterozygosity\n"); - fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); - fprintf(stderr, " mergefa merge two FASTA/Q files\n"); - fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n"); - fprintf(stderr, " randbase choose a random base from hets\n"); - fprintf(stderr, " cutN cut sequence at long N\n"); - fprintf(stderr, " listhet extract the position of each het\n"); - fprintf(stderr, "\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - if (argc == 1) return usage(); - if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); - else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); - else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); - else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); - else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); - else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1); - else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1); - else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); - else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); - else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); - else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); - else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1); - else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1); - else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); - else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); - else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); - else { - fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); - return 1; - } - return 0; -} +/* The MIT License + + Copyright (c) 20082-2012 by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int n, m; + uint64_t *a; +} reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, reglist_t) +KHASH_SET_INIT_INT64(64) + +typedef kh_reg_t reghash_t; + +reghash_t *stk_reg_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int beg = -1, end = -1; + reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); + if (end < 0) end = -1; + } + } + } + } + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg < 0) beg = 0, end = INT_MAX; + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return h; +} + +void stk_reg_destroy(reghash_t *h) +{ + khint_t k; + if (h == 0) return; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} + +/* constant table */ + +unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +unsigned char seq_nt6_table[256] = { + 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 +}; + +char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; +unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; +int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; +char comp_tab[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', + 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, + 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', + 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 +}; + +static void stk_printstr(const kstring_t *s, unsigned line_len) +{ + if (line_len != UINT_MAX && line_len != 0) { + int i, rest = s->l; + for (i = 0; i < s->l; i += line_len, rest -= line_len) { + putchar('\n'); + if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout); + else fwrite(s->s + i, 1, rest, stdout); + } + putchar('\n'); + } else { + putchar('\n'); + puts(s->s); + } +} + +static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n) +{ + putchar(s->qual.l? '@' : '>'); + if (n >= 0) { + if (prefix) fputs(prefix, stdout); + printf("%lld", (long long)n); + } else fputs(s->name.s, stdout); + if (s->comment.l) { + putchar(' '); fputs(s->comment.s, stdout); + } + stk_printstr(&s->seq, line_len); + if (s->qual.l) { + putchar('+'); + stk_printstr(&s->qual, line_len); + } +} + +inline void stk_printseq(const kseq_t *s, int line_len) +{ + stk_printseq_renamed(s, line_len, 0, -1); +} + +/* + 64-bit Mersenne Twister pseudorandom number generator. Adapted from: + + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c + + which was written by Takuji Nishimura and Makoto Matsumoto and released + under the 3-clause BSD license. +*/ + +typedef uint64_t krint64_t; + +struct _krand_t; +typedef struct _krand_t krand_t; + +#define KR_NN 312 +#define KR_MM 156 +#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ +#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */ + +struct _krand_t { + int mti; + krint64_t mt[KR_NN]; +}; + +static void kr_srand0(krint64_t seed, krand_t *kr) +{ + kr->mt[0] = seed; + for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti) + kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti; +} + +krand_t *kr_srand(krint64_t seed) +{ + krand_t *kr; + kr = malloc(sizeof(krand_t)); + kr_srand0(seed, kr); + return kr; +} + +krint64_t kr_rand(krand_t *kr) +{ + krint64_t x; + static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL }; + if (kr->mti >= KR_NN) { + int i; + if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr); + for (i = 0; i < KR_NN - KR_MM; ++i) { + x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); + kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)]; + } + for (; i < KR_NN - 1; ++i) { + x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); + kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)]; + } + x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM); + kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)]; + kr->mti = 0; + } + x = kr->mt[kr->mti++]; + x ^= (x >> 29) & 0x5555555555555555ULL; + x ^= (x << 17) & 0x71D67FFFEDA60000ULL; + x ^= (x << 37) & 0xFFF7EEE000000000ULL; + x ^= (x >> 43); + return x; +} + +#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0)) + + +/* quality based trimming with Mott's algorithm */ +int stk_trimfq(int argc, char *argv[]) +{ // FIXME: when a record with zero length will always be treated as a fasta record + gzFile fp; + kseq_t *seq; + double param = 0.05, q_int2real[128]; + int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { + switch (c) { + case 'q': param = atof(optarg); break; + case 'l': min_len = atoi(optarg); break; + case 'b': left = atoi(optarg); break; + case 'e': right = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; + } + } + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q; it has priority over -B) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q; it has priority over -E) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-e) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-b/-B) [%d]\n", right_keep); + fprintf(stderr, "\n"); + return 1; + } + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + for (i = 0; i < 128; ++i) + q_int2real[i] = pow(10., -(i - 33) / 10.); + while (kseq_read(seq) >= 0) { + int beg, tmp, end; + double s, max = 0.; + if (seq->seq.l == 0) { // trying to fix locally the bug where reads with no sequence are converted to FASTA format + beg = 0; + end = 1; + seq->seq.l = 1; + seq->qual.l = 1; + seq->seq.s = (char*)malloc(2); + seq->seq.s[0] = 'A'; + seq->qual.s = (char*)malloc(2); + seq->qual.s[0]='F'; + } else if (left_keep) { + beg = left; end = left + left_keep; + if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (right_keep) { + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (left || right) { + beg = left; end = seq->seq.l - right; + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (seq->qual.l > min_len && param != 0.) { + for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { + int q = seq->qual.s[i]; + if (q < 36) q = 36; + if (q > 127) q = 127; + s += param - q_int2real[q]; + if (s > max) max = s, beg = tmp, end = i + 1; + if (s < 0) s = 0, tmp = i + 1; + } + + /* max never set; all low qual, just give first min_len bp */ + if (max == 0.) beg = 0, end = min_len; + + if (end - beg < min_len) { // window-based + int is, imax; + for (i = 0, is = 0; i < min_len; ++i) + is += seq->qual.s[i] - 33; + for (imax = is, beg = 0; i < seq->qual.l; ++i) { + is += (int)seq->qual.s[i] - seq->qual.s[i - min_len]; + if (imax < is) imax = is, beg = i - min_len + 1; + } + end = beg + min_len; + } + } else beg = 0, end = seq->seq.l; + putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); + if (seq->comment.l) { + putchar(' '); puts(seq->comment.s); + } else putchar('\n'); + fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); + if (seq->qual.l) { + puts("+"); + fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* composition */ +int stk_comp(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, upper_only = 0; + reghash_t *h = 0; + reglist_t dummy; + + while ((c = getopt(argc, argv, "ur:")) >= 0) { + switch (c) { + case 'u': upper_only = 1; break; + case 'r': h = stk_reg_read(optarg); break; + } + } + if (argc == optind && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); + fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); + return 1; + } + fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); + while ((l = kseq_read(seq)) >= 0) { + int i, k; + reglist_t *p = 0; + if (h) { + khint_t k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) p = &kh_val(h, k); + } else { + p = &dummy; + dummy.a[0] = l; + } + for (k = 0; p && k < p->n; ++k) { + int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; + int la, lb, lc, na, nb, nc, cnt[11]; + if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; + else la = 'a', lb = -1, lc = 0; + na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + memset(cnt, 0, 11 * sizeof(int)); + for (i = beg; i < end; ++i) { + int is_CpG = 0, a, b, c; + a = na; b = nb; c = nc; + na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + if (b == 2 || b == 10) { // C or Y + if (nb == 4 || nb == 5) is_CpG = 1; + } else if (b == 4 || b == 5) { // G or R + if (lb == 2 || lb == 10) is_CpG = 1; + } + if (upper_only == 0 || isupper(a)) { + if (c > 1) ++cnt[c+2]; + if (c == 1) ++cnt[seq_nt16to4_table[b]]; + if (b == 10 || b == 5) ++cnt[9]; + else if (c == 2) { + ++cnt[8]; + } + if (is_CpG) { + ++cnt[7]; + if (b == 10 || b == 5) ++cnt[10]; + } + } + la = a; lb = b; lc = c; + } + if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); + else printf("%s\t%d", seq->name.s, l); + for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); + putchar('\n'); + } + fflush(stdout); + } + free(dummy.a); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_randbase(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk randbase \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + int i; + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + int c, b, a, j, k, m; + b = seq->seq.s[i]; + c = seq_nt16_table[b]; + a = bitcnt_table[c]; + if (a == 2) { + m = (drand48() < 0.5); + for (j = k = 0; j < 4; ++j) { + if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; + } + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_hety(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; + char *buf; + uint32_t cnt[3]; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk hety [options] \n\n"); + fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); + fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); + fprintf(stderr, " -m treat lowercases as masked\n"); + fprintf(stderr, "\n"); + return 1; + } + while ((c = getopt(argc, argv, "w:t:m")) >= 0) { + switch (c) { + case 'w': win_size = atoi(optarg); break; + case 't': n_start = atoi(optarg); break; + case 'm': is_lower_mask = 1; break; + } + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + win_step = win_size / n_start; + buf = calloc(win_size, 1); + while ((l = kseq_read(seq)) >= 0) { + int x, i, y, z, next = 0; + cnt[0] = cnt[1] = cnt[2] = 0; + for (i = 0; i <= l; ++i) { + if ((i >= win_size && i % win_step == 0) || i == l) { + if (i == l && l >= win_size) { + for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; + } + if (cnt[1] + cnt[2] > 0) + printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, + (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); + next = i; + } + if (i < l) { + y = i % win_size; + c = seq->seq.s[i]; + if (is_lower_mask && islower(c)) c = 'N'; + c = seq_nt16_table[c]; + x = bitcnt_table[c]; + if (i >= win_size) --cnt[(int)buf[y]]; + buf[y] = z = x > 2? 0 : x == 2? 2 : 1; + ++cnt[z]; + } + } + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* subseq */ + +int stk_subseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_tab = 0, line = 0; + khint_t k; + while ((c = getopt(argc, argv, "tl:")) >= 0) { + switch (c) { + case 't': is_tab = 1; break; + case 'l': line = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); + fprintf(stderr, "Options: -t TAB delimited output\n"); + fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); + fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // subseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); + continue; + } + if (end > seq->seq.l) end = seq->seq.l; + if (is_tab == 0) { + printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); + if (beg > 0 || (int)p->a[i] != INT_MAX) { + if (end == INT_MAX) { + if (beg) printf(":%d", beg+1); + } else printf(":%d-%d", beg+1, end); + } + if (seq->comment.l) printf("\t%s", seq->comment.s); + } else printf("%s\t%d\t", seq->name.s, beg + 1); + if (end > seq->seq.l) end = seq->seq.l; + for (j = 0; j < end - beg; ++j) { + if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n'); + putchar(seq->seq.s[j + beg]); + } + putchar('\n'); + if (seq->qual.l != seq->seq.l || is_tab) continue; + printf("+"); + for (j = 0; j < end - beg; ++j) { + if (j == 0 || (line > 0 && j % line == 0)) putchar('\n'); + putchar(seq->qual.s[j + beg]); + } + putchar('\n'); + } + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* mergefa */ +int stk_mergefa(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0; + uint64_t cnt[5]; + while ((c = getopt(argc, argv, "himrq:")) >= 0) { + switch (c) { + case 'i': is_intersect = 1; break; + case 'h': is_haploid = 1; break; + case 'm': is_mask = 1; break; + case 'r': is_randhet = 1; break; + case 'q': qual = atoi(optarg); break; + } + } + if (is_mask && is_intersect) { + fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); + return 1; + } + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); + fprintf(stderr, "Options: -q INT quality threshold [0]\n"); + fprintf(stderr, " -i take intersection\n"); + fprintf(stderr, " -m convert to lowercase when one of the input base is N\n"); + fprintf(stderr, " -r pick a random allele from het\n"); + fprintf(stderr, " -h suppress hets in the input\n\n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; + srand48(11); + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2], b[2], is_upper; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); + if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); + if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; + c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; + if (c[0] == 0) c[0] = 15; + if (c[1] == 0) c[1] = 15; + b[0] = bitcnt_table[c[0]]; + b[1] = bitcnt_table[c[1]]; + if (is_upper) { + if (b[0] == 1 && b[1] == 1) { + if (c[0] == c[1]) ++cnt[0]; + else ++cnt[1]; + } else if (b[0] == 1 && b[1] == 2) ++cnt[2]; + else if (b[0] == 2 && b[1] == 1) ++cnt[3]; + else if (b[0] == 2 && b[1] == 2) ++cnt[4]; + } + if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0; + if (is_intersect) { + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0! + } else if (is_mask) { + if (c[0] == 15 || c[1] == 15) is_upper = 0; + c[0] &= c[1]; + if (c[0] == 0) is_upper = 0; + } else if (is_randhet) { + if (b[0] == 1 && b[1] == 1) { // two homs + c[0] |= c[1]; + } else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het + c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]); + } else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets + if (lrand48()&1) { + if (lrand48()&1) { + for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele + if (c[0]&i) c[0] &= i; + } else { + for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele + if (c[0]&i) c[0] &= i; + } + } // else set as het + } else is_upper = 0; + } else c[0] |= c[1]; + c[0] = seq_nt16_rev_table[c[0]]; + if (!is_upper) c[0] = tolower(c[0]); + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]); + return 0; +} + +int stk_famask(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk famask \n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2]; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (c[1] == 'x') c[0] = tolower(c[0]); + else if (c[1] != 'X') c[0] = c[1]; + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_mutfa(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + kstream_t *ks; + int l, i, dret; + kstring_t *str; + khint_t k; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mutfa \n\n"); + fprintf(stderr, "Note: contains at least four columns per line which are:\n"); + fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); + return 1; + } + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + char *s = strdup(str->s); + int beg = 0, ret; + reglist_t *p; + k = kh_get(reg, h, s); + if (k == kh_end(h)) { + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col + ks_getuntil(ks, 0, str, &dret); // 3rd col + ks_getuntil(ks, 0, str, &dret); // 4th col + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (isalpha(str->s[0]) && str->l == 1) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + // mutfa + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) { + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32; + if (beg < seq->seq.l) + seq->seq.s[beg] = (int)p->a[i]; + } + } + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); + return 0; +} + +int stk_listhet(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk listhet \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = 0; i < l; ++i) { + int b = seq->seq.s[i]; + if (bitcnt_table[seq_nt16_table[b]] == 2) + printf("%s\t%d\t%c\n", seq->name.s, i+1, b); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* cutN */ +static int cutN_min_N_tract = 1000; +static int cutN_nonN_penalty = 10; + +static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) +{ + int i, b, e; + while (k < ks->seq.l) { + if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { + int score, max; + score = 0; e = max = -1; + for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, e = i; + } + score = 0; b = max = -1; + for (i = e; i >= 0 && score >= 0; --i) { /* backward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, b = i; + } + if (e + 1 - b >= cutN_min_N_tract) { + *begin = b; + *end = e + 1; + return *end; + } + k = e + 1; + } else ++k; + } + return -1; +} +static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) +{ + int i; + if (begin >= end) return; // FIXME: why may this happen? Understand it! + fprintf(fpout, "%c%s:%d-%d", ks->qual.l? '@' : '>', ks->name.s, begin+1, end); + for (i = begin; i < end && i < ks->seq.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->seq.s[i], fpout); + } + fputc('\n', fpout); + if (ks->qual.l == 0) return; + fputs("+\n", fpout); + for (i = begin; i < end && i < ks->qual.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->qual.s[i], fpout); + } + fputc('\n', fpout); +} +int stk_cutN(int argc, char *argv[]) +{ + int c, l, gap_only = 0; + gzFile fp; + kseq_t *ks; + while ((c = getopt(argc, argv, "n:p:g")) >= 0) { + switch (c) { + case 'n': cutN_min_N_tract = atoi(optarg); break; + case 'p': cutN_nonN_penalty = atoi(optarg); break; + case 'g': gap_only = 1; break; + default: return 1; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); + fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); + fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); + fprintf(stderr, " -g print gaps only, no sequence\n\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + ks = kseq_init(fp); + while ((l = kseq_read(ks)) >= 0) { + int k = 0, begin = 0, end = 0; + while (find_next_cut(ks, k, &begin, &end) >= 0) { + if (begin != 0) { + if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); + else print_seq(stdout, ks, k, begin); + } + k = end; + } + if (!gap_only) print_seq(stdout, ks, k, l); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +int stk_hrun(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int min_len = 7, l = 0, c = 0, beg = 0, i; + if (argc == optind) { + fprintf(stderr, "Usage: seqtk hrun [minLen=%d]\n", min_len); + return 1; + } + if (argc == optind + 2) min_len = atoi(argv[optind+1]); + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + c = ks->seq.s[0]; l = 1; beg = 0; + for (i = 1; i < ks->seq.l; ++i) { + if (ks->seq.s[i] != c) { + if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); + c = ks->seq.s[i]; l = 1; beg = i; + } else ++l; + } + } + if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* sample */ + +static void cpy_kstr(kstring_t *dst, const kstring_t *src) +{ + if (src->l == 0) return; + if (src->l + 1 > dst->m) { + dst->m = src->l + 1; + kroundup32(dst->m); + dst->s = realloc(dst->s, dst->m); + } + dst->l = src->l; + memcpy(dst->s, src->s, src->l + 1); +} + +static void cpy_kseq(kseq_t *dst, const kseq_t *src) +{ + cpy_kstr(&dst->name, &src->name); + cpy_kstr(&dst->seq, &src->seq); + cpy_kstr(&dst->qual, &src->qual); + cpy_kstr(&dst->comment, &src->comment); +} + +int stk_sample(int argc, char *argv[]) +{ + int c, twopass = 0; + uint64_t i, num = 0, n_seqs = 0; + double frac = 0.; + gzFile fp; + kseq_t *seq; + krand_t *kr = 0; + + while ((c = getopt(argc, argv, "2s:")) >= 0) + if (c == 's') kr = kr_srand(atol(optarg)); + else if (c == '2') twopass = 1; + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk sample [-2] [-s seed=11] |\n\n"); + fprintf(stderr, "Options: -s INT RNG seed [11]\n"); + fprintf(stderr, " -2 2-pass mode: twice as slow but with much reduced memory\n\n"); + return 1; + } + frac = atof(argv[optind+1]); + if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.; + else if (twopass) { + fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__); + twopass = 0; + } + if (kr == 0) kr = kr_srand(11); + + if (!twopass) { // the streaming version + kseq_t *buf = 0; + if (num > 0) buf = calloc(num, sizeof(kseq_t)); + if (num > 0 && buf == NULL) { + fprintf(stderr, "[E::%s] Could not allocate enough memory for %" PRIu64 " sequences. Exiting...\n", __func__, num); + free(kr); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) { + double r = kr_drand(kr); + ++n_seqs; + if (num) { + uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); + if (y < num) cpy_kseq(&buf[y], seq); + } else if (r < frac) stk_printseq(seq, UINT_MAX); + } + for (i = 0; i < num; ++i) { + kseq_t *p = &buf[i]; + if (p->seq.l) stk_printseq(p, UINT_MAX); + free(p->seq.s); free(p->qual.s); free(p->name.s); + } + if (buf != NULL) free(buf); + } else { + uint64_t *buf; + khash_t(64) *hash; + int absent; + + if (strcmp(argv[optind], "-") == 0) { + fprintf(stderr, "[E::%s] in the 2-pass mode, the input cannot be STDIN.\n", __func__); + free(kr); + return 1; + } + + // 1st pass + buf = malloc(num * 8); + for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; + fp = gzopen(argv[optind], "r"); + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) { + double r = kr_drand(kr); + uint64_t y; + ++n_seqs; + y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); + if (y < num) buf[y] = n_seqs; + } + kseq_destroy(seq); + gzclose(fp); + hash = kh_init(64); + for (i = 0; i < num; ++i) kh_put(64, hash, buf[i], &absent); + free(buf); + // 2nd pass + fp = gzopen(argv[optind], "r"); + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) + if (kh_get(64, hash, ++n_seqs) != kh_end(hash)) + stk_printseq(seq, UINT_MAX); + kh_destroy(64, hash); + } + + kseq_destroy(seq); + gzclose(fp); + free(kr); + return 0; +} + +/* seq */ + +void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr) +{ + unsigned i, j; + khiter_t k; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) { // not found in the hash table + if (is_complement) { + if (mask_chr) { + for (j = 0; j < seq->seq.l; ++j) + seq->seq.s[j] = mask_chr; + } else { + for (j = 0; j < seq->seq.l; ++j) + seq->seq.s[j] = tolower(seq->seq.s[j]); + } + } + } else { + reglist_t *p = &kh_val(h, k); + if (!is_complement) { + for (i = 0; i < p->n; ++i) { + unsigned beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) continue; + if (end > seq->seq.l) end = seq->seq.l; + if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); + else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr; + } + } else { + int8_t *mask = calloc(seq->seq.l, 1); + for (i = 0; i < p->n; ++i) { + unsigned beg = p->a[i]>>32, end = p->a[i]; + if (end >= seq->seq.l) end = seq->seq.l; + for (j = beg; j < end; ++j) mask[j] = 1; + } + if (mask_chr) { + for (j = 0; j < seq->seq.l; ++j) + if (mask[j] == 0) seq->seq.s[j] = mask_chr; + } else { + for (j = 0; j < seq->seq.l; ++j) + if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]); + } + free(mask); + } + } +} + +int stk_seq(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255; + unsigned i, line_len = 0; + int64_t n_seqs = 0; + double frac = 1.; + khash_t(reg) *h = 0; + krand_t *kr = 0; + + while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:")) >= 0) { + switch (c) { + case 'a': + case 'A': flag |= 1; break; + case 'C': flag |= 2; break; + case 'r': flag |= 4; break; + case 'c': flag |= 8; break; + case '1': flag |= 16; break; + case '2': flag |= 32; break; + case 'V': flag |= 64; break; + case 'N': flag |= 128; break; + case 'U': flag |= 256; break; + case 'M': h = stk_reg_read(optarg); break; + case 'n': mask_chr = *optarg; break; + case 'Q': qual_shift = atoi(optarg); break; + case 'q': qual_thres = atoi(optarg); break; + case 'X': max_q = atoi(optarg); break; + case 'l': line_len = atoi(optarg); break; + case 'L': min_len = atoi(optarg); break; + case 's': kr = kr_srand(atol(optarg)); break; + case 'f': frac = atof(optarg); break; + } + } + if (kr == 0) kr = kr_srand(11); + if (argc == optind && isatty(fileno(stdin))) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk seq [options] |\n\n"); + fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n"); + fprintf(stderr, " -X INT mask bases with quality higher than INT [255]\n"); + fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n"); + fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len); + fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift); + fprintf(stderr, " -s INT random seed (effective with -f) [11]\n"); + fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); + fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); + fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); + fprintf(stderr, " -c mask complement region (effective with -M)\n"); + fprintf(stderr, " -r reverse complement\n"); + fprintf(stderr, " -A force FASTA output (discard quality)\n"); + fprintf(stderr, " -C drop comments at the header lines\n"); + fprintf(stderr, " -N drop sequences containing ambiguous bases\n"); + fprintf(stderr, " -1 output the 2n-1 reads only\n"); + fprintf(stderr, " -2 output the 2n reads only\n"); + fprintf(stderr, " -V shift quality by '(-Q) - 33'\n"); + fprintf(stderr, " -U convert all bases to uppercases\n"); + fprintf(stderr, "\n"); + free(kr); + return 1; + } + if (line_len == 0) line_len = UINT_MAX; + fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + qual_thres += qual_shift; + while (kseq_read(seq) >= 0) { + ++n_seqs; + if (seq->seq.l < min_len) continue; // NB: length filter before taking random + if (frac < 1. && kr_drand(kr) >= frac) continue; + if (flag & 48) { // then choose odd/even reads only + if ((flag&16) && (n_seqs&1) == 0) continue; + if ((flag&32) && (n_seqs&1) == 1) continue; + } + if (seq->qual.l && qual_thres > qual_shift) { + if (mask_chr) { + for (i = 0; i < seq->seq.l; ++i) + if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) + seq->seq.s[i] = mask_chr; + } else { + for (i = 0; i < seq->seq.l; ++i) + if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) + seq->seq.s[i] = tolower(seq->seq.s[i]); + } + } + if (flag & 256) + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = toupper(seq->seq.s[i]); + if (flag & 1) seq->qual.l = 0; + if (flag & 2) seq->comment.l = 0; + if (h) stk_mask(seq, h, flag&8, mask_chr); // masking + if (flag & 4) { // reverse complement + int c0, c1; + for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence + c0 = comp_tab[(int)seq->seq.s[i]]; + c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]]; + seq->seq.s[i] = c1; + seq->seq.s[seq->seq.l - 1 - i] = c0; + } + if (seq->seq.l & 1) // complement the remaining base + seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]]; + if (seq->qual.l) { + for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality + c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0; + } + } + if ((flag & 64) && seq->qual.l && qual_shift != 33) + for (i = 0; i < seq->qual.l; ++i) + seq->qual.s[i] -= qual_shift - 33; + if (flag & 128) { + for (i = 0; i < seq->seq.l; ++i) + if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break; + if (i < seq->seq.l) continue; + } + stk_printseq(seq, line_len); + } + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + free(kr); + return 0; +} + +int stk_gc(int argc, char *argv[]) +{ + int c, is_at = 0, min_l = 20; + double frac = 0.6f, xdropoff = 10.0f, q; + gzFile fp; + kseq_t *seq; + + while ((c = getopt(argc, argv, "wx:f:l:")) >= 0) { + if (c == 'x') xdropoff = atof(optarg); + else if (c == 'w') is_at = 1; + else if (c == 'f') frac = atof(optarg); + else if (c == 'l') min_l = atoi(optarg); + } + if (optind + 1 > argc) { + fprintf(stderr, "Usage: seqtk gc [options] \n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -w identify high-AT regions\n"); + fprintf(stderr, " -f FLOAT min GC fraction (or AT fraction for -w) [%.2f]\n", frac); + fprintf(stderr, " -l INT min region length to output [%d]\n", min_l); + fprintf(stderr, " -x FLOAT X-dropoff [%.1f]\n", xdropoff); + return 1; + } + q = (1.0f - frac) / frac; + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while (kseq_read(seq) >= 0) { + int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0; + double sc = 0., max = 0.; + for (i = 0; i < seq->seq.l; ++i) { + int hit; + c = seq_nt16_table[(int)seq->seq.s[i]]; + if (is_at) hit = (c == 1 || c == 8 || c == 9); + else hit = (c == 2 || c == 4 || c == 6); + n_hits += hit; + if (hit) { + if (sc == 0) start = i, start_hits = n_hits; + sc += q; + if (sc > max) max = sc, max_i = i, max_hits = n_hits; + } else if (sc > 0) { + sc += -1.0f; + if (sc < 0 || max - sc > xdropoff) { + if (max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + sc = max = 0; + i = max_i; + } + } + } + if (max > 0. && max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_mergepe(int argc, char *argv[]) +{ + gzFile fp1, fp2; + kseq_t *seq[2]; + + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mergepe \n"); + return 1; + } + fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + seq[0] = kseq_init(fp1); + seq[1] = kseq_init(fp2); + while (kseq_read(seq[0]) >= 0) { + if (kseq_read(seq[1]) < 0) { + fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__); + break; + } + stk_printseq(seq[0], 0); + stk_printseq(seq[1], 0); + } + if (kseq_read(seq[1]) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__); + kseq_destroy(seq[0]); gzclose(fp1); + kseq_destroy(seq[1]); gzclose(fp2); + return 0; +} + +int stk_dropse(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq, last; + + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk dropse \n"); + return 1; + } + fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + + memset(&last, 0, sizeof(kseq_t)); + while (kseq_read(seq) >= 0) { + if (last.name.l) { + kstring_t *p = &last.name, *q = &seq->name; + int is_diff; + if (p->l == q->l) { + int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; + is_diff = strncmp(p->s, q->s, l); + } else is_diff = 1; + if (!is_diff) { + stk_printseq(&last, 0); + stk_printseq(seq, 0); + last.name.l = 0; + } else cpy_kseq(&last, seq); + } else cpy_kseq(&last, seq); + } + + kseq_destroy(seq); + gzclose(fp); + // free last! + return 0; +} + +int stk_rename(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq, last; + char *prefix = 0; + uint64_t n = 1; + + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk rename [prefix]\n"); + return 1; + } + fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + if (argc > 2) prefix = argv[2]; + + memset(&last, 0, sizeof(kseq_t)); + while (kseq_read(seq) >= 0) { + if (last.name.l) { + kstring_t *p = &last.name, *q = &seq->name; + int is_diff; + if (p->l == q->l) { + int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; + is_diff = strncmp(p->s, q->s, l); + } else is_diff = 1; + if (!is_diff) { + stk_printseq_renamed(&last, 0, prefix, n); + stk_printseq_renamed(seq, 0, prefix, n); + last.name.l = 0; + ++n; + } else { + stk_printseq_renamed(&last, 0, prefix, n); + ++n; + cpy_kseq(&last, seq); + } + } else cpy_kseq(&last, seq); + } + if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n); + + kseq_destroy(seq); + gzclose(fp); + // free last! + return 0; +} + +int stk_kfreq(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int kmer, i, l, mask; + char *nei; + + if (argc < 2) { + fprintf(stderr, "Usage: seqtk kfreq \n"); + return 1; + } + + // get the k-mer + l = strlen(argv[1]); + for (i = kmer = 0; i < l; ++i) { + int c = seq_nt6_table[(int)argv[1][i]]; + assert(c >= 1 && c <= 4); + kmer = kmer << 2 | (c - 1); + } + mask = (1<<2*l) - 1; + + // get the neighbors + nei = calloc(1, 1<<2*l); + for (i = 0; i < l; ++i) { + int j, x; + x = kmer & ~(3 << 2*i); + for (j = 0; j < 4; ++j) + nei[x|j<<2*i] = 1; + } + + fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + int k, x[2], cnt[2], cnt_nei[2], which; + x[0] = x[1] = k = cnt[0] = cnt[1] = cnt_nei[0] = cnt_nei[1] = 0; + for (i = 0; i < ks->seq.l; ++i) { + int c = seq_nt6_table[(int)ks->seq.s[i]]; + if (c >= 1 && c <= 4) { + x[0] = (x[0] << 2 | (c - 1)) & mask; + x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1)); + if (k < l) ++k; + if (k == l) { + if (x[0] == kmer) ++cnt[0]; + else if (x[1] == kmer) ++cnt[1]; + if (nei[x[0]]) ++cnt_nei[0]; + else if (nei[x[1]]) ++cnt_nei[1]; + } + } else k = 0; + } + which = cnt_nei[0] > cnt_nei[1]? 0 : 1; + printf("%s\t%ld\t%c\t%d\t%d\n", ks->name.s, ks->seq.l, "+-"[which], cnt_nei[which], cnt[which]); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* fqchk */ + +typedef struct { + int64_t q[94], b[5]; +} posstat_t; + +static void fqc_aux(posstat_t *p, int pos, int64_t allq[94], double perr[94], int qthres) +{ + int k; + int64_t sum = 0, qsum = 0, sum_low = 0; + double psum = 0; + if (pos <= 0) printf("ALL"); + else printf("%d", pos); + for (k = 0; k <= 4; ++k) sum += p->b[k]; + printf("\t%lld", (long long)sum); + for (k = 0; k <= 4; ++k) + printf("\t%.1f", 100. * p->b[k] / sum); + for (k = 0; k <= 93; ++k) { + qsum += p->q[k] * k, psum += p->q[k] * perr[k]; + if (k < qthres) sum_low += p->q[k]; + } + printf("\t%.1f\t%.1f", (double)qsum/sum, -4.343*log((psum+1e-6)/(sum+1e-6))); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (allq[k] > 0) printf("\t%.2f", 100. * p->q[k] / sum); + } else printf("\t%.1f\t%.1f", 100. * sum_low / sum, 100. * (sum - sum_low) / sum); + putchar('\n'); +} + +int stk_fqchk(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, c, k, max_len = 0, min_len = 0x7fffffff, max_alloc = 0, offset = 33, n_diffQ = 0, qthres = 20; + int64_t tot_len = 0, n = 0; + double perr[94]; + posstat_t all, *pos = 0; + + while ((c = getopt(argc, argv, "q:")) >= 0) + if (c == 'q') qthres = atoi(optarg); + + if (optind == argc) { + fprintf(stderr, "Usage: seqtk fqchk [-q %d] \n", qthres); + fprintf(stderr, "Note: use -q0 to get the distribution of all quality values\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + for (k = 0; k <= 93; ++k) + perr[k] = pow(10., -.1 * k); + perr[0] = perr[1] = perr[2] = perr[3] = .5; + while (kseq_read(seq) >= 0) { + if (seq->qual.l == 0) continue; + ++n; + tot_len += seq->seq.l; + min_len = min_len < seq->seq.l? min_len : seq->seq.l; + max_len = max_len > seq->seq.l? max_len : seq->seq.l; + if (max_len > max_alloc) { + int old_max = max_alloc; + max_alloc = max_len; + kroundup32(max_alloc); + pos = realloc(pos, max_alloc * sizeof(posstat_t)); + memset(&pos[old_max], 0, (max_alloc - old_max) * sizeof(posstat_t)); + } + for (i = 0; i < seq->qual.l; ++i) { + int q = seq->qual.s[i] - offset; + int b = seq_nt6_table[(int)seq->seq.s[i]]; + b = b? b - 1 : 4; + q = q < 93? q : 93; + ++pos[i].q[q]; + ++pos[i].b[b]; + } + } + kseq_destroy(seq); + gzclose(fp); + + memset(&all, 0, sizeof(posstat_t)); + for (i = 0; i < max_len; ++i) { + for (k = 0; k <= 93; ++k) + all.q[k] += pos[i].q[k]; + for (k = 0; k <= 4; ++k) + all.b[k] += pos[i].b[k]; + } + for (k = n_diffQ = 0; k <= 93; ++k) + if (all.q[k]) ++n_diffQ; + printf("min_len: %d; max_len: %d; avg_len: %.2f; %d distinct quality values\n", min_len, max_len, (double)tot_len/n, n_diffQ); + printf("POS\t#bases\t%%A\t%%C\t%%G\t%%T\t%%N\tavgQ\terrQ"); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (all.q[k] > 0) printf("\t%%Q%d", k); + } else printf("\t%%low\t%%high"); + putchar('\n'); + fqc_aux(&all, 0, all.q, perr, qthres); + for (i = 0; i < max_len; ++i) + fqc_aux(&pos[i], i + 1, all.q, perr, qthres); + free(pos); + return 0; +} + +/* main function */ +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk \n"); + fprintf(stderr, "Version: 1.0-r82b-dirty\n\n"); + fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); + fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); + fprintf(stderr, " sample subsample sequences\n"); + fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " fqchk fastq QC (base/quality summary)\n"); + fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n"); + fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n"); + fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " gc identify high- or low-GC regions\n"); + fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); + fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n"); + fprintf(stderr, " rename rename sequence names\n"); + fprintf(stderr, " randbase choose a random base from hets\n"); + fprintf(stderr, " cutN cut sequence at long N\n"); + fprintf(stderr, " listhet extract the position of each het\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) return usage(); + if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1); + else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1); + else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); + else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); + else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); + else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1); + else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1); + else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); + else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); + else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); + else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); + else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1); + else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1); + else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); + else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); + else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); + else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]); + return 1; + } + return 0; +} From 60513badf286c80c820a2ae17b101002f21b5a11 Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 22 Oct 2015 12:50:37 +0300 Subject: [PATCH 20/32] updates --- LICENSE | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0380c7e --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +The MIT License +Copyright (c) 20082-2012 by Heng Li +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From dd555ecb012ee2073e84aa74bee7a999b472188f Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Thu, 22 Oct 2015 12:55:16 +0300 Subject: [PATCH 21/32] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b4f2a67..2378a3a 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,4 @@ Seqtk Examples * Trim 5bp from right end and keep the 50bp from right end of each read and if trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: seqtk trimfq -E 50 -e 5 -l 20 in.fq > out.fq + From 0b4b78d1ead6449bca9dc092cfccbe5d0b22fea3 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 5 Mar 2017 07:53:05 +0200 Subject: [PATCH 22/32] updates --- Makefile | 5 +- README.md | 27 +- kseq.h | 218 ++-- ksort.h | 298 ----- kstring.h | 169 --- ksw.c | 454 ------- ksw.h | 69 -- kvec.h | 90 -- seqtk.c | 3359 +++++++++++++++++++++++++++------------------------- trimadap.c | 184 --- 10 files changed, 1836 insertions(+), 3037 deletions(-) delete mode 100644 ksort.h delete mode 100644 kstring.h delete mode 100644 ksw.c delete mode 100644 ksw.h delete mode 100644 kvec.h delete mode 100644 trimadap.c diff --git a/Makefile b/Makefile index 0b9f9c2..12bd575 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,10 @@ CC=gcc CFLAGS=-g -Wall -O2 -Wno-unused-function -all:seqtk trimadap +all:seqtk seqtk:seqtk.c khash.h kseq.h $(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm -trimadap:trimadap.c kseq.h ksw.h - $(CC) $(CFLAGS) ksw.c trimadap.c -o $@ -lz -lm - clean: rm -fr gmon.out *.o ext/*.o a.out seqtk trimadap *~ *.a *.dSYM session* diff --git a/README.md b/README.md index 2378a3a..6dec1cc 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,12 @@ Introduction Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be -optionally compressed by gzip. +optionally compressed by gzip. To install `seqtk`, +```sh +git clone https://github.com/lh3/seqtk.git; +cd seqtk; make +``` +The only library dependency is zlib. Seqtk Examples -------------- @@ -54,23 +59,3 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa -* Keep first 50bp from the left end of each read by trimming the right end: - - seqtk trimfq -B 50 in.fq > out.fq - -* Keep last 50bp from the right end of each read by trimming the left end: - - seqtk trimfq -E 50 in.fq > out.fq - -* Trim 5bp from left end and keep next 50bp from left end of each read: - - seqtk trimfq -B 50 -b 5 in.fq > out.fq - -* Trim 5bp from right end and keep the 50bp from right end of each read: - - seqtk trimfq -E 50 -e 5 in.fq > out.fq - -* Trim 5bp from right end and keep the 50bp from right end of each read and if trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: - - seqtk trimfq -E 50 -e 5 -l 20 in.fq > out.fq - diff --git a/kseq.h b/kseq.h index b2238d1..8f9e498 100644 --- a/kseq.h +++ b/kseq.h @@ -23,7 +23,7 @@ SOFTWARE. */ -/* Last Modified: 05MAR2012 */ +/* Last Modified: 2017-02-11 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -37,42 +37,45 @@ #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - unsigned char *buf; \ - int begin, end, is_eof; \ - type_t f; \ +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ } kstream_t; +#define ks_err(ks) ((ks)->end < 0) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (unsigned char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ } -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end == 0) { ks->is_eof = 1; return -1;} \ - } \ - return (int)ks->buf[ks->begin++]; \ +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks_err(ks)) return -3; \ + if (ks_eof(ks)) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + else if (ks->end < 0) { ks->is_eof = 1; return -3; } \ + } \ + return (int)ks->buf[ks->begin++]; \ } #ifndef KSTRING_T @@ -87,140 +90,145 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#define __KS_GETUNTIL(__read, __bufsize) \ +#define __KS_GETUNTIL(__read, __bufsize) \ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - int gotany = 0; \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end == 0) { ks->is_eof = 1; break; } \ - } else break; \ - } \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks_err(ks)) return -3; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ + } else break; \ + } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ - } else if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - gotany = 1; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (!gotany && ks_eof(ks)) return -1; \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ - str->s[str->l] = '\0'; \ - return str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ } \ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) -#define __KSEQ_BASIC(SCOPE, type_t) \ - SCOPE kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - SCOPE void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ + ks_destroy(ks->f); \ + free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string + -3 error reading stream */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c,r; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error*/ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ + seq->is_fastq = (c == '+'); \ + if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ + if (c == -3) return -3; /* stream error */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char, is_fastq; \ + kstream_t *f; \ } kseq_t; -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) diff --git a/ksort.h b/ksort.h deleted file mode 100644 index 4da7a13..0000000 --- a/ksort.h +++ /dev/null @@ -1,298 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2011-04-10 (0.1.6): - - * Added sample - - 2011-03 (0.1.5): - - * Added shuffle/permutation - - 2008-11-16 (0.1.4): - - * Fixed a bug in introsort() that happens in rare cases. - - 2008-11-05 (0.1.3): - - * Fixed a bug in introsort() for complex comparisons. - - * Fixed a bug in mergesort(). The previous version is not stable. - - 2008-09-15 (0.1.2): - - * Accelerated introsort. On my Mac (not on another Linux machine), - my implementation is as fast as std::sort on random input. - - * Added combsort and in introsort, switch to combsort if the - recursion is too deep. - - 2008-09-13 (0.1.1): - - * Added k-small algorithm - - 2008-09-05 (0.1.0): - - * Initial version - -*/ - -#ifndef AC_KSORT_H -#define AC_KSORT_H - -#include -#include - -typedef struct { - void *left, *right; - int depth; -} ks_isort_stack_t; - -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } - -#define KSORT_INIT(name, type_t, __sort_lt) \ - void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ - { \ - type_t *a2[2], *a, *b; \ - int curr, shift; \ - \ - a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ - for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ - ks_heapadjust_##name(i, lsize, l); \ - } \ - void ks_heapsort_##name(size_t lsize, type_t l[]) \ - { \ - size_t i; \ - for (i = lsize - 1; i > 0; --i) { \ - type_t tmp; \ - tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ - } \ - } \ - static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ - { \ - type_t *i, *j, swap_tmp; \ - for (i = s + 1; i < t; ++i) \ - for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ - swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ - } \ - } \ - void ks_combsort_##name(size_t n, type_t a[]) \ - { \ - const double shrink_factor = 1.2473309501039786540366528676643; \ - int do_swap; \ - size_t gap = n; \ - type_t tmp, *i, *j; \ - do { \ - if (gap > 2) { \ - gap = (size_t)(gap / shrink_factor); \ - if (gap == 9 || gap == 10) gap = 11; \ - } \ - do_swap = 0; \ - for (i = a; i < a + n - gap; ++i) { \ - j = i + gap; \ - if (__sort_lt(*j, *i)) { \ - tmp = *i; *i = *j; *j = tmp; \ - do_swap = 1; \ - } \ - } \ - } while (do_swap || gap > 2); \ - if (gap != 1) __ks_insertsort_##name(a, a + n); \ - } \ - void ks_introsort_##name(size_t n, type_t a[]) \ - { \ - int d; \ - ks_isort_stack_t *top, *stack; \ - type_t rp, swap_tmp; \ - type_t *s, *t, *i, *j, *k; \ - \ - if (n < 1) return; \ - else if (n == 2) { \ - if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ - return; \ - } \ - for (d = 2; 1ul<>1) + 1; \ - if (__sort_lt(*k, *i)) { \ - if (__sort_lt(*k, *j)) k = j; \ - } else k = __sort_lt(*j, *i)? i : j; \ - rp = *k; \ - if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ - for (;;) { \ - do ++i; while (__sort_lt(*i, rp)); \ - do --j; while (i <= j && __sort_lt(rp, *j)); \ - if (j <= i) break; \ - swap_tmp = *i; *i = *j; *j = swap_tmp; \ - } \ - swap_tmp = *i; *i = *t; *t = swap_tmp; \ - if (i-s > t-i) { \ - if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ - s = t-i > 16? i+1 : t; \ - } else { \ - if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ - t = i-s > 16? i-1 : s; \ - } \ - } else { \ - if (top == stack) { \ - free(stack); \ - __ks_insertsort_##name(a, a+n); \ - return; \ - } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ - } \ - } \ - } \ - /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ - /* 0 <= kk < n */ \ - type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ - { \ - type_t *low, *high, *k, *ll, *hh, *mid; \ - low = arr; high = arr + n - 1; k = arr + kk; \ - for (;;) { \ - if (high <= low) return *k; \ - if (high == low + 1) { \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - return *k; \ - } \ - mid = low + (high - low) / 2; \ - if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ - KSORT_SWAP(type_t, *mid, *(low+1)); \ - ll = low + 1; hh = high; \ - for (;;) { \ - do ++ll; while (__sort_lt(*ll, *low)); \ - do --hh; while (__sort_lt(*low, *hh)); \ - if (hh < ll) break; \ - KSORT_SWAP(type_t, *ll, *hh); \ - } \ - KSORT_SWAP(type_t, *low, *hh); \ - if (hh <= k) low = ll; \ - if (hh >= k) high = hh - 1; \ - } \ - } \ - void ks_shuffle_##name(size_t n, type_t a[]) \ - { \ - int i, j; \ - for (i = n; i > 1; --i) { \ - type_t tmp; \ - j = (int)(drand48() * i); \ - tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ - } \ - } \ - void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \ - { /* reference: http://code.activestate.com/recipes/272884/ */ \ - int i, k, pop = n; \ - for (i = (int)r, k = 0; i >= 0; --i) { \ - double z = 1., x = drand48(); \ - type_t tmp; \ - while (x < z) z -= z * i / (pop--); \ - if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \ - ++k; \ - } \ - } - -#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) -#define ks_introsort(name, n, a) ks_introsort_##name(n, a) -#define ks_combsort(name, n, a) ks_combsort_##name(n, a) -#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) -#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) -#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) -#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) -#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) - -#define ks_lt_generic(a, b) ((a) < (b)) -#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) - -typedef const char *ksstr_t; - -#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) -#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) - -#endif diff --git a/kstring.h b/kstring.h deleted file mode 100644 index 6639bd8..0000000 --- a/kstring.h +++ /dev/null @@ -1,169 +0,0 @@ -/* The MIT License - - Copyright (c) by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef KSTRING_H -#define KSTRING_H - -#include -#include -#include - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - uint32_t l, m; - char *s; -} kstring_t; -#endif - -typedef struct { - uint64_t tab[4]; - int sep, finished; - const char *p; // end of the current token -} ks_tokaux_t; - -#ifdef __cplusplus -extern "C" { -#endif - - int ksprintf(kstring_t *s, const char *fmt, ...); - int ksprintf_fast(kstring_t *s, const char *fmt, ...); - int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); - char *kstrstr(const char *str, const char *pat, int **_prep); - char *kstrnstr(const char *str, const char *pat, int n, int **_prep); - void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); - - /* kstrtok() is similar to strtok_r() except that str is not - * modified and both str and sep can be NULL. For efficiency, it is - * actually recommended to set both to NULL in the subsequent calls - * if sep is not changed. */ - char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - -#ifdef __cplusplus -} -#endif - -static inline void ks_resize(kstring_t *s, size_t size) -{ - if (s->m < size) { - s->m = size; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } -} - -static inline int kputsn(const char *p, int l, kstring_t *s) -{ - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - memcpy(s->s + s->l, p, l); - s->l += l; - s->s[s->l] = 0; - return l; -} - -static inline int kputs(const char *p, kstring_t *s) -{ - return kputsn(p, strlen(p), s); -} - -static inline int kputc(int c, kstring_t *s) -{ - if (s->l + 1 >= s->m) { - s->m = s->l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - s->s[s->l++] = c; - s->s[s->l] = 0; - return c; -} - -static inline int kputw(int c, kstring_t *s) -{ - char buf[16]; - int l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputuw(unsigned c, kstring_t *s) -{ - char buf[16]; - int l, i; - unsigned x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputl(long c, kstring_t *s) -{ - char buf[32]; - long l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int *ksplit(kstring_t *s, int delimiter, int *n) -{ - int max = 0, *offsets = 0; - *n = ksplit_core(s->s, delimiter, &max, &offsets); - return offsets; -} - -#endif diff --git a/ksw.c b/ksw.c deleted file mode 100644 index 8aaa7fc..0000000 --- a/ksw.c +++ /dev/null @@ -1,454 +0,0 @@ -/* The MIT License - - Copyright (c) 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include "ksw.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; - -struct _kswq_t { - int qlen, slen; - uint8_t shift, mdiff, max, size; - __m128i *qp, *H0, *H1, *E, *Hmax; -}; - -/** - * Initialize the query data structure - * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array - * - * @return Query data structure - */ -kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) -{ - kswq_t *q; - int slen, a, tmp, p; - - size = size > 1? 2 : 1; - p = 8 * (3 - size); // # values per __m128i - slen = (qlen + p - 1) / p; // segmented length - q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory - q->H0 = q->qp + slen * m; - q->H1 = q->H0 + slen; - q->E = q->H1 + slen; - q->Hmax = q->E + slen; - q->slen = slen; q->qlen = qlen; q->size = size; - // compute shift - tmp = m * m; - for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score - if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; - if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; - } - q->max = q->mdiff; - q->shift = 256 - q->shift; // NB: q->shift is uint8_t - q->mdiff += q->shift; // this is the difference between the min and max scores - // An example: p=8, qlen=19, slen=3 and segmentation: - // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} - if (size == 1) { - int8_t *t = (int8_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; - } - } else { - int16_t *t = (int16_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]); - } - } - return q; -} - -kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_16(ret, xx) do { \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ - (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(_gapo + _gape); - gape = _mm_set1_epi8(_gape); - shift = _mm_set1_epi8(q->shift); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, cmp, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian - for (j = 0; LIKELY(j < slen); ++j) { - /* SW cells are computed in the following order: - * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} - * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} - */ - // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) - h = _mm_adds_epu8(h, _mm_load_si128(S + j)); - h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) - e = _mm_load_si128(E + j); // e=E'(i,j) - h = _mm_max_epu8(h, e); - h = _mm_max_epu8(h, f); // h=H'(i,j) - max = _mm_max_epu8(max, h); // set max - _mm_store_si128(H1 + j, h); // save to H'(i,j) - // now compute E'(i+1,j) - h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo - e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape - e = _mm_max_epu8(e, h); // e=E'(i+1,j) - _mm_store_si128(E + j, e); // save to E'(i+1,j) - // now compute F'(i,j+1) - f = _mm_subs_epu8(f, gape); - f = _mm_max_epu8(f, h); - // get H'(i-1,j) and prepare for the next j - h = _mm_load_si128(H0 + j); // h=H'(i-1,j) - } - // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion - for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max - f = _mm_slli_si128(f, 1); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epu8(h, f); // h=H'(i,j) - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu8(h, gapoe); - f = _mm_subs_epu8(f, gape); - cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); - if (UNLIKELY(cmp == 0xffff)) goto end_loop16; - } - } -end_loop16: - //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); - __max_16(imax, max); // imax is the maximum number in max - if (imax >= minsc) { // write the b array; this condition adds branching unfornately - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; // te is the end position on the target - for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255 || gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; // swap H0 and H1 - } - r.score = gmax + q->shift < 255? gmax : 255; - r.te = te; - if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score - int max = -1, low, high, qlen = slen * 16; - uint8_t *t = (uint8_t*)Hmax; - for (i = 0; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; - //printf("%d,%d\n", max, gmax); - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_8(ret, xx) do { \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ - (ret) = _mm_extract_epi16((xx), 0); \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(_gapo + _gape); - gape = _mm_set1_epi16(_gape); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_adds_epi16(h, *S++); - e = _mm_load_si128(E + j); - h = _mm_max_epi16(h, e); - h = _mm_max_epi16(h, f); - max = _mm_max_epi16(max, h); - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - e = _mm_subs_epu16(e, gape); - e = _mm_max_epi16(e, h); - _mm_store_si128(E + j, e); - f = _mm_subs_epu16(f, gape); - f = _mm_max_epi16(f, h); - h = _mm_load_si128(H0 + j); - } - for (k = 0; LIKELY(k < 16); ++k) { - f = _mm_slli_si128(f, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epi16(h, f); - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - f = _mm_subs_epu16(f, gape); - if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; - } - } -end_loop8: - __max_8(imax, max); - if (imax >= minsc) { - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; - for (j = 0; LIKELY(j < slen); ++j) - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; - } - r.score = gmax; r.te = te; - { - int max = -1, low, high, qlen = slen * 8; - uint16_t *t = (uint16_t*)Hmax; - for (i = 0, r.qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -static void revseq(int l, uint8_t *s) -{ - int i, t; - for (i = 0; i < l>>1; ++i) - t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; -} - -kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) -{ - int size; - kswq_t *q; - kswr_t r, rr; - kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); - - q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); - if (qry && *qry == 0) *qry = q; - func = q->size == 2? ksw_i16 : ksw_u8; - size = q->size; - r = func(q, tlen, target, gapo, gape, xtra); - if (qry == 0) free(q); - if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; - revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end - q = ksw_qinit(size, r.qe + 1, query, m, mat); - rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); - revseq(r.qe + 1, query); revseq(r.te + 1, target); - free(q); - if (r.score == rr.score) - r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; - return r; -} - -/******************************************* - * Main function (not compiled by default) * - *******************************************/ - -#ifdef _KSW_MAIN - -#include -#include -#include -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -int main(int argc, char *argv[]) -{ - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; - int8_t mat[25]; - int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; - uint8_t *rseq = 0; - gzFile fpt, fpq; - kseq_t *kst, *ksq; - - // parse command line - while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { - switch (c) { - case 'a': sa = atoi(optarg); break; - case 'b': sb = atoi(optarg); break; - case 'q': gapo = atoi(optarg); break; - case 'r': gape = atoi(optarg); break; - case 't': minsc = atoi(optarg); break; - case 'f': forward_only = 1; break; - case '1': xtra |= KSW_XBYTE; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); - return 1; - } - if (minsc > 0xffff) minsc = 0xffff; - xtra |= KSW_XSUBO | minsc; - // initialize scoring matrix - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? sa : -sb; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; - // open file - fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); - fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); - // all-pair alignment - while (kseq_read(ksq) > 0) { - kswq_t *q[2] = {0, 0}; - kswr_t r; - for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - if (!forward_only) { // reverse - if ((int)ksq->seq.m > max_rseq) { - max_rseq = ksq->seq.m; - rseq = (uint8_t*)realloc(rseq, max_rseq); - } - for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) - rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - } - gzrewind(fpt); kseq_rewind(kst); - while (kseq_read(kst) > 0) { - for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); - if (r.score >= minsc) - printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); - if (rseq) { - r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); - if (r.score >= minsc) - printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); - } - } - free(q[0]); free(q[1]); - } - free(rseq); - kseq_destroy(kst); gzclose(fpt); - kseq_destroy(ksq); gzclose(fpq); - return 0; -} -#endif diff --git a/ksw.h b/ksw.h deleted file mode 100644 index e1ecf8d..0000000 --- a/ksw.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __AC_KSW_H -#define __AC_KSW_H - -#include - -#define KSW_XBYTE 0x10000 -#define KSW_XSTOP 0x20000 -#define KSW_XSUBO 0x40000 -#define KSW_XSTART 0x80000 - -struct _kswq_t; -typedef struct _kswq_t kswq_t; - -typedef struct { - int score; // best score - int te, qe; // target end and query end - int score2, te2; // second best score and ending position on the target - int tb, qb; // target start and query start -} kswr_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * Aligning two sequences - * - * @param qlen length of the query sequence (typically - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "kvec.h" -int main() { - kvec_t(int) array; - kv_init(array); - kv_push(int, array, 10); // append - kv_a(int, array, 20) = 5; // dynamic - kv_A(array, 20) = 4; // static - kv_destroy(array); - return 0; -} -*/ - -/* - 2008-09-22 (0.1.0): - - * The initial version. - -*/ - -#ifndef AC_KVEC_H -#define AC_KVEC_H - -#include - -#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) - -#define kvec_t(type) struct { size_t n, m; type *a; } -#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) -#define kv_destroy(v) free((v).a) -#define kv_A(v, i) ((v).a[(i)]) -#define kv_pop(v) ((v).a[--(v).n]) -#define kv_size(v) ((v).n) -#define kv_max(v) ((v).m) - -#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) - -#define kv_copy(type, v1, v0) do { \ - if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ - (v1).n = (v0).n; \ - memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ - } while (0) \ - -#define kv_push(type, v, x) do { \ - if ((v).n == (v).m) { \ - (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ - } \ - (v).a[(v).n++] = (x); \ - } while (0) - -#define kv_pushp(type, v) (((v).n == (v).m)? \ - ((v).m = ((v).m? (v).m<<1 : 2), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) - -#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ - ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ - : 0), (v).a[(i)]) - -#endif diff --git a/seqtk.c b/seqtk.c index 0e522fe..8fdd32e 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1,1643 +1,1716 @@ -/* The MIT License - - Copyright (c) 20082-2012 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -typedef struct { - int n, m; - uint64_t *a; -} reglist_t; - -#include "khash.h" -KHASH_MAP_INIT_STR(reg, reglist_t) -KHASH_SET_INIT_INT64(64) - -typedef kh_reg_t reghash_t; - -reghash_t *stk_reg_read(const char *fn) -{ - reghash_t *h = kh_init(reg); - gzFile fp; - kstream_t *ks; - int dret; - kstring_t *str; - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int beg = -1, end = -1; - reglist_t *p; - khint_t k = kh_get(reg, h, str->s); - if (k == kh_end(h)) { - int ret; - char *s = strdup(str->s); - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - beg = atoi(str->s); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - end = atoi(str->s); - if (end < 0) end = -1; - } - } - } - } - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column - if (beg < 0) beg = 0, end = INT_MAX; - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | end; - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return h; -} - -void stk_reg_destroy(reghash_t *h) -{ - khint_t k; - if (h == 0) return; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); -} - -/* constant table */ - -unsigned char seq_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; - -unsigned char seq_nt6_table[256] = { - 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 -}; - -char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; -unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; -int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; -char comp_tab[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', - 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, - 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', - 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 -}; - -static void stk_printstr(const kstring_t *s, unsigned line_len) -{ - if (line_len != UINT_MAX && line_len != 0) { - int i, rest = s->l; - for (i = 0; i < s->l; i += line_len, rest -= line_len) { - putchar('\n'); - if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout); - else fwrite(s->s + i, 1, rest, stdout); - } - putchar('\n'); - } else { - putchar('\n'); - puts(s->s); - } -} - -static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n) -{ - putchar(s->qual.l? '@' : '>'); - if (n >= 0) { - if (prefix) fputs(prefix, stdout); - printf("%lld", (long long)n); - } else fputs(s->name.s, stdout); - if (s->comment.l) { - putchar(' '); fputs(s->comment.s, stdout); - } - stk_printstr(&s->seq, line_len); - if (s->qual.l) { - putchar('+'); - stk_printstr(&s->qual, line_len); - } -} - -inline void stk_printseq(const kseq_t *s, int line_len) -{ - stk_printseq_renamed(s, line_len, 0, -1); -} - -/* - 64-bit Mersenne Twister pseudorandom number generator. Adapted from: - - http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c - - which was written by Takuji Nishimura and Makoto Matsumoto and released - under the 3-clause BSD license. -*/ - -typedef uint64_t krint64_t; - -struct _krand_t; -typedef struct _krand_t krand_t; - -#define KR_NN 312 -#define KR_MM 156 -#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ -#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */ - -struct _krand_t { - int mti; - krint64_t mt[KR_NN]; -}; - -static void kr_srand0(krint64_t seed, krand_t *kr) -{ - kr->mt[0] = seed; - for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti) - kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti; -} - -krand_t *kr_srand(krint64_t seed) -{ - krand_t *kr; - kr = malloc(sizeof(krand_t)); - kr_srand0(seed, kr); - return kr; -} - -krint64_t kr_rand(krand_t *kr) -{ - krint64_t x; - static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL }; - if (kr->mti >= KR_NN) { - int i; - if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr); - for (i = 0; i < KR_NN - KR_MM; ++i) { - x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); - kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)]; - } - for (; i < KR_NN - 1; ++i) { - x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); - kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)]; - } - x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM); - kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)]; - kr->mti = 0; - } - x = kr->mt[kr->mti++]; - x ^= (x >> 29) & 0x5555555555555555ULL; - x ^= (x << 17) & 0x71D67FFFEDA60000ULL; - x ^= (x << 37) & 0xFFF7EEE000000000ULL; - x ^= (x >> 43); - return x; -} - -#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0)) - - -/* quality based trimming with Mott's algorithm */ -int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record - gzFile fp; - kseq_t *seq; - double param = 0.05, q_int2real[128]; - int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; - while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { - switch (c) { - case 'q': param = atof(optarg); break; - case 'l': min_len = atoi(optarg); break; - case 'b': left = atoi(optarg); break; - case 'e': right = atoi(optarg); break; - case 'B': left_keep = atoi(optarg); break; - case 'E': right_keep = atoi(optarg); break; - } - } - if (optind == argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q; it has priority over -B) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q; it has priority over -E) [0]\n"); - fprintf(stderr, " -B INT keep first INT bp from left (disabled by -q/-e) [%d]\n", left_keep); - fprintf(stderr, " -E INT keep last INT bp from right (disabled by -q/-b/-B) [%d]\n", right_keep); - fprintf(stderr, "\n"); - return 1; - } - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - for (i = 0; i < 128; ++i) - q_int2real[i] = pow(10., -(i - 33) / 10.); - while (kseq_read(seq) >= 0) { - int beg, tmp, end; - double s, max = 0.; - if (seq->seq.l == 0) { // trying to fix locally the bug where reads with no sequence are converted to FASTA format - beg = 0; - end = 1; - seq->seq.l = 1; - seq->qual.l = 1; - seq->seq.s = (char*)malloc(2); - seq->seq.s[0] = 'A'; - seq->qual.s = (char*)malloc(2); - seq->qual.s[0]='F'; - } else if (left_keep) { - beg = left; end = left + left_keep; - if (seq->seq.l < end) end = seq->seq.l; - if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (right_keep) { - beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; - if (beg < 0) beg = 0; - if (end < 0) end = 0; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (left || right) { - beg = left; end = seq->seq.l - right; - if (end < 0) end = 0; - if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { - beg = 0; - end = min_len; - if (end > seq->seq.l) end = seq->seq.l; - } - } else if (seq->qual.l > min_len && param != 0.) { - for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { - int q = seq->qual.s[i]; - if (q < 36) q = 36; - if (q > 127) q = 127; - s += param - q_int2real[q]; - if (s > max) max = s, beg = tmp, end = i + 1; - if (s < 0) s = 0, tmp = i + 1; - } - - /* max never set; all low qual, just give first min_len bp */ - if (max == 0.) beg = 0, end = min_len; - - if (end - beg < min_len) { // window-based - int is, imax; - for (i = 0, is = 0; i < min_len; ++i) - is += seq->qual.s[i] - 33; - for (imax = is, beg = 0; i < seq->qual.l; ++i) { - is += (int)seq->qual.s[i] - seq->qual.s[i - min_len]; - if (imax < is) imax = is, beg = i - min_len + 1; - } - end = beg + min_len; - } - } else beg = 0, end = seq->seq.l; - putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); - if (seq->comment.l) { - putchar(' '); puts(seq->comment.s); - } else putchar('\n'); - fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); - if (seq->qual.l) { - puts("+"); - fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* composition */ -int stk_comp(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, upper_only = 0; - reghash_t *h = 0; - reglist_t dummy; - - while ((c = getopt(argc, argv, "ur:")) >= 0) { - switch (c) { - case 'u': upper_only = 1; break; - case 'r': h = stk_reg_read(optarg); break; - } - } - if (argc == optind && isatty(fileno(stdin))) { - fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); - fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); - return 1; - } - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); - while ((l = kseq_read(seq)) >= 0) { - int i, k; - reglist_t *p = 0; - if (h) { - khint_t k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) p = &kh_val(h, k); - } else { - p = &dummy; - dummy.a[0] = l; - } - for (k = 0; p && k < p->n; ++k) { - int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; - int la, lb, lc, na, nb, nc, cnt[11]; - if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; - else la = 'a', lb = -1, lc = 0; - na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - memset(cnt, 0, 11 * sizeof(int)); - for (i = beg; i < end; ++i) { - int is_CpG = 0, a, b, c; - a = na; b = nb; c = nc; - na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - if (b == 2 || b == 10) { // C or Y - if (nb == 4 || nb == 5) is_CpG = 1; - } else if (b == 4 || b == 5) { // G or R - if (lb == 2 || lb == 10) is_CpG = 1; - } - if (upper_only == 0 || isupper(a)) { - if (c > 1) ++cnt[c+2]; - if (c == 1) ++cnt[seq_nt16to4_table[b]]; - if (b == 10 || b == 5) ++cnt[9]; - else if (c == 2) { - ++cnt[8]; - } - if (is_CpG) { - ++cnt[7]; - if (b == 10 || b == 5) ++cnt[10]; - } - } - la = a; lb = b; lc = c; - } - if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); - else printf("%s\t%d", seq->name.s, l); - for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); - putchar('\n'); - } - fflush(stdout); - } - free(dummy.a); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_randbase(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk randbase \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - int i; - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - int c, b, a, j, k, m; - b = seq->seq.s[i]; - c = seq_nt16_table[b]; - a = bitcnt_table[c]; - if (a == 2) { - m = (drand48() < 0.5); - for (j = k = 0; j < 4; ++j) { - if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; - } - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_hety(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; - char *buf; - uint32_t cnt[3]; - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk hety [options] \n\n"); - fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); - fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); - fprintf(stderr, " -m treat lowercases as masked\n"); - fprintf(stderr, "\n"); - return 1; - } - while ((c = getopt(argc, argv, "w:t:m")) >= 0) { - switch (c) { - case 'w': win_size = atoi(optarg); break; - case 't': n_start = atoi(optarg); break; - case 'm': is_lower_mask = 1; break; - } - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - win_step = win_size / n_start; - buf = calloc(win_size, 1); - while ((l = kseq_read(seq)) >= 0) { - int x, i, y, z, next = 0; - cnt[0] = cnt[1] = cnt[2] = 0; - for (i = 0; i <= l; ++i) { - if ((i >= win_size && i % win_step == 0) || i == l) { - if (i == l && l >= win_size) { - for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; - } - if (cnt[1] + cnt[2] > 0) - printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, - (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); - next = i; - } - if (i < l) { - y = i % win_size; - c = seq->seq.s[i]; - if (is_lower_mask && islower(c)) c = 'N'; - c = seq_nt16_table[c]; - x = bitcnt_table[c]; - if (i >= win_size) --cnt[(int)buf[y]]; - buf[y] = z = x > 2? 0 : x == 2? 2 : 1; - ++cnt[z]; - } - } - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* subseq */ - -int stk_subseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_tab = 0, line = 0; - khint_t k; - while ((c = getopt(argc, argv, "tl:")) >= 0) { - switch (c) { - case 't': is_tab = 1; break; - case 'l': line = atoi(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); - fprintf(stderr, "Options: -t TAB delimited output\n"); - fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); - fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // subseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); - continue; - } - if (end > seq->seq.l) end = seq->seq.l; - if (is_tab == 0) { - printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); - if (beg > 0 || (int)p->a[i] != INT_MAX) { - if (end == INT_MAX) { - if (beg) printf(":%d", beg+1); - } else printf(":%d-%d", beg+1, end); - } - if (seq->comment.l) printf("\t%s", seq->comment.s); - } else printf("%s\t%d\t", seq->name.s, beg + 1); - if (end > seq->seq.l) end = seq->seq.l; - for (j = 0; j < end - beg; ++j) { - if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n'); - putchar(seq->seq.s[j + beg]); - } - putchar('\n'); - if (seq->qual.l != seq->seq.l || is_tab) continue; - printf("+"); - for (j = 0; j < end - beg; ++j) { - if (j == 0 || (line > 0 && j % line == 0)) putchar('\n'); - putchar(seq->qual.s[j + beg]); - } - putchar('\n'); - } - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* mergefa */ -int stk_mergefa(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0; - uint64_t cnt[5]; - while ((c = getopt(argc, argv, "himrq:")) >= 0) { - switch (c) { - case 'i': is_intersect = 1; break; - case 'h': is_haploid = 1; break; - case 'm': is_mask = 1; break; - case 'r': is_randhet = 1; break; - case 'q': qual = atoi(optarg); break; - } - } - if (is_mask && is_intersect) { - fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); - return 1; - } - if (optind + 2 > argc) { - fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); - fprintf(stderr, "Options: -q INT quality threshold [0]\n"); - fprintf(stderr, " -i take intersection\n"); - fprintf(stderr, " -m convert to lowercase when one of the input base is N\n"); - fprintf(stderr, " -r pick a random allele from het\n"); - fprintf(stderr, " -h suppress hets in the input\n\n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; - srand48(11); - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2], b[2], is_upper; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); - if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); - if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; - c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; - if (c[0] == 0) c[0] = 15; - if (c[1] == 0) c[1] = 15; - b[0] = bitcnt_table[c[0]]; - b[1] = bitcnt_table[c[1]]; - if (is_upper) { - if (b[0] == 1 && b[1] == 1) { - if (c[0] == c[1]) ++cnt[0]; - else ++cnt[1]; - } else if (b[0] == 1 && b[1] == 2) ++cnt[2]; - else if (b[0] == 2 && b[1] == 1) ++cnt[3]; - else if (b[0] == 2 && b[1] == 2) ++cnt[4]; - } - if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0; - if (is_intersect) { - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0! - } else if (is_mask) { - if (c[0] == 15 || c[1] == 15) is_upper = 0; - c[0] &= c[1]; - if (c[0] == 0) is_upper = 0; - } else if (is_randhet) { - if (b[0] == 1 && b[1] == 1) { // two homs - c[0] |= c[1]; - } else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het - c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]); - } else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets - if (lrand48()&1) { - if (lrand48()&1) { - for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele - if (c[0]&i) c[0] &= i; - } else { - for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele - if (c[0]&i) c[0] &= i; - } - } // else set as het - } else is_upper = 0; - } else c[0] |= c[1]; - c[0] = seq_nt16_rev_table[c[0]]; - if (!is_upper) c[0] = tolower(c[0]); - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]); - return 0; -} - -int stk_famask(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk famask \n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2]; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (c[1] == 'x') c[0] = tolower(c[0]); - else if (c[1] != 'X') c[0] = c[1]; - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_mutfa(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - kstream_t *ks; - int l, i, dret; - kstring_t *str; - khint_t k; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mutfa \n\n"); - fprintf(stderr, "Note: contains at least four columns per line which are:\n"); - fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); - return 1; - } - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - char *s = strdup(str->s); - int beg = 0, ret; - reglist_t *p; - k = kh_get(reg, h, s); - if (k == kh_end(h)) { - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col - ks_getuntil(ks, 0, str, &dret); // 3rd col - ks_getuntil(ks, 0, str, &dret); // 4th col - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (isalpha(str->s[0]) && str->l == 1) { - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; - } - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - // mutfa - fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) { - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32; - if (beg < seq->seq.l) - seq->seq.s[beg] = (int)p->a[i]; - } - } - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); - return 0; -} - -int stk_listhet(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int i, l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk listhet \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - for (i = 0; i < l; ++i) { - int b = seq->seq.s[i]; - if (bitcnt_table[seq_nt16_table[b]] == 2) - printf("%s\t%d\t%c\n", seq->name.s, i+1, b); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* cutN */ -static int cutN_min_N_tract = 1000; -static int cutN_nonN_penalty = 10; - -static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) -{ - int i, b, e; - while (k < ks->seq.l) { - if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { - int score, max; - score = 0; e = max = -1; - for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, e = i; - } - score = 0; b = max = -1; - for (i = e; i >= 0 && score >= 0; --i) { /* backward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, b = i; - } - if (e + 1 - b >= cutN_min_N_tract) { - *begin = b; - *end = e + 1; - return *end; - } - k = e + 1; - } else ++k; - } - return -1; -} -static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) -{ - int i; - if (begin >= end) return; // FIXME: why may this happen? Understand it! - fprintf(fpout, "%c%s:%d-%d", ks->qual.l? '@' : '>', ks->name.s, begin+1, end); - for (i = begin; i < end && i < ks->seq.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->seq.s[i], fpout); - } - fputc('\n', fpout); - if (ks->qual.l == 0) return; - fputs("+\n", fpout); - for (i = begin; i < end && i < ks->qual.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->qual.s[i], fpout); - } - fputc('\n', fpout); -} -int stk_cutN(int argc, char *argv[]) -{ - int c, l, gap_only = 0; - gzFile fp; - kseq_t *ks; - while ((c = getopt(argc, argv, "n:p:g")) >= 0) { - switch (c) { - case 'n': cutN_min_N_tract = atoi(optarg); break; - case 'p': cutN_nonN_penalty = atoi(optarg); break; - case 'g': gap_only = 1; break; - default: return 1; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); - fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); - fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); - fprintf(stderr, " -g print gaps only, no sequence\n\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while ((l = kseq_read(ks)) >= 0) { - int k = 0, begin = 0, end = 0; - while (find_next_cut(ks, k, &begin, &end) >= 0) { - if (begin != 0) { - if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); - else print_seq(stdout, ks, k, begin); - } - k = end; - } - if (!gap_only) print_seq(stdout, ks, k, l); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -int stk_hrun(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *ks; - int min_len = 7, l = 0, c = 0, beg = 0, i; - if (argc == optind) { - fprintf(stderr, "Usage: seqtk hrun [minLen=%d]\n", min_len); - return 1; - } - if (argc == optind + 2) min_len = atoi(argv[optind+1]); - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - c = ks->seq.s[0]; l = 1; beg = 0; - for (i = 1; i < ks->seq.l; ++i) { - if (ks->seq.s[i] != c) { - if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); - c = ks->seq.s[i]; l = 1; beg = i; - } else ++l; - } - } - if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* sample */ - -static void cpy_kstr(kstring_t *dst, const kstring_t *src) -{ - if (src->l == 0) return; - if (src->l + 1 > dst->m) { - dst->m = src->l + 1; - kroundup32(dst->m); - dst->s = realloc(dst->s, dst->m); - } - dst->l = src->l; - memcpy(dst->s, src->s, src->l + 1); -} - -static void cpy_kseq(kseq_t *dst, const kseq_t *src) -{ - cpy_kstr(&dst->name, &src->name); - cpy_kstr(&dst->seq, &src->seq); - cpy_kstr(&dst->qual, &src->qual); - cpy_kstr(&dst->comment, &src->comment); -} - -int stk_sample(int argc, char *argv[]) -{ - int c, twopass = 0; - uint64_t i, num = 0, n_seqs = 0; - double frac = 0.; - gzFile fp; - kseq_t *seq; - krand_t *kr = 0; - - while ((c = getopt(argc, argv, "2s:")) >= 0) - if (c == 's') kr = kr_srand(atol(optarg)); - else if (c == '2') twopass = 1; - - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk sample [-2] [-s seed=11] |\n\n"); - fprintf(stderr, "Options: -s INT RNG seed [11]\n"); - fprintf(stderr, " -2 2-pass mode: twice as slow but with much reduced memory\n\n"); - return 1; - } - frac = atof(argv[optind+1]); - if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.; - else if (twopass) { - fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__); - twopass = 0; - } - if (kr == 0) kr = kr_srand(11); - - if (!twopass) { // the streaming version - kseq_t *buf = 0; - if (num > 0) buf = calloc(num, sizeof(kseq_t)); - if (num > 0 && buf == NULL) { - fprintf(stderr, "[E::%s] Could not allocate enough memory for %" PRIu64 " sequences. Exiting...\n", __func__, num); - free(kr); - return 1; - } - - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) { - double r = kr_drand(kr); - ++n_seqs; - if (num) { - uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); - if (y < num) cpy_kseq(&buf[y], seq); - } else if (r < frac) stk_printseq(seq, UINT_MAX); - } - for (i = 0; i < num; ++i) { - kseq_t *p = &buf[i]; - if (p->seq.l) stk_printseq(p, UINT_MAX); - free(p->seq.s); free(p->qual.s); free(p->name.s); - } - if (buf != NULL) free(buf); - } else { - uint64_t *buf; - khash_t(64) *hash; - int absent; - - if (strcmp(argv[optind], "-") == 0) { - fprintf(stderr, "[E::%s] in the 2-pass mode, the input cannot be STDIN.\n", __func__); - free(kr); - return 1; - } - - // 1st pass - buf = malloc(num * 8); - for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; - fp = gzopen(argv[optind], "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) { - double r = kr_drand(kr); - uint64_t y; - ++n_seqs; - y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); - if (y < num) buf[y] = n_seqs; - } - kseq_destroy(seq); - gzclose(fp); - hash = kh_init(64); - for (i = 0; i < num; ++i) kh_put(64, hash, buf[i], &absent); - free(buf); - // 2nd pass - fp = gzopen(argv[optind], "r"); - seq = kseq_init(fp); - n_seqs = 0; - while (kseq_read(seq) >= 0) - if (kh_get(64, hash, ++n_seqs) != kh_end(hash)) - stk_printseq(seq, UINT_MAX); - kh_destroy(64, hash); - } - - kseq_destroy(seq); - gzclose(fp); - free(kr); - return 0; -} - -/* seq */ - -void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr) -{ - unsigned i, j; - khiter_t k; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) { // not found in the hash table - if (is_complement) { - if (mask_chr) { - for (j = 0; j < seq->seq.l; ++j) - seq->seq.s[j] = mask_chr; - } else { - for (j = 0; j < seq->seq.l; ++j) - seq->seq.s[j] = tolower(seq->seq.s[j]); - } - } - } else { - reglist_t *p = &kh_val(h, k); - if (!is_complement) { - for (i = 0; i < p->n; ++i) { - unsigned beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) continue; - if (end > seq->seq.l) end = seq->seq.l; - if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); - else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr; - } - } else { - int8_t *mask = calloc(seq->seq.l, 1); - for (i = 0; i < p->n; ++i) { - unsigned beg = p->a[i]>>32, end = p->a[i]; - if (end >= seq->seq.l) end = seq->seq.l; - for (j = beg; j < end; ++j) mask[j] = 1; - } - if (mask_chr) { - for (j = 0; j < seq->seq.l; ++j) - if (mask[j] == 0) seq->seq.s[j] = mask_chr; - } else { - for (j = 0; j < seq->seq.l; ++j) - if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]); - } - free(mask); - } - } -} - -int stk_seq(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255; - unsigned i, line_len = 0; - int64_t n_seqs = 0; - double frac = 1.; - khash_t(reg) *h = 0; - krand_t *kr = 0; - - while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:")) >= 0) { - switch (c) { - case 'a': - case 'A': flag |= 1; break; - case 'C': flag |= 2; break; - case 'r': flag |= 4; break; - case 'c': flag |= 8; break; - case '1': flag |= 16; break; - case '2': flag |= 32; break; - case 'V': flag |= 64; break; - case 'N': flag |= 128; break; - case 'U': flag |= 256; break; - case 'M': h = stk_reg_read(optarg); break; - case 'n': mask_chr = *optarg; break; - case 'Q': qual_shift = atoi(optarg); break; - case 'q': qual_thres = atoi(optarg); break; - case 'X': max_q = atoi(optarg); break; - case 'l': line_len = atoi(optarg); break; - case 'L': min_len = atoi(optarg); break; - case 's': kr = kr_srand(atol(optarg)); break; - case 'f': frac = atof(optarg); break; - } - } - if (kr == 0) kr = kr_srand(11); - if (argc == optind && isatty(fileno(stdin))) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk seq [options] |\n\n"); - fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n"); - fprintf(stderr, " -X INT mask bases with quality higher than INT [255]\n"); - fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n"); - fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len); - fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift); - fprintf(stderr, " -s INT random seed (effective with -f) [11]\n"); - fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); - fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); - fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); - fprintf(stderr, " -c mask complement region (effective with -M)\n"); - fprintf(stderr, " -r reverse complement\n"); - fprintf(stderr, " -A force FASTA output (discard quality)\n"); - fprintf(stderr, " -C drop comments at the header lines\n"); - fprintf(stderr, " -N drop sequences containing ambiguous bases\n"); - fprintf(stderr, " -1 output the 2n-1 reads only\n"); - fprintf(stderr, " -2 output the 2n reads only\n"); - fprintf(stderr, " -V shift quality by '(-Q) - 33'\n"); - fprintf(stderr, " -U convert all bases to uppercases\n"); - fprintf(stderr, "\n"); - free(kr); - return 1; - } - if (line_len == 0) line_len = UINT_MAX; - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - qual_thres += qual_shift; - while (kseq_read(seq) >= 0) { - ++n_seqs; - if (seq->seq.l < min_len) continue; // NB: length filter before taking random - if (frac < 1. && kr_drand(kr) >= frac) continue; - if (flag & 48) { // then choose odd/even reads only - if ((flag&16) && (n_seqs&1) == 0) continue; - if ((flag&32) && (n_seqs&1) == 1) continue; - } - if (seq->qual.l && qual_thres > qual_shift) { - if (mask_chr) { - for (i = 0; i < seq->seq.l; ++i) - if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) - seq->seq.s[i] = mask_chr; - } else { - for (i = 0; i < seq->seq.l; ++i) - if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) - seq->seq.s[i] = tolower(seq->seq.s[i]); - } - } - if (flag & 256) - for (i = 0; i < seq->seq.l; ++i) - seq->seq.s[i] = toupper(seq->seq.s[i]); - if (flag & 1) seq->qual.l = 0; - if (flag & 2) seq->comment.l = 0; - if (h) stk_mask(seq, h, flag&8, mask_chr); // masking - if (flag & 4) { // reverse complement - int c0, c1; - for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence - c0 = comp_tab[(int)seq->seq.s[i]]; - c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]]; - seq->seq.s[i] = c1; - seq->seq.s[seq->seq.l - 1 - i] = c0; - } - if (seq->seq.l & 1) // complement the remaining base - seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]]; - if (seq->qual.l) { - for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality - c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0; - } - } - if ((flag & 64) && seq->qual.l && qual_shift != 33) - for (i = 0; i < seq->qual.l; ++i) - seq->qual.s[i] -= qual_shift - 33; - if (flag & 128) { - for (i = 0; i < seq->seq.l; ++i) - if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break; - if (i < seq->seq.l) continue; - } - stk_printseq(seq, line_len); - } - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - free(kr); - return 0; -} - -int stk_gc(int argc, char *argv[]) -{ - int c, is_at = 0, min_l = 20; - double frac = 0.6f, xdropoff = 10.0f, q; - gzFile fp; - kseq_t *seq; - - while ((c = getopt(argc, argv, "wx:f:l:")) >= 0) { - if (c == 'x') xdropoff = atof(optarg); - else if (c == 'w') is_at = 1; - else if (c == 'f') frac = atof(optarg); - else if (c == 'l') min_l = atoi(optarg); - } - if (optind + 1 > argc) { - fprintf(stderr, "Usage: seqtk gc [options] \n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -w identify high-AT regions\n"); - fprintf(stderr, " -f FLOAT min GC fraction (or AT fraction for -w) [%.2f]\n", frac); - fprintf(stderr, " -l INT min region length to output [%d]\n", min_l); - fprintf(stderr, " -x FLOAT X-dropoff [%.1f]\n", xdropoff); - return 1; - } - q = (1.0f - frac) / frac; - - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while (kseq_read(seq) >= 0) { - int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0; - double sc = 0., max = 0.; - for (i = 0; i < seq->seq.l; ++i) { - int hit; - c = seq_nt16_table[(int)seq->seq.s[i]]; - if (is_at) hit = (c == 1 || c == 8 || c == 9); - else hit = (c == 2 || c == 4 || c == 6); - n_hits += hit; - if (hit) { - if (sc == 0) start = i, start_hits = n_hits; - sc += q; - if (sc > max) max = sc, max_i = i, max_hits = n_hits; - } else if (sc > 0) { - sc += -1.0f; - if (sc < 0 || max - sc > xdropoff) { - if (max_i + 1 - start >= min_l) - printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); - sc = max = 0; - i = max_i; - } - } - } - if (max > 0. && max_i + 1 - start >= min_l) - printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_mergepe(int argc, char *argv[]) -{ - gzFile fp1, fp2; - kseq_t *seq[2]; - - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mergepe \n"); - return 1; - } - fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - seq[0] = kseq_init(fp1); - seq[1] = kseq_init(fp2); - while (kseq_read(seq[0]) >= 0) { - if (kseq_read(seq[1]) < 0) { - fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__); - break; - } - stk_printseq(seq[0], 0); - stk_printseq(seq[1], 0); - } - if (kseq_read(seq[1]) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__); - kseq_destroy(seq[0]); gzclose(fp1); - kseq_destroy(seq[1]); gzclose(fp2); - return 0; -} - -int stk_dropse(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq, last; - - if (argc == 1 && isatty(fileno(stdin))) { - fprintf(stderr, "Usage: seqtk dropse \n"); - return 1; - } - fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - - memset(&last, 0, sizeof(kseq_t)); - while (kseq_read(seq) >= 0) { - if (last.name.l) { - kstring_t *p = &last.name, *q = &seq->name; - int is_diff; - if (p->l == q->l) { - int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; - is_diff = strncmp(p->s, q->s, l); - } else is_diff = 1; - if (!is_diff) { - stk_printseq(&last, 0); - stk_printseq(seq, 0); - last.name.l = 0; - } else cpy_kseq(&last, seq); - } else cpy_kseq(&last, seq); - } - - kseq_destroy(seq); - gzclose(fp); - // free last! - return 0; -} - -int stk_rename(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq, last; - char *prefix = 0; - uint64_t n = 1; - - if (argc == 1 && isatty(fileno(stdin))) { - fprintf(stderr, "Usage: seqtk rename [prefix]\n"); - return 1; - } - fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - if (argc > 2) prefix = argv[2]; - - memset(&last, 0, sizeof(kseq_t)); - while (kseq_read(seq) >= 0) { - if (last.name.l) { - kstring_t *p = &last.name, *q = &seq->name; - int is_diff; - if (p->l == q->l) { - int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; - is_diff = strncmp(p->s, q->s, l); - } else is_diff = 1; - if (!is_diff) { - stk_printseq_renamed(&last, 0, prefix, n); - stk_printseq_renamed(seq, 0, prefix, n); - last.name.l = 0; - ++n; - } else { - stk_printseq_renamed(&last, 0, prefix, n); - ++n; - cpy_kseq(&last, seq); - } - } else cpy_kseq(&last, seq); - } - if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n); - - kseq_destroy(seq); - gzclose(fp); - // free last! - return 0; -} - -int stk_kfreq(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *ks; - int kmer, i, l, mask; - char *nei; - - if (argc < 2) { - fprintf(stderr, "Usage: seqtk kfreq \n"); - return 1; - } - - // get the k-mer - l = strlen(argv[1]); - for (i = kmer = 0; i < l; ++i) { - int c = seq_nt6_table[(int)argv[1][i]]; - assert(c >= 1 && c <= 4); - kmer = kmer << 2 | (c - 1); - } - mask = (1<<2*l) - 1; - - // get the neighbors - nei = calloc(1, 1<<2*l); - for (i = 0; i < l; ++i) { - int j, x; - x = kmer & ~(3 << 2*i); - for (j = 0; j < 4; ++j) - nei[x|j<<2*i] = 1; - } - - fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - int k, x[2], cnt[2], cnt_nei[2], which; - x[0] = x[1] = k = cnt[0] = cnt[1] = cnt_nei[0] = cnt_nei[1] = 0; - for (i = 0; i < ks->seq.l; ++i) { - int c = seq_nt6_table[(int)ks->seq.s[i]]; - if (c >= 1 && c <= 4) { - x[0] = (x[0] << 2 | (c - 1)) & mask; - x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1)); - if (k < l) ++k; - if (k == l) { - if (x[0] == kmer) ++cnt[0]; - else if (x[1] == kmer) ++cnt[1]; - if (nei[x[0]]) ++cnt_nei[0]; - else if (nei[x[1]]) ++cnt_nei[1]; - } - } else k = 0; - } - which = cnt_nei[0] > cnt_nei[1]? 0 : 1; - printf("%s\t%ld\t%c\t%d\t%d\n", ks->name.s, ks->seq.l, "+-"[which], cnt_nei[which], cnt[which]); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* fqchk */ - -typedef struct { - int64_t q[94], b[5]; -} posstat_t; - -static void fqc_aux(posstat_t *p, int pos, int64_t allq[94], double perr[94], int qthres) -{ - int k; - int64_t sum = 0, qsum = 0, sum_low = 0; - double psum = 0; - if (pos <= 0) printf("ALL"); - else printf("%d", pos); - for (k = 0; k <= 4; ++k) sum += p->b[k]; - printf("\t%lld", (long long)sum); - for (k = 0; k <= 4; ++k) - printf("\t%.1f", 100. * p->b[k] / sum); - for (k = 0; k <= 93; ++k) { - qsum += p->q[k] * k, psum += p->q[k] * perr[k]; - if (k < qthres) sum_low += p->q[k]; - } - printf("\t%.1f\t%.1f", (double)qsum/sum, -4.343*log((psum+1e-6)/(sum+1e-6))); - if (qthres <= 0) { - for (k = 0; k <= 93; ++k) - if (allq[k] > 0) printf("\t%.2f", 100. * p->q[k] / sum); - } else printf("\t%.1f\t%.1f", 100. * sum_low / sum, 100. * (sum - sum_low) / sum); - putchar('\n'); -} - -int stk_fqchk(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int i, c, k, max_len = 0, min_len = 0x7fffffff, max_alloc = 0, offset = 33, n_diffQ = 0, qthres = 20; - int64_t tot_len = 0, n = 0; - double perr[94]; - posstat_t all, *pos = 0; - - while ((c = getopt(argc, argv, "q:")) >= 0) - if (c == 'q') qthres = atoi(optarg); - - if (optind == argc) { - fprintf(stderr, "Usage: seqtk fqchk [-q %d] \n", qthres); - fprintf(stderr, "Note: use -q0 to get the distribution of all quality values\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - for (k = 0; k <= 93; ++k) - perr[k] = pow(10., -.1 * k); - perr[0] = perr[1] = perr[2] = perr[3] = .5; - while (kseq_read(seq) >= 0) { - if (seq->qual.l == 0) continue; - ++n; - tot_len += seq->seq.l; - min_len = min_len < seq->seq.l? min_len : seq->seq.l; - max_len = max_len > seq->seq.l? max_len : seq->seq.l; - if (max_len > max_alloc) { - int old_max = max_alloc; - max_alloc = max_len; - kroundup32(max_alloc); - pos = realloc(pos, max_alloc * sizeof(posstat_t)); - memset(&pos[old_max], 0, (max_alloc - old_max) * sizeof(posstat_t)); - } - for (i = 0; i < seq->qual.l; ++i) { - int q = seq->qual.s[i] - offset; - int b = seq_nt6_table[(int)seq->seq.s[i]]; - b = b? b - 1 : 4; - q = q < 93? q : 93; - ++pos[i].q[q]; - ++pos[i].b[b]; - } - } - kseq_destroy(seq); - gzclose(fp); - - memset(&all, 0, sizeof(posstat_t)); - for (i = 0; i < max_len; ++i) { - for (k = 0; k <= 93; ++k) - all.q[k] += pos[i].q[k]; - for (k = 0; k <= 4; ++k) - all.b[k] += pos[i].b[k]; - } - for (k = n_diffQ = 0; k <= 93; ++k) - if (all.q[k]) ++n_diffQ; - printf("min_len: %d; max_len: %d; avg_len: %.2f; %d distinct quality values\n", min_len, max_len, (double)tot_len/n, n_diffQ); - printf("POS\t#bases\t%%A\t%%C\t%%G\t%%T\t%%N\tavgQ\terrQ"); - if (qthres <= 0) { - for (k = 0; k <= 93; ++k) - if (all.q[k] > 0) printf("\t%%Q%d", k); - } else printf("\t%%low\t%%high"); - putchar('\n'); - fqc_aux(&all, 0, all.q, perr, qthres); - for (i = 0; i < max_len; ++i) - fqc_aux(&pos[i], i + 1, all.q, perr, qthres); - free(pos); - return 0; -} - -/* main function */ -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.0-r82b-dirty\n\n"); - fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); - fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); - fprintf(stderr, " sample subsample sequences\n"); - fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); - fprintf(stderr, " fqchk fastq QC (base/quality summary)\n"); - fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n"); - fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n"); - fprintf(stderr, " hety regional heterozygosity\n"); - fprintf(stderr, " gc identify high- or low-GC regions\n"); - fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); - fprintf(stderr, " mergefa merge two FASTA/Q files\n"); - fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n"); - fprintf(stderr, " rename rename sequence names\n"); - fprintf(stderr, " randbase choose a random base from hets\n"); - fprintf(stderr, " cutN cut sequence at long N\n"); - fprintf(stderr, " listhet extract the position of each het\n"); - fprintf(stderr, "\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - if (argc == 1) return usage(); - if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); - else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1); - else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); - else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1); - else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); - else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); - else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); - else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1); - else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1); - else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); - else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); - else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); - else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); - else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1); - else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1); - else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); - else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); - else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); - else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1); - else { - fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]); - return 1; - } - return 0; -} +/* The MIT License + + Copyright (c) 2008-2016 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int n, m; + uint64_t *a; +} reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, reglist_t) +KHASH_SET_INIT_INT64(64) + +typedef kh_reg_t reghash_t; + +reghash_t *stk_reg_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + ks = ks_init(fp); + str = calloc(1, sizeof(kstring_t)); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int beg = -1, end = -1; + reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); + if (end < 0) end = -1; + } + } + } + } + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg < 0) beg = 0, end = INT_MAX; + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return h; +} + +void stk_reg_destroy(reghash_t *h) +{ + khint_t k; + if (h == 0) return; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} + +/* constant table */ + +unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +unsigned char seq_nt6_table[256] = { + 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 +}; + +char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; +unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; +int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; +char comp_tab[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', + 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, + 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', + 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 +}; + +static void stk_printstr(const kstring_t *s, unsigned line_len) +{ + if (line_len != UINT_MAX && line_len != 0) { + int i, rest = s->l; + for (i = 0; i < s->l; i += line_len, rest -= line_len) { + putchar('\n'); + if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout); + else fwrite(s->s + i, 1, rest, stdout); + } + putchar('\n'); + } else { + putchar('\n'); + puts(s->s); + } +} + +static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n) +{ + putchar(s->qual.l? '@' : '>'); + if (n >= 0) { + if (prefix) fputs(prefix, stdout); + printf("%lld", (long long)n); + } else fputs(s->name.s, stdout); + if (s->comment.l) { + putchar(' '); fputs(s->comment.s, stdout); + } + stk_printstr(&s->seq, line_len); + if (s->qual.l) { + putchar('+'); + stk_printstr(&s->qual, line_len); + } +} + +static inline void stk_printseq(const kseq_t *s, int line_len) +{ + stk_printseq_renamed(s, line_len, 0, -1); +} + +/* + 64-bit Mersenne Twister pseudorandom number generator. Adapted from: + + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c + + which was written by Takuji Nishimura and Makoto Matsumoto and released + under the 3-clause BSD license. +*/ + +typedef uint64_t krint64_t; + +struct _krand_t; +typedef struct _krand_t krand_t; + +#define KR_NN 312 +#define KR_MM 156 +#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ +#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */ + +struct _krand_t { + int mti; + krint64_t mt[KR_NN]; +}; + +static void kr_srand0(krint64_t seed, krand_t *kr) +{ + kr->mt[0] = seed; + for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti) + kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti; +} + +krand_t *kr_srand(krint64_t seed) +{ + krand_t *kr; + kr = malloc(sizeof(krand_t)); + kr_srand0(seed, kr); + return kr; +} + +krint64_t kr_rand(krand_t *kr) +{ + krint64_t x; + static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL }; + if (kr->mti >= KR_NN) { + int i; + if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr); + for (i = 0; i < KR_NN - KR_MM; ++i) { + x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); + kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)]; + } + for (; i < KR_NN - 1; ++i) { + x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM); + kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)]; + } + x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM); + kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)]; + kr->mti = 0; + } + x = kr->mt[kr->mti++]; + x ^= (x >> 29) & 0x5555555555555555ULL; + x ^= (x << 17) & 0x71D67FFFEDA60000ULL; + x ^= (x << 37) & 0xFFF7EEE000000000ULL; + x ^= (x >> 43); + return x; +} + +#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0)) + + +/* quality based trimming with Mott's algorithm */ +int stk_trimfq(int argc, char *argv[]) +{ // FIXME: when a record with zero length will always be treated as a fasta record + gzFile fp; + kseq_t *seq; + double param = 0.05, q_int2real[128]; + int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1; + while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) { + switch (c) { + case 'q': param = atof(optarg); break; + case 'l': min_len = atoi(optarg); break; + case 'b': left = atoi(optarg); break; + case 'e': right = atoi(optarg); break; + case 'L': fixed_len = atoi(optarg); break; + } + } + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); + fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n"); + fprintf(stderr, " -Q force FASTQ output\n"); + fprintf(stderr, "\n"); + return 1; + } + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + for (i = 0; i < 128; ++i) + q_int2real[i] = pow(10., -(i - 33) / 10.); + while (kseq_read(seq) >= 0) { + int beg, tmp, end; + double s, max; + if (left || right || fixed_len > 0) { + beg = left; end = seq->seq.l - right; + if (beg >= end) beg = end = 0; + if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len; + } else if (seq->qual.l > min_len) { + for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { + int q = seq->qual.s[i]; + if (q < 36) q = 36; + if (q > 127) q = 127; + s += param - q_int2real[q]; + if (s > max) max = s, beg = tmp, end = i + 1; + if (s < 0) s = 0, tmp = i + 1; + } + + /* max never set; all low qual, just give first min_len bp */ + if (max == 0.) beg = 0, end = min_len; + + if (end - beg < min_len) { // window-based + int is, imax; + for (i = 0, is = 0; i < min_len; ++i) + is += seq->qual.s[i] - 33; + for (imax = is, beg = 0; i < seq->qual.l; ++i) { + is += (int)seq->qual.s[i] - seq->qual.s[i - min_len]; + if (imax < is) imax = is, beg = i - min_len + 1; + } + end = beg + min_len; + } + } else beg = 0, end = seq->seq.l; + putchar(seq->is_fastq? '@' : '>'); fputs(seq->name.s, stdout); + if (seq->comment.l) { + putchar(' '); puts(seq->comment.s); + } else putchar('\n'); + fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n'); + if (seq->is_fastq) { + puts("+"); + fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n'); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* composition */ +int stk_comp(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, upper_only = 0; + reghash_t *h = 0; + reglist_t dummy; + + while ((c = getopt(argc, argv, "ur:")) >= 0) { + switch (c) { + case 'u': upper_only = 1; break; + case 'r': h = stk_reg_read(optarg); break; + } + } + if (argc == optind && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); + fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); + return 1; + } + fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); + while ((l = kseq_read(seq)) >= 0) { + int i, k; + reglist_t *p = 0; + if (h) { + khint_t k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) p = &kh_val(h, k); + } else { + p = &dummy; + dummy.a[0] = l; + } + for (k = 0; p && k < p->n; ++k) { + int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; + int la, lb, lc, na, nb, nc, cnt[11]; + if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; + else la = 'a', lb = -1, lc = 0; + na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + memset(cnt, 0, 11 * sizeof(int)); + for (i = beg; i < end; ++i) { + int is_CpG = 0, a, b, c; + a = na; b = nb; c = nc; + na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + if (b == 2 || b == 10) { // C or Y + if (nb == 4 || nb == 5) is_CpG = 1; + } else if (b == 4 || b == 5) { // G or R + if (lb == 2 || lb == 10) is_CpG = 1; + } + if (upper_only == 0 || isupper(a)) { + if (c > 1) ++cnt[c+2]; + if (c == 1) ++cnt[seq_nt16to4_table[b]]; + if (b == 10 || b == 5) ++cnt[9]; + else if (c == 2) { + ++cnt[8]; + } + if (is_CpG) { + ++cnt[7]; + if (b == 10 || b == 5) ++cnt[10]; + } + } + la = a; lb = b; lc = c; + } + if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); + else printf("%s\t%d", seq->name.s, l); + for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); + putchar('\n'); + } + fflush(stdout); + } + free(dummy.a); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_randbase(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk randbase \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + int i; + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + int c, b, a, j, k, m; + b = seq->seq.s[i]; + c = seq_nt16_table[b]; + a = bitcnt_table[c]; + if (a == 2) { + m = (drand48() < 0.5); + for (j = k = 0; j < 4; ++j) { + if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; + } + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_hety(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; + char *buf; + uint32_t cnt[3]; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk hety [options] \n\n"); + fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); + fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); + fprintf(stderr, " -m treat lowercases as masked\n"); + fprintf(stderr, "\n"); + return 1; + } + while ((c = getopt(argc, argv, "w:t:m")) >= 0) { + switch (c) { + case 'w': win_size = atoi(optarg); break; + case 't': n_start = atoi(optarg); break; + case 'm': is_lower_mask = 1; break; + } + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + win_step = win_size / n_start; + buf = calloc(win_size, 1); + while ((l = kseq_read(seq)) >= 0) { + int x, i, y, z, next = 0; + cnt[0] = cnt[1] = cnt[2] = 0; + for (i = 0; i <= l; ++i) { + if ((i >= win_size && i % win_step == 0) || i == l) { + if (i == l && l >= win_size) { + for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; + } + if (cnt[1] + cnt[2] > 0) + printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, + (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); + next = i; + } + if (i < l) { + y = i % win_size; + c = seq->seq.s[i]; + if (is_lower_mask && islower(c)) c = 'N'; + c = seq_nt16_table[c]; + x = bitcnt_table[c]; + if (i >= win_size) --cnt[(int)buf[y]]; + buf[y] = z = x > 2? 0 : x == 2? 2 : 1; + ++cnt[z]; + } + } + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* subseq */ + +int stk_subseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_tab = 0, line = 0; + khint_t k; + while ((c = getopt(argc, argv, "tl:")) >= 0) { + switch (c) { + case 't': is_tab = 1; break; + case 'l': line = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); + fprintf(stderr, "Options: -t TAB delimited output\n"); + fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); + fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + if (h == 0) { + fprintf(stderr, "[E::%s] failed to read the list of regions in file '%s'\n", __func__, argv[optind+1]); + return 1; + } + // subseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__); + return 1; + } + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); + continue; + } + if (end > seq->seq.l) end = seq->seq.l; + if (is_tab == 0) { + printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); + if (beg > 0 || (int)p->a[i] != INT_MAX) { + if (end == INT_MAX) { + if (beg) printf(":%d", beg+1); + } else printf(":%d-%d", beg+1, end); + } + if (seq->comment.l) printf(" %s", seq->comment.s); + } else printf("%s\t%d\t", seq->name.s, beg + 1); + if (end > seq->seq.l) end = seq->seq.l; + for (j = 0; j < end - beg; ++j) { + if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n'); + putchar(seq->seq.s[j + beg]); + } + putchar('\n'); + if (seq->qual.l != seq->seq.l || is_tab) continue; + printf("+"); + for (j = 0; j < end - beg; ++j) { + if (j == 0 || (line > 0 && j % line == 0)) putchar('\n'); + putchar(seq->qual.s[j + beg]); + } + putchar('\n'); + } + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* mergefa */ +int stk_mergefa(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0; + uint64_t cnt[5]; + while ((c = getopt(argc, argv, "himrq:")) >= 0) { + switch (c) { + case 'i': is_intersect = 1; break; + case 'h': is_haploid = 1; break; + case 'm': is_mask = 1; break; + case 'r': is_randhet = 1; break; + case 'q': qual = atoi(optarg); break; + } + } + if (is_mask && is_intersect) { + fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); + return 1; + } + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); + fprintf(stderr, "Options: -q INT quality threshold [0]\n"); + fprintf(stderr, " -i take intersection\n"); + fprintf(stderr, " -m convert to lowercase when one of the input base is N\n"); + fprintf(stderr, " -r pick a random allele from het\n"); + fprintf(stderr, " -h suppress hets in the input\n\n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + if (fp[0] == 0 || fp[1] == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0; + srand48(11); + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2], b[2], is_upper; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); + if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); + if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; + c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; + if (c[0] == 0) c[0] = 15; + if (c[1] == 0) c[1] = 15; + b[0] = bitcnt_table[c[0]]; + b[1] = bitcnt_table[c[1]]; + if (is_upper) { + if (b[0] == 1 && b[1] == 1) { + if (c[0] == c[1]) ++cnt[0]; + else ++cnt[1]; + } else if (b[0] == 1 && b[1] == 2) ++cnt[2]; + else if (b[0] == 2 && b[1] == 1) ++cnt[3]; + else if (b[0] == 2 && b[1] == 2) ++cnt[4]; + } + if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0; + if (is_intersect) { + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0! + } else if (is_mask) { + if (c[0] == 15 || c[1] == 15) is_upper = 0; + c[0] &= c[1]; + if (c[0] == 0) is_upper = 0; + } else if (is_randhet) { + if (b[0] == 1 && b[1] == 1) { // two homs + c[0] |= c[1]; + } else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het + c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]); + } else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets + if (lrand48()&1) { + if (lrand48()&1) { + for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele + if (c[0]&i) c[0] &= i; + } else { + for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele + if (c[0]&i) c[0] &= i; + } + } // else set as het + } else is_upper = 0; + } else c[0] |= c[1]; + c[0] = seq_nt16_rev_table[c[0]]; + if (!is_upper) c[0] = tolower(c[0]); + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]); + return 0; +} + +int stk_famask(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c; + while ((c = getopt(argc, argv, "")) >= 0); + if (argc - optind < 2) { + fprintf(stderr, "Usage: seqtk famask \n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + if (fp[0] == 0 || fp[1] == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2]; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (c[1] == 'x') c[0] = tolower(c[0]); + else if (c[1] != 'X') c[0] = c[1]; + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_mutfa(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + kstream_t *ks; + int l, i, dret; + kstring_t *str; + khint_t k; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mutfa \n\n"); + fprintf(stderr, "Note: contains at least four columns per line which are:\n"); + fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); + return 1; + } + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + char *s = strdup(str->s); + int beg = 0, ret; + reglist_t *p; + k = kh_get(reg, h, s); + if (k == kh_end(h)) { + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col + ks_getuntil(ks, 0, str, &dret); // 3rd col + ks_getuntil(ks, 0, str, &dret); // 4th col + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (isalpha(str->s[0]) && str->l == 1) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + // mutfa + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) { + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32; + if (beg < seq->seq.l) + seq->seq.s[beg] = (int)p->a[i]; + } + } + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); + return 0; +} + +int stk_listhet(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk listhet \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = 0; i < l; ++i) { + int b = seq->seq.s[i]; + if (bitcnt_table[seq_nt16_table[b]] == 2) + printf("%s\t%d\t%c\n", seq->name.s, i+1, b); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* cutN */ +static int cutN_min_N_tract = 1000; +static int cutN_nonN_penalty = 10; + +static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) +{ + int i, b, e; + while (k < ks->seq.l) { + if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { + int score, max; + score = 0; e = max = -1; + for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, e = i; + } + score = 0; b = max = -1; + for (i = e; i >= 0 && score >= 0; --i) { /* backward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, b = i; + } + if (e + 1 - b >= cutN_min_N_tract) { + *begin = b; + *end = e + 1; + return *end; + } + k = e + 1; + } else ++k; + } + return -1; +} +static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) +{ + int i; + if (begin >= end) return; // FIXME: why may this happen? Understand it! + fprintf(fpout, "%c%s:%d-%d", ks->qual.l? '@' : '>', ks->name.s, begin+1, end); + for (i = begin; i < end && i < ks->seq.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->seq.s[i], fpout); + } + fputc('\n', fpout); + if (ks->qual.l == 0) return; + fputs("+\n", fpout); + for (i = begin; i < end && i < ks->qual.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->qual.s[i], fpout); + } + fputc('\n', fpout); +} +int stk_cutN(int argc, char *argv[]) +{ + int c, l, gap_only = 0; + gzFile fp; + kseq_t *ks; + while ((c = getopt(argc, argv, "n:p:g")) >= 0) { + switch (c) { + case 'n': cutN_min_N_tract = atoi(optarg); break; + case 'p': cutN_nonN_penalty = atoi(optarg); break; + case 'g': gap_only = 1; break; + default: return 1; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); + fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); + fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); + fprintf(stderr, " -g print gaps only, no sequence\n\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + ks = kseq_init(fp); + while ((l = kseq_read(ks)) >= 0) { + int k = 0, begin = 0, end = 0; + while (find_next_cut(ks, k, &begin, &end) >= 0) { + if (begin != 0) { + if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); + else print_seq(stdout, ks, k, begin); + } + k = end; + } + if (!gap_only) print_seq(stdout, ks, k, l); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +int stk_hrun(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int min_len = 7, l = 0, c = 0, beg = 0, i; + if (argc == optind) { + fprintf(stderr, "Usage: seqtk hrun [minLen=%d]\n", min_len); + return 1; + } + if (argc == optind + 2) min_len = atoi(argv[optind+1]); + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + c = ks->seq.s[0]; l = 1; beg = 0; + for (i = 1; i < ks->seq.l; ++i) { + if (ks->seq.s[i] != c) { + if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); + c = ks->seq.s[i]; l = 1; beg = i; + } else ++l; + } + } + if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c); + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* sample */ + +static void cpy_kstr(kstring_t *dst, const kstring_t *src) +{ + if (src->l == 0) return; + if (src->l + 1 > dst->m) { + dst->m = src->l + 1; + kroundup32(dst->m); + dst->s = realloc(dst->s, dst->m); + } + dst->l = src->l; + memcpy(dst->s, src->s, src->l + 1); +} + +static void cpy_kseq(kseq_t *dst, const kseq_t *src) +{ + cpy_kstr(&dst->name, &src->name); + cpy_kstr(&dst->seq, &src->seq); + cpy_kstr(&dst->qual, &src->qual); + cpy_kstr(&dst->comment, &src->comment); +} + +int stk_sample(int argc, char *argv[]) +{ + int c, twopass = 0; + uint64_t i, num = 0, n_seqs = 0; + double frac = 0.; + gzFile fp; + kseq_t *seq; + krand_t *kr = 0; + + while ((c = getopt(argc, argv, "2s:")) >= 0) + if (c == 's') kr = kr_srand(atol(optarg)); + else if (c == '2') twopass = 1; + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk sample [-2] [-s seed=11] |\n\n"); + fprintf(stderr, "Options: -s INT RNG seed [11]\n"); + fprintf(stderr, " -2 2-pass mode: twice as slow but with much reduced memory\n\n"); + return 1; + } + frac = atof(argv[optind+1]); + if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.; + else if (twopass) { + fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__); + twopass = 0; + } + if (kr == 0) kr = kr_srand(11); + + if (!twopass) { // the streaming version + kseq_t *buf = 0; + if (num > 0) buf = calloc(num, sizeof(kseq_t)); + if (num > 0 && buf == NULL) { + fprintf(stderr, "[E::%s] Could not allocate enough memory for %" PRIu64 " sequences. Exiting...\n", __func__, num); + free(kr); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) { + double r = kr_drand(kr); + ++n_seqs; + if (num) { + uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); + if (y < num) cpy_kseq(&buf[y], seq); + } else if (r < frac) stk_printseq(seq, UINT_MAX); + } + for (i = 0; i < num; ++i) { + kseq_t *p = &buf[i]; + if (p->seq.l) stk_printseq(p, UINT_MAX); + free(p->seq.s); free(p->qual.s); free(p->name.s); + } + if (buf != NULL) free(buf); + } else { + uint64_t *buf; + khash_t(64) *hash; + int absent; + + if (strcmp(argv[optind], "-") == 0) { + fprintf(stderr, "[E::%s] in the 2-pass mode, the input cannot be STDIN.\n", __func__); + free(kr); + return 1; + } + + // 1st pass + buf = malloc(num * 8); + for (i = 0; i < num; ++i) buf[i] = UINT64_MAX; + fp = gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) { + double r = kr_drand(kr); + uint64_t y; + ++n_seqs; + y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs); + if (y < num) buf[y] = n_seqs; + } + kseq_destroy(seq); + gzclose(fp); + hash = kh_init(64); + for (i = 0; i < num; ++i) kh_put(64, hash, buf[i], &absent); + free(buf); + // 2nd pass + fp = gzopen(argv[optind], "r"); + seq = kseq_init(fp); + n_seqs = 0; + while (kseq_read(seq) >= 0) + if (kh_get(64, hash, ++n_seqs) != kh_end(hash)) + stk_printseq(seq, UINT_MAX); + kh_destroy(64, hash); + } + + kseq_destroy(seq); + gzclose(fp); + free(kr); + return 0; +} + +/* seq */ + +void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr) +{ + unsigned i, j; + khiter_t k; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) { // not found in the hash table + if (is_complement) { + if (mask_chr) { + for (j = 0; j < seq->seq.l; ++j) + seq->seq.s[j] = mask_chr; + } else { + for (j = 0; j < seq->seq.l; ++j) + seq->seq.s[j] = tolower(seq->seq.s[j]); + } + } + } else { + reglist_t *p = &kh_val(h, k); + if (!is_complement) { + for (i = 0; i < p->n; ++i) { + unsigned beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) continue; + if (end > seq->seq.l) end = seq->seq.l; + if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); + else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr; + } + } else { + int8_t *mask = calloc(seq->seq.l, 1); + for (i = 0; i < p->n; ++i) { + unsigned beg = p->a[i]>>32, end = p->a[i]; + if (end >= seq->seq.l) end = seq->seq.l; + for (j = beg; j < end; ++j) mask[j] = 1; + } + if (mask_chr) { + for (j = 0; j < seq->seq.l; ++j) + if (mask[j] == 0) seq->seq.s[j] = mask_chr; + } else { + for (j = 0; j < seq->seq.l; ++j) + if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]); + } + free(mask); + } + } +} + +int stk_seq(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255; + unsigned i, line_len = 0; + int64_t n_seqs = 0; + double frac = 1.; + khash_t(reg) *h = 0; + krand_t *kr = 0; + + while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:S")) >= 0) { + switch (c) { + case 'a': + case 'A': flag |= 1; break; + case 'C': flag |= 2; break; + case 'r': flag |= 4; break; + case 'c': flag |= 8; break; + case '1': flag |= 16; break; + case '2': flag |= 32; break; + case 'V': flag |= 64; break; + case 'N': flag |= 128; break; + case 'U': flag |= 256; break; + case 'S': flag |= 512; break; + case 'M': h = stk_reg_read(optarg); break; + case 'n': mask_chr = *optarg; break; + case 'Q': qual_shift = atoi(optarg); break; + case 'q': qual_thres = atoi(optarg); break; + case 'X': max_q = atoi(optarg); break; + case 'l': line_len = atoi(optarg); break; + case 'L': min_len = atoi(optarg); break; + case 's': kr = kr_srand(atol(optarg)); break; + case 'f': frac = atof(optarg); break; + } + } + if (kr == 0) kr = kr_srand(11); + if (argc == optind && isatty(fileno(stdin))) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk seq [options] |\n\n"); + fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n"); + fprintf(stderr, " -X INT mask bases with quality higher than INT [255]\n"); + fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n"); + fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len); + fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift); + fprintf(stderr, " -s INT random seed (effective with -f) [11]\n"); + fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); + fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); + fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); + fprintf(stderr, " -c mask complement region (effective with -M)\n"); + fprintf(stderr, " -r reverse complement\n"); + fprintf(stderr, " -A force FASTA output (discard quality)\n"); + fprintf(stderr, " -C drop comments at the header lines\n"); + fprintf(stderr, " -N drop sequences containing ambiguous bases\n"); + fprintf(stderr, " -1 output the 2n-1 reads only\n"); + fprintf(stderr, " -2 output the 2n reads only\n"); + fprintf(stderr, " -V shift quality by '(-Q) - 33'\n"); + fprintf(stderr, " -U convert all bases to uppercases\n"); + fprintf(stderr, " -S strip of white spaces in sequences\n"); + fprintf(stderr, "\n"); + free(kr); + return 1; + } + if (line_len == 0) line_len = UINT_MAX; + fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + qual_thres += qual_shift; + while (kseq_read(seq) >= 0) { + ++n_seqs; + if (seq->seq.l < min_len) continue; // NB: length filter before taking random + if (frac < 1. && kr_drand(kr) >= frac) continue; + if (flag & 48) { // then choose odd/even reads only + if ((flag&16) && (n_seqs&1) == 0) continue; + if ((flag&32) && (n_seqs&1) == 1) continue; + } + if (flag & 512) { // option -S: squeeze out white spaces + int k; + if (seq->qual.l) { + for (i = k = 0; i < seq->seq.l; ++i) + if (!isspace(seq->seq.s[i])) + seq->qual.s[k++] = seq->qual.s[i]; + seq->qual.l = k; + } + for (i = k = 0; i < seq->seq.l; ++i) + if (!isspace(seq->seq.s[i])) + seq->seq.s[k++] = seq->seq.s[i]; + seq->seq.l = k; + } + if (seq->qual.l && qual_thres > qual_shift) { + if (mask_chr) { + for (i = 0; i < seq->seq.l; ++i) + if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) + seq->seq.s[i] = mask_chr; + } else { + for (i = 0; i < seq->seq.l; ++i) + if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q) + seq->seq.s[i] = tolower(seq->seq.s[i]); + } + } + if (flag & 256) // option -U: convert to uppercases + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = toupper(seq->seq.s[i]); + if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta + if (flag & 2) seq->comment.l = 0; // option -C: drop fasta/q comments + if (h) stk_mask(seq, h, flag&8, mask_chr); // masking + if (flag & 4) { // option -r: reverse complement + int c0, c1; + for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence + c0 = comp_tab[(int)seq->seq.s[i]]; + c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]]; + seq->seq.s[i] = c1; + seq->seq.s[seq->seq.l - 1 - i] = c0; + } + if (seq->seq.l & 1) // complement the remaining base + seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]]; + if (seq->qual.l) { + for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality + c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0; + } + } + if ((flag & 64) && seq->qual.l && qual_shift != 33) + for (i = 0; i < seq->qual.l; ++i) + seq->qual.s[i] -= qual_shift - 33; + if (flag & 128) { // option -N: drop sequences containing ambiguous bases - Note: this is the last step! + for (i = 0; i < seq->seq.l; ++i) + if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break; + if (i < seq->seq.l) continue; + } + stk_printseq(seq, line_len); + } + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + free(kr); + return 0; +} + +int stk_gc(int argc, char *argv[]) +{ + int c, is_at = 0, min_l = 20; + double frac = 0.6f, xdropoff = 10.0f, q; + gzFile fp; + kseq_t *seq; + + while ((c = getopt(argc, argv, "wx:f:l:")) >= 0) { + if (c == 'x') xdropoff = atof(optarg); + else if (c == 'w') is_at = 1; + else if (c == 'f') frac = atof(optarg); + else if (c == 'l') min_l = atoi(optarg); + } + if (optind + 1 > argc) { + fprintf(stderr, "Usage: seqtk gc [options] \n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -w identify high-AT regions\n"); + fprintf(stderr, " -f FLOAT min GC fraction (or AT fraction for -w) [%.2f]\n", frac); + fprintf(stderr, " -l INT min region length to output [%d]\n", min_l); + fprintf(stderr, " -x FLOAT X-dropoff [%.1f]\n", xdropoff); + return 1; + } + q = (1.0f - frac) / frac; + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + while (kseq_read(seq) >= 0) { + int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0; + double sc = 0., max = 0.; + for (i = 0; i < seq->seq.l; ++i) { + int hit; + c = seq_nt16_table[(int)seq->seq.s[i]]; + if (is_at) hit = (c == 1 || c == 8 || c == 9); + else hit = (c == 2 || c == 4 || c == 6); + n_hits += hit; + if (hit) { + if (sc == 0) start = i, start_hits = n_hits; + sc += q; + if (sc > max) max = sc, max_i = i, max_hits = n_hits; + } else if (sc > 0) { + sc += -1.0f; + if (sc < 0 || max - sc > xdropoff) { + if (max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + sc = max = 0; + i = max_i; + } + } + } + if (max > 0. && max_i + 1 - start >= min_l) + printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_mergepe(int argc, char *argv[]) +{ + gzFile fp1, fp2; + kseq_t *seq[2]; + + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mergepe \n"); + return 1; + } + fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (fp1 == 0 || fp2 == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq[0] = kseq_init(fp1); + seq[1] = kseq_init(fp2); + while (kseq_read(seq[0]) >= 0) { + if (kseq_read(seq[1]) < 0) { + fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__); + break; + } + stk_printseq(seq[0], 0); + stk_printseq(seq[1], 0); + } + if (kseq_read(seq[1]) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__); + kseq_destroy(seq[0]); gzclose(fp1); + kseq_destroy(seq[1]); gzclose(fp2); + return 0; +} + +int stk_dropse(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq, last; + + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk dropse \n"); + return 1; + } + fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + + memset(&last, 0, sizeof(kseq_t)); + while (kseq_read(seq) >= 0) { + if (last.name.l) { + kstring_t *p = &last.name, *q = &seq->name; + int is_diff; + if (p->l == q->l) { + int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; + is_diff = strncmp(p->s, q->s, l); + } else is_diff = 1; + if (!is_diff) { + stk_printseq(&last, 0); + stk_printseq(seq, 0); + last.name.l = 0; + } else cpy_kseq(&last, seq); + } else cpy_kseq(&last, seq); + } + + kseq_destroy(seq); + gzclose(fp); + // free last! + return 0; +} + +int stk_rename(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq, last; + char *prefix = 0; + uint64_t n = 1; + + if (argc == 1 && isatty(fileno(stdin))) { + fprintf(stderr, "Usage: seqtk rename [prefix]\n"); + return 1; + } + fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + if (argc > 2) prefix = argv[2]; + + memset(&last, 0, sizeof(kseq_t)); + while (kseq_read(seq) >= 0) { + if (last.name.l) { + kstring_t *p = &last.name, *q = &seq->name; + int is_diff; + if (p->l == q->l) { + int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l; + is_diff = strncmp(p->s, q->s, l); + } else is_diff = 1; + if (!is_diff) { + stk_printseq_renamed(&last, 0, prefix, n); + stk_printseq_renamed(seq, 0, prefix, n); + last.name.l = 0; + ++n; + } else { + stk_printseq_renamed(&last, 0, prefix, n); + ++n; + cpy_kseq(&last, seq); + } + } else cpy_kseq(&last, seq); + } + if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n); + + kseq_destroy(seq); + gzclose(fp); + // free last! + return 0; +} + +int stk_kfreq(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int kmer, i, l, mask; + char *nei; + + if (argc < 2) { + fprintf(stderr, "Usage: seqtk kfreq \n"); + return 1; + } + + // get the k-mer + l = strlen(argv[1]); + for (i = kmer = 0; i < l; ++i) { + int c = seq_nt6_table[(int)argv[1][i]]; + assert(c >= 1 && c <= 4); + kmer = kmer << 2 | (c - 1); + } + mask = (1<<2*l) - 1; + + // get the neighbors + nei = calloc(1, 1<<2*l); + for (i = 0; i < l; ++i) { + int j, x; + x = kmer & ~(3 << 2*i); + for (j = 0; j < 4; ++j) + nei[x|j<<2*i] = 1; + } + + fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + int k, x[2], cnt[2], cnt_nei[2], which; + x[0] = x[1] = k = cnt[0] = cnt[1] = cnt_nei[0] = cnt_nei[1] = 0; + for (i = 0; i < ks->seq.l; ++i) { + int c = seq_nt6_table[(int)ks->seq.s[i]]; + if (c >= 1 && c <= 4) { + x[0] = (x[0] << 2 | (c - 1)) & mask; + x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1)); + if (k < l) ++k; + if (k == l) { + if (x[0] == kmer) ++cnt[0]; + else if (x[1] == kmer) ++cnt[1]; + if (nei[x[0]]) ++cnt_nei[0]; + else if (nei[x[1]]) ++cnt_nei[1]; + } + } else k = 0; + } + which = cnt_nei[0] > cnt_nei[1]? 0 : 1; + printf("%s\t%ld\t%c\t%d\t%d\n", ks->name.s, ks->seq.l, "+-"[which], cnt_nei[which], cnt[which]); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* fqchk */ + +typedef struct { + int64_t q[94], b[5]; +} posstat_t; + +static void fqc_aux(posstat_t *p, int pos, int64_t allq[94], double perr[94], int qthres) +{ + int k; + int64_t sum = 0, qsum = 0, sum_low = 0; + double psum = 0; + if (pos <= 0) printf("ALL"); + else printf("%d", pos); + for (k = 0; k <= 4; ++k) sum += p->b[k]; + printf("\t%lld", (long long)sum); + for (k = 0; k <= 4; ++k) + printf("\t%.1f", 100. * p->b[k] / sum); + for (k = 0; k <= 93; ++k) { + qsum += p->q[k] * k, psum += p->q[k] * perr[k]; + if (k < qthres) sum_low += p->q[k]; + } + printf("\t%.1f\t%.1f", (double)qsum/sum, -4.343*log((psum+1e-6)/(sum+1e-6))); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (allq[k] > 0) printf("\t%.2f", 100. * p->q[k] / sum); + } else printf("\t%.1f\t%.1f", 100. * sum_low / sum, 100. * (sum - sum_low) / sum); + putchar('\n'); +} + +int stk_fqchk(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, c, k, max_len = 0, min_len = 0x7fffffff, max_alloc = 0, offset = 33, n_diffQ = 0, qthres = 20; + int64_t tot_len = 0, n = 0; + double perr[94]; + posstat_t all, *pos = 0; + + while ((c = getopt(argc, argv, "q:")) >= 0) + if (c == 'q') qthres = atoi(optarg); + + if (optind == argc) { + fprintf(stderr, "Usage: seqtk fqchk [-q %d] \n", qthres); + fprintf(stderr, "Note: use -q0 to get the distribution of all quality values\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__); + return 1; + } + seq = kseq_init(fp); + for (k = 0; k <= 93; ++k) + perr[k] = pow(10., -.1 * k); + perr[0] = perr[1] = perr[2] = perr[3] = .5; + while (kseq_read(seq) >= 0) { + if (seq->qual.l == 0) continue; + ++n; + tot_len += seq->seq.l; + min_len = min_len < seq->seq.l? min_len : seq->seq.l; + max_len = max_len > seq->seq.l? max_len : seq->seq.l; + if (max_len > max_alloc) { + int old_max = max_alloc; + max_alloc = max_len; + kroundup32(max_alloc); + pos = realloc(pos, max_alloc * sizeof(posstat_t)); + memset(&pos[old_max], 0, (max_alloc - old_max) * sizeof(posstat_t)); + } + for (i = 0; i < seq->qual.l; ++i) { + int q = seq->qual.s[i] - offset; + int b = seq_nt6_table[(int)seq->seq.s[i]]; + b = b? b - 1 : 4; + q = q < 93? q : 93; + ++pos[i].q[q]; + ++pos[i].b[b]; + } + } + kseq_destroy(seq); + gzclose(fp); + + memset(&all, 0, sizeof(posstat_t)); + for (i = 0; i < max_len; ++i) { + for (k = 0; k <= 93; ++k) + all.q[k] += pos[i].q[k]; + for (k = 0; k <= 4; ++k) + all.b[k] += pos[i].b[k]; + } + for (k = n_diffQ = 0; k <= 93; ++k) + if (all.q[k]) ++n_diffQ; + printf("min_len: %d; max_len: %d; avg_len: %.2f; %d distinct quality values\n", min_len, max_len, (double)tot_len/n, n_diffQ); + printf("POS\t#bases\t%%A\t%%C\t%%G\t%%T\t%%N\tavgQ\terrQ"); + if (qthres <= 0) { + for (k = 0; k <= 93; ++k) + if (all.q[k] > 0) printf("\t%%Q%d", k); + } else printf("\t%%low\t%%high"); + putchar('\n'); + fqc_aux(&all, 0, all.q, perr, qthres); + for (i = 0; i < max_len; ++i) + fqc_aux(&pos[i], i + 1, all.q, perr, qthres); + free(pos); + return 0; +} + +/* main function */ +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk \n"); + fprintf(stderr, "Version: 1.2-r101-dirty\n\n"); + fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); + fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); + fprintf(stderr, " sample subsample sequences\n"); + fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " fqchk fastq QC (base/quality summary)\n"); + fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n"); + fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n"); + fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " gc identify high- or low-GC regions\n"); + fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); + fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " famask apply a X-coded FASTA to a source FASTA\n"); + fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n"); + fprintf(stderr, " rename rename sequence names\n"); + fprintf(stderr, " randbase choose a random base from hets\n"); + fprintf(stderr, " cutN cut sequence at long N\n"); + fprintf(stderr, " listhet extract the position of each het\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) return usage(); + if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1); + else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1); + else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); + else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); + else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); + else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1); + else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1); + else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); + else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); + else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); + else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); + else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1); + else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1); + else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1); + else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1); + else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1); + else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]); + return 1; + } + return 0; +} diff --git a/trimadap.c b/trimadap.c deleted file mode 100644 index b968c81..0000000 --- a/trimadap.c +++ /dev/null @@ -1,184 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "ksw.h" -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -typedef struct { - int type, len; - uint8_t *seq; - kswq_t *qp; - uint64_t cnt; -} ta_adap_t; - -int main(int argc, char *argv[]) -{ - int n_adaps, m_adaps; - int c, i, j, k, from_stdin; - int sa = 1, sb = 2, go = 1, ge = 3, type = 1; - int min_sc = 15, min_len = 10; - double max_diff = 0.15; - ta_adap_t *adaps; - kseq_t *ks; - gzFile fp; - int8_t mat[25]; - kstring_t str = {0,0,0}; - - n_adaps = m_adaps = 0; adaps = 0; - while ((c = getopt(argc, argv, "5:3:s:t:l:")) >= 0) { - if (c == '5' || c == '3') { - ta_adap_t *p; - if (m_adaps == n_adaps) { - m_adaps = m_adaps? m_adaps<<1 : 4; - adaps = realloc(adaps, m_adaps * sizeof(ta_adap_t)); - } - p = &adaps[n_adaps++]; - p->seq = (uint8_t*)strdup(optarg); - p->type = c - '0'; - } else if (c == 't') { - if (strcmp(optarg, "ilpe") == 0) type = 1; - } else if (c == 's') min_sc = atoi(optarg); - else if (c == 'd') max_diff = atof(optarg); - else if (c == 'l') min_len = atoi(optarg); - } - - // preset - if (type == 1 && n_adaps == 0) { - m_adaps = n_adaps = 3; - adaps = malloc(m_adaps * sizeof(ta_adap_t)); - adaps[0].type = 5; adaps[0].seq = (uint8_t*)strdup("AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"); - adaps[1].type = 3; adaps[1].seq = (uint8_t*)strdup("AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"); - adaps[2].type = 3; adaps[2].seq = (uint8_t*)strdup("ATCTCGTATGCCGTCTTCTGCTTG"); - } - - // update adapter info - for (j = 0; j < n_adaps; ++j) { - ta_adap_t *p = &adaps[j]; - p->len = strlen((char*)p->seq); - p->qp = 0; - p->cnt = 0; - for (i = 0; i < p->len; ++i) - p->seq[i] = seq_nt4_table[(uint8_t)p->seq[i]]; - } - - from_stdin = !isatty(fileno(stdin)); - if (optind == argc && !from_stdin) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: trimadap [options] \n\n"); - fprintf(stderr, "Options: -5 STR 5'-end adapter\n"); - fprintf(stderr, " -3 STR 3'-end adapter\n"); - fprintf(stderr, " -l INT min length [%d]\n", min_len); - fprintf(stderr, " -s INT min score [%d]\n", min_sc); - fprintf(stderr, " -d FLOAT max difference [%.3f]\n", max_diff); - fprintf(stderr, "\n"); - return 1; // FIXME: memory leak - } - - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? sa : -sb; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; - - fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "rb") : gzdopen(fileno(stdin), "rb"); - ks = kseq_init(fp); - while (kseq_read(ks) >= 0) { - if (str.m < ks->seq.m) { - str.m = ks->seq.m; - str.s = realloc(str.s, str.m); - } - str.l = ks->seq.l; - for (i = 0; i < ks->seq.l; ++i) - str.s[i] = seq_nt4_table[(uint8_t)ks->seq.s[i]]; - for (j = 0; j < n_adaps; ++j) { - kswr_t r; - double diff; - int type; - ta_adap_t *p = &adaps[j]; - r = ksw_align(p->len, p->seq, str.l, (uint8_t*)str.s, 5, mat, go, ge, KSW_XBYTE|KSW_XSTART|(min_len*sa), &p->qp); - ++r.te; ++r.qe; // change to 0-based - k = r.qe - r.qb < r.te - r.tb? r.qe - r.qb : r.te - r.tb; - diff = (double)(k * sa - r.score) / sb / k; - //printf("%d:%.3f [%d,%d):%d <=> [%d,%d):%d\n", r.score, diff, r.qb, r.qe, p->len, r.tb, r.te, (int)str.l); - if (r.qb <= r.tb && p->len - r.qe <= str.l - r.te) { // contained - if (r.qb * sa > sa + sb) continue; - if ((p->len - r.qe) * sa > sa + sb) continue; - type = 1; - } else if (r.qb <= r.tb) { // 3' overlap - if (r.qb * sa > sa + sb) continue; - if ((str.l - r.te) * sa > sa + sb) continue; - type = 2; - } else { - if ((p->len - r.qe) * sa > sa + sb) continue; - if (r.tb * sa > sa + sb) continue; - type = 3; - } - if (p->type == 5) { - if (r.tb == 0 && r.qe == p->len && (r.te - r.tb) * sa == r.score) - type = 4; - } else if (p->type == 3) { - if (r.qb == 0 && r.te == str.l && (r.te - r.tb) * sa == r.score) - type = 4; - } - if (type == 4) { - if (r.te - r.tb < min_len) continue; - } else { - if (r.score < min_sc || diff > max_diff) continue; - } - ++p->cnt; - if (p->type == 5) { - k = r.te + (p->len - r.qe); - k = k < str.l? k : str.l; - for (i = 0; i < k; ++i) ks->seq.s[i] = 'X'; - } else if (p->type == 3) { - k = r.tb > r.qb? r.tb - r.qb : 0; - for (i = k; i < str.l; ++i) ks->seq.s[i] = 'X'; - } - } - putchar(ks->qual.l? '@' : '>'); - puts(ks->name.s); - puts(ks->seq.s); - if (ks->qual.l) { - puts("+"); - puts(ks->qual.s); - } - } - free(str.s); - kseq_destroy(ks); - gzclose(fp); - - for (j = 0; j < n_adaps; ++j) { - ta_adap_t *p = &adaps[j]; - fprintf(stderr, "%-15ld ", (long)p->cnt); - for (i = 0; i < p->len; ++i) fputc("ACGTN"[(int)p->seq[i]], stderr); - fputc('\n', stderr); - free(p->seq); - free(p->qp); - } - free(adaps); - return 0; -} From 4e81d668b3b433c14778fb80cad628d5d018c390 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 08:00:16 +0200 Subject: [PATCH 23/32] Added options -B/-E for trimfq. --- seqtk.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/seqtk.c b/seqtk.c index 8fdd32e..54745d3 100644 --- a/seqtk.c +++ b/seqtk.c @@ -277,29 +277,31 @@ krint64_t kr_rand(krand_t *kr) /* quality based trimming with Mott's algorithm */ int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record +{ gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1; - while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) { + int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; - case 'L': fixed_len = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; } } if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -Q force FASTQ output\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-B) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-E) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); +// fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; } @@ -314,11 +316,33 @@ int stk_trimfq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max; - if (left || right || fixed_len > 0) { + if (left_keep) { + beg = left; end = left + left_keep; + if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (right_keep) { + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; - if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len; - } else if (seq->qual.l > min_len) { + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < min_len) { + beg = 0; + end = min_len; + if (end > seq->seq.l) end = seq->seq.l; + } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; From 44023198091a25001f83dbbba456534e1e86a52e Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 08:02:39 +0200 Subject: [PATCH 24/32] Update seqtk.c --- seqtk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/seqtk.c b/seqtk.c index 54745d3..6878f98 100644 --- a/seqtk.c +++ b/seqtk.c @@ -342,6 +342,7 @@ int stk_trimfq(int argc, char *argv[]) beg = 0; end = min_len; if (end > seq->seq.l) end = seq->seq.l; + } } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; From 760020c4093dc979cb95e0d5261bf2f93addd10b Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 08:23:08 +0200 Subject: [PATCH 25/32] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 6dec1cc..b177f5e 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,6 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa +* Trim 5bp from right end and keep the 50bp from right end of each read and if trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: + + seqtk trimfq -E 50 -e 5 -l 20 in.fq > out.fq From d95666f452b6b2890122b8dc48f49544b77b2e71 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 08:31:41 +0200 Subject: [PATCH 26/32] Update seqtk.c --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index 6878f98..99372f9 100644 --- a/seqtk.c +++ b/seqtk.c @@ -300,7 +300,7 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-B) [0]\n"); fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-E) [0]\n"); fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); - fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-e/-B) [%d]\n", right_keep); // fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; From 1b69f69b187e99d781873b9ed17e4a63ad48b206 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 09:04:01 +0200 Subject: [PATCH 27/32] added threshold for shortest read while trimming --- seqtk.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/seqtk.c b/seqtk.c index 99372f9..c43500d 100644 --- a/seqtk.c +++ b/seqtk.c @@ -281,11 +281,12 @@ int stk_trimfq(int argc, char *argv[]) gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; - while ((c = getopt(argc, argv, "l:q:b:e:B:E:")) >= 0) { + int i, c, min_len = 30, shortest_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:s:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; + case 's': shortest_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; case 'B': left_keep = atoi(optarg); break; @@ -296,11 +297,12 @@ int stk_trimfq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trims down from right end to INT bp when the trimming results in read length below this [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-B) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-E) [0]\n"); + fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -s INT trimming by -b/-e/-B/-E shall not produce reads shorter then INT bp [%d]\n", shortest_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q) [0]\n"); fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); - fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-e/-B) [%d]\n", right_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); // fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; @@ -320,27 +322,27 @@ int stk_trimfq(int argc, char *argv[]) beg = left; end = left + left_keep; if (seq->seq.l < end) end = seq->seq.l; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { + if (end - beg < shortest_len) { beg = 0; - end = min_len; + end = shortest_len; if (end > seq->seq.l) end = seq->seq.l; } } else if (right_keep) { beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; if (beg < 0) beg = 0; if (end < 0) end = 0; - if (end - beg < min_len) { + if (end - beg < shortest_len) { beg = 0; - end = min_len; + end = shortest_len; if (end > seq->seq.l) end = seq->seq.l; } } else if (left || right) { beg = left; end = seq->seq.l - right; if (end < 0) end = 0; if (seq->seq.l < beg) beg = seq->seq.l; - if (end - beg < min_len) { + if (end - beg < shortest_len) { beg = 0; - end = min_len; + end = shortest_len; if (end > seq->seq.l) end = seq->seq.l; } } else if (seq->qual.l > min_len && param != 0.) { From 89aade80d62c35235eb0f606023aaf3f50591880 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 12:33:30 +0200 Subject: [PATCH 28/32] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b177f5e..1642638 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,6 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa -* Trim 5bp from right end and keep the 50bp from right end of each read and if trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: +* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: - seqtk trimfq -E 50 -e 5 -l 20 in.fq > out.fq + seqtk trimfq -E 50 -s 20 in.fq > out.fq From 893a380e0950fe33eb05328e5bc8e99af2d601a2 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Sun, 5 Mar 2017 13:14:44 +0200 Subject: [PATCH 29/32] Update seqtk.c --- seqtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqtk.c b/seqtk.c index c43500d..5e59b40 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1691,7 +1691,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.2-r101-dirty\n\n"); + fprintf(stderr, "Version: 1.2-r101b-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); From 649aa9fb8618510400bd97538225463dbaa1394a Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Mon, 6 Mar 2017 10:02:38 +0200 Subject: [PATCH 30/32] back to original --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 1642638..6dec1cc 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,3 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa -* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: - - seqtk trimfq -E 50 -s 20 in.fq > out.fq From 9464ef3943246e4c5caeb44aacb99ac3bd462695 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Mon, 6 Mar 2017 10:04:21 +0200 Subject: [PATCH 31/32] more precise trimming options added for trimfq, such as -s/-E/-B. --- README.md | 3 +++ seqtk.c | 57 ++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6dec1cc..1642638 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,6 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa +* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: + + seqtk trimfq -E 50 -s 20 in.fq > out.fq diff --git a/seqtk.c b/seqtk.c index 8fdd32e..5e59b40 100644 --- a/seqtk.c +++ b/seqtk.c @@ -277,29 +277,33 @@ krint64_t kr_rand(krand_t *kr) /* quality based trimming with Mott's algorithm */ int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record +{ gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1; - while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) { + int i, c, min_len = 30, shortest_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:s:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; + case 's': shortest_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; - case 'L': fixed_len = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; } } if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -Q force FASTQ output\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -s INT trimming by -b/-e/-B/-E shall not produce reads shorter then INT bp [%d]\n", shortest_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); +// fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; } @@ -314,11 +318,34 @@ int stk_trimfq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max; - if (left || right || fixed_len > 0) { + if (left_keep) { + beg = left; end = left + left_keep; + if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (right_keep) { + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; - if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len; - } else if (seq->qual.l > min_len) { + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; @@ -1664,7 +1691,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.2-r101-dirty\n\n"); + fprintf(stderr, "Version: 1.2-r101b-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n"); From a83376cb50c3c1d0d2f2b5b0a56072e55c707b29 Mon Sep 17 00:00:00 2001 From: Daniel Nicorici Date: Tue, 6 Jun 2017 11:09:55 +0300 Subject: [PATCH 32/32] added option -e for subseq such that exclusion is performed instead of inclusion --- seqtk.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/seqtk.c b/seqtk.c index 5e59b40..587abc1 100644 --- a/seqtk.c +++ b/seqtk.c @@ -575,11 +575,13 @@ int stk_subseq(int argc, char *argv[]) khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; - int l, i, j, c, is_tab = 0, line = 0; + int l, i, j, c, is_tab = 0, line = 0, is_exclude = 0; + reglist_t dummy; khint_t k; - while ((c = getopt(argc, argv, "tl:")) >= 0) { + while ((c = getopt(argc, argv, "tel:")) >= 0) { switch (c) { case 't': is_tab = 1; break; + case 'e': is_exclude = 1; break; case 'l': line = atoi(optarg); break; } } @@ -587,6 +589,7 @@ int stk_subseq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); fprintf(stderr, "Options: -t TAB delimited output\n"); + fprintf(stderr, " -e exclusion instead of inclusion for sequences from \n"); fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); return 1; @@ -602,12 +605,19 @@ int stk_subseq(int argc, char *argv[]) fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__); return 1; } + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); + if (is_exclude == 0) { + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + } else { + if (k != kh_end(h)) continue; + p = &dummy; + dummy.a[0] = INT_MAX; + } for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (beg >= seq->seq.l) { @@ -1691,7 +1701,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.2-r101b-dirty\n\n"); + fprintf(stderr, "Version: 1.2-r101c-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n");