-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathprocess_vcf_get_aa_seq.cpp
More file actions
153 lines (128 loc) · 5.85 KB
/
Copy pathprocess_vcf_get_aa_seq.cpp
File metadata and controls
153 lines (128 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//
// process_vcf_get_aa_seq.cpp
// process_vcf
//
// Created by Milan Malinsky on 17/02/2014.
// Copyright (c) 2014 Milan Malinsky. All rights reserved.
//
#include "process_vcf_get_aa_seq.h"
#define SUBPROGRAM "aa-seq"
#define DEBUG 1
static const char *AA_SEQ_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] REF_FASTA_FROM_MAF.fa ANCESTRAL_WITH_GAPS.fa\n"
"Generate ancestral sequence in ref genome coordinates for a scaffold\n"
"REF_FASTA_FROM_MAF.fa is generated from a multiple alignment (.maf file) by calling:\n"
"msa_view Mz_target_Pn_Ab_Nb_ms1_scaffold_${i}_iRows.maf --refseq scaffold_${i}.fa --in-format MAF --out-format FASTA\n"
"ANCESTRAL_WITH_GAPS.fa is generated by 'prequel':\n"
"prequel --no-probs --keep-gaps --refseq scaffold_${i}.fa --msa-format MAF "
"--seqs haplochromines Mz_target_Pn_Ab_Nb_ms1_scaffold_${i}_iRows.maf ... \n"
"\n"
" -h, --help display this help and exit\n"
" -o, --out=FILE_ROOT the outpur file will be 'FILE_ROOT.ancestralSequence.fa'\n"
" --anc-from-maf=1/0 ANCESTRAL_WITH_GAPS.fa is not from 'prequel' but rather a sequence for a specific species\n"
" generated by the msa view call from the .maf file\n"
" 0 means that deletions in the ancestral sequence will be filled by the reference (e.g. for phylogeny that uses only SNPs)\n"
" 1 means that deletions in the ancestral sequence will be filled by 'N' characters (e.g. for filling the AA field in the VCF file)\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
static const char* shortopts = "ho:";
enum { OPT_ANC_FROM_MAF };
static const struct option longopts[] = {
{ "help", no_argument, NULL, 'h' },
{ "anc-from-maf", required_argument, NULL, OPT_ANC_FROM_MAF },
{ "out", required_argument, NULL, 'o' },
{ NULL, 0, NULL, 0 }
};
namespace opt
{
static string refFastaFile;
static string ancWithGapsFile;
static string out;
static bool bAncFromMaf = false;
static bool deletionAsN = false;
}
int getAaSeqMain(int argc, char** argv) {
parseAaSeqOptions(argc, argv);
std::ifstream* refFastaFile = new std::ifstream(opt::refFastaFile.c_str());
std::ifstream* ancWithGapsFile = new std::ifstream(opt::ancWithGapsFile.c_str());
string refFastaFileRoot;
if (opt::out.empty()) {
refFastaFileRoot = stripExtension(opt::refFastaFile);
} else {
refFastaFileRoot = opt::out;
}
string outAncestralFN;
if (!opt::bAncFromMaf) { outAncestralFN = refFastaFileRoot + ".ancestralSequence.fa"; }
else { std::cerr << "Using maf" << std::endl;
if (opt::deletionAsN) outAncestralFN = refFastaFileRoot + ".PNsequence.deletionsAsN.fa";
else outAncestralFN = refFastaFileRoot + ".PNsequence.NoIndels.fa";
}
std::cerr << "Output should be in:" << std::endl;
std::cerr << outAncestralFN << std::endl;
std::ofstream* outAncestralFile = new std::ofstream(outAncestralFN.c_str());
string line;
getline(*ancWithGapsFile, line); getline(*refFastaFile, line);
*outAncestralFile << line << std::endl;
// Read in everything
string refFastaSeq; refFastaSeq.reserve(50000000);
string ancWithGapsSeq; ancWithGapsSeq.reserve(50000000);
while (getline(*refFastaFile, line)) { refFastaSeq.append(line); }
while (getline(*ancWithGapsFile, line)) { ancWithGapsSeq.append(line); }
// Now generate the final ancestral sequence
assert(refFastaSeq.length() == ancWithGapsSeq.length());
string outAncestralSeq; outAncestralSeq.reserve(50000000);
if (!opt::bAncFromMaf) {
for (string::size_type i = 0; i != refFastaSeq.length(); i++) {
if (refFastaSeq[i] == '-') { continue; }
else if (refFastaSeq[i] == 'N') { outAncestralSeq += refFastaSeq[i]; }
else if (refFastaSeq[i] == ancWithGapsSeq[i]) { outAncestralSeq += refFastaSeq[i]; }
else { outAncestralSeq += ancWithGapsSeq[i]; }
}
print80bpPerLineFile(outAncestralFile, outAncestralSeq);
} else {
for (string::size_type i = 0; i != refFastaSeq.length(); i++) {
if (refFastaSeq[i] == '-') { continue; }
else if (refFastaSeq[i] == 'N') { outAncestralSeq += refFastaSeq[i]; }
else if (refFastaSeq[i] == ancWithGapsSeq[i]) { outAncestralSeq += refFastaSeq[i]; }
else if (ancWithGapsSeq[i] == '-') {
if (opt::deletionAsN) outAncestralSeq += 'N';
else outAncestralSeq += refFastaSeq[i];
} else if (ancWithGapsSeq[i] == '*') { outAncestralSeq += 'N'; }
else { outAncestralSeq += ancWithGapsSeq[i]; }
}
print80bpPerLineFile(outAncestralFile, outAncestralSeq);
}
return 0;
}
void parseAaSeqOptions(int argc, char** argv) {
bool die = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
{
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c)
{
case '?': die = true; break;
case 'o': arg >> opt::out; break;
case OPT_ANC_FROM_MAF: opt::bAncFromMaf = true; int a; arg >> a; if (a == 1) opt::deletionAsN = true; break;
case 'h':
std::cout << AA_SEQ_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
}
}
if (argc - optind < 2) {
std::cerr << "missing arguments\n";
die = true;
}
else if (argc - optind > 2)
{
std::cerr << "too many arguments\n";
die = true;
}
if (die) {
std::cout << "\n" << AA_SEQ_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}
// Parse the input filenames
opt::refFastaFile = argv[optind++];
opt::ancWithGapsFile = argv[optind++];
}