dbSNPparser/test_parser.py at master · Spencer-Smith/dbSNPparser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#This class tests the Parser script

import unittest
import os
import Carnicero
import juicer
import functionfilter

class ParserTestCase(unittest.TestCase):

	def setUp(self):
		self.parser = Carnicero.Carnicero()
		self.juicer = juicer.Juicer("","")
		self.funfil = functionfilter.FunctionFilter("","")
		self.funfil.SetFunctionCodes()
		self.output = "TESTfiles\output.txt"

	def checkmywork(self, file1, file2):
		#This method just checks that two files are equivalent
		in1 = open(file1,'r')
		in2 = open(file2,'r')

		while True:
			line1 = in1.readline()
			line2 = in2.readline()
			if line1 != line2:
				in1.close()
				in2.close()
				return False

			if line1 == '' and line2 == '':
				break
		in1.close()
		in2.close()
		return True

	#The Dictator method makes a dictionary given a file a column of the file.
		# Prototype: Dictator(InputPath, TargetColumn)
	def test_make_dictionary_from_one_column_file(self):
		"""Can the dictator method make a dictionary from a one column file?"""
		expectedDictionary = {"this":1, "is":1, "a":1, "dictionary":1}
		path = "TESTfiles\TEST_one_column_file.txt"
		self.assertEqual(self.parser.Dictator(path,0), expectedDictionary)

	def test_make_dictionary_from_two_column_file(self):
		"""Can the dictator method make a dictionary from a two column file?"""
		expectedDictionary = {"this":1, "is":1, "a":1, "dictionary":1}
		path = "TESTfiles\TEST_two_column_file.txt"
		self.assertEqual(self.parser.Dictator(path,1), expectedDictionary)

	#The Cut method reads in a file, and parses out only the desired columns. Each row can
		# have a column compared against a dictionary of acceptable values, excluding those rows
		# which do not meet the criteria. The remaining columns of each row are printed to an
		# output file. A passed header is passed, it will be output first to the output file,
		# however if the Header argument is left as an empty string (""), then the first line of
		# the input file will be considered a header and written to the output file
		# Prototype: Cut(InPath, OutPath, ColumnsToKeep, DictionaryOfAcceptableValues,
		# 							ColumnToChallengeDictionary, Header)

	def test_cut_one_column(self):
		"""Can the cut method keep all but one column?"""
		path = "TESTfiles\TEST_table.txt"
		columns = [0,1,3]
		dictionary = {"blue":1,"red":1,"orange":1,"black":1,"teal":1,"silver":1,"outofspace":1,"green":1,"purple":1}
		col = 3
		header = "number\tword\tcolor\n"
		self.parser.Cut(path,self.output,columns,dictionary,col,header)
		expected = "TESTfiles\expected_cut_one_column.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	def test_keep_one_column(self):
		"""Can the cut method keep only one column?"""
		path = "TESTfiles\TEST_table.txt"
		columns = [3]
		dictionary = {"blue":1,"red":1,"orange":1,"black":1,"teal":1,"silver":1,"outofspace":1,"green":1,"purple":1}
		col = 3
		header = "color\n"
		self.parser.Cut(path,self.output,columns,dictionary,col,header)
		expected = "TESTfiles\expected_keep_one_column.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	def test_limit_by_dictionary(self):
		"""Can the cut method remove rows with a value not pertaining to a dictionary?"""
		path = "TESTfiles\TEST_table.txt"
		columns = [0,1,2,3]
		dictionary = {"blue":1,"orange":1,"black":1,"outofspace":1,"green":1,"purple":1}
		col = 3
		header = "number\tword\tplace\tcolor\n"
		self.parser.Cut(path,self.output,columns,dictionary,col,header)
		expected = "TESTfiles\expected_limit_by_dictionary.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	def test_cut_two_columns_limit_by_dictionary(self):
		"""Can the cut method cut and limit at the same time?"""
		path = "TESTfiles\TEST_table.txt"
		columns = [1,2]
		dictionary = {"ephemeral":1,"zephyr":1,"cordial":1,"external":1}
		col = 1
		header = "word\tplace\n"
		self.parser.Cut(path,self.output,columns,dictionary,col,header)
		expected = "TESTfiles\expected_cut_two_columns_limit_by_dictionary.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	#The Juicer class uses it's ParseFile method to parse files. It stores the entries on a line,
		# in a buffer until the first column changes. It then calls CheckBuffer, which ensures
		# that at least one of the frequencies in the buffer has a minor allele frequency greater
		# than 1%, and then PrintBuffer. The buffer should contain ten lines, but is printed as
		# two lines by placing all frequencies the same population ID on the same line. NOTE: This
		# last operation does not depend on detecting the population ID, but instead attaches every
		# other line to a first or second line, then prints them.

	def test_buffer_all_MAF_greater_than_one(self):
		"""Can CheckBuffer pass a buffer where all MAF are greater than 1%?"""
		self.juicer.Buffer = [[".011" for i in range(4)],[".989" for j in range(4)]] * 5
		self.assertTrue(self.juicer.CheckBuffer())

	def test_buffer_one_MAF_greater_than_one(self):
		"""Can CheckBuffer pass a buffer where only one MAF is greater than 1%
			and the rest are less than 1%?"""
		self.juicer.Buffer = [[".009" for i in range(4)],[".991" for j in range(4)]] * 4
		self.juicer.Buffer += [".011" for i in range(4)],[".989" for j in range(4)]
		self.assertTrue(self.juicer.CheckBuffer())

	def test_buffer_all_MAF_less_than_one(self):
		"""Can CheckBuffer fail a buffer where all MAF are less than 1%?"""
		self.juicer.Buffer = [[".009" for i in range(4)],[".991" for j in range(4)]] * 5
		self.assertFalse(self.juicer.CheckBuffer())

	def test_buffer_MAF_equal_to_one(self):
		"""Can CheckBuffer fail a buffer where all MAF are equal to 1%? Or
			where one is equal and the rest are less than 1%? Can it pass
			when one or more are equal to one but another is greater?"""
		self.juicer.Buffer = [[".01" for i in range(4)],[".99" for j in range(4)]] * 5
		self.assertFalse(self.juicer.CheckBuffer(), msg="CheckBuffer should not pass when equal to 1%")

		self.juicer.Buffer = [[".009" for i in range(4)],[".991" for j in range(4)]] * 4
		self.juicer.Buffer += [".01" for i in range(4)],[".99" for j in range(4)]
		self.assertFalse(self.juicer.CheckBuffer(), msg="CheckBuffer should not pass when equal to 1%")

		self.juicer.Buffer = [[".01" for i in range(4)],[".99" for j in range(4)]] * 4
		self.juicer.Buffer += [".011" for i in range(4)],[".991" for j in range(4)]
		self.assertTrue(self.juicer.CheckBuffer())

		self.juicer.Buffer = [[".011" for i in range(4)],[".989" for j in range(4)]] * 4
		self.juicer.Buffer += [".01" for i in range(4)],[".99" for j in range(4)]
		self.assertTrue(self.juicer.CheckBuffer())

	def test_buffer_not_full(self):
		"""Will CheckBuffer fail if the buffer has missing information?"""
		self.juicer.Buffer = [[".011" for i in range(3)],[".989" for j in range(3)]] * 5
		self.assertFalse(self.juicer.CheckBuffer(), msg="Buffer not full, should return false")

	def test_juicer_write_buffer(self):
		"""Does WriteBuffer properly output the data from the buffer?"""
		self.juicer.Buffer = [["1","51","2","0.91"],["1","51","7","0.09"],["1","52","2","0.95"],
			["1","52","7","0.05"],["1","53","2","0.93"],["1","53","7","0.07"],["1","54","2","0.99"],
			["1","54","7","0.01"],["1","55","2","0.89"],["1","55","7","0.11"]]
		out = open(self.output, 'w')
		self.juicer.WriteBuffer(out)
		out.close()
		expected = "TESTfiles\expected_juicer_write_buffer.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	#The FunctionFilter class also has a ParseFile method, which also begins by storing lines in a
		# buffer to keep information for a single SNP together. When the data being read in no longer
		# pertains to the same SNP, the buffer is checked with a CheckBuffer method, which makes sure
		# that the function of the SNP is one that we're interested in. It then writes the buffer,
		# excluding columns that we're not interested in. SNPs with reversed mRNA orientation will
		# also have their alleles switched to the reverse compliment through the ReverseCompliment
		# method.

	def test_buffer_valid_codes(self):
		"""Will CheckBuffer pass all valid codes?"""
		self.funfil.Buffer = [["8" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="8 is a valid function code")
		self.funfil.Buffer = [["41" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="41 is a valid function code")
		self.funfil.Buffer = [["42" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="42 is a valid function code")
		self.funfil.Buffer = [["43" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="43 is a valid function code")
		self.funfil.Buffer = [["44" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="44 is a valid function code")
		self.funfil.Buffer = [["45" for i in range(25)] for j in range(2)]
		self.assertTrue(self.funfil.CheckBuffer(), msg="45 is a valid function code")

	def test_buffer_nonvalid_codes(self):
		"""Will CheckBuffer fail invalid codes?"""
		self.funfil.Buffer = [["3" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="3 is not a valid function code")
		self.funfil.Buffer = [["6" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="6 is not a valid function code")
		self.funfil.Buffer = [["9" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="9 is not a valid function code")
		self.funfil.Buffer = [["30" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="30 is not a valid function code")
		self.funfil.Buffer = [["53" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="53 is not a valid function code")
		self.funfil.Buffer = [["75" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="75 is not a valid function code")

	def test_buffer_nonsense_codes(self):
		"""Does CheckBuffer know what to do with input that is not codes at all?"""
		self.funfil.Buffer = [["-21" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="-21 isn't even a code at all")
		self.funfil.Buffer = [["over 9000" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="over 9000 isn't even a code at all")
		self.funfil.Buffer = [["A" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="A isn't even a code at all")
		self.funfil.Buffer = [["zenith" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg="zenith isn't even a code at all")
		self.funfil.Buffer = [["...." for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg=".... isn't even a code at all")
		self.funfil.Buffer = [[";)" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg=";) isn't even a code at all")
		self.funfil.Buffer = [["" for i in range(25)] for j in range(2)]
		self.assertFalse(self.funfil.CheckBuffer(), msg=" isn't even a code at all")

	def test_buffer_mixed_codes(self):
		"""Will CheckBuffer fail valid codes when invalid codes are present?"""
		self.funfil.Buffer = [["8" for i in range(25)],["3" for i in range(25)]]
		self.assertFalse(self.funfil.CheckBuffer(), msg="8 is a valid function code")

	def test_reverse_comp_single_nucleotides(self):
		"""Can ReverseCompliment return the compliment of a single nucleotide?"""
		nuc = "A"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "T")
		nuc = "C"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "G")
		nuc = "G"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "C")
		nuc = "T"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "A")

	def test_reverse_comp_long_sequences(self):
		"""Can ReverseCompliment make the reverse compliments of longer (3-9
			nucleotides) sequences?"""
		nuc = "CAT"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "ATG")
		nuc = "TAGAC"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "GTCTA")
		nuc = "ACCATAGGA"
		self.assertEqual(self.funfil.ReverseCompliment(nuc), "TCCTATGGT")

	def test_reverse_comp_hyphen(self):
		"""In the SNPContigLocusId table, hyphens("-") indicate the absense of
			an allele (e.g, in indel SNPs). Does ReverseCompliment correctly
			return a hyphen in response to a hyphen?"""
		hyphy = "-"
		self.assertEqual(self.funfil.ReverseCompliment(hyphy), hyphy)

	def test_reverse_comp_nonnucleotide_input(self):
		"""Does ReverseCompliment correctly return any character other than a
			nucleotide or a hyphen as an empty string ?"""
		notnuc = "M"
		self.assertEqual(self.funfil.ReverseCompliment(notnuc), "",
				msg="'M' is not a nucleotide")
		notnuc = "win"
		self.assertEqual(self.funfil.ReverseCompliment(notnuc), "",
			msg="No character of 'win' is a valid nucleotide")
		notnuc = "a"
		self.assertEqual(self.funfil.ReverseCompliment(notnuc), "",
			msg="Nucleotides must be capitalized to be considered valid")
		notnuc = "?"
		self.assertEqual(self.funfil.ReverseCompliment(notnuc), "",
				msg="'?' is not a nucleotide")
		notnuc = ""
		self.assertEqual(self.funfil.ReverseCompliment(notnuc), "",
				msg="Empty string is not a nucleotide")

	def test_funfil_write_buffer(self):
		"""Does WriteBuffer write the correct columns to output?"""
		self.funfil.Buffer = [["8" for i in range(25)],["42" for i in range(25)]]
		self.funfil.Buffer[0][13] = "A"
		self.funfil.Buffer[1][13] = "G"
		self.funfil.Buffer[0][24] = self.funfil.Buffer[1][24] = "0"
		out = open(self.output, 'w')
		self.funfil.WriteBuffer(out)
		out.close()
		expected = "TESTfiles\expected_funfil_write_buffer.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

	def test_funfil_write_buffer_reverse_comp(self):
		"""Does WriteBuffer call reverse compliment when necessary?"""
		self.funfil.Buffer = [["8" for i in range(25)],["42" for i in range(25)]]
		self.funfil.Buffer[0][13] = "A"
		self.funfil.Buffer[1][13] = "G"
		self.funfil.Buffer[0][24] = self.funfil.Buffer[1][24] = "1"
		out = open(self.output, 'w')
		self.funfil.WriteBuffer(out)
		out.close()
		expected = "TESTfiles\expected_funfil_write_buffer_reverse_comp.txt"
		self.assertTrue(self.checkmywork(self.output, expected), msg="Output not equal to expected")

if __name__ == '__main__':
    unittest.main()