-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_GC_bin.py
More file actions
executable file
·71 lines (62 loc) · 1.98 KB
/
generate_GC_bin.py
File metadata and controls
executable file
·71 lines (62 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
import os
import commands
import fileinput
import struct
import sys
import argparse
last_chrom = ''
fasta_buf = ''
fasta_start = 0
fasta_end = 0
linesize = 51
i = 0
parser = argparse.ArgumentParser(description='QuicK-mer GC axillary file generator.')
parser.add_argument('-w',help="GC window size. Default = 400")
parser.add_argument('kmer',help="File path to kmer list")
parser.add_argument('genome',help="Directory containing FASTA file for reference genome assembly used to generate kmer list.")
parser.add_argument('output',help="File path to GC binary output.")
args = parser.parse_args()
kmerfile = open(args.kmer, 'r')
genome_path = args.genome
if genome_path[-1] != '/':
genome_path += '/'
kmersize = len(kmerfile.readline().rstrip().split()[4])
kmerfile.seek(0)
window_size = 400
if args.w != None:
window_size = int(args.w)
output = open(args.output,'wb')
for line in kmerfile:
i+=1
arr = line.split()
chro = arr[0]
pos = int(arr[1])
Upstream = pos - int((window_size - kmersize)/2.0) + 1
Downstream = pos + int((window_size + kmersize)/2.0)
#if the file is new, open and add to dictionary
if chro != last_chrom:
#open corresponding chromosome files
#print 'Opening '+chro
chromosome= open(genome_path+chro+'.fa','r')
fasta_buf = chromosome.read()
#the header size
fasta_start = fasta_buf.find('\n')+1
#the file size
chromosome.seek(0,2)
fasta_end = chromosome.tell()
chromosome.close()
last_chrom = chro
print chro+' loaded successfully'
#figure out the string byte location need to chop out
bytestart = (Upstream /(linesize -1)) * linesize + fasta_start -1 + (Upstream % (linesize-1))
byteend = (Downstream /(linesize -1)) * linesize + fasta_start -1 + (Downstream % (linesize-1))
seq = fasta_buf[max(bytestart,fasta_start):min(byteend+1,fasta_end)].upper()
#Calculate GC content
C=seq.count('C');
G=seq.count('G');
A=seq.count('A');
T=seq.count('T');
length = A+T+C+G
output.write(struct.pack('f',float(C+G)/length*100))
output.close()