-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path05_combine-contigs-based-on-links.py
More file actions
executable file
·115 lines (89 loc) · 3.18 KB
/
05_combine-contigs-based-on-links.py
File metadata and controls
executable file
·115 lines (89 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import genutils
import os
import sys
from optparse import OptionParser
#############################################################################
USAGE = """combine-contigs-based-on-links.py --vcf <input vcf file> --outdirbase <base directory for assembly output>
Combines contigs into scaffolds (based on links) and runs RepeatMasker
"""
parser = OptionParser(USAGE)
parser.add_option('--vcf',dest='vcfFile', help = 'input vcf file name')
parser.add_option('--outdirbase',dest='outDirBase', help = 'base directory for assembly output')
(options,args)=parser.parse_args()
if options.vcfFile is None:
parser.error('vcf file name not given')
if options.outDirBase is None:
parser.error('base dir for assembly output not given')
#############################################################################
inputVCFCalls = options.vcfFile
outDirBase = options.outDirBase
intervalsToReadNames = {}
nameToSeq = {}
regionsToDo = []
inFile = open(inputVCFCalls,'r')
for line in inFile:
if line[0] == '#':
continue
line = line.rstrip()
line = line.split()
c = line[0]
p = line[1]
k = c + '_' + p
regionsToDo.append([k,c])
inFile.close()
print 'Read in %i regions to do' % (len(regionsToDo))
gapSeq = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
for region in regionsToDo:
c = region[1]
region = region[0]
print '***',region,'***'
outPutDir = outDirBase + '/' + c + '/' + region
outPutDir += '/'
print region
print outPutDir
if os.path.isdir(outPutDir) is False:
print 'Dir not exist, skipping'
continue
contigsFile = outPutDir + 'reads.fasta.cap.contigs'
contigFileSize = os.path.getsize(contigsFile)
if contigFileSize == 0:
newContigFile = outPutDir + 'combined.scaffolds.fa'
outFile = open(newContigFile,'w')
outFile.close()
continue
contigs = genutils.read_fasta_file_to_list(contigsFile)
linksFile = outPutDir + 'reads.fasta.cap.contigs.links'
conLinks = []
inFile = open(linksFile,'r')
for line in inFile:
line = line.rstrip()
if line == '':
continue
line = line.split(' ')
c1 = line[0]
c2 = line[2]
conLinks.append([c1,c2])
inFile.close()
conLinks = conLinks[::-1]
print 'have %i contig links' % len(conLinks)
for i in conLinks:
contigs[i[0]]['seq'] += gapSeq + contigs[i[1]]['seq']
del contigs[i[1]]
newContigFile = outPutDir + 'combined.scaffolds.fa'
outFile = open(newContigFile,'w')
cNames = contigs.keys()
cNames.sort()
for c in cNames:
s = genutils.add_breaks_to_line(contigs[c]['seq'])
cN = c.replace('Contig','Scafftig')
outFile.write('>%s\n%s\n' % (cN,s))
outFile.close()
fastaSizes = newContigFile + '.sizes'
cmd = 'fastalength %s > %s ' % (newContigFile,fastaSizes)
genutils.runCMD(cmd)
# change this based on species
cmd = 'RepeatMasker --species human ' + newContigFile
# cmd = 'RepeatMasker --species dog ' + newContigFile
print cmd
genutils.runCMD(cmd)
print 'ALL DONE'