-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample-based-on-ppm.py
More file actions
80 lines (62 loc) · 3.25 KB
/
sample-based-on-ppm.py
File metadata and controls
80 lines (62 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# make random samples from genome, weighted by some PPM
# reads genome into memory, so be sure to have enough RAM!
# assumes that are looking at L1 endonuclease cleavage pattern
import sampleutils
import sys
from optparse import OptionParser
###############################################################################
USAGE = """
sample-based-on-ppm.py --ppm <position probability matrix> --genome <genome file to extract from>
--outpre <prefix for random sample set files>
--n_per_set <number of random samples per set>
--n_sets <number of random sets to produce>
--exclusionbed <bed file of regions to exclude from sampling>
For example, to make 10 sets of 50 random samples, use --n_per_set 50 --n_sets 10.
"""
parser = OptionParser(USAGE)
parser.add_option('--ppm',dest='ppm', help = 'position probability matrix')
parser.add_option('--genome', dest='genome', help = 'genome fasta file for selection')
parser.add_option('--outpre', dest='outFilePrefix', help = 'prefix for output file')
parser.add_option('--n_sets', dest='numSet',type='int', help = 'number of random set to make')
parser.add_option('--n_per_set', dest='numPerSet',type='int', help = 'number of random samples per set to make')
parser.add_option('--exclusionbed', dest='exclusionBed', help = 'bed file of regions to exclude from sampling',default='NONE')
(options, args) = parser.parse_args()
parser = OptionParser(USAGE)
if options.ppm is None:
parser.error('input ppm file name not given')
if options.genome is None:
parser.error('genome fasta file not given')
if options.outFilePrefix is None:
parser.error('output files prefix not given')
if options.numSet is None:
parser.error('total number of sets not given')
if options.numPerSet is None:
parser.error('number of random samples per set not given')
###############################################################################
myData = {}
# read in genome sequences to list of fasta files
myData['genomeFasta'] = options.genome
myData['ppmFile'] = options.ppm
if options.exclusionBed == 'NONE':
myData['useExclusionRegions'] = False
else:
myData['useExclusionRegions'] = True
myData['exclusionBedFile'] = options.exclusionBed
print 'Will exclude random sites that occur in file',options.exclusionBed
sampleutils.initialize_ppm(myData)
sampleutils.initialize_genome_sequences(myData)
print 'Initialization complete, ready to sample'
print 'Will make %i sets, each of which consists of %i random samples' % (options.numSet,options.numPerSet)
for setNum in range(options.numSet):
outFileName = options.outFilePrefix + '.%i' %setNum
print 'Sampled locations being written to',outFileName
outFile = open(outFileName,'w')
for sample_i in xrange(options.numPerSet):
if sample_i % 100 == 0:
print 'Did sample %i of %i...' % (sample_i,options.numPerSet)
randSel = sampleutils.select_random_position_with_weights(myData)
# write output as bed file for first 3 columns
# pos+1, because pos is 0 zero based already....
outFile.write('%s\t%i\t%i\t%s\t%s\n' % (randSel['chrom'],randSel['pos'],randSel['pos']+1,randSel['strand'],randSel['extractedSeq']))
outFile.close()