Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions apply_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim):
chrom=chrom.replace('-','__')
if ':' in chrom:
chrom=chrom.replace(':','___')
if '.' in chrom:
chrom=chrom.replace('.','____')
start = read_info.loc[rid, 'start']
end = read_info.loc[rid, 'end']

Expand Down Expand Up @@ -211,7 +213,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
all_starts.append(','.join(starts.astype(str)))
all_lengths.append(','.join(lengths.astype(str)))
all_counts.append(len(starts))

pbar2.update(1)

pbar2.set_description(f"Writing chunk {i}")
Expand All @@ -226,7 +228,18 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes']
b12 = pd.concat([b12, no_me_b12])
b12 = b12.sort_values(by=['chrom', 'start'])


# back to the origninal chromosome name
for chrName in b12['chrom'].unique():
initName = chrName
if '____' in chrName:
initName = chrName.replace('____', '.')
if '___' in chrName:
initName = chrName.replace('___', ':')
if '__' in chrName:
initName = chrName.replace('__', '-')
b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True)

# Write to a temporary file (split by chromosome if necessary)
tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed")
b12.to_csv(tmp_file, sep='\t', index=False)
Expand Down Expand Up @@ -283,4 +296,4 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
fout.write(fin.read())
os.remove(tmp_file)

os.rmdir(tmp_dir)
os.rmdir(tmp_dir)
15 changes: 14 additions & 1 deletion apply_model_multiprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim):
chrom=chrom.replace('-','__')
if ':' in chrom:
chrom=chrom.replace(':','___')
if '.' in chrom:
chrom=chrom.replace('.','____')
start = read_info.loc[rid, 'start']
end = read_info.loc[rid, 'end']

Expand Down Expand Up @@ -203,6 +205,17 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si
b12 = pd.concat([b12, no_me_b12])
b12 = b12.sort_values(by=['chrom', 'start'])

# back to the origninal chromosome name
for chrName in b12['chrom'].unique():
initName = chrName
if '____' in chrName:
initName = chrName.replace('____', '.')
if '___' in chrName:
initName = chrName.replace('___', ':')
if '__' in chrName:
initName = chrName.replace('__', '-')
b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True)

# Write to a temporary file
tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed")
b12.to_csv(tmp_file, sep='\t', index=False, header=False)
Expand Down Expand Up @@ -322,4 +335,4 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset):

#this consistently fails on my tests because of permissions, but it's not a huge issue
#os.rmdir(tmp_dir)
logging.info("Temporary directory removed and script completed.")
logging.info("Temporary directory removed and script completed.")
5 changes: 4 additions & 1 deletion encode_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,14 @@ def make_fa_dic(infile):
with open(infile, 'r') as f:
for line in tqdm(f, desc="Importing fasta", leave=False, total = total_lines):
line=line.rstrip()
if '>' in line and 'chr' in line:
if line.startswith('>'):
chrom=line.split(' ')[0].replace('>','') #grab chromosome name up until first whitespace
if '-' in chrom: #replace forbidden characters
chrom=chrom.replace('-','__')
if ':' in chrom:
chrom=chrom.replace(':','___')
if '.' in chrom:
chrom=chrom.replace('.','____')

chrom_filter=True # This is preserved in case I want to hardcode leaving out specific chromosomes.
# This can be useful in weird assemblies with many 1000s of contigs if you don't
Expand Down
4 changes: 3 additions & 1 deletion train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def encode_me(rid, read, read_info, context, edge_trim, me_col):
chrom=chrom.replace('-','__')
if ':' in chrom:
chrom=chrom.replace(':','___')
if '.' in chrom:
chrom=chrom.replace('.','____')
start = read_info.loc[rid, 'start']
end = read_info.loc[rid, 'end']

Expand Down Expand Up @@ -289,4 +291,4 @@ def train_HMM(emission_probs, train_arrays):
with open(outdir+'/all_models.pickle', 'wb') as handle:
pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv')
pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv')