diff --git a/apply_model.py b/apply_model.py index 18de12e..56cdcd8 100644 --- a/apply_model.py +++ b/apply_model.py @@ -74,6 +74,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim): chrom=chrom.replace('-','__') if ':' in chrom: chrom=chrom.replace(':','___') + if '.' in chrom: + chrom=chrom.replace('.','____') start = read_info.loc[rid, 'start'] end = read_info.loc[rid, 'end'] @@ -211,7 +213,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ all_starts.append(','.join(starts.astype(str))) all_lengths.append(','.join(lengths.astype(str))) all_counts.append(len(starts)) - + pbar2.update(1) pbar2.set_description(f"Writing chunk {i}") @@ -226,7 +228,18 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes'] b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) - + + # back to the origninal chromosome name + for chrName in b12['chrom'].unique(): + initName = chrName + if '____' in chrName: + initName = chrName.replace('____', '.') + if '___' in chrName: + initName = chrName.replace('___', ':') + if '__' in chrName: + initName = chrName.replace('__', '-') + b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True) + # Write to a temporary file (split by chromosome if necessary) tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed") b12.to_csv(tmp_file, sep='\t', index=False) @@ -283,4 +296,4 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_ fout.write(fin.read()) os.remove(tmp_file) -os.rmdir(tmp_dir) \ No newline at end of file +os.rmdir(tmp_dir) diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py index c69ab5c..bd2b40e 100644 --- a/apply_model_multiprocess.py +++ b/apply_model_multiprocess.py @@ -78,6 +78,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim): chrom=chrom.replace('-','__') if ':' in chrom: chrom=chrom.replace(':','___') + if '.' in chrom: + chrom=chrom.replace('.','____') start = read_info.loc[rid, 'start'] end = read_info.loc[rid, 'end'] @@ -203,6 +205,17 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si b12 = pd.concat([b12, no_me_b12]) b12 = b12.sort_values(by=['chrom', 'start']) + # back to the origninal chromosome name + for chrName in b12['chrom'].unique(): + initName = chrName + if '____' in chrName: + initName = chrName.replace('____', '.') + if '___' in chrName: + initName = chrName.replace('___', ':') + if '__' in chrName: + initName = chrName.replace('__', '-') + b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True) + # Write to a temporary file tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed") b12.to_csv(tmp_file, sep='\t', index=False, header=False) @@ -322,4 +335,4 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset): #this consistently fails on my tests because of permissions, but it's not a huge issue #os.rmdir(tmp_dir) -logging.info("Temporary directory removed and script completed.") \ No newline at end of file +logging.info("Temporary directory removed and script completed.") diff --git a/encode_context.py b/encode_context.py index 05014d7..92ebae5 100644 --- a/encode_context.py +++ b/encode_context.py @@ -43,11 +43,14 @@ def make_fa_dic(infile): with open(infile, 'r') as f: for line in tqdm(f, desc="Importing fasta", leave=False, total = total_lines): line=line.rstrip() - if '>' in line and 'chr' in line: + if line.startswith('>'): chrom=line.split(' ')[0].replace('>','') #grab chromosome name up until first whitespace if '-' in chrom: #replace forbidden characters chrom=chrom.replace('-','__') + if ':' in chrom: chrom=chrom.replace(':','___') + if '.' in chrom: + chrom=chrom.replace('.','____') chrom_filter=True # This is preserved in case I want to hardcode leaving out specific chromosomes. # This can be useful in weird assemblies with many 1000s of contigs if you don't diff --git a/train_model.py b/train_model.py index 22ebff9..d66b5eb 100644 --- a/train_model.py +++ b/train_model.py @@ -105,6 +105,8 @@ def encode_me(rid, read, read_info, context, edge_trim, me_col): chrom=chrom.replace('-','__') if ':' in chrom: chrom=chrom.replace(':','___') + if '.' in chrom: + chrom=chrom.replace('.','____') start = read_info.loc[rid, 'start'] end = read_info.loc[rid, 'end'] @@ -289,4 +291,4 @@ def train_HMM(emission_probs, train_arrays): with open(outdir+'/all_models.pickle', 'wb') as handle: pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL) -pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv') \ No newline at end of file +pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv')