fiberseq · Edmri · Oct 24, 2025 · Oct 28, 2025
diff --git a/apply_model.py b/apply_model.py
@@ -74,6 +74,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim):
         chrom=chrom.replace('-','__')
     if ':' in chrom:
         chrom=chrom.replace(':','___')
+    if '.' in chrom:
+        chrom=chrom.replace('.','____')
     start = read_info.loc[rid, 'start']
     end = read_info.loc[rid, 'end']
 
@@ -211,7 +213,7 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
                             all_starts.append(','.join(starts.astype(str)))
                             all_lengths.append(','.join(lengths.astype(str)))
                             all_counts.append(len(starts))
-
+                        
                         pbar2.update(1)
 
                     pbar2.set_description(f"Writing chunk {i}")
@@ -226,7 +228,18 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
                     b12.columns = ['chrom', 'start', 'end', 'name', 'thickStart', 'thickEnd', 'blockCount', 'itemRgb', 'blockStarts', 'blockSizes']
                     b12 = pd.concat([b12, no_me_b12])
                     b12 = b12.sort_values(by=['chrom', 'start'])
-
+
+                    # back to the origninal chromosome name
+                    for chrName in b12['chrom'].unique():
+                        initName = chrName
+                        if '____' in chrName:
+                            initName = chrName.replace('____', '.')
+                        if '___' in chrName:
+                            initName = chrName.replace('___', ':')
+                        if '__' in chrName:
+                            initName = chrName.replace('__', '-')
+                        b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True)
+
                     # Write to a temporary file (split by chromosome if necessary)
                     tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed")
                     b12.to_csv(tmp_file, sep='\t', index=False)
@@ -283,4 +296,4 @@ def apply_model(model, f, outdir, context, chromlist, train_rids, me_col, chunk_
             fout.write(fin.read())
         os.remove(tmp_file) 
 
-os.rmdir(tmp_dir)
+os.rmdir(tmp_dir)
diff --git a/apply_model_multiprocess.py b/apply_model_multiprocess.py
@@ -78,6 +78,8 @@ def encode_me(rid, read, read_info, context, circle, edge_trim):
         chrom=chrom.replace('-','__')
     if ':' in chrom:
         chrom=chrom.replace(':','___')
+    if '.' in chrom:
+        chrom=chrom.replace('.','____')
     start = read_info.loc[rid, 'start']
     end = read_info.loc[rid, 'end']
 
@@ -203,6 +205,17 @@ def process_chunk(chunk, model, context, chromlist, train_rids, me_col, chunk_si
         b12 = pd.concat([b12, no_me_b12])
         b12 = b12.sort_values(by=['chrom', 'start'])
 
+        # back to the origninal chromosome name
+        for chrName in b12['chrom'].unique():
+            initName = chrName
+            if '____' in chrName:
+                initName = chrName.replace('____', '.')
+            if '___' in chrName:
+                initName = chrName.replace('___', ':')
+            if '__' in chrName:
+                initName = chrName.replace('__', '-')
+            b12['chrom'].mask(b12['chrom'] == chrName, initName, inplace=True)
+
         # Write to a temporary file 
         tmp_file = os.path.join(tmp_dir, f"{dataset}_{i}.bed")
         b12.to_csv(tmp_file, sep='\t', index=False, header=False)
@@ -322,4 +335,4 @@ def combine_temp_files(chromlist, tmp_dir, outdir, dataset):
 
 #this consistently fails on my tests because of permissions, but it's not a huge issue
 #os.rmdir(tmp_dir)
-logging.info("Temporary directory removed and script completed.")
+logging.info("Temporary directory removed and script completed.")
diff --git a/encode_context.py b/encode_context.py
@@ -43,11 +43,14 @@ def make_fa_dic(infile):
     with open(infile, 'r') as f:
         for line in tqdm(f, desc="Importing fasta", leave=False, total = total_lines):
             line=line.rstrip()
-            if '>' in line and 'chr' in line:
+            if line.startswith('>'):
                 chrom=line.split(' ')[0].replace('>','') #grab chromosome name up until first whitespace
                 if '-' in chrom: #replace forbidden characters
                     chrom=chrom.replace('-','__')
+                if ':' in chrom:
                     chrom=chrom.replace(':','___')
+                if '.' in chrom:
+                   chrom=chrom.replace('.','____') 
 
                 chrom_filter=True   # This is preserved in case I want to hardcode leaving out specific chromosomes.
                                     # This can be useful in weird assemblies with many 1000s of contigs if you don't

diff --git a/train_model.py b/train_model.py
@@ -105,6 +105,8 @@ def encode_me(rid, read, read_info, context, edge_trim, me_col):
         chrom=chrom.replace('-','__')
     if ':' in chrom:
         chrom=chrom.replace(':','___')
+    if '.' in chrom:
+        chrom=chrom.replace('.','____')
     start = read_info.loc[rid, 'start']
     end = read_info.loc[rid, 'end']
 
@@ -289,4 +291,4 @@ def train_HMM(emission_probs, train_arrays):
 with open(outdir+'/all_models.pickle', 'wb') as handle:
     pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
-pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv')
+pd.DataFrame(train_rids, columns=['rid']).to_csv(outdir+'/training-reads.tsv')