@@ -259,6 +259,54 @@ def simplify_headers_drop(inputfile, keepfile, dropfile, base="contig_", drop=[]
259259 return names
260260
261261
262+ def filter_contigs_by_length (inputfile , outputfile , min_length = 10000 , base = "contig_" ):
263+ """
264+ Filter contigs by minimum length and simplify headers.
265+
266+ This function reads a FASTA file, filters contigs based on a minimum length threshold,
267+ and writes the filtered contigs to an output file with simplified headers. This is
268+ particularly useful for training ab initio gene predictors where short contigs
269+ are not informative and can hurt training quality.
270+
271+ Args:
272+ inputfile (str): Path to the input FASTA file.
273+ outputfile (str): Path to the output file for filtered contigs.
274+ min_length (int, optional): Minimum contig length to keep. Defaults to 10000.
275+ base (str, optional): Base string for simplified headers. Defaults to "contig_".
276+
277+ Returns:
278+ tuple: A tuple containing:
279+ - dict: Mapping of simplified headers to original headers for kept contigs.
280+ - int: Number of contigs kept.
281+ - int: Number of contigs filtered out.
282+ - int: Total length of kept contigs.
283+ - int: Total length of filtered contigs.
284+ """
285+ names = {}
286+ kept_count = 0
287+ filtered_count = 0
288+ kept_length = 0
289+ filtered_length = 0
290+
291+ with open (outputfile , "w" ) as outfile :
292+ for title , seq in pyfastx .Fasta (inputfile , build_index = False ):
293+ seq_length = len (seq )
294+
295+ if seq_length >= min_length :
296+ # Keep this contig
297+ kept_count += 1
298+ kept_length += seq_length
299+ simplified_name = f"{ base } { kept_count } "
300+ names [simplified_name ] = title
301+ outfile .write (f">{ simplified_name } \n { softwrap (seq )} \n " )
302+ else :
303+ # Filter out this contig
304+ filtered_count += 1
305+ filtered_length += seq_length
306+
307+ return names , kept_count , filtered_count , kept_length , filtered_length
308+
309+
262310def list2groups (L ):
263311 """
264312 Identify groups of continuous numbers in a list.
0 commit comments