From e089aa10baec5f16bbb0e526049b3fc39847b34f Mon Sep 17 00:00:00 2001 From: li6in9muyou Date: Tue, 11 May 2021 23:16:36 +0800 Subject: [PATCH] issue: #2 fasta file parser now shows a helpful message to user --- data/bad_seq.txt | 4 ++++ util/FileProcessing.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 data/bad_seq.txt diff --git a/data/bad_seq.txt b/data/bad_seq.txt new file mode 100644 index 0000000..5fba2e1 --- /dev/null +++ b/data/bad_seq.txt @@ -0,0 +1,4 @@ +>P_1|1|training +CGCCUCCCACGCGGGAGACCCGGGUUCAAUUCCCGGCCAAU +>P_21|training +CCGGGUUCAAUUCCCGGCCACUGCACGUGGUUGUUUUUCAC diff --git a/util/FileProcessing.py b/util/FileProcessing.py index 5926a22..bc596c2 100644 --- a/util/FileProcessing.py +++ b/util/FileProcessing.py @@ -34,7 +34,7 @@ def __init__(self, file): else: - self.error_msg = 'File format error.' + pass def read_fasta(self, file): """ @@ -55,6 +55,8 @@ def read_fasta(self, file): header, sequence = array[0].split()[0], re.sub('[^ACDEFGHIKLMNPQRSTUVWY-]', '-', ''.join(array[1:]).upper()) header_array = header.split('|') name = header_array[0] + if len(header_array) != 3: + return [], None, f"fasta file parsing failed at \"{header}\"" label = header_array[1] if len(header_array) >= 2 else '0' label_train = header_array[2] if len(header_array) >= 3 else 'training' fasta_sequences.append([name, sequence, label, label_train])