From a554a3977bfcd26c681a8c2b8d3b080e1530b433 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Fri, 14 Mar 2025 18:31:53 -0400 Subject: [PATCH 1/5] simple test script from importing sequences using BioPython --- .gitignore | 4 ++++ README.md | 12 ++++++++++++ requirenments.txt | 1 + test_bio.py | 26 ++++++++++++++++++++++++++ test_data/.gitkeep | 0 5 files changed, 43 insertions(+) create mode 100644 requirenments.txt create mode 100644 test_bio.py create mode 100644 test_data/.gitkeep diff --git a/.gitignore b/.gitignore index 15201ac..a225bdb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Data files shouldn't be synced with GitHub +*.fna + + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index da26b24..89f6b01 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,16 @@ Accurate Taxonomic Genomic Computer This research is about classifying microbes by genomic signatures using machine learning +## BioPython +A set of Python tools for computational molecular biology. +Biopython features include +* Parsers for various Bioinformatics file formats (BLAST, Clustalw, FASTA, Genbank,…) +* Access to online services (NCBI, Expasy,…) +* Interfaces to common and not-so-common programs (Clustalw, DSSP, MSMS…) +* A standard sequence class +* Various clustering modules +* A KD tree data structure +* And more... + +This information was from [LabXchange](https://www.labxchange.org/library/items/lb:LabXchange:d64401b3:html:1) diff --git a/requirenments.txt b/requirenments.txt new file mode 100644 index 0000000..e0116bd --- /dev/null +++ b/requirenments.txt @@ -0,0 +1 @@ +biopython diff --git a/test_bio.py b/test_bio.py new file mode 100644 index 0000000..7e1e79f --- /dev/null +++ b/test_bio.py @@ -0,0 +1,26 @@ +import os +from Bio import SeqIO + +def read_fna(file_path): + """Reads a .fna file and returns a list of SeqRecord objects.""" + records = [] + with open(file_path, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + records.append(record) + return records + +# Example usage +sequence_iterators = {} +data_folder_path = "test_data" +for file in os.listdir(data_folder_path): + file_path = os.path.join(data_folder_path, file) + if os.path.isfile(file_path): + sequence_iterators[file] = (read_fna(file_path)) + +for file, sequences in sequence_iterators.items(): + print(f"Info from {file}") +for record in sequences: + print(f"ID: {record.id}") + print(f"Sequence: {record.seq}") + print(f"Description: {record.description}") + print("---") \ No newline at end of file diff --git a/test_data/.gitkeep b/test_data/.gitkeep new file mode 100644 index 0000000..e69de29 From bfcbd9db69e940db134bb4c29a01d6f24a4964ad Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 15 Mar 2025 01:53:18 -0400 Subject: [PATCH 2/5] Excluding .gitkeep --- test_bio.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test_bio.py b/test_bio.py index 7e1e79f..33b5663 100644 --- a/test_bio.py +++ b/test_bio.py @@ -1,5 +1,7 @@ import os from Bio import SeqIO +# from Bio import LogisticRegression + def read_fna(file_path): """Reads a .fna file and returns a list of SeqRecord objects.""" @@ -13,14 +15,16 @@ def read_fna(file_path): sequence_iterators = {} data_folder_path = "test_data" for file in os.listdir(data_folder_path): - file_path = os.path.join(data_folder_path, file) - if os.path.isfile(file_path): - sequence_iterators[file] = (read_fna(file_path)) + if file != ".gitkeep": + file_path = os.path.join(data_folder_path, file) + if os.path.isfile(file_path): + sequence_iterators[file] = (read_fna(file_path)) for file, sequences in sequence_iterators.items(): print(f"Info from {file}") -for record in sequences: - print(f"ID: {record.id}") - print(f"Sequence: {record.seq}") - print(f"Description: {record.description}") - print("---") \ No newline at end of file + + for record in sequences: + print(f"ID: {record.id}") + print(f"Sequence: {record.seq}") + print(f"Description: {record.description}") + print("---") \ No newline at end of file From 0ede0871fa4bcfb55f3841c07a68c70d7f2832ae Mon Sep 17 00:00:00 2001 From: AymanBx Date: Sat, 22 Mar 2025 22:04:59 -0400 Subject: [PATCH 3/5] Cut the sequence short to get the info only and one line of sequence --- test_bio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_bio.py b/test_bio.py index 33b5663..728b998 100644 --- a/test_bio.py +++ b/test_bio.py @@ -25,6 +25,6 @@ def read_fna(file_path): for record in sequences: print(f"ID: {record.id}") - print(f"Sequence: {record.seq}") print(f"Description: {record.description}") + print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}") print("---") \ No newline at end of file From 6d90f93438da62d273a0e85bf0bbd90229dfa2ee Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 27 Mar 2025 07:52:20 +0000 Subject: [PATCH 4/5] Now that all data is imported to unity. This script imports all data files into a dictionary and can print the information --- test_bio.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/test_bio.py b/test_bio.py index 728b998..dbe4e51 100644 --- a/test_bio.py +++ b/test_bio.py @@ -13,13 +13,17 @@ def read_fna(file_path): # Example usage sequence_iterators = {} -data_folder_path = "test_data" -for file in os.listdir(data_folder_path): - if file != ".gitkeep": - file_path = os.path.join(data_folder_path, file) - if os.path.isfile(file_path): - sequence_iterators[file] = (read_fna(file_path)) - +data_folder_path = "../data" +for directory in os.listdir(data_folder_path): + if directory != ".gitkeep" and directory != "dataset_catalog.json" and directory != "data_summary.tsv" and directory != "assembly_data_report.jsonl" : + data_directory_path = os.path.join(data_folder_path, directory) + data_file = os.listdir(data_directory_path) + data_file_path = os.path.join(data_directory_path, data_file[0]) + if os.path.isfile(data_file_path): + # print(data_file_path) + sequence_iterators[data_file[0]] = (read_fna(data_file_path)) +print(f"Collected data..\n{len(sequence_iterators)}") +print() for file, sequences in sequence_iterators.items(): print(f"Info from {file}") From 1824a3b6df92dc86ff485c050fbe885fc9ae0805 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Thu, 8 May 2025 04:35:03 +0000 Subject: [PATCH 5/5] Different ways of ingesting the data --- extra.py | 29 +++++++++++++ genus_bio.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ genus_dict.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++ phylum_bio.py | 91 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 352 insertions(+) create mode 100644 extra.py create mode 100644 genus_bio.py create mode 100644 genus_dict.py create mode 100644 phylum_bio.py diff --git a/extra.py b/extra.py new file mode 100644 index 0000000..b544622 --- /dev/null +++ b/extra.py @@ -0,0 +1,29 @@ +"""Reads a .fna file and returns a list of SeqRecord objects.""" +def read_fna(file_path): + global seq_count + records = [] + with open(file_path, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + records.append(record) + print(dir(record)) + print() + print(type(record)) + print(record.description) + for word in record.description.split(" "): + print(word) + print(record.annotations) + print(record.dbxrefs) + print(record.features) + print(record.letter_annotations) + print(record.id) + # print(record.translate()) # Method to turn into protein?? + # print(record.count()) # Method requires arg:sub + # print(record.format()) # Method requires arg:format + # print(record.isupper()) # Method base case + # print(record.upper()) # Method base case + # print(record.islower()) # Method base case + # print(record.lower()) # Method base case + exit() + # records = SeqIO.parse(handle, "fasta") + seq_count += len(records) + return records \ No newline at end of file diff --git a/genus_bio.py b/genus_bio.py new file mode 100644 index 0000000..a5c59ab --- /dev/null +++ b/genus_bio.py @@ -0,0 +1,114 @@ +import os +from Bio import SeqIO +# from Bio import LogisticRegression + +# For debugging purposes +files = 0 +seq_count = 0 +max_length = 0 +genus_repetition = 0 + +"""Reads a .fna file and returns a list of SeqRecord objects.""" +def read_fna(file_path): + global seq_count, max_length + genus="new" + sequences = [] + with open(file_path, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + description = [word for word in record.description.split(" ")] + + # Get the genus of the sequence + if genus == "new": + genus = description[1] + # Confirm that the different sequences in the file are of the same genus + else: assert genus == description[1] + + # Finding maximum length for padding + if (max_length < len(record.seq)): + max_length = len(record.seq) + + sequences.append(record) + + seq_count += len(sequences) + return genus, sequences + + +""" +A module iterates over fna data files that hold DNA sequences sorted in folders of phylums. +The program loads the data into a dictionary that uses genises as keys and lists of +k-mers of those sequeces as values. +This dictionary will be split and used to train a transformers ML model and then test it +on its ability to predict the genis of a genome sucessfully +""" +if __name__ == "__main__": + + + # Hardcoded data source + # This data folder holds genomic data sorted in phylum folders + phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data" + # This data folder holds unsorted genomic data + data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data" + + + # Create empty dictionaries + phylum_dict = {} + genus_dict = {} + + # Start iterating over the folders + for phylum in os.listdir(phylum_data_folder_path): + # Create pyhlum keys with values that are empty lists + phylum_dict[phylum] = [] + + phylum_folder = os.path.join(phylum_data_folder_path, phylum) + number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder + + # Debugging + print(f"In {phylum} folder, we have {number_of_files} data files.") + files += number_of_files + # Limit the work to 5 sequeces from each phylum to test + # num = 0 # Debugging: Testing. Comment out after confirming + + + # Each directory holds a single fna file + for directory in os.listdir(phylum_folder): + # if num < 5: # Debugging: Testing. Comment out after confirming + data_directory_path = os.path.join(phylum_folder, directory) + if os.path.isdir(data_directory_path): + data_file = os.listdir(data_directory_path) + # Confirm that there's only one file in each folder + assert len(data_file) == 1 + data_file_path = os.path.join(data_directory_path, data_file[0]) + if os.path.isfile(data_file_path): # Just for more safty + # print(data_file_path) + # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key + genus, some_sequences= read_fna(data_file_path) + phylum_dict[phylum] += some_sequences # + if (genus in genus_dict.keys()): + genus_repetition += 1 + genus_dict[genus] += some_sequences + else: + genus_dict.update({genus: some_sequences}) + # else: break # Debugging: Testing. Comment out after confirming + # num += 1 # Debugging: Testing. Comment out after confirming + + print(f"Collected data..\n") + print(type(genus_dict)) + + # For testing purposes: Outputting the inputted data + # Iterate over the dictionary items + for genus, sequences in genus_dict.items(): + print(f"Info from {genus}") + print(f"This genus contains {len(sequences)} sequence objects.") + for record in sequences: + print(f"ID: {record.id}") + print(f"Description: {record.description}") + print(f"Sequence len: {len(record.seq)}") + print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}") + print("---") + # I've noiticed that some files have more than one sequence in them. This is to confirm that + # AS: Confirmed! Some files have more than one sequence of the same species!!! + print(f"Total sequences: {seq_count}") + print(f"Total files: {files}") + print(f"Total # of genuses: {len(genus_dict)}") + print(f"Total # of genuse repetition: {genus_repetition}") + print(f"Max seq. length: {max_length}") \ No newline at end of file diff --git a/genus_dict.py b/genus_dict.py new file mode 100644 index 0000000..e1d8ca6 --- /dev/null +++ b/genus_dict.py @@ -0,0 +1,118 @@ +import os +from Bio import SeqIO +# from Bio import LogisticRegression + +# For debugging purposes +files = 0 +seq_count = 0 +max_length = 0 +genus_repetition = 0 + +"""Reads a .fna file and returns a list of SeqRecord objects.""" +def read_fna(file_path): + global seq_count, max_length + # genus="new" + sequences = [] + with open(file_path, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + description = [word for word in record.description.split(" ")] + genus = description[1] + + # We have confirmed that each file contains only one genus + # # Get the genus of the sequence + # if genus == "new": + # genus = description[1] + # # Confirm that the different sequences in the file are of the same genus + # else: assert genus == description[1] + + # Finding maximum length for padding + if (max_length < len(record.seq)): + max_length = len(record.seq) + # We only care for the top sequence + seq_count += 1 + return genus, record + + seq_count += len(sequences) + return genus, sequences + + +""" +A module iterates over fna data files that hold DNA sequences sorted in folders of phylums. +The program loads the data into a dictionary that uses genises as keys and lists of +k-mers of those sequeces as values. +This dictionary will be split and used to train a transformers ML model and then test it +on its ability to predict the genis of a genome sucessfully +""" +if __name__ == "__main__": + + + # Hardcoded data source + # This data folder holds genomic data sorted in phylum folders + phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data" + # This data folder holds unsorted genomic data + data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data" + + + # Create empty dictionaries + phylum_dict = {} + genus_dict = {} + + # Start iterating over the folders + for phylum in os.listdir(phylum_data_folder_path): + # Create pyhlum keys with values that are empty lists + phylum_dict[phylum] = [] + + phylum_folder = os.path.join(phylum_data_folder_path, phylum) + number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder + + # Debugging + print(f"In {phylum} folder, we have {number_of_files} data files.") + files += number_of_files + # Limit the work to 5 sequeces from each phylum to test + # num = 0 # Debugging: Testing. Comment out after confirming + + + # Each directory holds a single fna file + for directory in os.listdir(phylum_folder): + # if num < 5: # Debugging: Testing. Comment out after confirming + data_directory_path = os.path.join(phylum_folder, directory) + if os.path.isdir(data_directory_path): + data_file = os.listdir(data_directory_path) + # Confirm that there's only one file in each folder + assert len(data_file) == 1 + data_file_path = os.path.join(data_directory_path, data_file[0]) + if os.path.isfile(data_file_path): # Just for more safty + # print(data_file_path) + # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key + + genus, sequence= read_fna(data_file_path) + phylum_dict[phylum].append(sequence) # + if (genus in genus_dict.keys()): + genus_repetition += 1 + genus_dict[genus].append(sequence) + else: + genus_dict.update({genus: [sequence]}) + # else: break # Debugging: Testing. Comment out after confirming + # num += 1 # Debugging: Testing. Comment out after confirming + + print(f"Collected data..\n") + print(type(genus_dict)) + + # For testing purposes: Outputting the inputted data + # Iterate over the dictionary items + for genus, sequences in genus_dict.items(): + print(f"Info from {genus}") + print(f"This genus contains {len(sequences)} sequence objects.") + for record in sequences: + print(f"ID: {record.id}") + print(f"Description: {record.description}") + print(f"Sequence len: {len(record.seq)}") + print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}") + print("---") + # I've noiticed that some files have more than one sequence in them. This is to confirm that + # AS: Confirmed! Some files have more than one sequence of the same species!!! + print(f"Total sequences: {seq_count}") + print(f"Total files: {files}") + print(f"Total # of genuses: {len(genus_dict)}") + print(f"Total # of genuse repetition: {genus_repetition}") + print(f"Max seq. length: {max_length}") \ No newline at end of file diff --git a/phylum_bio.py b/phylum_bio.py new file mode 100644 index 0000000..ac67579 --- /dev/null +++ b/phylum_bio.py @@ -0,0 +1,91 @@ +import os +from Bio import SeqIO +# from Bio import LogisticRegression + +# For debugging purposes +files = 0 +seq_count = 0 + +"""Reads a .fna file and returns a list of SeqRecord objects.""" +def read_fna(file_path): + global seq_count + records = [] + with open(file_path, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + records.append(record) + # records = SeqIO.parse(handle, "fasta") + seq_count += len(records) + return records + + +""" +A module iterates over fna data files that hold DNA sequences sorted in folders of phylums. +The program loads the data into a dictionary that uses genises as keys and lists of +k-mers of those sequeces as values. +This dictionary will be split and used to train a transformers ML model and then test it +on its ability to predict the genis of a genome sucessfully +""" +if __name__ == "__main__": + + + # Hardcoded data source + # This data folder holds genomic data sorted in phylum folders + phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data" + # This data folder holds unsorted genomic data + data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data" + + + # Create empty dictionaries + phylum_dict = {} + genis_dict = {} + + # Start iterating over the folders + for phylum in os.listdir(phylum_data_folder_path): + # Create pyhlum keys with values that are empty lists + phylum_dict[phylum] = [] + + phylum_folder = os.path.join(phylum_data_folder_path, phylum) + number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder + + # Debugging + print(f"In {phylum} folder, we have {number_of_files} data files.") + files += number_of_files + # Limit the work to 5 sequeces from each phylum to test + num = 0 # Debugging: Testing. Comment out after confirming + + + # Each directory holds a single fna file + for directory in os.listdir(phylum_folder): + if num < 5: # Debugging: Testing. Comment out after confirming + data_directory_path = os.path.join(phylum_folder, directory) + if os.path.isdir(data_directory_path): + data_file = os.listdir(data_directory_path) + + # Confirm that there's only one file in each folder + assert len(data_file) == 1 + data_file_path = os.path.join(data_directory_path, data_file[0]) + if os.path.isfile(data_file_path): # Just for more safty + # print(data_file_path) + # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key + phylum_dict[phylum] += read_fna(data_file_path) + else: break # Debugging: Testing. Comment out after confirming + num += 1 # Debugging: Testing. Comment out after confirming + + print(f"Collected data..\n") + + # For testing purposes: Outputting the inputted data + # Iterate over the dictionary items + for phylum, sequences in phylum_dict.items(): + print(f"Info from {phylum}") + print(f"This phylum contains {len(sequences)} sequence objects.") + for record in sequences: + print(f"ID: {record.id}") + print(f"Description: {record.description}") + print(f"Sequence len: {len(record.seq)}") + print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}") + print("---") + # I've noiticed that some files have more than one sequence in them. This is to confirm that + # AS: Confirmed! Some files have more than one sequence of the same species!!! + print(f"Total sequences: {seq_count}") + print(f"Total files: {files}") +