From a554a3977bfcd26c681a8c2b8d3b080e1530b433 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Fri, 14 Mar 2025 18:31:53 -0400
Subject: [PATCH 1/5] simple test script from importing sequences using
 BioPython

---
 .gitignore         |  4 ++++
 README.md          | 12 ++++++++++++
 requirenments.txt  |  1 +
 test_bio.py        | 26 ++++++++++++++++++++++++++
 test_data/.gitkeep |  0
 5 files changed, 43 insertions(+)
 create mode 100644 requirenments.txt
 create mode 100644 test_bio.py
 create mode 100644 test_data/.gitkeep

diff --git a/.gitignore b/.gitignore
index 15201ac..a225bdb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# Data files shouldn't be synced with GitHub
+*.fna
+
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
index da26b24..89f6b01 100644
--- a/README.md
+++ b/README.md
@@ -3,4 +3,16 @@ Accurate Taxonomic Genomic Computer
 
 This research is about classifying microbes by genomic signatures using machine learning
 
+## BioPython
+A set of Python tools for computational molecular biology.
 
+Biopython features include
+* Parsers for various Bioinformatics file formats (BLAST, Clustalw, FASTA, Genbank,…)
+* Access to online services (NCBI, Expasy,…)
+* Interfaces to common and not-so-common programs (Clustalw, DSSP, MSMS…)
+* A standard sequence class
+* Various clustering modules
+* A KD tree data structure
+* And more...
+
+This information was from [LabXchange](https://www.labxchange.org/library/items/lb:LabXchange:d64401b3:html:1)
diff --git a/requirenments.txt b/requirenments.txt
new file mode 100644
index 0000000..e0116bd
--- /dev/null
+++ b/requirenments.txt
@@ -0,0 +1 @@
+biopython
diff --git a/test_bio.py b/test_bio.py
new file mode 100644
index 0000000..7e1e79f
--- /dev/null
+++ b/test_bio.py
@@ -0,0 +1,26 @@
+import os
+from Bio import SeqIO
+
+def read_fna(file_path):
+    """Reads a .fna file and returns a list of SeqRecord objects."""
+    records = []
+    with open(file_path, "r") as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            records.append(record)
+    return records
+
+# Example usage
+sequence_iterators = {}
+data_folder_path = "test_data"
+for file in os.listdir(data_folder_path):
+    file_path = os.path.join(data_folder_path, file)
+    if os.path.isfile(file_path):
+        sequence_iterators[file] = (read_fna(file_path))
+
+for file, sequences in sequence_iterators.items():
+    print(f"Info from {file}")
+for record in sequences:
+    print(f"ID: {record.id}")
+    print(f"Sequence: {record.seq}")
+    print(f"Description: {record.description}")
+    print("---")
\ No newline at end of file
diff --git a/test_data/.gitkeep b/test_data/.gitkeep
new file mode 100644
index 0000000..e69de29

From bfcbd9db69e940db134bb4c29a01d6f24a4964ad Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Sat, 15 Mar 2025 01:53:18 -0400
Subject: [PATCH 2/5] Excluding .gitkeep

---
 test_bio.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test_bio.py b/test_bio.py
index 7e1e79f..33b5663 100644
--- a/test_bio.py
+++ b/test_bio.py
@@ -1,5 +1,7 @@
 import os
 from Bio import SeqIO
+# from Bio import LogisticRegression
+
 
 def read_fna(file_path):
     """Reads a .fna file and returns a list of SeqRecord objects."""
@@ -13,14 +15,16 @@ def read_fna(file_path):
 sequence_iterators = {}
 data_folder_path = "test_data"
 for file in os.listdir(data_folder_path):
-    file_path = os.path.join(data_folder_path, file)
-    if os.path.isfile(file_path):
-        sequence_iterators[file] = (read_fna(file_path))
+    if file != ".gitkeep":
+        file_path = os.path.join(data_folder_path, file)
+        if os.path.isfile(file_path):
+            sequence_iterators[file] = (read_fna(file_path))
 
 for file, sequences in sequence_iterators.items():
     print(f"Info from {file}")
-for record in sequences:
-    print(f"ID: {record.id}")
-    print(f"Sequence: {record.seq}")
-    print(f"Description: {record.description}")
-    print("---")
\ No newline at end of file
+
+    for record in sequences:
+        print(f"ID: {record.id}")
+        print(f"Sequence: {record.seq}")
+        print(f"Description: {record.description}")
+        print("---")
\ No newline at end of file

From 0ede0871fa4bcfb55f3841c07a68c70d7f2832ae Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Sat, 22 Mar 2025 22:04:59 -0400
Subject: [PATCH 3/5] Cut the sequence short to get the info only and one line
 of sequence

---
 test_bio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_bio.py b/test_bio.py
index 33b5663..728b998 100644
--- a/test_bio.py
+++ b/test_bio.py
@@ -25,6 +25,6 @@ def read_fna(file_path):
 
     for record in sequences:
         print(f"ID: {record.id}")
-        print(f"Sequence: {record.seq}")
         print(f"Description: {record.description}")
+        print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
         print("---")
\ No newline at end of file

From 6d90f93438da62d273a0e85bf0bbd90229dfa2ee Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Thu, 27 Mar 2025 07:52:20 +0000
Subject: [PATCH 4/5] Now that all data is imported to unity. This script
 imports all data files into a dictionary and can print the information

---
 test_bio.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test_bio.py b/test_bio.py
index 728b998..dbe4e51 100644
--- a/test_bio.py
+++ b/test_bio.py
@@ -13,13 +13,17 @@ def read_fna(file_path):
 
 # Example usage
 sequence_iterators = {}
-data_folder_path = "test_data"
-for file in os.listdir(data_folder_path):
-    if file != ".gitkeep":
-        file_path = os.path.join(data_folder_path, file)
-        if os.path.isfile(file_path):
-            sequence_iterators[file] = (read_fna(file_path))
-
+data_folder_path = "../data"
+for directory in os.listdir(data_folder_path):
+    if directory != ".gitkeep" and directory != "dataset_catalog.json" and directory != "data_summary.tsv"  and directory != "assembly_data_report.jsonl" :
+        data_directory_path = os.path.join(data_folder_path, directory)
+        data_file = os.listdir(data_directory_path)
+        data_file_path = os.path.join(data_directory_path, data_file[0])
+        if os.path.isfile(data_file_path):
+            # print(data_file_path)
+            sequence_iterators[data_file[0]] = (read_fna(data_file_path))
+print(f"Collected data..\n{len(sequence_iterators)}")
+print()
 for file, sequences in sequence_iterators.items():
     print(f"Info from {file}")
 

From 1824a3b6df92dc86ff485c050fbe885fc9ae0805 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Thu, 8 May 2025 04:35:03 +0000
Subject: [PATCH 5/5] Different ways of ingesting the data

---
 extra.py      |  29 +++++++++++++
 genus_bio.py  | 114 ++++++++++++++++++++++++++++++++++++++++++++++++
 genus_dict.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
 phylum_bio.py |  91 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 352 insertions(+)
 create mode 100644 extra.py
 create mode 100644 genus_bio.py
 create mode 100644 genus_dict.py
 create mode 100644 phylum_bio.py

diff --git a/extra.py b/extra.py
new file mode 100644
index 0000000..b544622
--- /dev/null
+++ b/extra.py
@@ -0,0 +1,29 @@
+"""Reads a .fna file and returns a list of SeqRecord objects."""
+def read_fna(file_path):
+    global seq_count
+    records = []
+    with open(file_path, "r") as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            records.append(record)
+            print(dir(record))
+            print()
+            print(type(record))
+            print(record.description)
+            for word in record.description.split(" "):
+                print(word)
+            print(record.annotations)
+            print(record.dbxrefs)
+            print(record.features)
+            print(record.letter_annotations)
+            print(record.id)
+            # print(record.translate()) # Method to turn into protein??
+            # print(record.count()) # Method requires arg:sub
+            # print(record.format()) # Method requires arg:format
+            # print(record.isupper()) # Method base case
+            # print(record.upper())  # Method base case
+            # print(record.islower()) # Method base case
+            # print(record.lower()) # Method base case
+            exit()
+        # records = SeqIO.parse(handle, "fasta") 
+    seq_count += len(records)
+    return records
\ No newline at end of file
diff --git a/genus_bio.py b/genus_bio.py
new file mode 100644
index 0000000..a5c59ab
--- /dev/null
+++ b/genus_bio.py
@@ -0,0 +1,114 @@
+import os
+from Bio import SeqIO
+# from Bio import LogisticRegression
+
+# For debugging purposes
+files = 0
+seq_count = 0
+max_length = 0
+genus_repetition = 0
+
+"""Reads a .fna file and returns a list of SeqRecord objects."""
+def read_fna(file_path):
+    global seq_count, max_length
+    genus="new"
+    sequences = []
+    with open(file_path, "r") as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            description = [word for word in record.description.split(" ")]
+            
+            # Get the genus of the sequence
+            if genus == "new":
+                genus = description[1]
+            # Confirm that the different sequences in the file are of the same genus
+            else: assert genus == description[1]
+
+            # Finding maximum length for padding
+            if (max_length < len(record.seq)): 
+                max_length = len(record.seq)
+
+            sequences.append(record)
+
+    seq_count += len(sequences)
+    return genus, sequences
+
+
+"""
+A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
+The program loads the data into a dictionary that uses genises as keys and lists of 
+k-mers of those sequeces as values.
+This dictionary will be split and used to train a transformers ML model and then test it 
+on its ability to predict the genis of a genome sucessfully
+"""
+if __name__ == "__main__":
+    
+
+    # Hardcoded data source
+    # This data folder holds genomic data sorted in phylum folders
+    phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
+    # This data folder holds unsorted genomic data
+    data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"
+
+
+    # Create empty dictionaries 
+    phylum_dict = {}
+    genus_dict = {}
+    
+    # Start iterating over the folders
+    for phylum in os.listdir(phylum_data_folder_path):
+        # Create pyhlum keys with values that are empty lists
+        phylum_dict[phylum] = []
+
+        phylum_folder = os.path.join(phylum_data_folder_path, phylum)
+        number_of_files= len(os.listdir(phylum_folder)) - 3     # We have 3 metadata files in each folder
+        
+        # Debugging
+        print(f"In {phylum} folder, we have {number_of_files} data files.") 
+        files += number_of_files
+        # Limit the work to 5 sequeces from each phylum to test
+        # num = 0 # Debugging: Testing. Comment out after confirming
+
+
+        # Each directory holds a single fna file
+        for directory in os.listdir(phylum_folder):
+            # if num < 5: # Debugging: Testing. Comment out after confirming
+            data_directory_path = os.path.join(phylum_folder, directory)
+            if os.path.isdir(data_directory_path):
+                data_file = os.listdir(data_directory_path)
+                # Confirm that there's only one file in each folder
+                assert len(data_file) == 1
+                data_file_path = os.path.join(data_directory_path, data_file[0])
+                if os.path.isfile(data_file_path): # Just for more safty
+                    # print(data_file_path)
+                    # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key
+                    genus, some_sequences= read_fna(data_file_path)
+                    phylum_dict[phylum] += some_sequences # 
+                    if (genus in genus_dict.keys()):
+                        genus_repetition += 1
+                        genus_dict[genus] += some_sequences
+                    else: 
+                        genus_dict.update({genus: some_sequences})
+            # else: break # Debugging: Testing. Comment out after confirming
+            # num += 1    # Debugging: Testing. Comment out after confirming
+
+    print(f"Collected data..\n")
+    print(type(genus_dict))
+
+    # For testing purposes: Outputting the inputted data
+    # Iterate over the dictionary items 
+    for genus, sequences in genus_dict.items():
+        print(f"Info from {genus}")
+        print(f"This genus contains {len(sequences)} sequence objects.")
+        for record in sequences:
+            print(f"ID: {record.id}")
+            print(f"Description: {record.description}")
+            print(f"Sequence len: {len(record.seq)}")
+            print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
+            print("---")
+    # I've noiticed that some files have more than one sequence in them. This is to confirm that
+    # AS: Confirmed! Some files have more than one sequence of the same species!!!
+    print(f"Total sequences: {seq_count}")
+    print(f"Total files: {files}")
+    print(f"Total # of genuses: {len(genus_dict)}")
+    print(f"Total # of genuse repetition: {genus_repetition}")
+    print(f"Max seq. length: {max_length}")
\ No newline at end of file
diff --git a/genus_dict.py b/genus_dict.py
new file mode 100644
index 0000000..e1d8ca6
--- /dev/null
+++ b/genus_dict.py
@@ -0,0 +1,118 @@
+import os
+from Bio import SeqIO
+# from Bio import LogisticRegression
+
+# For debugging purposes
+files = 0
+seq_count = 0
+max_length = 0
+genus_repetition = 0
+
+"""Reads a .fna file and returns a list of SeqRecord objects."""
+def read_fna(file_path):
+    global seq_count, max_length
+    # genus="new"
+    sequences = []
+    with open(file_path, "r") as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            description = [word for word in record.description.split(" ")]
+            genus = description[1]
+            
+            # We have confirmed that each file contains only one genus
+            # # Get the genus of the sequence
+            # if genus == "new":
+            #     genus = description[1]
+            # # Confirm that the different sequences in the file are of the same genus
+            # else: assert genus == description[1]
+
+            # Finding maximum length for padding
+            if (max_length < len(record.seq)): 
+                max_length = len(record.seq)
+            # We only care for the top sequence
+            seq_count += 1
+            return genus, record
+
+    seq_count += len(sequences)
+    return genus, sequences
+
+
+"""
+A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
+The program loads the data into a dictionary that uses genises as keys and lists of 
+k-mers of those sequeces as values.
+This dictionary will be split and used to train a transformers ML model and then test it 
+on its ability to predict the genis of a genome sucessfully
+"""
+if __name__ == "__main__":
+    
+
+    # Hardcoded data source
+    # This data folder holds genomic data sorted in phylum folders
+    phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
+    # This data folder holds unsorted genomic data
+    data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"
+
+
+    # Create empty dictionaries 
+    phylum_dict = {}
+    genus_dict = {}
+    
+    # Start iterating over the folders
+    for phylum in os.listdir(phylum_data_folder_path):
+        # Create pyhlum keys with values that are empty lists
+        phylum_dict[phylum] = []
+
+        phylum_folder = os.path.join(phylum_data_folder_path, phylum)
+        number_of_files= len(os.listdir(phylum_folder)) - 3     # We have 3 metadata files in each folder
+        
+        # Debugging
+        print(f"In {phylum} folder, we have {number_of_files} data files.") 
+        files += number_of_files
+        # Limit the work to 5 sequeces from each phylum to test
+        # num = 0 # Debugging: Testing. Comment out after confirming
+
+
+        # Each directory holds a single fna file
+        for directory in os.listdir(phylum_folder):
+            # if num < 5: # Debugging: Testing. Comment out after confirming
+            data_directory_path = os.path.join(phylum_folder, directory)
+            if os.path.isdir(data_directory_path):
+                data_file = os.listdir(data_directory_path)
+                # Confirm that there's only one file in each folder
+                assert len(data_file) == 1
+                data_file_path = os.path.join(data_directory_path, data_file[0])
+                if os.path.isfile(data_file_path): # Just for more safty
+                    # print(data_file_path)
+                    # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key
+                    
+                    genus, sequence= read_fna(data_file_path)
+                    phylum_dict[phylum].append(sequence) # 
+                    if (genus in genus_dict.keys()):
+                        genus_repetition += 1
+                        genus_dict[genus].append(sequence)
+                    else: 
+                        genus_dict.update({genus: [sequence]})
+            # else: break # Debugging: Testing. Comment out after confirming
+            # num += 1    # Debugging: Testing. Comment out after confirming
+
+    print(f"Collected data..\n")
+    print(type(genus_dict))
+
+    # For testing purposes: Outputting the inputted data
+    # Iterate over the dictionary items 
+    for genus, sequences in genus_dict.items():
+        print(f"Info from {genus}")
+        print(f"This genus contains {len(sequences)} sequence objects.")
+        for record in sequences:
+            print(f"ID: {record.id}")
+            print(f"Description: {record.description}")
+            print(f"Sequence len: {len(record.seq)}")
+            print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
+            print("---")
+    # I've noiticed that some files have more than one sequence in them. This is to confirm that
+    # AS: Confirmed! Some files have more than one sequence of the same species!!!
+    print(f"Total sequences: {seq_count}")
+    print(f"Total files: {files}")
+    print(f"Total # of genuses: {len(genus_dict)}")
+    print(f"Total # of genuse repetition: {genus_repetition}")
+    print(f"Max seq. length: {max_length}")
\ No newline at end of file
diff --git a/phylum_bio.py b/phylum_bio.py
new file mode 100644
index 0000000..ac67579
--- /dev/null
+++ b/phylum_bio.py
@@ -0,0 +1,91 @@
+import os
+from Bio import SeqIO
+# from Bio import LogisticRegression
+
+# For debugging purposes
+files = 0
+seq_count = 0
+
+"""Reads a .fna file and returns a list of SeqRecord objects."""
+def read_fna(file_path):
+    global seq_count
+    records = []
+    with open(file_path, "r") as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            records.append(record)
+        # records = SeqIO.parse(handle, "fasta") 
+    seq_count += len(records)
+    return records
+
+
+"""
+A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
+The program loads the data into a dictionary that uses genises as keys and lists of 
+k-mers of those sequeces as values.
+This dictionary will be split and used to train a transformers ML model and then test it 
+on its ability to predict the genis of a genome sucessfully
+"""
+if __name__ == "__main__":
+    
+
+    # Hardcoded data source
+    # This data folder holds genomic data sorted in phylum folders
+    phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
+    # This data folder holds unsorted genomic data
+    data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"
+
+
+    # Create empty dictionaries 
+    phylum_dict = {}
+    genis_dict = {}
+    
+    # Start iterating over the folders
+    for phylum in os.listdir(phylum_data_folder_path):
+        # Create pyhlum keys with values that are empty lists
+        phylum_dict[phylum] = []
+
+        phylum_folder = os.path.join(phylum_data_folder_path, phylum)
+        number_of_files= len(os.listdir(phylum_folder)) - 3     # We have 3 metadata files in each folder
+        
+        # Debugging
+        print(f"In {phylum} folder, we have {number_of_files} data files.") 
+        files += number_of_files
+        # Limit the work to 5 sequeces from each phylum to test
+        num = 0 # Debugging: Testing. Comment out after confirming
+
+
+        # Each directory holds a single fna file
+        for directory in os.listdir(phylum_folder):
+            if num < 5: # Debugging: Testing. Comment out after confirming
+                data_directory_path = os.path.join(phylum_folder, directory)
+                if os.path.isdir(data_directory_path):
+                    data_file = os.listdir(data_directory_path)
+
+                    # Confirm that there's only one file in each folder
+                    assert len(data_file) == 1
+                    data_file_path = os.path.join(data_directory_path, data_file[0])
+                    if os.path.isfile(data_file_path): # Just for more safty
+                        # print(data_file_path)
+                        # Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key
+                        phylum_dict[phylum] += read_fna(data_file_path)
+            else: break # Debugging: Testing. Comment out after confirming
+            num += 1    # Debugging: Testing. Comment out after confirming
+
+    print(f"Collected data..\n")
+
+    # For testing purposes: Outputting the inputted data
+    # Iterate over the dictionary items 
+    for phylum, sequences in phylum_dict.items():
+        print(f"Info from {phylum}")
+        print(f"This phylum contains {len(sequences)} sequence objects.")
+        for record in sequences:
+            print(f"ID: {record.id}")
+            print(f"Description: {record.description}")
+            print(f"Sequence len: {len(record.seq)}")
+            print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
+            print("---")
+    # I've noiticed that some files have more than one sequence in them. This is to confirm that
+    # AS: Confirmed! Some files have more than one sequence of the same species!!!
+    print(f"Total sequences: {seq_count}")
+    print(f"Total files: {files}")
+