Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Data files shouldn't be synced with GitHub
*.fna


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,16 @@ Accurate Taxonomic Genomic Computer

This research is about classifying microbes by genomic signatures using machine learning

## BioPython
A set of Python tools for computational molecular biology.

Biopython features include
* Parsers for various Bioinformatics file formats (BLAST, Clustalw, FASTA, Genbank,…)
* Access to online services (NCBI, Expasy,…)
* Interfaces to common and not-so-common programs (Clustalw, DSSP, MSMS…)
* A standard sequence class
* Various clustering modules
* A KD tree data structure
* And more...

This information was from [LabXchange](https://www.labxchange.org/library/items/lb:LabXchange:d64401b3:html:1)
29 changes: 29 additions & 0 deletions extra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Reads a .fna file and returns a list of SeqRecord objects."""
def read_fna(file_path):
global seq_count
records = []
with open(file_path, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
records.append(record)
print(dir(record))
print()
print(type(record))
print(record.description)
for word in record.description.split(" "):
print(word)
print(record.annotations)
print(record.dbxrefs)
print(record.features)
print(record.letter_annotations)
print(record.id)
# print(record.translate()) # Method to turn into protein??
# print(record.count()) # Method requires arg:sub
# print(record.format()) # Method requires arg:format
# print(record.isupper()) # Method base case
# print(record.upper()) # Method base case
# print(record.islower()) # Method base case
# print(record.lower()) # Method base case
exit()
# records = SeqIO.parse(handle, "fasta")
seq_count += len(records)
return records
114 changes: 114 additions & 0 deletions genus_bio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
from Bio import SeqIO
# from Bio import LogisticRegression

# For debugging purposes
files = 0
seq_count = 0
max_length = 0
genus_repetition = 0

"""Reads a .fna file and returns a list of SeqRecord objects."""
def read_fna(file_path):
global seq_count, max_length
genus="new"
sequences = []
with open(file_path, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
description = [word for word in record.description.split(" ")]

# Get the genus of the sequence
if genus == "new":
genus = description[1]
# Confirm that the different sequences in the file are of the same genus
else: assert genus == description[1]

# Finding maximum length for padding
if (max_length < len(record.seq)):
max_length = len(record.seq)

sequences.append(record)

seq_count += len(sequences)
return genus, sequences


"""
A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
The program loads the data into a dictionary that uses genises as keys and lists of
k-mers of those sequeces as values.
This dictionary will be split and used to train a transformers ML model and then test it
on its ability to predict the genis of a genome sucessfully
"""
if __name__ == "__main__":


# Hardcoded data source
# This data folder holds genomic data sorted in phylum folders
phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
# This data folder holds unsorted genomic data
data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"


# Create empty dictionaries
phylum_dict = {}
genus_dict = {}

# Start iterating over the folders
for phylum in os.listdir(phylum_data_folder_path):
# Create pyhlum keys with values that are empty lists
phylum_dict[phylum] = []

phylum_folder = os.path.join(phylum_data_folder_path, phylum)
number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder

# Debugging
print(f"In {phylum} folder, we have {number_of_files} data files.")
files += number_of_files
# Limit the work to 5 sequeces from each phylum to test
# num = 0 # Debugging: Testing. Comment out after confirming


# Each directory holds a single fna file
for directory in os.listdir(phylum_folder):
# if num < 5: # Debugging: Testing. Comment out after confirming
data_directory_path = os.path.join(phylum_folder, directory)
if os.path.isdir(data_directory_path):
data_file = os.listdir(data_directory_path)
# Confirm that there's only one file in each folder
assert len(data_file) == 1
data_file_path = os.path.join(data_directory_path, data_file[0])
if os.path.isfile(data_file_path): # Just for more safty
# print(data_file_path)
# Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key
genus, some_sequences= read_fna(data_file_path)
phylum_dict[phylum] += some_sequences #
if (genus in genus_dict.keys()):
genus_repetition += 1
genus_dict[genus] += some_sequences
else:
genus_dict.update({genus: some_sequences})
# else: break # Debugging: Testing. Comment out after confirming
# num += 1 # Debugging: Testing. Comment out after confirming

print(f"Collected data..\n")
print(type(genus_dict))

# For testing purposes: Outputting the inputted data
# Iterate over the dictionary items
for genus, sequences in genus_dict.items():
print(f"Info from {genus}")
print(f"This genus contains {len(sequences)} sequence objects.")
for record in sequences:
print(f"ID: {record.id}")
print(f"Description: {record.description}")
print(f"Sequence len: {len(record.seq)}")
print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
print("---")
# I've noiticed that some files have more than one sequence in them. This is to confirm that
# AS: Confirmed! Some files have more than one sequence of the same species!!!
print(f"Total sequences: {seq_count}")
print(f"Total files: {files}")
print(f"Total # of genuses: {len(genus_dict)}")
print(f"Total # of genuse repetition: {genus_repetition}")
print(f"Max seq. length: {max_length}")
118 changes: 118 additions & 0 deletions genus_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
from Bio import SeqIO
# from Bio import LogisticRegression

# For debugging purposes
files = 0
seq_count = 0
max_length = 0
genus_repetition = 0

"""Reads a .fna file and returns a list of SeqRecord objects."""
def read_fna(file_path):
global seq_count, max_length
# genus="new"
sequences = []
with open(file_path, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
description = [word for word in record.description.split(" ")]
genus = description[1]

# We have confirmed that each file contains only one genus
# # Get the genus of the sequence
# if genus == "new":
# genus = description[1]
# # Confirm that the different sequences in the file are of the same genus
# else: assert genus == description[1]

# Finding maximum length for padding
if (max_length < len(record.seq)):
max_length = len(record.seq)
# We only care for the top sequence
seq_count += 1
return genus, record

seq_count += len(sequences)
return genus, sequences


"""
A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
The program loads the data into a dictionary that uses genises as keys and lists of
k-mers of those sequeces as values.
This dictionary will be split and used to train a transformers ML model and then test it
on its ability to predict the genis of a genome sucessfully
"""
if __name__ == "__main__":


# Hardcoded data source
# This data folder holds genomic data sorted in phylum folders
phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
# This data folder holds unsorted genomic data
data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"


# Create empty dictionaries
phylum_dict = {}
genus_dict = {}

# Start iterating over the folders
for phylum in os.listdir(phylum_data_folder_path):
# Create pyhlum keys with values that are empty lists
phylum_dict[phylum] = []

phylum_folder = os.path.join(phylum_data_folder_path, phylum)
number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder

# Debugging
print(f"In {phylum} folder, we have {number_of_files} data files.")
files += number_of_files
# Limit the work to 5 sequeces from each phylum to test
# num = 0 # Debugging: Testing. Comment out after confirming


# Each directory holds a single fna file
for directory in os.listdir(phylum_folder):
# if num < 5: # Debugging: Testing. Comment out after confirming
data_directory_path = os.path.join(phylum_folder, directory)
if os.path.isdir(data_directory_path):
data_file = os.listdir(data_directory_path)
# Confirm that there's only one file in each folder
assert len(data_file) == 1
data_file_path = os.path.join(data_directory_path, data_file[0])
if os.path.isfile(data_file_path): # Just for more safty
# print(data_file_path)
# Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key

genus, sequence= read_fna(data_file_path)
phylum_dict[phylum].append(sequence) #
if (genus in genus_dict.keys()):
genus_repetition += 1
genus_dict[genus].append(sequence)
else:
genus_dict.update({genus: [sequence]})
# else: break # Debugging: Testing. Comment out after confirming
# num += 1 # Debugging: Testing. Comment out after confirming

print(f"Collected data..\n")
print(type(genus_dict))

# For testing purposes: Outputting the inputted data
# Iterate over the dictionary items
for genus, sequences in genus_dict.items():
print(f"Info from {genus}")
print(f"This genus contains {len(sequences)} sequence objects.")
for record in sequences:
print(f"ID: {record.id}")
print(f"Description: {record.description}")
print(f"Sequence len: {len(record.seq)}")
print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
print("---")
# I've noiticed that some files have more than one sequence in them. This is to confirm that
# AS: Confirmed! Some files have more than one sequence of the same species!!!
print(f"Total sequences: {seq_count}")
print(f"Total files: {files}")
print(f"Total # of genuses: {len(genus_dict)}")
print(f"Total # of genuse repetition: {genus_repetition}")
print(f"Max seq. length: {max_length}")
91 changes: 91 additions & 0 deletions phylum_bio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
from Bio import SeqIO
# from Bio import LogisticRegression

# For debugging purposes
files = 0
seq_count = 0

"""Reads a .fna file and returns a list of SeqRecord objects."""
def read_fna(file_path):
global seq_count
records = []
with open(file_path, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
records.append(record)
# records = SeqIO.parse(handle, "fasta")
seq_count += len(records)
return records


"""
A module iterates over fna data files that hold DNA sequences sorted in folders of phylums.
The program loads the data into a dictionary that uses genises as keys and lists of
k-mers of those sequeces as values.
This dictionary will be split and used to train a transformers ML model and then test it
on its ability to predict the genis of a genome sucessfully
"""
if __name__ == "__main__":


# Hardcoded data source
# This data folder holds genomic data sorted in phylum folders
phylum_data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/phylum_data"
# This data folder holds unsorted genomic data
data_folder_path = "/scratch/class/ayman_sandouk_uri_edu-bps542/Project/data"


# Create empty dictionaries
phylum_dict = {}
genis_dict = {}

# Start iterating over the folders
for phylum in os.listdir(phylum_data_folder_path):
# Create pyhlum keys with values that are empty lists
phylum_dict[phylum] = []

phylum_folder = os.path.join(phylum_data_folder_path, phylum)
number_of_files= len(os.listdir(phylum_folder)) - 3 # We have 3 metadata files in each folder

# Debugging
print(f"In {phylum} folder, we have {number_of_files} data files.")
files += number_of_files
# Limit the work to 5 sequeces from each phylum to test
num = 0 # Debugging: Testing. Comment out after confirming


# Each directory holds a single fna file
for directory in os.listdir(phylum_folder):
if num < 5: # Debugging: Testing. Comment out after confirming
data_directory_path = os.path.join(phylum_folder, directory)
if os.path.isdir(data_directory_path):
data_file = os.listdir(data_directory_path)

# Confirm that there's only one file in each folder
assert len(data_file) == 1
data_file_path = os.path.join(data_directory_path, data_file[0])
if os.path.isfile(data_file_path): # Just for more safty
# print(data_file_path)
# Get the content of the data file in a bio object, map it as an item of a list of object in its respective phylum key
phylum_dict[phylum] += read_fna(data_file_path)
else: break # Debugging: Testing. Comment out after confirming
num += 1 # Debugging: Testing. Comment out after confirming

print(f"Collected data..\n")

# For testing purposes: Outputting the inputted data
# Iterate over the dictionary items
for phylum, sequences in phylum_dict.items():
print(f"Info from {phylum}")
print(f"This phylum contains {len(sequences)} sequence objects.")
for record in sequences:
print(f"ID: {record.id}")
print(f"Description: {record.description}")
print(f"Sequence len: {len(record.seq)}")
print(f"Sequence: {record.seq[:70]}"+"..." if len(record.seq) > 70 else f"Sequence: {record.seq}")
print("---")
# I've noiticed that some files have more than one sequence in them. This is to confirm that
# AS: Confirmed! Some files have more than one sequence of the same species!!!
print(f"Total sequences: {seq_count}")
print(f"Total files: {files}")

1 change: 1 addition & 0 deletions requirenments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
biopython
Loading