Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions extensions/get_replicate_average.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env python
import argparse


def create_list(current_line, item_to_remove):
return_list = current_line.split("\t")
if item_to_remove != "":
return_list.remove(item_to_remove)
return_list[len(return_list) - 1] = remove_newline(return_list[len(return_list) - 1])

return return_list


def create_replicate_matrix(names_file, score_file, output_file):
replicate_total = 0
sequence_dict = {}
base_sequences = []

# Read in the name, score, and output files; print any errors found
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice section to see. This type of error handling will definitely make the script more user friendly.

try:
names_fh = open(names_file, "r")
score_fh = open(score_file, "r")
output_fh = open(output_file, "w")
except FileNotFoundError as f:
print(f"Error: file not found ({f})")
return 1
except NameError as n:
print(f"Error: file not found ({n})")
return 1
except TypeError as t:
print(f"Error: file not found ({t})")
return 1

output_fh.write("Sequence name\t")

# Get the first line's base sequence
current_line = names_fh.readline()
current_line_list = create_list(current_line, "")
base_sequence = current_line_list[0]

# Write the first base sequence
base_sequences.append(base_sequence)
output_fh.write(f"{base_sequence}")
sequence_dict[base_sequence] = []

# Get the rest of the base sequences (if there are any)
while True:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's something of a matter of preference, but I generally find that it's more straightforward to step through a file line by line using a for loop. For example:

for line in names_fh:
cols = line.rstrip("\n").split("\t")
etc...

# Get the current line
current_line = names_fh.readline()
if current_line == "":
break

# Get the base sequence on the current line
current_line_list = create_list(current_line, "")
base_sequence = current_line_list[0]

# Use base sequence for the following:
# 1. Append it to the list of base sequences
# 2. Write it to the output file
# 3. Add it as a key in the sequence dictionary
base_sequences.append(base_sequence)
output_fh.write("\t")
output_fh.write(f"{base_sequence}")
sequence_dict[base_sequence] = []

output_fh.write("\n")

# Get list of sequence names from input score file
sequence_names_list = create_list(score_fh.readline(), "Sequence name")

# Loop through score file & record the replicate averages
while True:
base_sequence_found = False

# Check if at EOF
current_line = score_fh.readline()
if current_line == "":
break

# Create list out of values on current row
current_line_list = create_list(current_line, "")

# Get current peptide and remove from list
current_peptide = current_line_list[0]
current_line_list.pop(0)

# Loop through sequence names
sequence_names_index = 0
while sequence_names_index < len(sequence_names_list):
# Loop through base sequences
base_sequence_index = 0
while base_sequence_index < len(base_sequences):
# Check if sequence name contains base sequence name
if sequence_names_list[sequence_names_index].find(base_sequences[base_sequence_index]) != -1:
base_sequence_found = True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I understanding correctly that you are checking to see if the base string is a substring of the individual sample names? This probably works in this example, but this is NOT something that I want to assume is true. This is the reason the replicate names are explicitly provided in the names input file.


# Create/update sequence dictionary entry with:
# Key: sequence name
# Entry: list of replicate values
sequence_dict[base_sequences[base_sequence_index]].append\
(current_line_list[sequence_names_index])

base_sequence_index += 1

# Check if the base sequence was not found after the search
if not base_sequence_found:
print("Error: base sequence not found")

names_fh.close()
score_fh.close()
output_fh.close()

return 1

sequence_names_index += 1

output_fh.write(f"{current_peptide}")

# For a given list of values at a sequence's entry,
# Total up all the values and find their average
for sequence in sequence_dict:
for replicate in sequence_dict[sequence]:
replicate_total += int(replicate)

# Get the average and, if the average is an integer, write as is.
# Otherwise, include precision for decimal value
replicate_average = replicate_total / len(sequence_dict[sequence])
output_fh.write("\t")
if replicate_average.is_integer():
output_fh.write(f"{int(replicate_average)}")
else:
output_fh.write('%.2f' % replicate_average)
replicate_total = 0

output_fh.write("\n")

# Reset sequence dictionary
for sequence in base_sequences:
sequence_dict[sequence] = []

# Close files
names_fh.close()
score_fh.close()
output_fh.close()

return 0


def main():
arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('-i', '--input', help="Name of input file that contains matrix with replicate totals")
arg_parser.add_argument('-n', '--rep_names', help="Name of file that contains sequences to be utilized")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually sample names, not "sequences". Also, for arguments in general, and especially ones that this that are going to require non-standard formats, it's important to describe the expected formats in the help message.

arg_parser.add_argument('-a', '--get_avgs', help="Name of output file that matrix with replicate averages will go")

args = arg_parser.parse_args()

# Check for missing files; ensures that operations in create_replicate_matrix run correctly
if args.rep_names is None and args.get_avgs is not None:
print("Error: --get_avgs requires a file in --rep_names to be used")
return 1
elif args.get_avgs is None and args.rep_names is not None:
print("Error: --rep_names requires a file in --get_avgs to be used")
return 1

create_replicate_matrix(args.rep_names, args.input, args.get_avgs)
return 0


def remove_newline(string):
edited_string = string
if "\n" in string:
edited_string = string.strip("\n")

return edited_string


if __name__ == '__main__':
main()