-
Notifications
You must be signed in to change notification settings - Fork 2
Create get_replicate_average.py in response to issue #149 #172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| #!/usr/bin/env python | ||
| import argparse | ||
|
|
||
|
|
||
| def create_list(current_line, item_to_remove): | ||
| return_list = current_line.split("\t") | ||
| if item_to_remove != "": | ||
| return_list.remove(item_to_remove) | ||
| return_list[len(return_list) - 1] = remove_newline(return_list[len(return_list) - 1]) | ||
|
|
||
| return return_list | ||
|
|
||
|
|
||
| def create_replicate_matrix(names_file, score_file, output_file): | ||
| replicate_total = 0 | ||
| sequence_dict = {} | ||
| base_sequences = [] | ||
|
|
||
| # Read in the name, score, and output files; print any errors found | ||
| try: | ||
| names_fh = open(names_file, "r") | ||
| score_fh = open(score_file, "r") | ||
| output_fh = open(output_file, "w") | ||
| except FileNotFoundError as f: | ||
| print(f"Error: file not found ({f})") | ||
| return 1 | ||
| except NameError as n: | ||
| print(f"Error: file not found ({n})") | ||
| return 1 | ||
| except TypeError as t: | ||
| print(f"Error: file not found ({t})") | ||
| return 1 | ||
|
|
||
| output_fh.write("Sequence name\t") | ||
|
|
||
| # Get the first line's base sequence | ||
| current_line = names_fh.readline() | ||
| current_line_list = create_list(current_line, "") | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| base_sequence = current_line_list[0] | ||
|
|
||
| # Write the first base sequence | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| base_sequences.append(base_sequence) | ||
| output_fh.write(f"{base_sequence}") | ||
| sequence_dict[base_sequence] = [] | ||
|
|
||
| # Get the rest of the base sequences (if there are any) | ||
| while True: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's something of a matter of preference, but I generally find that it's more straightforward to step through a file line by line using a for loop. For example: for line in names_fh: |
||
| # Get the current line | ||
| current_line = names_fh.readline() | ||
| if current_line == "": | ||
| break | ||
|
|
||
| # Get the base sequence on the current line | ||
| current_line_list = create_list(current_line, "") | ||
| base_sequence = current_line_list[0] | ||
|
|
||
| # Use base sequence for the following: | ||
| # 1. Append it to the list of base sequences | ||
| # 2. Write it to the output file | ||
| # 3. Add it as a key in the sequence dictionary | ||
| base_sequences.append(base_sequence) | ||
| output_fh.write("\t") | ||
| output_fh.write(f"{base_sequence}") | ||
| sequence_dict[base_sequence] = [] | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| output_fh.write("\n") | ||
|
|
||
| # Get list of sequence names from input score file | ||
| sequence_names_list = create_list(score_fh.readline(), "Sequence name") | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Loop through score file & record the replicate averages | ||
| while True: | ||
| base_sequence_found = False | ||
|
|
||
| # Check if at EOF | ||
| current_line = score_fh.readline() | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if current_line == "": | ||
| break | ||
|
|
||
| # Create list out of values on current row | ||
| current_line_list = create_list(current_line, "") | ||
|
|
||
| # Get current peptide and remove from list | ||
| current_peptide = current_line_list[0] | ||
| current_line_list.pop(0) | ||
|
|
||
| # Loop through sequence names | ||
| sequence_names_index = 0 | ||
| while sequence_names_index < len(sequence_names_list): | ||
| # Loop through base sequences | ||
| base_sequence_index = 0 | ||
| while base_sequence_index < len(base_sequences): | ||
| # Check if sequence name contains base sequence name | ||
| if sequence_names_list[sequence_names_index].find(base_sequences[base_sequence_index]) != -1: | ||
| base_sequence_found = True | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I understanding correctly that you are checking to see if the base string is a substring of the individual sample names? This probably works in this example, but this is NOT something that I want to assume is true. This is the reason the replicate names are explicitly provided in the names input file. |
||
|
|
||
| # Create/update sequence dictionary entry with: | ||
| # Key: sequence name | ||
| # Entry: list of replicate values | ||
| sequence_dict[base_sequences[base_sequence_index]].append\ | ||
| (current_line_list[sequence_names_index]) | ||
|
|
||
| base_sequence_index += 1 | ||
|
|
||
| # Check if the base sequence was not found after the search | ||
| if not base_sequence_found: | ||
| print("Error: base sequence not found") | ||
|
|
||
| names_fh.close() | ||
| score_fh.close() | ||
| output_fh.close() | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| return 1 | ||
|
|
||
| sequence_names_index += 1 | ||
|
|
||
| output_fh.write(f"{current_peptide}") | ||
|
|
||
| # For a given list of values at a sequence's entry, | ||
| # Total up all the values and find their average | ||
| for sequence in sequence_dict: | ||
| for replicate in sequence_dict[sequence]: | ||
| replicate_total += int(replicate) | ||
|
|
||
| # Get the average and, if the average is an integer, write as is. | ||
| # Otherwise, include precision for decimal value | ||
| replicate_average = replicate_total / len(sequence_dict[sequence]) | ||
| output_fh.write("\t") | ||
| if replicate_average.is_integer(): | ||
| output_fh.write(f"{int(replicate_average)}") | ||
| else: | ||
| output_fh.write('%.2f' % replicate_average) | ||
| replicate_total = 0 | ||
|
|
||
| output_fh.write("\n") | ||
|
|
||
| # Reset sequence dictionary | ||
| for sequence in base_sequences: | ||
| sequence_dict[sequence] = [] | ||
|
|
||
| # Close files | ||
| names_fh.close() | ||
| score_fh.close() | ||
| output_fh.close() | ||
|
|
||
| return 0 | ||
|
|
||
|
|
||
| def main(): | ||
| arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
| arg_parser.add_argument('-i', '--input', help="Name of input file that contains matrix with replicate totals") | ||
| arg_parser.add_argument('-n', '--rep_names', help="Name of file that contains sequences to be utilized") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's actually sample names, not "sequences". Also, for arguments in general, and especially ones that this that are going to require non-standard formats, it's important to describe the expected formats in the help message. |
||
| arg_parser.add_argument('-a', '--get_avgs', help="Name of output file that matrix with replicate averages will go") | ||
|
|
||
| args = arg_parser.parse_args() | ||
|
|
||
| # Check for missing files; ensures that operations in create_replicate_matrix run correctly | ||
| if args.rep_names is None and args.get_avgs is not None: | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| print("Error: --get_avgs requires a file in --rep_names to be used") | ||
| return 1 | ||
| elif args.get_avgs is None and args.rep_names is not None: | ||
| print("Error: --rep_names requires a file in --get_avgs to be used") | ||
| return 1 | ||
|
|
||
| create_replicate_matrix(args.rep_names, args.input, args.get_avgs) | ||
| return 0 | ||
|
|
||
|
|
||
| def remove_newline(string): | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| edited_string = string | ||
| if "\n" in string: | ||
jdr479 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| edited_string = string.strip("\n") | ||
|
|
||
| return edited_string | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a nice section to see. This type of error handling will definitely make the script more user friendly.