diff --git a/src/CAI/CAI.py b/src/CAI/CAI.py index a08c65e..86672b5 100644 --- a/src/CAI/CAI.py +++ b/src/CAI/CAI.py @@ -10,7 +10,7 @@ warnings.simplefilter("ignore", BiopythonWarning) -def _synonymous_codons(genetic_code_dict): +def _init_synonymous_codons(genetic_code_dict): # invert the genetic code dictionary to map each amino acid to its codons codons_for_amino_acid = {} @@ -27,7 +27,7 @@ def _synonymous_codons(genetic_code_dict): _synonymous_codons = { - k: _synonymous_codons(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items() + k: _init_synonymous_codons(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items() } _non_synonymous_codons = { k: {codon for codon in v.keys() if len(v[codon]) == 1} @@ -35,29 +35,31 @@ def _synonymous_codons(genetic_code_dict): } -def RSCU(sequences, genetic_code=11): - r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences. +def _init_codonsbyaa(genetic_code_dict): - RSCU is 'the observed frequency of [a] codon divided by the frequency - expected under the assumption of equal usage of the synonymous codons for an - amino acid' (page 1283). + # invert the genetic code dictionary to map each amino acid to its codons + codons_for_amino_acid = {} + for codon, amino_acid in genetic_code_dict.items(): + codons_for_amino_acid[amino_acid] = codons_for_amino_acid.get(amino_acid, []) + codons_for_amino_acid[amino_acid].append(codon) - In math terms, it is + # create dictionary of codons by amino acid + # Example: {'L': ['CTT', 'CTG', 'CTA', 'CTC', 'TTA', 'TTG'], 'M': ['ATG']...} + return codons_for_amino_acid - .. math:: +_codons_by_aa = { + k: _init_codonsbyaa(v.forward_table) for k, v in ct.unambiguous_dna_by_id.items() +} - \frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}} - "where :math:`X` is the number of occurrences of the :math:`j` th codon for - the :math:`i` th amino acid, and :math:`n` is the number (from one to six) - of alternative codons for the :math:`i` th amino acid" (page 1283). +def Count3mers(sequences): + r"""Counts the 3-mers in frame in a set of sequences. Args: - sequences (list): The reference set of sequences. - genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code. + sequences (list): The set of sequences. Returns: - dict: The relative synonymous codon usage. + dict: Counts of in-frame 3-mers. Raises: ValueError: When an invalid sequence is provided or a list is not provided. @@ -66,7 +68,6 @@ def RSCU(sequences, genetic_code=11): if not isinstance(sequences, (list, tuple)): raise ValueError( "Be sure to pass a list of sequences, not a single sequence. " - "To find the RSCU of a single sequence, pass it as a one element list." ) # ensure all input sequences are divisible by three @@ -81,10 +82,88 @@ def RSCU(sequences, genetic_code=11): (sequence[i : i + 3].upper() for i in range(0, len(sequence), 3)) for sequence in sequences ) - codons = chain.from_iterable( + threemers = chain.from_iterable( sequences ) # flat list of all codons (to be used for counting) - counts = Counter(codons) + counts = Counter(threemers) + + return counts + + +def CodonFrequencyByAA(sequences, genetic_code=11, outputtype="count"): + r"""Calculates the synonymous codon frequency by AA for a set of sequences. + + Args: + sequences (list): The reference set of sequences. + genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code. + outputtype: "count" to return codon counts, "freq" to return frequencies proportional to aa. + + Returns: + dict: For each amino acid, a dict of codon counts or frequencies + + Raises: + ValueError: When an invalid sequence is provided or a list is not provided. + """ + + counts = Count3mers(sequences) + + # determine the synonymous codons for the genetic code + codons_by_aa = _codons_by_aa[genetic_code] + + # hold the result as it is being calculated + counts_by_aa = {} + + # calculate counts organized by aa + for aa, codons in codons_by_aa.items(): + miniresult = {} + for codon in codons: + # note: check what happens if zero? + miniresult[codon] = counts[codon] + counts_by_aa[aa] = miniresult + + if (outputtype == "count") : + return counts_by_aa + elif (outputtype == "freq") : + freqs_by_aa = {} + for aa, codons in codons_by_aa.items(): + minifreqs = {} + miniresult = counts_by_aa[aa] + total_by_aa = sum(miniresult.values()) + for codon in codons: + minifreqs[codon] = miniresult[codon] / total_by_aa + freqs_by_aa[aa] = minifreqs + return freqs_by_aa + + +def RSCU(sequences, genetic_code=11): + r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences. + + RSCU is 'the observed frequency of [a] codon divided by the frequency + expected under the assumption of equal usage of the synonymous codons for an + amino acid' (page 1283). + + In math terms, it is + + .. math:: + + \frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}} + + "where :math:`X` is the number of occurrences of the :math:`j` th codon for + the :math:`i` th amino acid, and :math:`n` is the number (from one to six) + of alternative codons for the :math:`i` th amino acid" (page 1283). + + Args: + sequences (list): The reference set of sequences. + genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code. + + Returns: + dict: The relative synonymous codon usage. + + Raises: + ValueError: When an invalid sequence is provided or a list is not provided. + """ + + counts = Count3mers(sequences) # "if a certain codon is never used in the reference set... assign [its # count] a value of 0.5" (page 1285)