From 9b8cc992653acd16cc66bf3d366b268a9c615cb1 Mon Sep 17 00:00:00 2001 From: NorthernPeach <71791413+NorthernPeach@users.noreply.github.com> Date: Tue, 6 Jul 2021 13:35:45 +0300 Subject: [PATCH 1/2] Add files via upload --- SWW Genomics.ipynb | 815 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 815 insertions(+) create mode 100644 SWW Genomics.ipynb diff --git a/SWW Genomics.ipynb b/SWW Genomics.ipynb new file mode 100644 index 0000000..96b9342 --- /dev/null +++ b/SWW Genomics.ipynb @@ -0,0 +1,815 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "import pprint\n", + "from Bio import Seq, SeqUtils, SeqIO" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "seqs = []\n", + "for seq_record in SeqIO.parse(\"measles.txt\", \"fasta\"):\n", + " seqs.append(seq_record.seq)\n", + "#seqs" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "# DNA codon table\n", + "protein = {\"TTT\" : \"F\", \"CTT\" : \"L\", \"ATT\" : \"I\", \"GTT\" : \"V\",\n", + " \"TTC\" : \"F\", \"CTC\" : \"L\", \"ATC\" : \"I\", \"GTC\" : \"V\",\n", + " \"TTA\" : \"L\", \"CTA\" : \"L\", \"ATA\" : \"I\", \"GTA\" : \"V\",\n", + " \"TTG\" : \"L\", \"CTG\" : \"L\", \"ATG\" : \"M\", \"GTG\" : \"V\",\n", + " \"TCT\" : \"S\", \"CCT\" : \"P\", \"ACT\" : \"T\", \"GCT\" : \"A\",\n", + " \"TCC\" : \"S\", \"CCC\" : \"P\", \"ACC\" : \"T\", \"GCC\" : \"A\",\n", + " \"TCA\" : \"S\", \"CCA\" : \"P\", \"ACA\" : \"T\", \"GCA\" : \"A\",\n", + " \"TCG\" : \"S\", \"CCG\" : \"P\", \"ACG\" : \"T\", \"GCG\" : \"A\",\n", + " \"TAT\" : \"Y\", \"CAT\" : \"H\", \"AAT\" : \"N\", \"GAT\" : \"D\",\n", + " \"TAC\" : \"Y\", \"CAC\" : \"H\", \"AAC\" : \"N\", \"GAC\" : \"D\",\n", + " \"TAA\" : \"STOP\", \"CAA\" : \"Q\", \"AAA\" : \"K\", \"GAA\" : \"E\",\n", + " \"TAG\" : \"STOP\", \"CAG\" : \"Q\", \"AAG\" : \"K\", \"GAG\" : \"E\",\n", + " \"TGT\" : \"C\", \"CGT\" : \"R\", \"AGT\" : \"S\", \"GGT\" : \"G\",\n", + " \"TGC\" : \"C\", \"CGC\" : \"R\", \"AGC\" : \"S\", \"GGC\" : \"G\",\n", + " \"TGA\" : \"STOP\", \"CGA\" : \"R\", \"AGA\" : \"R\", \"GGA\" : \"G\",\n", + " \"TGG\" : \"W\", \"CGG\" : \"R\", \"AGG\" : \"R\", \"GGG\" : \"G\" \n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "def count_codons(seq, dict_codons):\n", + " seq = str(seq)\n", + " for i in range(0, len(seq)-3, 3):\n", + " if seq[i:i+3] not in dict_codons.keys():\n", + " dict_codons[seq[i:i+3]] = 1\n", + " else:\n", + " dict_codons[seq[i:i+3]] += 1\n", + " return dict_codons" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: {'ACC': 106,\n", + " 'AAA': 101,\n", + " 'CAA': 130,\n", + " 'AGT': 65,\n", + " 'TGG': 86,\n", + " 'GTA': 48,\n", + " 'AGG': 103,\n", + " 'ATA': 98,\n", + " 'GAT': 87,\n", + " 'TCA': 156,\n", + " 'ATG': 125,\n", + " 'ATC': 117,\n", + " 'TTC': 59,\n", + " 'TAG': 92,\n", + " 'TAC': 48,\n", + " 'ACT': 75,\n", + " 'AGA': 123,\n", + " 'TCC': 110,\n", + " 'TAT': 69,\n", + " 'CAG': 139,\n", + " 'GGA': 120,\n", + " 'GAG': 96,\n", + " 'CCG': 53,\n", + " 'CCA': 124,\n", + " 'CAC': 79,\n", + " 'TTT': 58,\n", + " 'TGA': 101,\n", + " 'GCT': 52,\n", + " 'CAT': 76,\n", + " 'TGT': 74,\n", + " 'GAA': 89,\n", + " 'ACA': 143,\n", + " 'AAC': 87,\n", + " 'TTA': 96,\n", + " 'GTG': 69,\n", + " 'CTG': 100,\n", + " 'ATT': 83,\n", + " 'CCT': 60,\n", + " 'CTC': 92,\n", + " 'GAC': 67,\n", + " 'GGT': 82,\n", + " 'TAA': 91,\n", + " 'TTG': 126,\n", + " 'CGG': 49,\n", + " 'GCG': 27,\n", + " 'GGC': 72,\n", + " 'GGG': 100,\n", + " 'GTC': 48,\n", + " 'AAT': 83,\n", + " 'ACG': 46,\n", + " 'GCA': 90,\n", + " 'GCC': 77,\n", + " 'TCG': 50,\n", + " 'TCT': 73,\n", + " 'AAG': 126,\n", + " 'TGC': 65,\n", + " 'CCC': 106,\n", + " 'AGC': 76,\n", + " 'CTT': 61,\n", + " 'CTA': 73,\n", + " 'GTT': 52,\n", + " 'CGA': 33,\n", + " 'CGC': 22,\n", + " 'CGT': 13},\n", + " 1: {'ACC': 118,\n", + " 'AAA': 110,\n", + " 'CAA': 113,\n", + " 'AGT': 71,\n", + " 'TGG': 108,\n", + " 'GTA': 33,\n", + " 'AGG': 130,\n", + " 'ATA': 103,\n", + " 'GAT': 72,\n", + " 'CTT': 50,\n", + " 'ACT': 67,\n", + " 'ATG': 124,\n", + " 'GAC': 65,\n", + " 'TAG': 122,\n", + " 'GAG': 117,\n", + " 'TAA': 85,\n", + " 'AGA': 144,\n", + " 'TCC': 104,\n", + " 'TAC': 50,\n", + " 'TGT': 60,\n", + " 'CGG': 47,\n", + " 'GGG': 93,\n", + " 'TGA': 129,\n", + " 'CTA': 68,\n", + " 'CGA': 46,\n", + " 'CTC': 82,\n", + " 'TTC': 59,\n", + " 'TTA': 82,\n", + " 'GCT': 49,\n", + " 'CAT': 72,\n", + " 'TCA': 180,\n", + " 'GGA': 90,\n", + " 'ACA': 150,\n", + " 'AAG': 121,\n", + " 'CGC': 21,\n", + " 'CAG': 151,\n", + " 'GTT': 42,\n", + " 'CCA': 138,\n", + " 'CCG': 77,\n", + " 'CGT': 25,\n", + " 'CCC': 101,\n", + " 'TGC': 56,\n", + " 'TCG': 66,\n", + " 'GGC': 65,\n", + " 'TTG': 139,\n", + " 'ATC': 102,\n", + " 'CTG': 87,\n", + " 'ACG': 56,\n", + " 'GGT': 60,\n", + " 'GCG': 35,\n", + " 'GCA': 83,\n", + " 'TAT': 76,\n", + " 'CAC': 85,\n", + " 'AAT': 59,\n", + " 'AGC': 66,\n", + " 'GAA': 70,\n", + " 'GCC': 71,\n", + " 'CCT': 66,\n", + " 'GTG': 66,\n", + " 'ATT': 80,\n", + " 'TTT': 45,\n", + " 'TCT': 89,\n", + " 'AAC': 64,\n", + " 'GTC': 60},\n", + " 2: {'ACC': 91,\n", + " 'AAA': 150,\n", + " 'CAA': 163,\n", + " 'AGT': 73,\n", + " 'TGG': 86,\n", + " 'CTA': 104,\n", + " 'AGG': 78,\n", + " 'ATG': 135,\n", + " 'GAT': 100,\n", + " 'TAA': 127,\n", + " 'TTA': 132,\n", + " 'AAC': 81,\n", + " 'ATT': 120,\n", + " 'TCG': 44,\n", + " 'ACT': 83,\n", + " 'TAG': 115,\n", + " 'GGT': 61,\n", + " 'TGA': 139,\n", + " 'TCC': 60,\n", + " 'TGT': 82,\n", + " 'CTC': 87,\n", + " 'AAG': 138,\n", + " 'GGA': 99,\n", + " 'GAA': 86,\n", + " 'CAG': 112,\n", + " 'TCA': 151,\n", + " 'GAC': 54,\n", + " 'CCA': 90,\n", + " 'GTC': 61,\n", + " 'TTC': 79,\n", + " 'GCC': 35,\n", + " 'GCT': 53,\n", + " 'AGA': 106,\n", + " 'GGG': 60,\n", + " 'CCC': 42,\n", + " 'CAT': 89,\n", + " 'CTT': 82,\n", + " 'CTG': 106,\n", + " 'GCG': 12,\n", + " 'GAG': 82,\n", + " 'TCT': 89,\n", + " 'GTG': 70,\n", + " 'CGA': 24,\n", + " 'GCA': 66,\n", + " 'TTG': 141,\n", + " 'TAT': 86,\n", + " 'ACA': 94,\n", + " 'CCG': 29,\n", + " 'AAT': 114,\n", + " 'ATC': 123,\n", + " 'GTA': 53,\n", + " 'TAC': 64,\n", + " 'CCT': 57,\n", + " 'TTT': 76,\n", + " 'ACG': 39,\n", + " 'GTT': 61,\n", + " 'ATA': 122,\n", + " 'AGC': 60,\n", + " 'CGT': 21,\n", + " 'TGC': 58,\n", + " 'CGG': 23,\n", + " 'GGC': 43,\n", + " 'CAC': 57,\n", + " 'CGC': 13},\n", + " 3: {'ACC': 73,\n", + " 'AGA': 121,\n", + " 'CAA': 165,\n", + " 'TGT': 104,\n", + " 'TTG': 149,\n", + " 'TGA': 131,\n", + " 'CCT': 41,\n", + " 'ATT': 204,\n", + " 'CTA': 119,\n", + " 'ACG': 21,\n", + " 'ACA': 102,\n", + " 'TTA': 205,\n", + " 'AAT': 164,\n", + " 'TAG': 125,\n", + " 'GAA': 90,\n", + " 'TAA': 159,\n", + " 'CGA': 35,\n", + " 'TTC': 83,\n", + " 'CAT': 104,\n", + " 'GGT': 64,\n", + " 'GAG': 73,\n", + " 'GGG': 59,\n", + " 'TCA': 119,\n", + " 'CCA': 65,\n", + " 'GTC': 52,\n", + " 'TAT': 135,\n", + " 'CAC': 68,\n", + " 'CTG': 87,\n", + " 'GAC': 43,\n", + " 'ATA': 205,\n", + " 'AGC': 34,\n", + " 'AAC': 79,\n", + " 'CAG': 100,\n", + " 'CGT': 18,\n", + " 'GTT': 68,\n", + " 'CGG': 16,\n", + " 'GTG': 62,\n", + " 'AAG': 112,\n", + " 'GAT': 107,\n", + " 'AAA': 163,\n", + " 'TTT': 104,\n", + " 'GCA': 44,\n", + " 'ATC': 122,\n", + " 'CTT': 78,\n", + " 'TCG': 24,\n", + " 'CTC': 58,\n", + " 'ATG': 167,\n", + " 'TGG': 86,\n", + " 'AGG': 72,\n", + " 'GTA': 83,\n", + " 'GGA': 64,\n", + " 'ACT': 77,\n", + " 'GCT': 39,\n", + " 'TCC': 39,\n", + " 'TAC': 73,\n", + " 'CCG': 7,\n", + " 'TGC': 50,\n", + " 'GGC': 23,\n", + " 'TCT': 64,\n", + " 'AGT': 85,\n", + " 'GCC': 31,\n", + " 'CCC': 25,\n", + " 'CGC': 3,\n", + " 'GCG': 7},\n", + " 4: {'ACC': 100,\n", + " 'AGA': 122,\n", + " 'CAA': 142,\n", + " 'AGT': 68,\n", + " 'TGG': 103,\n", + " 'CTA': 101,\n", + " 'AGG': 85,\n", + " 'ATA': 101,\n", + " 'GTT': 61,\n", + " 'AAA': 145,\n", + " 'TTA': 114,\n", + " 'TTG': 126,\n", + " 'AAT': 106,\n", + " 'ATT': 102,\n", + " 'ACT': 95,\n", + " 'TAG': 122,\n", + " 'GGT': 71,\n", + " 'TGA': 127,\n", + " 'TCC': 80,\n", + " 'TAC': 54,\n", + " 'CTT': 89,\n", + " 'GAA': 99,\n", + " 'GGC': 52,\n", + " 'TCA': 172,\n", + " 'GAC': 59,\n", + " 'CCA': 99,\n", + " 'GCC': 42,\n", + " 'TTC': 68,\n", + " 'CAC': 56,\n", + " 'TGT': 73,\n", + " 'GGA': 99,\n", + " 'CTC': 81,\n", + " 'GGG': 50,\n", + " 'AAC': 78,\n", + " 'CCC': 73,\n", + " 'CCT': 69,\n", + " 'CTG': 98,\n", + " 'GCT': 47,\n", + " 'CCG': 41,\n", + " 'GAG': 89,\n", + " 'TAA': 90,\n", + " 'AGC': 65,\n", + " 'ATG': 132,\n", + " 'CGG': 27,\n", + " 'GTG': 77,\n", + " 'GCA': 58,\n", + " 'GAT': 86,\n", + " 'TAT': 95,\n", + " 'ATC': 97,\n", + " 'ACG': 40,\n", + " 'TCT': 78,\n", + " 'GTA': 53,\n", + " 'GCG': 20,\n", + " 'GTC': 62,\n", + " 'CAT': 87,\n", + " 'CAG': 112,\n", + " 'AAG': 131,\n", + " 'CGA': 35,\n", + " 'ACA': 117,\n", + " 'TGC': 58,\n", + " 'TTT': 77,\n", + " 'TCG': 46,\n", + " 'CGC': 16,\n", + " 'CGT': 11},\n", + " 5: {'ACC': 130,\n", + " 'AAA': 108,\n", + " 'CAA': 130,\n", + " 'AGT': 81,\n", + " 'TGG': 95,\n", + " 'GTA': 47,\n", + " 'AGG': 107,\n", + " 'ATC': 113,\n", + " 'GGT': 72,\n", + " 'CTA': 81,\n", + " 'TCA': 162,\n", + " 'ATG': 116,\n", + " 'ATT': 92,\n", + " 'TAG': 102,\n", + " 'CAC': 90,\n", + " 'ACT': 83,\n", + " 'GAT': 79,\n", + " 'AGA': 132,\n", + " 'TCC': 112,\n", + " 'TAT': 63,\n", + " 'CGA': 54,\n", + " 'CTG': 106,\n", + " 'GAG': 100,\n", + " 'CAG': 143,\n", + " 'GCT': 69,\n", + " 'TAA': 74,\n", + " 'TCT': 90,\n", + " 'TTA': 68,\n", + " 'CTT': 60,\n", + " 'CTC': 74,\n", + " 'TGA': 117,\n", + " 'TGT': 76,\n", + " 'GGG': 92,\n", + " 'CCA': 123,\n", + " 'AAG': 112,\n", + " 'ACA': 125,\n", + " 'AGC': 85,\n", + " 'CCC': 96,\n", + " 'TTG': 97,\n", + " 'GTG': 78,\n", + " 'GAA': 69,\n", + " 'AAC': 76,\n", + " 'TTC': 58,\n", + " 'CCT': 57,\n", + " 'GCC': 70,\n", + " 'GTC': 47,\n", + " 'TCG': 59,\n", + " 'AAT': 68,\n", + " 'CCG': 63,\n", + " 'GCA': 96,\n", + " 'ATA': 105,\n", + " 'GGC': 66,\n", + " 'CAT': 89,\n", + " 'ACG': 40,\n", + " 'TAC': 58,\n", + " 'GGA': 107,\n", + " 'GAC': 78,\n", + " 'TGC': 50,\n", + " 'GTT': 52,\n", + " 'CGT': 17,\n", + " 'TTT': 42,\n", + " 'GCG': 24,\n", + " 'CGC': 34,\n", + " 'CGG': 34},\n", + " 6: {'ACC': 99,\n", + " 'AGA': 147,\n", + " 'CAA': 140,\n", + " 'AGC': 60,\n", + " 'TGG': 94,\n", + " 'CTA': 93,\n", + " 'GGG': 51,\n", + " 'GTA': 57,\n", + " 'GAA': 89,\n", + " 'TAA': 120,\n", + " 'CAG': 129,\n", + " 'ATA': 123,\n", + " 'ATG': 129,\n", + " 'AAT': 96,\n", + " 'TAT': 104,\n", + " 'CAT': 98,\n", + " 'ACT': 80,\n", + " 'TAG': 137,\n", + " 'GAT': 91,\n", + " 'TGA': 136,\n", + " 'TCC': 82,\n", + " 'TTG': 133,\n", + " 'GCA': 60,\n", + " 'AAA': 123,\n", + " 'GGT': 82,\n", + " 'TCA': 150,\n", + " 'CGA': 57,\n", + " 'CAC': 60,\n", + " 'TTC': 64,\n", + " 'GGA': 106,\n", + " 'GTC': 59,\n", + " 'CTC': 78,\n", + " 'TGT': 56,\n", + " 'ACA': 100,\n", + " 'AAG': 131,\n", + " 'CCC': 56,\n", + " 'GTT': 51,\n", + " 'GAG': 104,\n", + " 'CCA': 105,\n", + " 'TTA': 124,\n", + " 'TCG': 44,\n", + " 'TAC': 70,\n", + " 'CCG': 35,\n", + " 'GTG': 71,\n", + " 'ATT': 103,\n", + " 'GAC': 52,\n", + " 'CTG': 97,\n", + " 'CTT': 73,\n", + " 'GCG': 27,\n", + " 'AAC': 65,\n", + " 'ACG': 39,\n", + " 'AGT': 88,\n", + " 'GGC': 43,\n", + " 'AGG': 89,\n", + " 'CGG': 30,\n", + " 'TCT': 84,\n", + " 'TTT': 62,\n", + " 'GCC': 34,\n", + " 'ATC': 103,\n", + " 'CGT': 20,\n", + " 'CCT': 46,\n", + " 'GCT': 43,\n", + " 'TGC': 41,\n", + " 'CGC': 20}}" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# with zero shift\n", + "cdns_frq = {}\n", + "for id_sq, seq in enumerate(seqs):\n", + " cdns_frq[id_sq] = {}\n", + " cdns_frq[id_sq] = count_codons(seq, cdns_frq[id_sq])\n", + "cdns_frq" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "# with one shift\n", + "cdns_frq1 = {}\n", + "for id_sq, seq in enumerate(seqs):\n", + " cdns_frq1[id_sq] = {}\n", + " cdns_frq1[id_sq] = count_codons(seq[1:], cdns_frq1[id_sq])\n", + "#cdns_frq1" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# with two shift\n", + "cdns_frq2 = {}\n", + "for id_sq, seq in enumerate(seqs):\n", + " cdns_frq2[id_sq] = {}\n", + " cdns_frq2[id_sq] = count_codons(seq[2:], cdns_frq2[id_sq])\n", + "#cdns_frq2" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# Counting aminas task \n", + "# translating the seq\n", + "def translate(seq, protein):\n", + " trans_seq = ''\n", + " for i in range(0, len(seq)-3, 3):\n", + " trans_seq += protein[seq[i: i+3]] \n", + " return trans_seq" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "def count_aminas(seq, list_aminas, aminas_dict):\n", + "\n", + " for amina in list_aminas:\n", + " aminas_dict[amina] = seq.count(amina)\n", + " return aminas_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "aminas, aminas1, aminas2 = {}, {}, {}\n", + "list_aminas = np.unique(list(protein.values()))\n", + "\n", + "# with zero shift\n", + "for id_sq, seq in enumerate(seqs):\n", + " aminas[id_sq] = {}\n", + " trans_seq = translate(seq, protein)\n", + " aminas[id_sq] = count_aminas(trans_seq, list_aminas, aminas[id_sq])\n", + "\n", + " # with one shift\n", + " aminas1[id_sq] = {}\n", + " trans_seq = translate(seq[1:], protein)\n", + " aminas1[id_sq] = count_aminas(trans_seq, list_aminas, aminas1[id_sq])\n", + " \n", + " # with one shift\n", + " aminas2[id_sq] = {}\n", + " trans_seq = translate(seq[2:], protein)\n", + " aminas2[id_sq] = count_aminas(trans_seq, list_aminas, aminas2[id_sq])" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: {'A': 246,\n", + " 'C': 139,\n", + " 'D': 154,\n", + " 'E': 185,\n", + " 'F': 117,\n", + " 'G': 374,\n", + " 'H': 155,\n", + " 'I': 298,\n", + " 'K': 227,\n", + " 'L': 548,\n", + " 'M': 125,\n", + " 'N': 170,\n", + " 'P': 627,\n", + " 'Q': 269,\n", + " 'R': 343,\n", + " 'S': 814,\n", + " 'STOP': 284,\n", + " 'T': 654,\n", + " 'V': 217,\n", + " 'W': 86,\n", + " 'Y': 117},\n", + " 1: {'A': 238,\n", + " 'C': 116,\n", + " 'D': 137,\n", + " 'E': 187,\n", + " 'F': 104,\n", + " 'G': 308,\n", + " 'H': 157,\n", + " 'I': 285,\n", + " 'K': 231,\n", + " 'L': 508,\n", + " 'M': 124,\n", + " 'N': 123,\n", + " 'P': 718,\n", + " 'Q': 264,\n", + " 'R': 413,\n", + " 'S': 912,\n", + " 'STOP': 336,\n", + " 'T': 727,\n", + " 'V': 201,\n", + " 'W': 108,\n", + " 'Y': 126},\n", + " 2: {'A': 166,\n", + " 'C': 140,\n", + " 'D': 154,\n", + " 'E': 168,\n", + " 'F': 155,\n", + " 'G': 263,\n", + " 'H': 146,\n", + " 'I': 365,\n", + " 'K': 288,\n", + " 'L': 652,\n", + " 'M': 135,\n", + " 'N': 195,\n", + " 'P': 599,\n", + " 'Q': 275,\n", + " 'R': 265,\n", + " 'S': 858,\n", + " 'STOP': 381,\n", + " 'T': 688,\n", + " 'V': 245,\n", + " 'W': 86,\n", + " 'Y': 150},\n", + " 3: {'A': 121,\n", + " 'C': 154,\n", + " 'D': 150,\n", + " 'E': 163,\n", + " 'F': 187,\n", + " 'G': 210,\n", + " 'H': 172,\n", + " 'I': 531,\n", + " 'K': 275,\n", + " 'L': 696,\n", + " 'M': 167,\n", + " 'N': 243,\n", + " 'P': 553,\n", + " 'Q': 265,\n", + " 'R': 265,\n", + " 'S': 780,\n", + " 'STOP': 415,\n", + " 'T': 688,\n", + " 'V': 265,\n", + " 'W': 86,\n", + " 'Y': 208},\n", + " 4: {'A': 167,\n", + " 'C': 131,\n", + " 'D': 145,\n", + " 'E': 188,\n", + " 'F': 145,\n", + " 'G': 272,\n", + " 'H': 143,\n", + " 'I': 300,\n", + " 'K': 276,\n", + " 'L': 609,\n", + " 'M': 132,\n", + " 'N': 184,\n", + " 'P': 621,\n", + " 'Q': 254,\n", + " 'R': 296,\n", + " 'S': 848,\n", + " 'STOP': 339,\n", + " 'T': 691,\n", + " 'V': 253,\n", + " 'W': 103,\n", + " 'Y': 149},\n", + " 5: {'A': 259,\n", + " 'C': 126,\n", + " 'D': 157,\n", + " 'E': 169,\n", + " 'F': 100,\n", + " 'G': 337,\n", + " 'H': 179,\n", + " 'I': 310,\n", + " 'K': 220,\n", + " 'L': 486,\n", + " 'M': 116,\n", + " 'N': 144,\n", + " 'P': 632,\n", + " 'Q': 273,\n", + " 'R': 378,\n", + " 'S': 882,\n", + " 'STOP': 293,\n", + " 'T': 671,\n", + " 'V': 224,\n", + " 'W': 95,\n", + " 'Y': 121},\n", + " 6: {'A': 164,\n", + " 'C': 97,\n", + " 'D': 143,\n", + " 'E': 193,\n", + " 'F': 126,\n", + " 'G': 282,\n", + " 'H': 158,\n", + " 'I': 329,\n", + " 'K': 254,\n", + " 'L': 598,\n", + " 'M': 129,\n", + " 'N': 161,\n", + " 'P': 635,\n", + " 'Q': 269,\n", + " 'R': 363,\n", + " 'S': 901,\n", + " 'STOP': 393,\n", + " 'T': 711,\n", + " 'V': 238,\n", + " 'W': 94,\n", + " 'Y': 174}}\n" + ] + } + ], + "source": [ + "pprint.pprint(aminas)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From de426c299b0d4e6ff6c7b829d7b4c83bb0d1fb52 Mon Sep 17 00:00:00 2001 From: NorthernPeach <71791413+NorthernPeach@users.noreply.github.com> Date: Tue, 6 Jul 2021 14:52:30 +0300 Subject: [PATCH 2/2] Update noname_tool.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Даня, добавила кусок кода с 22 по 110 строчку не совсем уверена, что это то, что должно быть, посмотри пожалуйста --- noname_tool.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/noname_tool.py b/noname_tool.py index 8d6be80..bb6d6ea 100644 --- a/noname_tool.py +++ b/noname_tool.py @@ -19,7 +19,96 @@ def count_GGG_triplets(sequence): ''' return sequence.count('GGG') +import re +import numpy as np +import pprint +from Bio import Seq, SeqUtils, SeqIO +seqs = [] +for seq_record in SeqIO.parse("measles.txt", "fasta"): + seqs.append(seq_record.seq) + +# DNA codon table +protein = {"TTT" : "F", "CTT" : "L", "ATT" : "I", "GTT" : "V", + "TTC" : "F", "CTC" : "L", "ATC" : "I", "GTC" : "V", + "TTA" : "L", "CTA" : "L", "ATA" : "I", "GTA" : "V", + "TTG" : "L", "CTG" : "L", "ATG" : "M", "GTG" : "V", + "TCT" : "S", "CCT" : "P", "ACT" : "T", "GCT" : "A", + "TCC" : "S", "CCC" : "P", "ACC" : "T", "GCC" : "A", + "TCA" : "S", "CCA" : "P", "ACA" : "T", "GCA" : "A", + "TCG" : "S", "CCG" : "P", "ACG" : "T", "GCG" : "A", + "TAT" : "Y", "CAT" : "H", "AAT" : "N", "GAT" : "D", + "TAC" : "Y", "CAC" : "H", "AAC" : "N", "GAC" : "D", + "TAA" : "STOP", "CAA" : "Q", "AAA" : "K", "GAA" : "E", + "TAG" : "STOP", "CAG" : "Q", "AAG" : "K", "GAG" : "E", + "TGT" : "C", "CGT" : "R", "AGT" : "S", "GGT" : "G", + "TGC" : "C", "CGC" : "R", "AGC" : "S", "GGC" : "G", + "TGA" : "STOP", "CGA" : "R", "AGA" : "R", "GGA" : "G", + "TGG" : "W", "CGG" : "R", "AGG" : "R", "GGG" : "G" + } +def count_codons(seq, dict_codons): + seq = str(seq) + for i in range(0, len(seq)-3, 3): + if seq[i:i+3] not in dict_codons.keys(): + dict_codons[seq[i:i+3]] = 1 + else: + dict_codons[seq[i:i+3]] += 1 + return dict_codons + +# with zero shift +cdns_frq = {} +for id_sq, seq in enumerate(seqs): + cdns_frq[id_sq] = {} + cdns_frq[id_sq] = count_codons(seq, cdns_frq[id_sq]) +cdns_frq + +# with one shift +cdns_frq1 = {} +for id_sq, seq in enumerate(seqs): + cdns_frq1[id_sq] = {} + cdns_frq1[id_sq] = count_codons(seq[1:], cdns_frq1[id_sq]) +#cdns_frq1 + +# with two shift +cdns_frq2 = {} +for id_sq, seq in enumerate(seqs): + cdns_frq2[id_sq] = {} + cdns_frq2[id_sq] = count_codons(seq[2:], cdns_frq2[id_sq]) +#cdns_frq2 + +# Counting aminas task +# translating the seq +def translate(seq, protein): + trans_seq = '' + for i in range(0, len(seq)-3, 3): + trans_seq += protein[seq[i: i+3]] + return trans_seq + +def count_aminas(seq, list_aminas, aminas_dict): + + for amina in list_aminas: + aminas_dict[amina] = seq.count(amina) + return aminas_dict + +aminas, aminas1, aminas2 = {}, {}, {} +list_aminas = np.unique(list(protein.values())) + +# with zero shift +for id_sq, seq in enumerate(seqs): + aminas[id_sq] = {} + trans_seq = translate(seq, protein) + aminas[id_sq] = count_aminas(trans_seq, list_aminas, aminas[id_sq]) + + # with one shift + aminas1[id_sq] = {} + trans_seq = translate(seq[1:], protein) + aminas1[id_sq] = count_aminas(trans_seq, list_aminas, aminas1[id_sq]) + + # with one shift + aminas2[id_sq] = {} + trans_seq = translate(seq[2:], protein) + aminas2[id_sq] = count_aminas(trans_seq, list_aminas, aminas2[id_sq]) + def main(input_file, output_file): with open(input_file) as fh: