diff --git a/Chapter01/Multi_Head_Attention_Sub_Layer.ipynb b/Chapter01/Multi_Head_Attention_Sub_Layer.ipynb index 9dcb907..3a2eb47 100644 --- a/Chapter01/Multi_Head_Attention_Sub_Layer.ipynb +++ b/Chapter01/Multi_Head_Attention_Sub_Layer.ipynb @@ -1,273 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Multi-Head Attention Sub-Layer.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU", - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "946c90b82f7f46caa25c885668b75eab": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_view_name": "HBoxView", - "_dom_classes": [], - "_model_name": "HBoxModel", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.5.0", - "box_style": "", - "layout": "IPY_MODEL_4191af78535e4da8bb797690eff84e00", - "_model_module": "@jupyter-widgets/controls", - "children": [ - "IPY_MODEL_9ce3d57b96b64da0b15e3f3626bacb30", - "IPY_MODEL_f8da2c91156342a69d9b262f4f993aa4" - ] - } - }, - "4191af78535e4da8bb797690eff84e00": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "9ce3d57b96b64da0b15e3f3626bacb30": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_view_name": "ProgressView", - "style": "IPY_MODEL_97370923218945c5b80ab468751ac8a7", - "_dom_classes": [], - "description": "Downloading: 100%", - "_model_name": "FloatProgressModel", - "bar_style": "success", - "max": 230, - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": 230, - "_view_count": null, - "_view_module_version": "1.5.0", - "orientation": "horizontal", - "min": 0, - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_0ba4a91f472e4c41ba80ab4025288446" - } - }, - "f8da2c91156342a69d9b262f4f993aa4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_15aa4b6f8f784c74804107be249126b9", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": " 230/230 [00:01<00:00, 185B/s]", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_edea457617ed4792aeeb65292019ceb4" - } - }, - "97370923218945c5b80ab468751ac8a7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_view_name": "StyleView", - "_model_name": "ProgressStyleModel", - "description_width": "initial", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "bar_color": null, - "_model_module": "@jupyter-widgets/controls" - } - }, - "0ba4a91f472e4c41ba80ab4025288446": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "15aa4b6f8f784c74804107be249126b9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "edea457617ed4792aeeb65292019ceb4": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - } - } - } - }, "cells": [ { "cell_type": "markdown", "metadata": { - "id": "aXACkAtfNpG0", - "colab_type": "text" + "colab_type": "text", + "id": "aXACkAtfNpG0" }, "source": [ "# The Attention Mechanism\n", @@ -280,73 +17,65 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "veRoFjFRNXwJ", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "veRoFjFRNXwJ" }, + "outputs": [], "source": [ "import numpy as np\n", "from scipy.special import softmax" - ], - "execution_count": 0, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "JLe9lWCJNogW", - "colab_type": "code", - "outputId": "733e039b-343e-4161-9919-19b3a1ec130f", "colab": { "base_uri": "https://localhost:8080/", "height": 90 - } + }, + "colab_type": "code", + "id": "JLe9lWCJNogW", + "outputId": "733e039b-343e-4161-9919-19b3a1ec130f" }, - "source": [ - "print(\"Step 1: Input : 3 inputs, d_model=4\")\n", - "x =np.array([[1.0, 0.0, 1.0, 0.0], # Input 1\n", - " [0.0, 2.0, 0.0, 2.0], # Input 2\n", - " [1.0, 1.0, 1.0, 1.0]]) # Input 3\n", - "print(x)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 1: Input : 3 inputs, d_model=4\n", "[[1. 0. 1. 0.]\n", " [0. 2. 0. 2.]\n", " [1. 1. 1. 1.]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 1: Input : 3 inputs, d_model=4\")\n", + "x = np.array([[1.0, 0.0, 1.0, 0.0], # Input 1\n", + " [0.0, 2.0, 0.0, 2.0], # Input 2\n", + " [1.0, 1.0, 1.0, 1.0]]) # Input 3\n", + "print(x)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "JZImwtHPN91V", - "colab_type": "code", - "outputId": "07706940-e200-4956-b957-fe9681139d0d", "colab": { "base_uri": "https://localhost:8080/", "height": 126 - } + }, + "colab_type": "code", + "id": "JZImwtHPN91V", + "outputId": "07706940-e200-4956-b957-fe9681139d0d" }, - "source": [ - "print(\"Step 2: weights 3 dimensions x d_model=4\")\n", - "print(\"w_query\")\n", - "w_query =np.array([[1, 0, 1],\n", - " [1, 0, 0],\n", - " [0, 0, 1],\n", - " [0, 1, 1]])\n", - "print(w_query)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 2: weights 3 dimensions x d_model=4\n", @@ -355,33 +84,34 @@ " [1 0 0]\n", " [0 0 1]\n", " [0 1 1]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 2: weights 3 dimensions x d_model=4\")\n", + "print(\"w_query\")\n", + "w_query = np.array([[1, 0, 1],\n", + " [1, 0, 0],\n", + " [0, 0, 1],\n", + " [0, 1, 1]])\n", + "print(w_query)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "7kRBS7MUOFgV", - "colab_type": "code", - "outputId": "8b0bcc03-88b1-4e8d-a483-dacc91ffa9ee", "colab": { "base_uri": "https://localhost:8080/", "height": 108 - } + }, + "colab_type": "code", + "id": "7kRBS7MUOFgV", + "outputId": "8b0bcc03-88b1-4e8d-a483-dacc91ffa9ee" }, - "source": [ - "print(\"w_key\")\n", - "w_key =np.array([[0, 0, 1],\n", - " [1, 1, 0],\n", - " [0, 1, 0],\n", - " [1, 1, 0]])\n", - "print(w_key)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "w_key\n", @@ -389,33 +119,33 @@ " [1 1 0]\n", " [0 1 0]\n", " [1 1 0]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"w_key\")\n", + "w_key = np.array([[0, 0, 1],\n", + " [1, 1, 0],\n", + " [0, 1, 0],\n", + " [1, 1, 0]])\n", + "print(w_key)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "Napm2VtkOIEN", - "colab_type": "code", - "outputId": "7331eb08-64d5-4a36-eeef-0a0a556f130b", "colab": { "base_uri": "https://localhost:8080/", "height": 108 - } + }, + "colab_type": "code", + "id": "Napm2VtkOIEN", + "outputId": "7331eb08-64d5-4a36-eeef-0a0a556f130b" }, - "source": [ - "print(\"w_value\")\n", - "w_value = np.array([[0, 2, 0],\n", - " [0, 3, 0],\n", - " [1, 0, 3],\n", - " [1, 1, 0]])\n", - "print(w_value)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "w_value\n", @@ -423,32 +153,33 @@ " [0 3 0]\n", " [1 0 3]\n", " [1 1 0]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"w_value\")\n", + "w_value = np.array([[0, 2, 0],\n", + " [0, 3, 0],\n", + " [1, 0, 3],\n", + " [1, 1, 0]])\n", + "print(w_value)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "JqapIgfDOQ7d", - "colab_type": "code", - "outputId": "fd610d7a-968a-47e6-d614-40ad03c1d172", "colab": { "base_uri": "https://localhost:8080/", "height": 108 - } + }, + "colab_type": "code", + "id": "JqapIgfDOQ7d", + "outputId": "fd610d7a-968a-47e6-d614-40ad03c1d172" }, - "source": [ - "print(\"Step 3: Matrix multiplication to obtain Q,K,V\")\n", - "\n", - "print(\"Queries: x * w_query\")\n", - "Q=np.matmul(x,w_query)\n", - "print(Q)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 3: Matrix multiplication to obtain Q,K,V\n", @@ -456,32 +187,32 @@ "[[1. 0. 2.]\n", " [2. 2. 2.]\n", " [2. 1. 3.]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 3: Matrix multiplication to obtain Q,K,V\")\n", + "\n", + "print(\"Queries: x * w_query\")\n", + "Q = np.matmul(x, w_query)\n", + "print(Q)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "NmfMln1Wmv73", - "colab_type": "code", - "outputId": "065b63ba-7584-4302-97cd-d5e1765470ed", "colab": { "base_uri": "https://localhost:8080/", "height": 108 - } + }, + "colab_type": "code", + "id": "NmfMln1Wmv73", + "outputId": "065b63ba-7584-4302-97cd-d5e1765470ed" }, - "source": [ - "print(\"Step 3: Matrix multiplication to obtain Q,K,V\")\n", - "\n", - "print(\"Keys: x * w_key\")\n", - "K=np.matmul(x,w_key)\n", - "print(K)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 3: Matrix multiplication to obtain Q,K,V\n", @@ -489,138 +220,127 @@ "[[0. 1. 1.]\n", " [4. 4. 0.]\n", " [2. 3. 1.]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 3: Matrix multiplication to obtain Q,K,V\")\n", + "\n", + "print(\"Keys: x * w_key\")\n", + "K= np.matmul(x, w_key)\n", + "print(K)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "v3Asv-8mOWkN", - "colab_type": "code", - "outputId": "2ec71310-0486-46f4-d9f5-d12a1a6ad0e6", "colab": { "base_uri": "https://localhost:8080/", "height": 90 - } + }, + "colab_type": "code", + "id": "v3Asv-8mOWkN", + "outputId": "2ec71310-0486-46f4-d9f5-d12a1a6ad0e6" }, - "source": [ - "print(\"Values: x * w_value\")\n", - "V=np.matmul(x,w_value)\n", - "print(V)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Values: x * w_value\n", "[[1. 2. 3.]\n", " [2. 8. 0.]\n", " [2. 6. 3.]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Values: x * w_value\")\n", + "V = np.matmul(x, w_value)\n", + "print(V)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "gfgRAHUuOp5c", - "colab_type": "code", - "outputId": "ad02f055-11e0-4b9a-eb15-b66e4846c95e", "colab": { "base_uri": "https://localhost:8080/", "height": 90 - } + }, + "colab_type": "code", + "id": "gfgRAHUuOp5c", + "outputId": "ad02f055-11e0-4b9a-eb15-b66e4846c95e" }, - "source": [ - "print(\"Step 4: Scaled Attention Scores\")\n", - "k_d=1 #square root of k_d=3 rounded down to 1 for this example\n", - "attention_scores = (Q @ K.transpose())/k_d\n", - "print(attention_scores)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 4: Scaled Attention Scores\n", "[[ 2. 4. 4.]\n", " [ 4. 16. 12.]\n", " [ 4. 12. 10.]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 4: Scaled Attention Scores\")\n", + "k_d = 1 #square root of k_d=3 rounded down to 1 for this example\n", + "attention_scores = np.matmul(Q, K.T) / k_d\n", + "print(attention_scores)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "hg2t6KuNOjzM", - "colab_type": "code", - "outputId": "c0610f91-cd1d-4b0f-b5ce-f6445481186a", "colab": { "base_uri": "https://localhost:8080/", "height": 90 - } + }, + "colab_type": "code", + "id": "hg2t6KuNOjzM", + "outputId": "c0610f91-cd1d-4b0f-b5ce-f6445481186a" }, - "source": [ - "print(\"Step 5: Scaled softmax attention_scores for each vector\")\n", - "attention_scores[0]=softmax(attention_scores[0])\n", - "attention_scores[1]=softmax(attention_scores[1])\n", - "attention_scores[2]=softmax(attention_scores[2])\n", - "print(attention_scores[0])\n", - "print(attention_scores[1])\n", - "print(attention_scores[2])" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 5: Scaled softmax attention_scores for each vector\n", "[0.06337894 0.46831053 0.46831053]\n", "[6.03366485e-06 9.82007865e-01 1.79861014e-02]\n", "[2.95387223e-04 8.80536902e-01 1.19167711e-01]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 5: Scaled softmax attention_scores for each vector\")\n", + "attention_scores[0] = softmax(attention_scores[0])\n", + "attention_scores[1] = softmax(attention_scores[1])\n", + "attention_scores[2] = softmax(attention_scores[2])\n", + "print(attention_scores[0])\n", + "print(attention_scores[1])\n", + "print(attention_scores[2])" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "R4Es7A7NOvjD", - "colab_type": "code", - "outputId": "b86060fe-1292-47c5-93f6-ddeeca1bfb62", "colab": { "base_uri": "https://localhost:8080/", "height": 199 - } + }, + "colab_type": "code", + "id": "R4Es7A7NOvjD", + "outputId": "b86060fe-1292-47c5-93f6-ddeeca1bfb62" }, - "source": [ - "print(\"Step 6: attention value obtained by score1/k_d * V\")\n", - "print(V[0])\n", - "print(V[1])\n", - "print(V[2])\n", - "print(\"Attention 1\")\n", - "attention1=attention_scores[0].reshape(-1,1)\n", - "attention1=attention_scores[0][0]*V[0]\n", - "print(attention1)\n", - "\n", - "print(\"Attention 2\")\n", - "attention2=attention_scores[0][1]*V[1]\n", - "print(attention2)\n", - "\n", - "print(\"Attention 3\")\n", - "attention3=attention_scores[0][2]*V[2]\n", - "print(attention3)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 6: attention value obtained by score1/k_d * V\n", @@ -633,60 +353,71 @@ "[0.93662106 3.74648425 0. ]\n", "Attention 3\n", "[0.93662106 2.80986319 1.40493159]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 6: attention value obtained by score1/k_d * V\")\n", + "print(V[0])\n", + "print(V[1])\n", + "print(V[2])\n", + "print(\"Attention 1\")\n", + "attention1 = attention_scores[0].reshape(-1,1)\n", + "attention1 = attention_scores[0][0] * V[0]\n", + "print(attention1)\n", + "\n", + "print(\"Attention 2\")\n", + "attention2 = attention_scores[0][1] * V[1]\n", + "print(attention2)\n", + "\n", + "print(\"Attention 3\")\n", + "attention3 = attention_scores[0][2] * V[2]\n", + "print(attention3)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "uBDKhaCvOzXj", - "colab_type": "code", - "outputId": "138901d8-0aa9-4db9-b8b1-76ad557e6688", "colab": { "base_uri": "https://localhost:8080/", "height": 54 - } + }, + "colab_type": "code", + "id": "uBDKhaCvOzXj", + "outputId": "138901d8-0aa9-4db9-b8b1-76ad557e6688" }, - "source": [ - "print(\"Step 7: summed the results to create the first line of the output matrix\")\n", - "attention_input1=attention1+attention2+attention3\n", - "print(attention_input1)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 7: summed the results to create the first line of the output matrix\n", "[1.93662106 6.68310531 1.59506841]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 7: summed the results to create the first line of the output matrix\")\n", + "attention_input1 = attention1 + attention2 + attention3\n", + "print(attention_input1)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "iEjgRcqHO4ik", - "colab_type": "code", - "outputId": "675a154b-a305-4c0c-e314-353541abfd3e", "colab": { "base_uri": "https://localhost:8080/", "height": 635 - } + }, + "colab_type": "code", + "id": "iEjgRcqHO4ik", + "outputId": "675a154b-a305-4c0c-e314-353541abfd3e" }, - "source": [ - "print(\"Step 8: Step 1 to 7 for inputs 1 to 3\")\n", - "#We assume we have 3 results with learned weights (they were not trained in this example)\n", - "#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each\n", - "attention_head1=np.random.random((3, 64))\n", - "print(attention_head1)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 8: Step 1 to 7 for inputs 1 to 3\n", @@ -723,81 +454,87 @@ " 0.04674047 0.97762416 0.72747288 0.75616534 0.68105477 0.06914679\n", " 0.14054312 0.42816012 0.66792325 0.03168237 0.68685758 0.43487164\n", " 0.08064005 0.23444144 0.60360253 0.21423994]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 8: Step 1 to 7 for inputs 1 to 3\")\n", + "#We assume we have 3 results with learned weights (they were not trained in this example)\n", + "#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each\n", + "attention_head1 = np.random.random((3, 64))\n", + "print(attention_head1)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "QI50dkZ1O630", - "colab_type": "code", - "outputId": "7d467842-f837-4e41-e099-534549b6fc05", "colab": { "base_uri": "https://localhost:8080/", "height": 54 - } + }, + "colab_type": "code", + "id": "QI50dkZ1O630", + "outputId": "7d467842-f837-4e41-e099-534549b6fc05" }, - "source": [ - "print(\"Step 9: We assume we have trained the 8 heads of the attention sub-layer\")\n", - "z0h1=np.random.random((3, 64))\n", - "z1h2=np.random.random((3, 64))\n", - "z2h3=np.random.random((3, 64))\n", - "z3h4=np.random.random((3, 64))\n", - "z4h5=np.random.random((3, 64))\n", - "z5h6=np.random.random((3, 64))\n", - "z6h7=np.random.random((3, 64))\n", - "z7h8=np.random.random((3, 64))\n", - "print(\"shape of one head\",z0h1.shape,\"dimension of 8 heads\",64*8)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 9: We assume we have trained the 8 heads of the attention sub-layer\n", "shape of one head (3, 64) dimension of 8 heads 512\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 9: We assume we have trained the 8 heads of the attention sub-layer\")\n", + "z0h1 = np.random.random((3, 64))\n", + "z1h2 = np.random.random((3, 64))\n", + "z2h3 = np.random.random((3, 64))\n", + "z3h4 = np.random.random((3, 64))\n", + "z4h5 = np.random.random((3, 64))\n", + "z5h6 = np.random.random((3, 64))\n", + "z6h7 = np.random.random((3, 64))\n", + "z7h8 = np.random.random((3, 64))\n", + "print(\"shape of one head\", z0h1.shape, \"dimension of 8 heads\", 64 * 8)" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "3n87LE92_Puf", - "colab_type": "code", - "outputId": "55d00415-ebea-43a6-b4c5-ff13e02c3052", "colab": { "base_uri": "https://localhost:8080/", "height": 90 - } + }, + "colab_type": "code", + "id": "3n87LE92_Puf", + "outputId": "55d00415-ebea-43a6-b4c5-ff13e02c3052" }, - "source": [ - "print(\"Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model\")\n", - "output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))\n", - "print(output_attention)" - ], - "execution_count": 0, "outputs": [ { + "name": "stdout", "output_type": "stream", "text": [ "Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model\n", "[[0.46950893 0.88546586 0.47615937 ... 0.08285802 0.16577096 0.61094461]\n", " [0.31638247 0.24246402 0.30390966 ... 0.42283366 0.62127905 0.64414042]\n", " [0.1922683 0.7017995 0.60116595 ... 0.20012387 0.16264044 0.93645276]]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "print(\"Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model\")\n", + "output_attention = np.hstack((z0h1, z1h2, z2h3, z3h4, z4h5, z5h6, z6h7, z7h8))\n", + "print(output_attention)" ] }, { "cell_type": "markdown", "metadata": { - "id": "PJLl4Jf3fPLh", - "colab_type": "text" + "colab_type": "text", + "id": "PJLl4Jf3fPLh" }, "source": [ "And now with Hugging Face in one line!" @@ -805,24 +542,22 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "CZIRvcRmfTPb", + "colab": {}, "colab_type": "code", - "colab": {} + "id": "CZIRvcRmfTPb" }, + "outputs": [], "source": [ "#@title Transformer Installation\n", "!pip -qq install transformers" - ], - "execution_count": 0, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "cNwLYc-SfXdF", - "colab_type": "code", - "outputId": "d1314cc6-74d6-45cf-b8d6-0a903e58ac60", "colab": { "base_uri": "https://localhost:8080/", "height": 85, @@ -836,24 +571,18 @@ "15aa4b6f8f784c74804107be249126b9", "edea457617ed4792aeeb65292019ceb4" ] - } + }, + "colab_type": "code", + "id": "cNwLYc-SfXdF", + "outputId": "d1314cc6-74d6-45cf-b8d6-0a903e58ac60" }, - "source": [ - "#@title Retrieve pipeline of modules and choose English to French translation\n", - "from transformers import pipeline\n", - "translator = pipeline(\"translation_en_to_fr\")\n", - "#One line of code!\n", - "print(translator(\"It is easy to translate languages with transformers\", max_length=40))" - ], - "execution_count": 0, "outputs": [ { - "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "946c90b82f7f46caa25c885668b75eab", - "version_minor": 0, - "version_major": 2 + "version_major": 2, + "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…" @@ -861,17 +590,298 @@ }, "metadata": { "tags": [] - } + }, + "output_type": "display_data" }, { + "name": "stdout", "output_type": "stream", "text": [ "\n", "[{'translation_text': 'Il est facile de traduire des langues avec des transformateurs.'}]\n" - ], - "name": "stdout" + ] } + ], + "source": [ + "#@title Retrieve pipeline of modules and choose English to French translation\n", + "from transformers import pipeline\n", + "translator = pipeline(\"translation_en_to_fr\")\n", + "#One line of code!\n", + "print(translator(\"It is easy to translate languages with transformers\", max_length=40))" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Multi-Head Attention Sub-Layer.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.6 (main, Mar 10 2023, 10:55:28) [GCC 11.3.0]" + }, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0ba4a91f472e4c41ba80ab4025288446": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "15aa4b6f8f784c74804107be249126b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4191af78535e4da8bb797690eff84e00": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "946c90b82f7f46caa25c885668b75eab": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9ce3d57b96b64da0b15e3f3626bacb30", + "IPY_MODEL_f8da2c91156342a69d9b262f4f993aa4" + ], + "layout": "IPY_MODEL_4191af78535e4da8bb797690eff84e00" + } + }, + "97370923218945c5b80ab468751ac8a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "9ce3d57b96b64da0b15e3f3626bacb30": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "Downloading: 100%", + "description_tooltip": null, + "layout": "IPY_MODEL_0ba4a91f472e4c41ba80ab4025288446", + "max": 230, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_97370923218945c5b80ab468751ac8a7", + "value": 230 + } + }, + "edea457617ed4792aeeb65292019ceb4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f8da2c91156342a69d9b262f4f993aa4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_edea457617ed4792aeeb65292019ceb4", + "placeholder": "​", + "style": "IPY_MODEL_15aa4b6f8f784c74804107be249126b9", + "value": " 230/230 [00:01<00:00, 185B/s]" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}