From 2d12afbc344ce29ef83cf4ac6418080ac905a7fc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 16 Feb 2026 18:03:15 +0000 Subject: [PATCH 1/5] adding tutorials from pruna pro --- docs/tutorials/computer_vision.ipynb | 177 +++++++++++++++++++++++ docs/tutorials/index.rst | 18 +++ docs/tutorials/recovery.ipynb | 205 +++++++++++++++++++++++++++ docs/tutorials/ring_attn.ipynb | 176 +++++++++++++++++++++++ 4 files changed, 576 insertions(+) create mode 100644 docs/tutorials/computer_vision.ipynb create mode 100644 docs/tutorials/recovery.ipynb create mode 100644 docs/tutorials/ring_attn.ipynb diff --git a/docs/tutorials/computer_vision.ipynb b/docs/tutorials/computer_vision.ipynb new file mode 100644 index 00000000..0e43690b --- /dev/null +++ b/docs/tutorials/computer_vision.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Blazingly fast Computer Vision Models" + ] + }, + { + "cell_type": "raw", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial demonstrates how to use the `pruna` package to optimize any custom computer vision model. We will use the `vit_b_16` model as an example. Any execution times given below are measured on a T4 GPU." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the CV Model\n", + "\n", + "First, load your ViT model.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torchvision\n", + "\n", + "model = torchvision.models.vit_b_16(weights=\"ViT_B_16_Weights.DEFAULT\").cuda()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the Smash Config\n", + "\n", + "Next, initialize the smash_config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig\n", + "\n", + "# Initialize the SmashConfig\n", + "smash_config = SmashConfig([\"x_fast\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, you can smash the model, which will take around 5 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import smash\n", + "\n", + "# Smash the model\n", + "smashed_model = smash(\n", + " model=model,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Preparing the Input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from torchvision import transforms\n", + "\n", + "# Generating a random image\n", + "image = np.random.randint(0, 256, size=(224, 224, 3), dtype=np.uint8)\n", + "input_tensor = transforms.ToTensor()(image).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Running the Model\n", + "\n", + "After the model has been compiled, we run inference for a few iterations as warm-up. This will take around 8 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run some warm-up iterations\n", + "for _ in range(5):\n", + " smashed_model(input_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, run the model to transcribe the audio file with accelerated inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the result\n", + "smashed_model(input_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap Up\n", + "\n", + "Congratulations! You have successfully smashed a CV model. You can now use the `pruna` package to optimize any custom CV model. The only parts that you should modify are step 1, 4 and 5 to fit your use case" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pruna", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 68873a6e..02d35834 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -87,6 +87,24 @@ These tutorials will guide you through the process of using |pruna| to optimize Learn how to use the ``target_modules`` parameter to target specific modules in your model. + .. grid-item-card:: Blazingly Fast Computer Vision + :text-align: center + :link: ./computer_vision.ipynb + + Optimize any ``computer vision`` model with ``x_fast`` ``compilation``. + + .. grid-item-card:: Recover Quality after Quantization + :text-align: center + :link: ./recovery.ipynb + + Recover quality using ``text_to_image_perp`` after ``diffusers_int8`` ``quantization``. + + .. grid-item-card:: Distribute across GPUs with Ring Attention + :text-align: center + :link: ./ring_attn.ipynb + + Distribute your ``Flux`` model across multiple GPUs with ``ring_attn`` and ``torch_compile``. + .. toctree:: :hidden: :maxdepth: 1 diff --git a/docs/tutorials/recovery.ipynb b/docs/tutorials/recovery.ipynb new file mode 100644 index 00000000..2bfcf6aa --- /dev/null +++ b/docs/tutorials/recovery.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recovering Quality after Quantizing Models to 4 Bits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + "\n", + "This tutorial demonstrates how to use the ``pruna`` package to use our experimental \"recovery\" feature to recover the model quality after quantization. This option allows you to push quantization or other compression techniques to the limit without compromising quality.\n", + "\n", + "We will use :doc:`PERP ` on the Sana model as an example, but you can also use Stable Diffusion and Flux models depending on your device. Any execution times given below are measured on a L40S GPU.\n", + "\n", + "Note that recovery is available in the ``pruna`` package." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the Sana Model\n", + "\n", + "First, load the Sana model, and generate an image for quality reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import SanaPipeline\n", + "\n", + "pipe = SanaPipeline.from_pretrained(\n", + " \"Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers\",\n", + " torch_dtype=torch.bfloat16,\n", + ").to(\"cuda\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate an image to have a reference for quality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"A crow walking along a river near a foggy cliff, with cute yellow ducklings following it in a line, at sunset.\"\n", + "pipe(prompt).images[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the SmashConfig" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "Next, initialize the SmashConfig. We'll use :doc:`bitsandbytes' quantization ` to 4 bits, and recover quality by finetuning with PERP on a text-to-image dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig\n", + "\n", + "smash_config = SmashConfig({\n", + " # Quantize the model to 4-bits\n", + " \"diffusers_int8\": {\n", + " \"weight_bits\": 4\n", + " },\n", + " # Recover, allowing you to push quantization to lower bit rates without compromising quality\n", + " \"text_to_image_perp\": {\n", + " # you can increase or reduce 'batch_size' depending on your GPU, or use 'gradient_accumulation_steps' with it\n", + " \"batch_size\": 8,\n", + " \"num_epochs\": 4,\n", + " \"validate_every_n_epoch\": 0.5 # run validation every half epoch\n", + " }\n", + "})\n", + "# Attach a text-to-image dataset, used for recovery\n", + "smash_config.add_data(\"COCO\")\n", + "smash_config.data.limit_datasets((256, 64, 1)) # training on 256 samples and validating on 64" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, smash the model. This takes about 9 minutes on an L40S GPU, but it depends on how many samples are used for recovery.\n", + "Recovery logging is handled though __Weights & Biases__, make sure you have it installed and set up in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import smash\n", + "\n", + "smashed_model = smash(\n", + " model=pipe,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Running the Model\n", + "Finally, we run the model which has been quantized and recovered. It has a lower memory footprint than the original because of the quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "smashed_model(prompt).images[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap up" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext", + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "Congratulations! You have successfully recovered quality on your compressed Sana model. You can now use the ``pruna`` package to its limit by using aggressive compression alongside recovery. The only parts you should modify are steps 1 and 4 to fit your use case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "prunatree", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb new file mode 100644 index 00000000..e31ad05d --- /dev/null +++ b/docs/tutorials/ring_attn.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributing Flux on Multiple GPUs" + ] + }, + { + "cell_type": "raw", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will walk you through how to use the `pruna` package to optimize your Flux model for faster inference on multiple GPUs. Any execution times below are measured on a set of 2 H100 PCIes.\n", + "Note that the `pruna` distributers are also compatible with `torchrun`, simply convert this tutorial to a script and run with `torchrun --nproc_per_node=2 flux_tutorial.py`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the Flux Model\n", + "\n", + "First, load your Flux model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import FluxPipeline\n", + "\n", + "pipe = FluxPipeline.from_pretrained(\"black-forest-labs/FLUX.1-dev\", torch_dtype=torch.bfloat16)\n", + "pipe.to(\"cuda\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the Smash Config\n", + "\n", + "Next, initialize the `smash_config`. For this tutorial, we will select our `ring_attn` distributer and `torch_compile`. If this is not enough for you, you can play around with additionally activating e.g. the quantizer, factorizer and pruner below!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pruna import SmashConfig, smash\n", + "\n", + "# Initialize the SmashConfig and configure the algorithms\n", + "smash_config = SmashConfig([\"ring_attn\", \"torch_compile\"])\n", + "# Additionally configure suitable hyperparameters\n", + "smash_config.add({\n", + " \"torch_compile_target\": \"module_list\"\n", + "})\n", + "\n", + "# You can choose to activate further algorithms compatible with the ring_attn distributer!\n", + "# smash_config.add([\"qkv_diffusers\", \"fp8\", \"padding_pruning\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, you can smash the model, which can take up to one minute. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe = smash(\n", + " model=pipe,\n", + " smash_config=smash_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Running the Model\n", + "\n", + "After the model has been distributed and compiled, we run inference for a few iterations as warm-up. The initial inference time of 10.4 seconds has now been reduced to around 2.7 seconds!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = (\n", + " \"An anime illustration of Sydney Opera House sitting next to Eiffel tower, under a blue night sky of \"\n", + " \"roiling energy, exploding yellow stars, and radiating swirls of blue.\"\n", + ")\n", + "\n", + "for _ in range(5):\n", + " output = pipe(prompt, num_inference_steps=50).images[0]\n", + "output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Clean-Up\n", + "\n", + "To properly clean up the distributed model, make sure to call the `destroy` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.destroy()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap Up\n", + "\n", + "Congratulations! You have successfully distributed a Flux model on multiple GPUs and combined it with other `pruna` algorithms - it is that easy." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pruna", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From bf3d9ea67a86c5c90c88668a09069f8cf220783e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 17 Feb 2026 17:44:38 +0000 Subject: [PATCH 2/5] small changes to rin_attn tutorial --- docs/tutorials/ring_attn.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb index e31ad05d..cc2ffa1d 100644 --- a/docs/tutorials/ring_attn.ipynb +++ b/docs/tutorials/ring_attn.ipynb @@ -75,7 +75,7 @@ "})\n", "\n", "# You can choose to activate further algorithms compatible with the ring_attn distributer!\n", - "# smash_config.add([\"qkv_diffusers\", \"fp8\", \"padding_pruning\"])" + "# smash_config.add([\"qkv_diffusers\", \"padding_pruning\"])" ] }, { @@ -173,4 +173,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From d047e5df060f17525962a9dd0a049eab482a23db Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 19 Feb 2026 14:15:25 +0000 Subject: [PATCH 3/5] make tutorials toctree explicit --- docs/tutorials/index.rst | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 02d35834..11f0da6a 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -109,6 +109,22 @@ These tutorials will guide you through the process of using |pruna| to optimize :hidden: :maxdepth: 1 :caption: Pruna - :glob: - - ./* + + image_generation + video_generation + llms + reasoning_llm + asr_tutorial + cv_cpu + diffusion_quantization_acceleration + evaluation_agent_cmmd + sana_diffusers_int8 + flux2klein4b_tutorial + sd_deepcache + deploying_sana_tutorial + target_modules_quanto + portable_compilation + llm_quantization_compilation_acceleration + computer_vision + recovery + ring_attn From 7ac3af95fe965a6594daee63c0a2c36b4a460121 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 20 Feb 2026 15:43:35 +0000 Subject: [PATCH 4/5] fixin colab link --- docs/tutorials/computer_vision.ipynb | 4 ++-- docs/tutorials/recovery.ipynb | 4 ++-- docs/tutorials/ring_attn.ipynb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/computer_vision.ipynb b/docs/tutorials/computer_vision.ipynb index 0e43690b..1e10f7d4 100644 --- a/docs/tutorials/computer_vision.ipynb +++ b/docs/tutorials/computer_vision.ipynb @@ -15,7 +15,7 @@ } }, "source": [ - "\n", + "\n", " \"Open\n", "" ] @@ -174,4 +174,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/docs/tutorials/recovery.ipynb b/docs/tutorials/recovery.ipynb index 2bfcf6aa..6e2f3a16 100644 --- a/docs/tutorials/recovery.ipynb +++ b/docs/tutorials/recovery.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", " \"Open\n", "" ] @@ -202,4 +202,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/docs/tutorials/ring_attn.ipynb b/docs/tutorials/ring_attn.ipynb index cc2ffa1d..da37c4fc 100644 --- a/docs/tutorials/ring_attn.ipynb +++ b/docs/tutorials/ring_attn.ipynb @@ -15,7 +15,7 @@ } }, "source": [ - "\n", + "\n", " \"Open\n", "" ] From 649ddf5b57a3a2e479079c074787178d8e25bfc9 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 20 Feb 2026 16:05:19 +0000 Subject: [PATCH 5/5] fixed grid cards --- docs/tutorials/computer_vision.ipynb | 348 +++++++++++++-------------- docs/tutorials/index.rst | 36 ++- 2 files changed, 191 insertions(+), 193 deletions(-) diff --git a/docs/tutorials/computer_vision.ipynb b/docs/tutorials/computer_vision.ipynb index 1e10f7d4..8611a2d7 100644 --- a/docs/tutorials/computer_vision.ipynb +++ b/docs/tutorials/computer_vision.ipynb @@ -1,177 +1,177 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Blazingly fast Computer Vision Models" - ] - }, - { - "cell_type": "raw", - "metadata": { - "vscode": { - "languageId": "raw" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Blazingly fast Computer Vision Models" + ] + }, + { + "cell_type": "raw", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial demonstrates how to use the `pruna` package to optimize any custom computer vision model. We will use the `vit_b_16` model as an example. Any execution times given below are measured on a T4 GPU." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Loading the CV Model\n", + "\n", + "First, load your ViT model.\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import torchvision\n", + "\n", + "model = torchvision.models.vit_b_16(weights=\"ViT_B_16_Weights.DEFAULT\").cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Initializing the Smash Config\n", + "\n", + "Next, initialize the smash_config." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from pruna import SmashConfig\n", + "\n", + "# Initialize the SmashConfig\n", + "smash_config = SmashConfig([\"x_fast\"])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Smashing the Model\n", + "\n", + "Now, you can smash the model, which will take around 5 seconds." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from pruna import smash\n", + "\n", + "# Smash the model\n", + "smashed_model = smash(\n", + " model=model,\n", + " smash_config=smash_config,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Preparing the Input" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import numpy as np\n", + "from torchvision import transforms\n", + "\n", + "# Generating a random image\n", + "image = np.random.randint(0, 256, size=(224, 224, 3), dtype=np.uint8)\n", + "input_tensor = transforms.ToTensor()(image).unsqueeze(0).cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Running the Model\n", + "\n", + "After the model has been compiled, we run inference for a few iterations as warm-up. This will take around 8 seconds." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# run some warm-up iterations\n", + "for _ in range(5):\n", + " smashed_model(input_tensor)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, run the model with accelerated inference." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Display the result\n", + "smashed_model(input_tensor)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrap Up\n", + "\n", + "Congratulations! You have successfully smashed a CV model. You can now use the `pruna` package to optimize any custom CV model. The only parts that you should modify are step 1, 4 and 5 to fit your use case" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pruna", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.15" } - }, - "source": [ - "\n", - " \"Open\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This tutorial demonstrates how to use the `pruna` package to optimize any custom computer vision model. We will use the `vit_b_16` model as an example. Any execution times given below are measured on a T4 GPU." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Loading the CV Model\n", - "\n", - "First, load your ViT model.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torchvision\n", - "\n", - "model = torchvision.models.vit_b_16(weights=\"ViT_B_16_Weights.DEFAULT\").cuda()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Initializing the Smash Config\n", - "\n", - "Next, initialize the smash_config." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pruna import SmashConfig\n", - "\n", - "# Initialize the SmashConfig\n", - "smash_config = SmashConfig([\"x_fast\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Smashing the Model\n", - "\n", - "Now, you can smash the model, which will take around 5 seconds." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pruna import smash\n", - "\n", - "# Smash the model\n", - "smashed_model = smash(\n", - " model=model,\n", - " smash_config=smash_config,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. Preparing the Input" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from torchvision import transforms\n", - "\n", - "# Generating a random image\n", - "image = np.random.randint(0, 256, size=(224, 224, 3), dtype=np.uint8)\n", - "input_tensor = transforms.ToTensor()(image).unsqueeze(0).cuda()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5. Running the Model\n", - "\n", - "After the model has been compiled, we run inference for a few iterations as warm-up. This will take around 8 seconds." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# run some warm-up iterations\n", - "for _ in range(5):\n", - " smashed_model(input_tensor)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, run the model to transcribe the audio file with accelerated inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display the result\n", - "smashed_model(input_tensor)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Wrap Up\n", - "\n", - "Congratulations! You have successfully smashed a CV model. You can now use the `pruna` package to optimize any custom CV model. The only parts that you should modify are step 1, 4 and 5 to fit your use case" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pruna", - "language": "python", - "name": "python3" }, - "language_info": { - "name": "python", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index 11f0da6a..5464d991 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -75,6 +75,7 @@ These tutorials will guide you through the process of using |pruna| to optimize :link: ./sd_deepcache.ipynb Optimize your ``diffusion`` model with ``deepcache`` ``caching``. + .. grid-item-card:: Optimize and Deploy Sana diffusers with Pruna and Hugging Face :text-align: center :link: ./deploying_sana_tutorial.ipynb @@ -105,26 +106,23 @@ These tutorials will guide you through the process of using |pruna| to optimize Distribute your ``Flux`` model across multiple GPUs with ``ring_attn`` and ``torch_compile``. + .. grid-item-card:: Reducing Warm-up Time for Compilation + :text-align: center + :link: ./portable_compilation.ipynb + + Reduce warm-up time significantly when re-loading a ``torch_compile`` compiled model on a new machine. + + .. grid-item-card:: Quantize and Speedup any LLM + :text-align: center + :link: ./llm_quantization_compilation_acceleration.ipynb + + Optimize latency and memory footprint of any LLM with ``hqq`` ``quantization`` and ``torch_compile`` ``compilation``. + .. toctree:: :hidden: :maxdepth: 1 :caption: Pruna - - image_generation - video_generation - llms - reasoning_llm - asr_tutorial - cv_cpu - diffusion_quantization_acceleration - evaluation_agent_cmmd - sana_diffusers_int8 - flux2klein4b_tutorial - sd_deepcache - deploying_sana_tutorial - target_modules_quanto - portable_compilation - llm_quantization_compilation_acceleration - computer_vision - recovery - ring_attn + :glob: + + ./* +