Add Vision Transformer demo for image classification

dhruvidave348 · dhruvidave348 · commit 1ba4fe14462e · 2025-10-10T21:58:59.000+05:30
diff --git a/computer_vision/vision_transformer.py b/computer_vision/vision_transformer.py
@@ -1,58 +1,65 @@
 """
-Vision Transformer (ViT)
-========================
+Vision Transformer (ViT) Module
+================================
 
-This module demonstrates how to use a pretrained Vision Transformer (ViT)
-for image classification using Hugging Face's Transformers library.
+Classify images using a pretrained Vision Transformer (ViT)
+from Hugging Face Transformers.
+
+Can be used as a demo or imported in other scripts.
 
 Source:
 https://huggingface.co/docs/transformers/model_doc/vit
 """
 
+try:
+    import requests
+    import torch
+    from io import BytesIO
+    from PIL import Image
+    from transformers import ViTForImageClassification, ViTImageProcessor
+except ImportError as e:
+    raise ImportError(
+        "This module requires 'torch', 'transformers', 'PIL', and 'requests'. "
+        "Install them with: pip install torch transformers pillow requests"
+    ) from e
 
-def vision_transformer_demo() -> None:
-    """
-    Demonstrates Vision Transformer (ViT) on a sample image.
 
-    Example:
-        >>> vision_transformer_demo()  # doctest: +SKIP
-        Predicted label: tabby, tabby cat
-    """
-    try:
-        import requests
-        import torch
-        from PIL import Image
-        from transformers import ViTForImageClassification, ViTImageProcessor
-    except ImportError as e:
-        raise ImportError(
-            "This demo requires 'torch', 'transformers', 'PIL', and 'requests' packages. "
-            "Install them with: pip install torch transformers pillow requests"
-        ) from e
-
-    # Load a sample image
-    url = (
-        "https://huggingface.co/datasets/huggingface/documentation-images/"
-        "resolve/main/cat_sample.jpeg"
-    )
-    image = Image.open(requests.get(url, stream=True, timeout=10).raw)
-
-    # Load pretrained model and processor
+def classify_image(image: Image.Image) -> str:
+    """Classify a PIL image using pretrained ViT."""
     processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
     model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
-    # Preprocess the image
     inputs = processor(images=image, return_tensors="pt")
 
-    # Run inference
     with torch.no_grad():
         outputs = model(**inputs)
         logits = outputs.logits
 
     predicted_class_idx = logits.argmax(-1).item()
-    predicted_label = model.config.id2label[predicted_class_idx]
+    return model.config.id2label[predicted_class_idx]
+
+
+def demo(url: str = None) -> None:
+    """
+    Run a demo using a sample image or provided URL.
+    
+    Args:
+        url (str): URL of the image. If None, uses a default cat image.
+    """
+    if url is None:
+        url = "https://images.unsplash.com/photo-1592194996308-7b43878e84a6"  # default example image
+
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        image = Image.open(BytesIO(response.content))
+    except Exception as e:
+        print(f"Failed to load image from {url}. Error: {e}")
+        return
 
-    print(f"Predicted label: {predicted_label}")
+    label = classify_image(image)
+    print(f"Predicted label: {label}")
 
 
 if __name__ == "__main__":
-    vision_transformer_demo()
+    demo()