inference with fastapi

shlbatra · shlbatra · commit 6262fee062be · 2025-09-13T21:11:47.000-04:00
diff --git a/deploy_dataflow_streaming.sh b/deploy_dataflow_streaming.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Deploy Dataflow streaming job for real-time Iris inference
+# Deploy Dataflow streaming job for real-time Iris inference using FastAPI service
 set -e
 
 # Configuration
@@ -12,9 +12,10 @@ OUTPUT_TABLE="$PROJECT_ID:ml_dataset.iris_predictions_streaming"
 TEMP_LOCATION="gs://sb-vertex/temp"
 STAGING_LOCATION="gs://sb-vertex/staging"
 SERVICE_ACCOUNT="kfp-mlops@deeplearning-sahil.iam.gserviceaccount.com"
-ENDPOINT_NAME="Iris-Classifier-XGBoost"
+SERVICE_URL="https://iris-classifier-xgboost-service-zoxyfmo73q-uc.a.run.app"
 
-echo "Deploying Dataflow streaming job for real-time inference..."
+echo "Deploying Dataflow streaming job for real-time inference using FastAPI service..."
+echo "Note: Update SERVICE_URL with the actual Cloud Run service URL after deployment"
 
 # Run the Dataflow job
 echo "Starting Dataflow streaming job: $JOB_NAME"
@@ -23,7 +24,7 @@ python src/ml_pipelines_kfp/dataflow/iris_streaming_pipeline.py \
     --output_table $OUTPUT_TABLE \
     --project_id $PROJECT_ID \
     --region $REGION \
-    --endpoint_name $ENDPOINT_NAME \
+    --service_url $SERVICE_URL \
     --runner DataflowRunner \
     --job_name $JOB_NAME \
     --temp_location $TEMP_LOCATION \
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -257,8 +257,7 @@ deploymentSpec:
           \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
           \  python3 -m pip install --quiet --no-warn-script-location 'google-cloud-aiplatform>=1.59.0'\
           \ 'google-cloud-run>=0.10.0' 'google-cloud-storage>=2.10.0' 'requests>=2.31.0'\
-          \ 'joblib>=1.4.2' 'scikit-learn>=1.3.0' 'pandas>=2.0.0' 'numpy>=1.24.0'\
-          \ 'grpcio-status>=1.62.3' && \"$0\" \"$@\"\n"
+          \ 'joblib>=1.4.2' 'grpcio-status>=1.62.3' && \"$0\" \"$@\"\n"
         - sh
         - -ec
         - 'program_path=$(mktemp -d)
@@ -286,17 +285,15 @@ deploymentSpec:
           \ credentials\n    client = aiplatform_v1.ModelServiceClient(\n        credentials=credentials,\n\
           \        client_options={\"api_endpoint\": f\"{location}-aiplatform.googleapis.com\"\
           }\n    )\n\n    print(f\"Searching for blessed model with name: {model_name}\"\
-          )\n\n    # Use the high-level aiplatform library to list all model versions\n\
-          \    # models = aiplatform.Model.list(filter=f\"display_name={model_name}\"\
-          )\n    # blessed_model = None\n\n    request = {\n            \"parent\"\
-          : f\"projects/{project_id}/locations/{location}\",\n            \"filter\"\
-          : f\"display_name={model_name}\"\n        }\n\n    models = list(client.list_models(request=request))\n\
-          \    blessed_model = None\n\n    print(f\"Found {len(models)} model versions\
-          \ with name {model_name}\")\n\n    # Search through all model versions (each\
-          \ item in models is already a version)\n    for parent_model in models:\n\
-          \        print(f\"Checking parent model: {parent_model.name}\")\n\n    \
-          \    # List all versions of this model\n        versions_request = {\"name\"\
-          : parent_model.name}\n        versions = list(client.list_model_versions(request=versions_request))\n\
+          )\n\n    request = {\n            \"parent\": f\"projects/{project_id}/locations/{location}\"\
+          ,\n            \"filter\": f\"display_name={model_name}\"\n        }\n\n\
+          \    models = list(client.list_models(request=request))\n    blessed_model\
+          \ = None\n\n    print(f\"Found {len(models)} model versions with name {model_name}\"\
+          )\n\n    # Search through all model versions (each item in models is already\
+          \ a version)\n    for parent_model in models:\n        print(f\"Checking\
+          \ parent model: {parent_model.name}\")\n\n        # List all versions of\
+          \ this model\n        versions_request = {\"name\": parent_model.name}\n\
+          \        versions = list(client.list_model_versions(request=versions_request))\n\
           \n        print(f\"Found {len(versions)} versions for this model\")\n\n\
           \        for version in versions:\n            print(f\"Version {version.version_id}:\
           \ Aliases = {list(version.version_aliases)}\")\n            if \"blessed\"\
@@ -308,34 +305,29 @@ deploymentSpec:
           \ ValueError(f\"No blessed version found for model {model_name}. Available\
           \ versions: {available_versions}\")\n\n    print(f\"Found blessed model:\
           \ {blessed_model.name}\")\n    print(f\"Model URI: {blessed_model.artifact_uri}\"\
-          )\n\n    # 2. Download joblib model from blessed version\n    gcs_uri =\
-          \ blessed_model.artifact_uri\n    if not gcs_uri.startswith('gs://'):\n\
-          \        raise ValueError(f\"Expected GCS URI, got: {gcs_uri}\")\n\n   \
-          \ bucket_name = gcs_uri.replace('gs://', '').split('/')[0]\n    model_path\
-          \ = '/'.join(gcs_uri.replace('gs://', '').split('/')[1:])\n\n    print(f\"\
-          Downloading model from gs://{bucket_name}/{model_path}\")\n\n    storage_client\
-          \ = storage.Client()\n    bucket = storage_client.bucket(bucket_name)\n\n\
-          \    # Download and validate the model\n    model_blob_path = f\"{model_path}/model.joblib\"\
+          )\n\n    # Download joblib model from blessed version\n    gcs_uri = blessed_model.artifact_uri\n\
+          \    if not gcs_uri.startswith('gs://'):\n        raise ValueError(f\"Expected\
+          \ GCS URI, got: {gcs_uri}\")\n\n    bucket_name = gcs_uri.replace('gs://',\
+          \ '').split('/')[0]\n    model_path = '/'.join(gcs_uri.replace('gs://',\
+          \ '').split('/')[1:])\n\n    print(f\"Downloading model from gs://{bucket_name}/{model_path}\"\
+          )\n\n    storage_client = storage.Client()\n    bucket = storage_client.bucket(bucket_name)\n\
+          \n    # Download and validate the model\n    model_blob_path = f\"{model_path}/model.joblib\"\
           \n    blob = bucket.blob(model_blob_path)\n\n    if not blob.exists():\n\
           \        raise ValueError(f\"Model file not found at gs://{bucket_name}/{model_blob_path}\"\
           )\n\n    with tempfile.NamedTemporaryFile(suffix='.joblib', delete=False)\
           \ as temp_file:\n        blob.download_to_filename(temp_file.name)\n   \
           \     local_model_path = temp_file.name\n\n    print(f\"Downloaded model\
-          \ to: {local_model_path}\")\n\n    # 3. Validate model can be loaded\n \
-          \   try:\n        model_obj = joblib.load(local_model_path)\n        print(f\"\
-          Model type: {type(model_obj)}\")\n        print(f\"Model validation successful\"\
-          )\n    except Exception as e:\n        os.unlink(local_model_path)\n   \
-          \     raise ValueError(f\"Model validation failed: {e}\")\n\n    # 4. Copy\
-          \ model to standard deployment location\n    deployment_model_path = f\"\
-          deployed-models/{service_name}/model.joblib\"\n    deployment_blob = bucket.blob(deployment_model_path)\n\
-          \n    print(f\"Copying model to deployment location: gs://{bucket_name}/{deployment_model_path}\"\
+          \ to: {local_model_path}\")\n\n    # Copy model to standard deployment location\n\
+          \    deployment_model_path = f\"deployed-models/{service_name}/model.joblib\"\
+          \n    deployment_blob = bucket.blob(deployment_model_path)\n\n    print(f\"\
+          Copying model to deployment location: gs://{bucket_name}/{deployment_model_path}\"\
           )\n    deployment_blob.upload_from_filename(local_model_path)\n\n    model_gcs_path\
           \ = f\"gs://{bucket_name}/{deployment_model_path}\"\n    print(f\"Model\
-          \ available at: {model_gcs_path}\")\n\n    # 5. Deploy to Cloud Run using\
-          \ pre-built generic image\n    print(f\"Deploying to Cloud Run service:\
-          \ {service_name}\")\n\n    run_client = run_v2.ServicesClient()\n\n    #\
-          \ Use pre-built generic FastAPI image from CI/CD\n    generic_image = fastapi_image_name\n\
-          \n    service_config = {\n        \"parent\": f\"projects/{project_id}/locations/{location}\"\
+          \ available at: {model_gcs_path}\")\n\n    # Deploy to Cloud Run using pre-built\
+          \ generic image\n    print(f\"Deploying to Cloud Run service: {service_name}\"\
+          )\n\n    run_client = run_v2.ServicesClient()\n\n    # Use pre-built generic\
+          \ FastAPI image from CI/CD\n    generic_image = fastapi_image_name\n\n \
+          \   service_config = {\n        \"parent\": f\"projects/{project_id}/locations/{location}\"\
           ,\n        \"service_id\": service_name,\n        \"service\": {\n     \
           \       \"template\": {\n                \"containers\": [{\n          \
           \          \"image\": generic_image,\n                    \"ports\": [{\"\
@@ -370,14 +362,14 @@ deploymentSpec:
           \            resource=result.name,  # This should be the full resource name\n\
           \            policy=policy\n        )\n        run_client.set_iam_policy(request=iam_request)\n\
           \n        service_url = result.uri\n        print(f\"Service deployed successfully\
-          \ to: {service_url}\")\n\n        # 6. Test deployment\n        print(\"\
-          Testing deployment...\")\n        time.sleep(30)  # Wait for service to\
-          \ be ready\n\n        test_payload = {\n            \"instances\": [\n \
-          \              {\"SepalLengthCm\": 5.1, \"SepalWidthCm\": 3.5, \"PetalLengthCm\"\
-          : 1.4, \"PetalWidthCm\": 0.2}\n            ]\n        }\n\n        try:\n\
-          \            # Test health endpoint first\n            health_response =\
-          \ requests.get(f\"{service_url}/health\", timeout=30)\n            print(f\"\
-          Health check status: {health_response.status_code}\")\n            if health_response.status_code\
+          \ to: {service_url}\")\n\n        # Test deployment\n        print(\"Testing\
+          \ deployment...\")\n        time.sleep(30)  # Wait for service to be ready\n\
+          \n        test_payload = {\n            \"instances\": [\n             \
+          \  {\"SepalLengthCm\": 5.1, \"SepalWidthCm\": 3.5, \"PetalLengthCm\": 1.4,\
+          \ \"PetalWidthCm\": 0.2}\n            ]\n        }\n\n        try:\n   \
+          \         # Test health endpoint first\n            health_response = requests.get(f\"\
+          {service_url}/health\", timeout=30)\n            print(f\"Health check status:\
+          \ {health_response.status_code}\")\n            if health_response.status_code\
           \ == 200:\n                print(f\"Health check response: {health_response.json()}\"\
           )\n\n            # Test prediction endpoint\n            response = requests.post(\n\
           \                f\"{service_url}/predict\", \n                json=test_payload,\n\
diff --git a/src/ml_pipelines_kfp/dataflow/iris_streaming_pipeline.py b/src/ml_pipelines_kfp/dataflow/iris_streaming_pipeline.py
@@ -1,24 +1,24 @@
 """
 Dataflow streaming pipeline for real-time Iris inference.
-Reads from Pub/Sub, calls Vertex AI endpoint, writes predictions to BigQuery.
+Reads from Pub/Sub, calls FastAPI ML service deployed via Kubeflow, writes predictions to BigQuery.
 """
 import json
 import logging
 import argparse
 from typing import Any, Dict, List
+import requests
+import time
 
 import apache_beam as beam
 from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.transforms import window
 from apache_beam.io import ReadFromPubSub, WriteToBigQuery
-from google.cloud import aiplatform
-from google.oauth2 import service_account
 
 # Constants
 PROJECT_ID = "deeplearning-sahil"
 REGION = "us-central1"
 MODEL_NAME = "Iris-Classifier-XGBoost"
-ENDPOINT_NAME = "Iris-Classifier-XGBoost"
+FASTAPI_SERVICE_NAME = "iris-classifier-xgboost-service"
 
 # BigQuery schema for predictions
 PREDICTION_SCHEMA = {
@@ -30,10 +30,10 @@
         {'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'},
         {'name': 'sample_id', 'type': 'INTEGER', 'mode': 'REQUIRED'},
         {'name': 'prediction', 'type': 'STRING', 'mode': 'REQUIRED'},
-        {'name': 'prediction_confidence', 'type': 'FLOAT', 'mode': 'NULLABLE'},
         {'name': 'prediction_timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'},
-        {'name': 'model_endpoint', 'type': 'STRING', 'mode': 'REQUIRED'},
-        {'name': 'processing_time', 'type': 'FLOAT', 'mode': 'NULLABLE'}
+        {'name': 'model_service', 'type': 'STRING', 'mode': 'REQUIRED'},
+        {'name': 'processing_time', 'type': 'FLOAT', 'mode': 'NULLABLE'},
+        {'name': 'dataflow_processing_time', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'},
     ]
 }
 
@@ -57,75 +57,44 @@ def process(self, element):
             logging.error(f"Error parsing message: {e}, message: {element}")
 
 
-class CallVertexAIEndpoint(beam.DoFn):
-    """Call Vertex AI model endpoint for inference."""
+class CallFastAPIService(beam.DoFn):
+    """Call FastAPI ML service for inference."""
     
-    def __init__(self, project: str, region: str, endpoint_name: str):
-        self.project = project
-        self.region = region
-        self.endpoint_name = endpoint_name
-        self.client = None
-        self.endpoint = None
-        
-    def setup(self):
-        """Initialize Vertex AI client."""
-        aiplatform.init(project=self.project, location=self.region)
-        
-        # Get the endpoint
-        endpoints = aiplatform.Endpoint.list(
-            filter=f'display_name="{self.endpoint_name}"'
-        )
-        
-        if endpoints:
-            # If multiple endpoints exist with same name, prioritize by:
-            # 1. Most recently created (newest first)
-            # 2. Then by resource name (for consistency)
-            sorted_endpoints = sorted(
-                endpoints, 
-                key=lambda ep: (ep.create_time, ep.resource_name), 
-                reverse=True
-            )
-            
-            self.endpoint = sorted_endpoints[0]
-                
-        else:
-            raise RuntimeError(f"Endpoint '{self.endpoint_name}' not found")
+    def __init__(self, service_url: str):
+        self.service_url = service_url
+        self.predict_url = f"{service_url}/predict"
     
     def process(self, element):
         import time
         from datetime import datetime
+        import requests
         
         start_time = time.time()
         
         try:
-            # Prepare features for prediction
-            features = [
-                element['sepal_length'],
-                element['sepal_width'], 
-                element['petal_length'],
-                element['petal_width']
-            ]
+            # Prepare payload for FastAPI
+            payload = {
+                "instances": [{
+                    "SepalLengthCm": element['sepal_length'],
+                    "SepalWidthCm": element['sepal_width'], 
+                    "PetalLengthCm": element['petal_length'],
+                    "PetalWidthCm": element['petal_width']
+                }]
+            }
             
-            # Call the endpoint
-            predictions = self.endpoint.predict(instances=[features])
+            # Call FastAPI service
+            response = requests.post(self.predict_url, json=payload, timeout=30)
+            response.raise_for_status()
             
-            # Extract prediction result
-            prediction_result = predictions.predictions[0]
-
-            logging.info(f"Prediction result: {prediction_result}")
+            # Parse response
+            result_data = response.json()
+            predictions = result_data.get('predictions', [])
             
-            # Handle different prediction formats
-            if isinstance(prediction_result, list):
-                predicted_class = prediction_result[0]
-                confidence = max(prediction_result) if len(prediction_result) > 1 else None
+            if predictions:
+                prediction_result = predictions[0]
+                predicted_class = str(prediction_result.get('prediction', 'unknown'))
             else:
-                predicted_class = str(prediction_result)
-                confidence = None
-            
-            # Map numeric prediction to class name if needed
-            class_mapping = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
-            if str(predicted_class).isdigit():
-                predicted_class = class_mapping.get(int(predicted_class), str(predicted_class))
+                predicted_class = 'unknown'
             
             processing_time = time.time() - start_time
             
@@ -137,18 +106,17 @@ def process(self, element):
                 'petal_width': element['petal_width'],
                 'timestamp': element.get('timestamp', datetime.utcnow().isoformat()),
                 'sample_id': element.get('sample_id', 0),
-                'prediction': str(predicted_class),
-                'prediction_confidence': confidence,
+                'prediction': predicted_class,
                 'prediction_timestamp': datetime.utcnow().isoformat(),
-                'model_endpoint': f"{self.project}/{self.region}/{self.endpoint_name}",
+                'model_service': self.service_url,
                 'processing_time': processing_time
             }
             
             logging.info(f"Prediction for sample {element.get('sample_id')}: {predicted_class}")
             yield result
             
         except Exception as e:
-            logging.error(f"Error calling endpoint: {e}, element: {element}")
+            logging.error(f"Error calling FastAPI service: {e}, element: {element}")
             # Yield error record for monitoring
             yield {
                 'sepal_length': element.get('sepal_length', 0.0),
@@ -158,9 +126,8 @@ def process(self, element):
                 'timestamp': element.get('timestamp', datetime.utcnow().isoformat()),
                 'sample_id': element.get('sample_id', 0),
                 'prediction': 'ERROR',
-                'prediction_confidence': None,
                 'prediction_timestamp': datetime.utcnow().isoformat(),
-                'model_endpoint': f"ERROR: {str(e)}",
+                'model_service': f"ERROR: {str(e)}",
                 'processing_time': time.time() - start_time
             }
 
@@ -173,7 +140,6 @@ def process(self, element):
         
         # Add additional metadata
         element['dataflow_processing_time'] = datetime.utcnow().isoformat()
-        element['pipeline_version'] = '1.0.0'
         
         yield element
 
@@ -203,9 +169,9 @@ def run_pipeline(argv=None):
         help='GCP Region'
     )
     parser.add_argument(
-        '--endpoint_name',
+        '--service_url',
         required=True,
-        help='Vertex AI endpoint name'
+        help='FastAPI service URL'
     )
     
     known_args, pipeline_args = parser.parse_known_args(argv)
@@ -227,11 +193,8 @@ def run_pipeline(argv=None):
             pipeline
             | 'Read from Pub/Sub' >> ReadFromPubSub(topic=known_args.input_topic)
             | 'Parse JSON' >> beam.ParDo(ParsePubSubMessage())
-            | 'Add Window' >> beam.WindowInto(window.FixedWindows(60))  # 1-minute windows
-            | 'Call Vertex AI' >> beam.ParDo(CallVertexAIEndpoint(
-                known_args.project_id, 
-                known_args.region, 
-                known_args.endpoint_name))
+            | 'Call FastAPI Service' >> beam.ParDo(CallFastAPIService(
+                known_args.service_url))
             | 'Add Metadata' >> beam.ParDo(AddProcessingMetadata())
             | 'Write to BigQuery' >> WriteToBigQuery(
                 table=known_args.output_table,
diff --git a/src/ml_pipelines_kfp/iris_xgboost/pipelines/components/fastapi/Dockerfile.fastapi b/src/ml_pipelines_kfp/iris_xgboost/pipelines/components/fastapi/Dockerfile.fastapi
@@ -15,7 +15,7 @@ COPY requirements.fastapi.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy FastAPI application
-COPY fastapi_server_template.py main.py
+COPY fastapi_server.py main.py
 
 # Create directory for models
 RUN mkdir -p /app/models
diff --git a/src/ml_pipelines_kfp/iris_xgboost/pipelines/components/fastapi/fastapi_server.py b/src/ml_pipelines_kfp/iris_xgboost/pipelines/components/fastapi/fastapi_server.py