diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 0822ee4c2db..bb5733bde26 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -61,7 +61,8 @@ "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", "5. PDF chunking function\n", - "6. Transcribe audio" + "6. Transcribe audio\n", + "7. Extract EXIF metadata from images" ] }, { @@ -104,6 +105,11 @@ "PROJECT = \"bigframes-dev\" # replace with your project. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", "\n", + "LOCATION = \"us\" # replace with your location.\n", + "\n", + "# Dataset where the UDF will be created.\n", + "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n", + "\n", "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", @@ -112,12 +118,14 @@ "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.bigquery.location = LOCATION\n", "\n", "# Display options\n", "bigframes.options.display.blob_display_width = 300\n", "bigframes.options.display.progress_bar = None\n", "\n", - "import bigframes.pandas as bpd" + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq" ] }, { @@ -1546,6 +1554,88 @@ "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", "transcribed_series_verbose" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Extract EXIF metadata from images" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"extract_exif\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pillow\", \"requests\"],\n", + " max_batching_rows=8192,\n", + " container_cpu=0.33,\n", + " container_memory=\"512Mi\"\n", + ")\n", + "def extract_exif(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from PIL import ExifTags, Image\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30)\n", + " bts = response.content\n", + " image = Image.open(io.BytesIO(bts))\n", + " exif_data = image.getexif()\n", + " exif_dict = {}\n", + " if exif_data:\n", + " for tag, value in exif_data.items():\n", + " tag_name = ExifTags.TAGS.get(tag, tag)\n", + " exif_dict[tag_name] = value\n", + " return json.dumps(exif_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Multimodal DataFrame from the sample image URIs\n", + "exif_image_df = bpd.from_glob_path(\n", + " \"gs://bigframes_blob_test/images_exif/*\",\n", + " name=\"blob_col\",\n", + ")\n", + "\n", + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", + "# This allows the UDF to download the images from Google Cloud Storage\n", + "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", + "\n", + "# Apply the BigQuery Python UDF to the runtime JSON strings\n", + "# We cast to string to ensure the input matches the UDF's signature\n", + "exif_json = access_urls.astype(str).apply(extract_exif)\n", + "\n", + "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", + "exif_data = bbq.parse_json(exif_json)\n", + "\n", + "exif_data" + ] } ], "metadata": {