Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 92 additions & 2 deletions notebooks/multimodal/multimodal_dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"3. Conduct image transformations\n",
"4. Use LLM models to ask questions and generate embeddings on images\n",
"5. PDF chunking function\n",
"6. Transcribe audio"
"6. Transcribe audio\n",
"7. Extract EXIF metadata from images"
]
},
{
Expand Down Expand Up @@ -104,6 +105,11 @@
"PROJECT = \"bigframes-dev\" # replace with your project. \n",
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
"\n",
"LOCATION = \"us\" # replace with your location.\n",
"\n",
"# Dataset where the UDF will be created.\n",
"DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
"\n",
"OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
"# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
Expand All @@ -112,12 +118,14 @@
"import bigframes\n",
"# Setup project\n",
"bigframes.options.bigquery.project = PROJECT\n",
"bigframes.options.bigquery.location = LOCATION\n",
"\n",
"# Display options\n",
"bigframes.options.display.blob_display_width = 300\n",
"bigframes.options.display.progress_bar = None\n",
"\n",
"import bigframes.pandas as bpd"
"import bigframes.pandas as bpd\n",
"import bigframes.bigquery as bbq"
]
},
{
Expand Down Expand Up @@ -1546,6 +1554,88 @@
"transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
"transcribed_series_verbose"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7. Extract EXIF metadata from images"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Construct the canonical connection ID\n",
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
"\n",
"@bpd.udf(\n",
" input_types=[str],\n",
" output_type=str,\n",
" dataset=DATASET_ID,\n",
" name=\"extract_exif\",\n",
" bigquery_connection=FULL_CONNECTION_ID,\n",
" packages=[\"pillow\", \"requests\"],\n",
" max_batching_rows=8192,\n",
" container_cpu=0.33,\n",
" container_memory=\"512Mi\"\n",
")\n",
"def extract_exif(src_obj_ref_rt: str) -> str:\n",
" import io\n",
" import json\n",
" from PIL import ExifTags, Image\n",
" import requests\n",
" from requests import adapters\n",
" session = requests.Session()\n",
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
" response = session.get(src_url, timeout=30)\n",
" bts = response.content\n",
" image = Image.open(io.BytesIO(bts))\n",
" exif_data = image.getexif()\n",
" exif_dict = {}\n",
" if exif_data:\n",
" for tag, value in exif_data.items():\n",
" tag_name = ExifTags.TAGS.get(tag, tag)\n",
" exif_dict[tag_name] = value\n",
" return json.dumps(exif_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a Multimodal DataFrame from the sample image URIs\n",
"exif_image_df = bpd.from_glob_path(\n",
" \"gs://bigframes_blob_test/images_exif/*\",\n",
" name=\"blob_col\",\n",
")\n",
"\n",
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
"# This allows the UDF to download the images from Google Cloud Storage\n",
"access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
"\n",
"# Apply the BigQuery Python UDF to the runtime JSON strings\n",
"# We cast to string to ensure the input matches the UDF's signature\n",
"exif_json = access_urls.astype(str).apply(extract_exif)\n",
"\n",
"# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
"exif_data = bbq.parse_json(exif_json)\n",
"\n",
"exif_data"
]
}
],
"metadata": {
Expand Down
Loading