CloudEngineHub
diff --git a/‎.semversioner/next-release/patch-20260219013931620740.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20260219013931620740.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dictionary.txt‎
Lines changed: 2 additions & 0 deletions b/‎dictionary.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/graphrag-vectors/example_notebooks/azure_ai_search.ipynb‎
Lines changed: 361 additions & 0 deletions b/‎packages/graphrag-vectors/example_notebooks/azure_ai_search.ipynb‎
Lines changed: 361 additions & 0 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add filtering, timestamp explosion, insert/count/remove/update operations to vector store API. Add top-level vector_size config to VectorStoreConfig."
+}
@@ -27,6 +27,8 @@ dtypes
 ints
 genid
 isinstance
+ismatch
+ftype
 
 # Azure
 abfs
 
@@ -0,0 +1,361 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7fb27b941602401d91542211134fc71a",
+   "metadata": {},
+   "source": [
+    "# Azure AI Search Vector Store Example\n",
+    "\n",
+    "This notebook demonstrates the `AzureAISearchVectorStore` from `graphrag_vectors`, including:\n",
+    "- Loading documents with metadata and embeddings\n",
+    "- Similarity search with field selection\n",
+    "- Metadata filtering using the `F` filter builder (compiled to OData)\n",
+    "- Timestamp-based filtering on exploded date fields\n",
+    "- Document update and removal\n",
+    "\n",
+    "**Prerequisites**: Set `AZURE_AI_SEARCH_URL` in your `.env` file (and optionally `AZURE_AI_SEARCH_API_KEY`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acae54e37e7d407bbb7b55eff062a284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from dotenv import load_dotenv\n",
+    "from graphrag_vectors import F, VectorStoreDocument\n",
+    "from graphrag_vectors.azure_ai_search import AzureAISearchVectorStore\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Load sample data (text units with embeddings)\n",
+    "data_dir = Path(\"data\")\n",
+    "text_units = pd.read_parquet(data_dir / \"text_units.parquet\")\n",
+    "embeddings = pd.read_parquet(data_dir / \"embeddings.text_unit_text.parquet\")\n",
+    "text_units = text_units.merge(embeddings, on=\"id\")\n",
+    "\n",
+    "print(\n",
+    "    f\"Loaded {len(text_units)} text units with columns: {text_units.columns.tolist()}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a63283cbaf04dbcab1f6479b197f3a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create and connect to an Azure AI Search vector store\n",
+    "url = os.environ[\"AZURE_AI_SEARCH_URL\"]\n",
+    "api_key = os.environ.get(\"AZURE_AI_SEARCH_API_KEY\")\n",
+    "\n",
+    "store = AzureAISearchVectorStore(\n",
+    "    url=url,\n",
+    "    api_key=api_key,\n",
+    "    index_name=\"text_units\",\n",
+    "    fields={\n",
+    "        \"os\": \"str\",\n",
+    "        \"category\": \"str\",\n",
+    "        \"timestamp\": \"date\",\n",
+    "    },\n",
+    ")\n",
+    "store.connect()\n",
+    "store.create_index()\n",
+    "\n",
+    "# Load documents\n",
+    "docs = [\n",
+    "    VectorStoreDocument(\n",
+    "        id=row[\"id\"],\n",
+    "        vector=row[\"embedding\"].tolist(),\n",
+    "        data=row.to_dict(),\n",
+    "        create_date=row.get(\"timestamp\"),\n",
+    "    )\n",
+    "    for _, row in text_units.iterrows()\n",
+    "]\n",
+    "store.load_documents(docs)\n",
+    "print(f\"Loaded {len(docs)} documents into store\")\n",
+    "\n",
+    "# Allow time for Azure AI Search to propagate\n",
+    "time.sleep(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dd0d8092fe74a7c96281538738b07e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test count\n",
+    "count = store.count()\n",
+    "print(f\"Document count: {count}\")\n",
+    "assert count == 42, f\"Expected 42, got {count}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72eea5119410473aa328ad9291626812",
+   "metadata": {},
+   "source": [
+    "## Vector Similarity Search\n",
+    "\n",
+    "Use `similarity_search_by_vector` to find the closest documents to a query embedding.\n",
+    "The `select` parameter controls which metadata fields are returned in results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8edb47106e1a46a883d545849b8ab81b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use the first document's embedding as a query vector\n",
+    "query_vector = text_units.iloc[0][\"embedding\"].tolist()\n",
+    "\n",
+    "# Basic search - returns all fields\n",
+    "results = store.similarity_search_by_vector(query_vector, k=3)\n",
+    "print(f\"Found {len(results)} results:\")\n",
+    "for r in results:\n",
+    "    print(\n",
+    "        f\"  - {r.document.id}: score={r.score:.4f}, data keys={list(r.document.data.keys())}\"\n",
+    "    )\n",
+    "\n",
+    "# Search with select - only return 'os' field\n",
+    "results = store.similarity_search_by_vector(query_vector, k=1, select=[\"os\"])\n",
+    "result = results[0]\n",
+    "print(\"\\nWith select=['os']:\")\n",
+    "print(f\"  Data fields: {result.document.data}\")\n",
+    "assert \"os\" in result.document.data, \"Expected 'os' field in data\"\n",
+    "assert \"category\" not in result.document.data, \"Expected 'category' to be excluded\"\n",
+    "print(\"  Select parameter confirmed - only 'os' field returned.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10185d26023b46108eb7d9f57d49d2b3",
+   "metadata": {},
+   "source": [
+    "## Metadata Filtering\n",
+    "\n",
+    "Use the `F` filter builder to construct filter expressions with `==`, `!=`, `>`, `<`, `>=`, `<=`.\n",
+    "Combine with `&` (AND), `|` (OR), and `~` (NOT). Filters are compiled to OData expressions for Azure AI Search."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8763a12b2bbd4a93a75aff182afb95dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter by a single field\n",
+    "print(\"=== Filter: os == 'windows' ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector, k=5, filters=F.os == \"windows\"\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: os={r.document.data.get('os')}, score={r.score:.4f}\")\n",
+    "\n",
+    "# Compound filter with AND\n",
+    "print(\"\\n=== Filter: os == 'windows' AND category == 'bug' ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=(F.os == \"windows\") & (F.category == \"bug\"),\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(\n",
+    "        f\"  - {r.document.id}: os={r.document.data.get('os')}, category={r.document.data.get('category')}\"\n",
+    "    )\n",
+    "\n",
+    "# OR filter\n",
+    "print(\"\\n=== Filter: category == 'bug' OR category == 'feature' ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=(F.category == \"bug\") | (F.category == \"feature\"),\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: category={r.document.data.get('category')}\")\n",
+    "\n",
+    "# NOT filter\n",
+    "print(\"\\n=== Filter: NOT os == 'linux' ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=3,\n",
+    "    filters=~(F.os == \"linux\"),\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: os={r.document.data.get('os')}\")\n",
+    "\n",
+    "# Show the compiled OData filter string for debugging\n",
+    "filter_expr = (F.os == \"windows\") & (F.category == \"bug\")\n",
+    "print(f\"\\nCompiled OData filter: {store._compile_filter(filter_expr)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7623eae2785240b9bd12b16a66d81610",
+   "metadata": {},
+   "source": [
+    "## Timestamp Filtering\n",
+    "\n",
+    "Date fields (declared as `\"date\"` in the `fields` dict) are automatically exploded into filterable components:\n",
+    "`_year`, `_month`, `_day`, `_hour`, `_day_of_week`, `_quarter`.\n",
+    "\n",
+    "The built-in `create_date` and `update_date` fields are also exploded automatically."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cdc8c89c7104fffa095e18ddfef8986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "# Filter by exploded field: documents created in December\n",
+    "print(\"=== Filter: create_date_month == 12 (December) ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=F.create_date_month == 12,\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(\n",
+    "        f\"  - {r.document.id}: create_date={r.document.create_date}, month={r.document.data.get('create_date_month')}\"\n",
+    "    )\n",
+    "\n",
+    "# Filter by day of week\n",
+    "print(\"\\n=== Filter: create_date_day_of_week == 'Monday' ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=F.create_date_day_of_week == \"Monday\",\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: day={r.document.data.get('create_date_day_of_week')}\")\n",
+    "\n",
+    "# Filter by quarter\n",
+    "print(\"\\n=== Filter: create_date_quarter == 4 (Q4) ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=F.create_date_quarter == 4,\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: quarter={r.document.data.get('create_date_quarter')}\")\n",
+    "\n",
+    "# Range query on the raw create_date\n",
+    "cutoff = (datetime.now() - timedelta(days=90)).isoformat()\n",
+    "print(f\"\\n=== Filter: create_date >= '{cutoff[:10]}...' (last 90 days) ===\")\n",
+    "filtered = store.similarity_search_by_vector(\n",
+    "    query_vector,\n",
+    "    k=5,\n",
+    "    filters=F.create_date >= cutoff,\n",
+    ")\n",
+    "print(f\"Found {len(filtered)} results:\")\n",
+    "for r in filtered:\n",
+    "    print(f\"  - {r.document.id}: create_date={r.document.create_date}\")\n",
+    "\n",
+    "# Show compiled OData filter strings\n",
+    "print(f\"\\nCompiled month filter:    {store._compile_filter(F.create_date_month == 12)}\")\n",
+    "print(f\"Compiled range filter:    {store._compile_filter(F.create_date >= cutoff)}\")\n",
+    "print(\n",
+    "    f\"Compiled compound filter: {store._compile_filter((F.create_date_quarter == 4) & (F.update_date_day_of_week == 'Monday'))}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b118ea5561624da68c537baed56e602f",
+   "metadata": {},
+   "source": [
+    "## Document Update and Removal\n",
+    "\n",
+    "Use `update()` to modify a document's metadata and `remove()` to delete documents by ID.\n",
+    "Azure AI Search operations may require a brief delay for propagation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "938c804e27f84196a10c8828c723f798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Update a document\n",
+    "doc_id = text_units[\"id\"].iloc[0]\n",
+    "original = store.search_by_id(doc_id)\n",
+    "print(f\"Original os: {original.data.get('os')}\")\n",
+    "\n",
+    "updated_doc = VectorStoreDocument(\n",
+    "    id=doc_id,\n",
+    "    vector=None,\n",
+    "    data={\"os\": \"updated-os-value\"},\n",
+    ")\n",
+    "store.update(updated_doc)\n",
+    "\n",
+    "# Allow time for Azure AI Search to propagate\n",
+    "time.sleep(2)\n",
+    "\n",
+    "result = store.search_by_id(doc_id)\n",
+    "print(f\"Updated os: {result.data.get('os')}\")\n",
+    "assert result.data.get(\"os\") == \"updated-os-value\", \"Update failed\"\n",
+    "print(\"Update confirmed.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "504fb2a444614c0babb325280ed9130a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove documents\n",
+    "ids_to_delete = text_units[\"id\"].head(5).tolist()\n",
+    "print(f\"Deleting {len(ids_to_delete)} documents...\")\n",
+    "\n",
+    "store.remove(ids_to_delete)\n",
+    "\n",
+    "# Allow time for Azure AI Search to propagate\n",
+    "time.sleep(3)\n",
+    "\n",
+    "new_count = store.count()\n",
+    "print(f\"Document count after delete: {new_count}\")\n",
+    "assert new_count == 37, f\"Expected 37, got {new_count}\"\n",
+    "print(\"Remove confirmed.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Add filtering, timestamp explosion, insert/count/remove/update operations to vector store API. Add top-level vector_size config to VectorStoreConfig."
 +}