diff --git a/.changeset/README.md b/.changeset/README.md new file mode 100644 index 0000000..2b95bfb --- /dev/null +++ b/.changeset/README.md @@ -0,0 +1,22 @@ +# Changesets + +Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works +with multi-package repos, or single-package repos to help you version and publish your code. You can +find the full documentation for it [in the readme](https://github.com/changesets/changesets/blob/main/README.md) + +## Usage + +### Adding a changeset + +Run `pnpm changeset` to create a new changeset. You'll be prompted to: +1. Select which packages have changed +2. Choose a bump type (major/minor/patch) +3. Write a summary of the changes + +### Versioning + +Run `pnpm changeset:version` to consume all pending changesets, bump versions, and update changelogs. + +### Publishing + +Run `pnpm release` to build and publish all packages to npm. diff --git a/.changeset/config.json b/.changeset/config.json new file mode 100644 index 0000000..b0b05fb --- /dev/null +++ b/.changeset/config.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://unpkg.com/@changesets/config@3.1.1/schema.json", + "changelog": [ + "@changesets/changelog-github", + { "repo": "techiejd/payloadcms-vectorize" } + ], + "commit": false, + "fixed": [ + ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf"] + ], + "access": "public", + "baseBranch": "main", + "updateInternalDependencies": "patch", + "ignore": [] +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9b4d41..84532f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,9 +4,29 @@ on: push: branches: [main, develop] pull_request: - branches: [main] + branches: [main, split_db_adapter] jobs: + typecheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Type check all packages + run: pnpm build:types:all + test_int: runs-on: ubuntu-latest @@ -33,8 +53,8 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: "20" - cache: "pnpm" + node-version: '20' + cache: 'pnpm' - name: Install dependencies run: pnpm install @@ -53,6 +73,90 @@ jobs: IVFFLATLISTS: 1 TEST_ENV: 1 + test_adapters_pg: + runs-on: ubuntu-latest + + services: + postgres: + image: pgvector/pgvector:pg15 + env: + POSTGRES_PASSWORD: password + POSTGRES_DB: payload_test + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5433:5432 + + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Install pgvector extension + run: | + sudo apt-get update + sudo apt-get install -y postgresql-client + PGPASSWORD=password psql -h localhost -p 5433 -U postgres -d payload_test -c "CREATE EXTENSION IF NOT EXISTS vector;" + + - name: Run pg adapter tests + run: pnpm test:adapters:pg + env: + PAYLOAD_SECRET: test-secret-key + DIMS: 8 + IVFFLATLISTS: 1 + TEST_ENV: 1 + + test_adapters_cf: + runs-on: ubuntu-latest + + services: + postgres: + image: pgvector/pgvector:pg15 + env: + POSTGRES_PASSWORD: password + POSTGRES_DB: payload_test + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5433:5432 + + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Run cf adapter tests + run: pnpm test:adapters:cf + env: + PAYLOAD_SECRET: test-secret-key + TEST_ENV: 1 + test_e2e: runs-on: ubuntu-latest @@ -79,8 +183,8 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: "20" - cache: "pnpm" + node-version: '20' + cache: 'pnpm' - name: Install dependencies run: pnpm install @@ -101,3 +205,19 @@ jobs: DIMS: 8 IVFFLATLISTS: 1 TEST_ENV: 1 + + test: + runs-on: ubuntu-latest + needs: [typecheck, test_int, test_adapters_pg, test_adapters_cf, test_e2e] + if: always() + steps: + - name: Check required jobs + run: | + if [ "${{ needs.typecheck.result }}" != "success" ] || \ + [ "${{ needs.test_int.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_pg.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_cf.result }}" != "success" ] || \ + [ "${{ needs.test_e2e.result }}" != "success" ]; then + echo "One or more required jobs failed" + exit 1 + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..7743b8b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,44 @@ +name: Release + +on: + push: + branches: + - main + +concurrency: ${{ github.workflow }}-${{ github.ref }} + +jobs: + release: + name: Release + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + id-token: write + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + registry-url: 'https://registry.npmjs.org' + + - name: Install dependencies + run: pnpm install + + - name: Create Release Pull Request or Publish + id: changesets + uses: changesets/action@v1 + with: + publish: pnpm release + version: pnpm changeset:version + title: 'chore: version packages' + commit: 'chore: version packages' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NPM_CONFIG_PROVENANCE: true diff --git a/.gitignore b/.gitignore index d2757c1..4e8441e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # dependencies -/node_modules +node_modules/ /.pnp .pnp.js .yarn/install-state.gz @@ -20,6 +20,8 @@ # production /build /dist +/adapters/pg/dist +/adapters/cf/dist # misc .DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md index a9f21d2..2434e16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,121 @@ All notable changes to this project will be documented in this file. +## 0.6.0-beta.5 - 2026-02-25 + +- Merges main into split_db_adapter (per-batch polling, coordinator/worker architecture, destroyPayload cleanup). + +## 0.6.0-beta.4 - 2026-02-20 + +- Merges main with should embed changes. + +## 0.6.0-beta - 2026-02-01 + +### Breaking Changes + +- **Database Adapter Architecture**: The plugin now uses a pluggable database adapter system. You must install a database adapter package (e.g., `@payloadcms-vectorize/pg`) separately from the core plugin. +- **`createVectorizeIntegration` removed from core**: Use the adapter-specific integration factory instead (e.g., `createPostgresVectorIntegration` from `@payloadcms-vectorize/pg`). +- **`dbAdapter` option required**: The `payloadcmsVectorize()` plugin now requires a `dbAdapter` option pointing to your adapter's implementation. +- **`similarity` renamed to `score`**: The `VectorSearchResult.similarity` field has been renamed to `score` to be more generic across different distance metrics. + +### Added + +- **`@payloadcms-vectorize/pg` package**: PostgreSQL adapter for pgvector, extracted from the core plugin. +- **`@payloadcms-vectorize/cf` package**: Cloudflare Vectorize adapter for edge-native vector search. +- **`DbAdapter` interface**: New interface for implementing custom database adapters. See `adapters/README.md`. +- **`deleteEmbeddings` on `DbAdapter`**: Adapters can now delete vectors when a document is deleted or re-indexed. Implemented in both the `pg` and `cf` adapters. +- **Adapter documentation**: Added `adapters/README.md` explaining how to create custom adapters. + +### Migration + +**Before (0.5.x)** + +```typescript +import { createVectorizeIntegration } from 'payloadcms-vectorize' + +const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ + main: { dims: 1536, ivfflatLists: 100 }, +}) + +export default buildConfig({ + db: postgresAdapter({ + afterSchemaInit: [afterSchemaInitHook], + }), + plugins: [ + payloadcmsVectorize({ + knowledgePools: { + main: { + /* ... */ + }, + }, + }), + ], +}) +``` + +**After (0.6.0+)** + +```typescript +import { createPostgresVectorIntegration } from '@payloadcms-vectorize/pg' +import payloadcmsVectorize from 'payloadcms-vectorize' + +const integration = createPostgresVectorIntegration({ + main: { dims: 1536, ivfflatLists: 100 }, +}) + +export default buildConfig({ + db: postgresAdapter({ + afterSchemaInit: [integration.afterSchemaInitHook], + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + main: { + /* ... */ + }, + }, + }), + ], +}) +``` + +**Updating search result handling:** + +```typescript +// Before +const score = result.similarity + +// After +const score = result.score +``` + +## 0.5.5 - 2026-02-24 + +### Added + +- **`batchLimit` option on `CollectionVectorizeOption`** – limits the number of documents fetched per bulk-embed worker job. When set, each page of results queues a continuation job for the next page, preventing serverless time-limit issues on large collections. Defaults to 1000. + +### Changed + +- **Coordinator / worker architecture for `prepare-bulk-embedding`** – the initial job now acts as a coordinator that fans out one worker job per collection. Each worker processes a single page of documents, making bulk embedding parallelizable and more resilient to timeouts. +- **Per-batch polling via `poll-or-complete-single-batch`** – replaced the monolithic `poll-or-complete-bulk-embedding` task. Each provider batch now has its own polling job, improving observability and reducing memory usage. +- **Memory-efficient incremental aggregation** – `finalizeRunIfComplete` now scans batch records page-by-page instead of loading all batches into memory at once. + +### Removed + +- `poll-or-complete-bulk-embedding` task (replaced by `poll-or-complete-single-batch`). + +### Upgrade Notes + +- **Ensure no bulk embedding run is in progress when upgrading.** The `poll-or-complete-bulk-embedding` task has been removed and replaced by `poll-or-complete-single-batch`. Any in-flight bulk run that still has pending `poll-or-complete-bulk-embedding` jobs will fail because the task slug no longer exists. Wait for all active runs to complete (or cancel them) before deploying this version. + +## 0.5.4 - 2026-02-20 + +### Added + +- **`shouldEmbedFn` filter**: Optional function on `CollectionVectorizeOption` that runs before a document is queued for embedding. Return `false` to skip the document entirely — no job is created and `toKnowledgePool` is never called. Works for both real-time and bulk embedding. Useful for skipping drafts, archived documents, or any custom criteria. + ## 0.5.3 - 2026-01-24 ### Changed diff --git a/README.md b/README.md index 110da8d..7b621b0 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,43 @@ # PayloadCMS Vectorize -A Payload CMS plugin that adds vector search capabilities to your collections using PostgreSQL's pgvector extension. Perfect for building RAG (Retrieval-Augmented Generation) applications and semantic search features. +A Payload CMS plugin that adds vector search capabilities to your collections. Perfect for building RAG (Retrieval-Augmented Generation) applications and semantic search features. ## Features - 🔍 **Semantic Search**: Vectorize any collection for intelligent content discovery - 🚀 **Realtime**: Documents are automatically vectorized when created or updated in realtime, and vectors are deleted as soon as the document is deleted. -- 🧵 **Bulk embedding**: Run “Embed all” batches that backfill only documents missing the current `embeddingVersion` since the last bulk run in order to save money. -- 📊 **PostgreSQL Integration**: Built on pgvector for high-performance vector operations +- 🧵 **Bulk embedding**: Run "Embed all" batches that backfill only documents missing the current `embeddingVersion` since the last bulk run in order to save money. +- 🔌 **Database Adapters**: Pluggable architecture supporting different database backends - ⚡ **Background Processing**: Uses Payload's job system for non-blocking vectorization - 🎯 **Flexible Chunking**: Drive chunk creation yourself with `toKnowledgePool` functions so you can combine any fields or content types - 🧩 **Extensible Schema**: Attach custom `extensionFields` to the embeddings collection and persist values per chunk and use for querying. - 🌐 **REST API**: Built-in vector-search endpoint with Payload-style `where` filtering and configurable limits -- 🏊 **Multiple Knowledge Pools**: Separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) and needs. +- 🏊 **Multiple Knowledge Pools**: Separate knowledge pools with independent configurations and needs. + +## Database Adapters + +This plugin requires a database adapter for vector storage. Available adapters: + +| Adapter | Package | Database | Documentation | +|---------|---------|----------|---------------| +| PostgreSQL | `@payloadcms-vectorize/pg` | PostgreSQL with pgvector | [README](./adapters/pg/README.md) | + +See [adapters/README.md](./adapters/README.md) for information on creating custom adapters. ## Prerequisites - Payload CMS 3.x (tested on 3.69.0, previously tested on 3.37.0) -- PostgreSQL with pgvector extension +- A supported database with vector capabilities (see adapters above) - Node.js 18+ ## Installation ```bash +# Install the core plugin pnpm add payloadcms-vectorize + +# Install a database adapter (e.g., PostgreSQL) +pnpm add @payloadcms-vectorize/pg ``` ## Quick Start @@ -38,23 +52,20 @@ pnpm add payloadcms-vectorize [![Bulk embedding](https://img.youtube.com/vi/oIcqu08k1Ok/0.jpg)](https://youtu.be/oIcqu08k1Ok) -### 0. Have pgvector permissions - -The plugin expects `vector` extension to be configured (`db: postgresAdapter({extensions: ['vector'],...})`) when Payload initializes. Your PostgreSQL database user must have permission to create extensions. If your user doesn't have these permissions, someone with permissions may need to manually create the extension once: +### 1. Set Up Your Database Adapter -```sql -CREATE EXTENSION IF NOT EXISTS vector; -``` +First, configure your database adapter. See the adapter-specific documentation: -**Note:** Most managed PostgreSQL services (like AWS RDS, Supabase, etc.) require superuser privileges or specific extension permissions. If you encounter permission errors, contact your database administrator or check your service's documentation. +- **PostgreSQL**: See [@payloadcms-vectorize/pg README](./adapters/pg/README.md) for pgvector setup, schema initialization, and migration handling. -### 1. Configure the Plugin +### 2. Configure the Plugin ```typescript import { buildConfig } from 'payload' import type { Payload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import { createPostgresVectorIntegration } from '@payloadcms-vectorize/pg' +import payloadcmsVectorize from 'payloadcms-vectorize' import type { ToKnowledgePoolFn } from 'payloadcms-vectorize' // Configure your embedding functions @@ -102,26 +113,25 @@ const postsToKnowledgePool: ToKnowledgePoolFn = async (doc, payload) => { return entries } -// Create the integration with static configs (dims, ivfflatLists) -const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - // Note limitation: Changing these values requires a migration. +// Create the database adapter integration +// See adapter documentation for configuration options +const integration = createPostgresVectorIntegration({ mainKnowledgePool: { - dims: 1536, // Vector dimensions - ivfflatLists: 100, // IVFFLAT index parameter + dims: 1536, // Vector dimensions + ivfflatLists: 100, // Index parameter }, }) export default buildConfig({ // ... your existing config db: postgresAdapter({ - // configure the 'vector' extension. extensions: ['vector'], - // afterSchemaInitHook adds 'vector' to your schema - afterSchemaInit: [afterSchemaInitHook], + afterSchemaInit: [integration.afterSchemaInitHook], // ... your database config }), plugins: [ payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { mainKnowledgePool: { collections: { @@ -177,33 +187,13 @@ pnpm run generate:importmap - **For production builds**: You MUST run `pnpm run generate:importmap` BEFORE running `pnpm build`, otherwise custom components won't be found during the build process. - **If client components don't appear**: Try manually generating the import map: `pnpm run generate:importmap` -**⚠️ Important:** Run this command: - -- After initial plugin setup -- If the "Embed all" button doesn't appear in the admin UI - -The import map tells Payload how to resolve component paths (like `'payloadcms-vectorize/client#EmbedAllButton'`) to actual React components. Without it, client components referenced in your collection configs won't render. - -### 2. Initial Migration Setup - -After configuring the plugin, create and apply your initial migration. The IVFFLAT indexes are created automatically via the `afterSchemaInitHook` using Drizzle's `extraConfig`. - -**For new setups:** +### 3. Run Migrations -1. Create your initial Payload migration (this will include both embedding columns and IVFFLAT indexes via Drizzle schema): +See your database adapter's documentation for migration instructions: - ```bash - pnpm payload migrate:create --name initial - ``` +- **PostgreSQL**: See [@payloadcms-vectorize/pg README](./adapters/pg/README.md#migrations) -2. Review and apply the migration: - ```bash - pnpm payload migrate - ``` - -**Note:** Both the embedding columns and IVFFLAT indexes are created automatically by Drizzle via the `afterSchemaInitHook`. No additional CLI steps are required for initial setup or when changing `ivfflatLists`. However, if you change `dims` after initial setup, you must run `pnpm payload vectorize:migrate` to add the required TRUNCATE statement to your migration (see "Changing dims" section below). There is no need to run `vectorize:migrate` on the first migration. - -### 3. Search Your Content +### 4. Search Your Content The plugin automatically creates a `/api/vector-search` endpoint: @@ -222,7 +212,7 @@ const response = await fetch('/api/vector-search', { }) const { results } = await response.json() -// Each result contains: id, similarity, sourceCollection, docId, chunkIndex, chunkText, +// Each result contains: id, score, sourceCollection, docId, chunkIndex, chunkText, // embeddingVersion, and any extensionFields you attached (e.g., category, priority). ``` @@ -262,18 +252,18 @@ if (vectorizedPayload) { ### Knowledge Pool Config -Knowledge pools are configured in two steps. The static configs define the database schema (migration required), while dynamic configs define runtime behavior (no migration required). +Knowledge pools are configured in two steps: -**1. Static Config** (passed to `createVectorizeIntegration`): +**1. Static Config** (passed to your database adapter's integration factory): -- `dims`: `number` - Vector dimensions for pgvector column -- `ivfflatLists`: `number` - IVFFLAT index parameter +Static configuration options vary by database adapter. See your adapter's documentation for available options: +- **PostgreSQL**: `dims`, `ivfflatLists` - See [@payloadcms-vectorize/pg README](./adapters/pg/README.md#static-configuration) The embeddings collection name will be the same as the knowledge pool name. **2. Dynamic Config** (passed to `payloadcmsVectorize`): -- `collections`: `Record` - Collections and their chunking configs +- `collections`: `Record` - Collections and their configs (optional `shouldEmbedFn` filter + required `toKnowledgePool` chunker) - `extensionFields?`: `Field[]` - Optional fields to extend the embeddings collection schema - `embeddingConfig`: Embedding configuration object: - `version`: `string` - Version string for tracking model changes @@ -436,8 +426,8 @@ type OnBulkErrorArgs = { The plugin uses separate Payload jobs for reliability with long-running providers: -- **`prepare-bulk-embedding`**: Streams through documents, calls your `addChunk` for each chunk, creates batch records. -- **`poll-or-complete-bulk-embedding`**: Polls all batches, requeues itself until done, then writes all successful embeddings (partial chunk failures are allowed). +- **`prepare-bulk-embedding`**: A coordinator job fans out one worker per collection. Each worker streams through documents, calls your `addChunk` for each chunk, and creates batch records. When `batchLimit` is set on a collection, workers paginate and queue continuation jobs. +- **`poll-or-complete-single-batch`**: Polls a single batch, requeues itself until done, then writes successful embeddings. When all batches for a run are terminal, the run is finalized (partial chunk failures are allowed). ### Queue Configuration @@ -464,64 +454,11 @@ jobs: { } ``` -## Changing Static Config (ivfflatLists or dims) & Migrations - -**⚠️ Important:** Changing `dims` is **DESTRUCTIVE** - it requires re-embedding all your data. Changing `ivfflatLists` rebuilds the index (non-destructive but may take time). - -When you change static config values (`dims` or `ivfflatLists`): - -1. **Update your static config** in `payload.config.ts`: - - ```typescript - const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - mainKnowledgePool: { - dims: 1536, // Changed from previous value - ivfflatLists: 200, // Changed from previous value - }, - }) - ``` +## Changing Static Config & Migrations -2. **Create a migration**: +Static configuration changes (like vector dimensions) may require migrations. See your database adapter's documentation for specific instructions: - ```bash - pnpm payload migrate:create --name migration_name - ``` - - Drizzle will automatically generate the index rebuild SQL. - -3. **Changing `dims`(Destructive, Optional)**: - -Skip this step if you did not change the `dims`. - -Changing `dims` requires truncating the embeddings table because existing vectors are incompatible with the new dimensions. You must use the `vectorize:migrate` CLI to add the TRUNCATE statement: - -```bash -pnpm payload vectorize:migrate -``` - -The CLI will: - -- Detect the dims change -- Patch the migration with TRUNCATE TABLE (with CASCADE) -- Add appropriate down migration to restore the old column type - -4. **Review the migration file** in `src/migrations/` - -5. **Apply the migration**: - - ```bash - pnpm payload migrate - ``` - -6. **If `dims` changed**: Re-embed all your documents using the bulk embed feature. - -**Schema name qualification:** - -The CLI automatically uses the `schemaName` from your Postgres adapter configuration. If you use a custom schema (e.g., `postgresAdapter({ schemaName: 'custom' })`), all SQL in the migration will be properly qualified with that schema name. - -**Idempotency:** - -Running `pnpm payload vectorize:migrate` multiple times with no config changes will not create duplicate migrations. The CLI detects when no changes are needed and exits early. +- **PostgreSQL**: See [@payloadcms-vectorize/pg README](./adapters/pg/README.md#migrations) ### Endpoints @@ -622,10 +559,23 @@ curl -X POST http://localhost:3000/api/vector-retry-failed-batch \ #### CollectionVectorizeOption +- `shouldEmbedFn? (doc, payload)` – optional filter that runs **before** the document is queued for embedding. Return `false` to skip the document entirely (no job is created and `toKnowledgePool` is never called). Works for both real-time and bulk embedding. Defaults to embedding all documents when omitted. - `toKnowledgePool (doc, payload)` – return an array of `{ chunk, ...extensionFieldValues }`. Each object becomes one embedding row and the index in the array determines `chunkIndex`. +- `batchLimit? (number)` – max documents to fetch per bulk-embed worker job. When set, each page of results becomes a separate job that queues a continuation for the next page. Useful for large collections that would exceed serverless time limits in a single job. Defaults to 1000. Reserved column names: `sourceCollection`, `docId`, `chunkIndex`, `chunkText`, `embeddingVersion`. Avoid reusing them in `extensionFields`. +**Example – skip draft documents:** + +```typescript +collections: { + posts: { + shouldEmbedFn: async (doc) => doc._status === 'published', + toKnowledgePool: postsToKnowledgePool, + }, +} +``` + ## Chunkers Use chunker helpers (see `dev/helpers/chunkers.ts`) to keep `toKnowledgePool` implementations focused on orchestration. A `toKnowledgePool` can combine multiple chunkers, enrich each chunk with metadata, and return everything the embeddings collection needs. @@ -649,14 +599,6 @@ Because you control the output, you can mix different field types, discard empty - If any entry is malformed, the vectorize job fails with `hasError = true` and an error message listing invalid indices. - To retry after fixing your `toKnowledgePool` logic, clear `hasError` and `completedAt` (and set `processing` to `false` if needed) on the failed `payload-jobs` row. The queue runner will pick it up on the next interval. -## PostgreSQL Custom Schema Support - -The plugin reads the `schemaName` configuration from your Postgres adapter within the Payload config. - -When you configure a custom schema via `postgresAdapter({ schemaName: 'custom' })`, all plugin SQL queries (for vector columns, indexes, and embeddings) are qualified with that schema name. This is useful for multi-tenant setups or when content tables live in a dedicated schema. - -Where schemaName is not specified within the postgresAdapter in the Payload config, the plugin falls back to `public` as is default adapter behaviour. - ## Example ### Using with Voyage AI @@ -694,7 +636,7 @@ You can see more examples in `dev/helpers/embed.ts` **POST** `/api/vector-search` -Search for similar content using vector similarity. +Search for similar content using vector search. **Request Body:** @@ -724,7 +666,7 @@ Search for similar content using vector similarity. "results": [ { "id": "embedding_id", - "similarity": 0.85, + "score": 0.85, "sourceCollection": "posts", "docId": "post_id", "chunkIndex": 0, @@ -1019,13 +961,14 @@ The more detailed your issue, the better I can understand and address your needs Thank you for the stars! The following updates have been completed: -- **Multiple Knowledge Pools**: You can create separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) and needs. Each pool operates independently, allowing you to organize your vectorized content by domain, use case, or any other criteria that makes sense for your application. +- **Multiple Knowledge Pools**: You can create separate knowledge pools with independent configurations and embedding functions. Each pool operates independently, allowing you to organize your vectorized content by domain, use case, or any other criteria that makes sense for your application. +- **Database Adapter Architecture**: Pluggable adapter system allowing support for different database backends. - **More expressive queries**: Added ability to change query limit, search on certain collections or certain fields - **Bulk embed all**: Batch backfills with admin button, provider callbacks, and run tracking. The following features are planned for future releases based on community interest and stars: -- **MongoDB support**: Extend vector search capabilities to MongoDB databases +- **MongoDB adapter**: Add a `@payloadcms-vectorize/mongodb` adapter for MongoDB Atlas Vector Search - **Vercel support**: Optimized deployment and configuration for Vercel hosting **Want to see these features sooner?** Star this repository and open issues for the features you need most! diff --git a/adapters/README.md b/adapters/README.md new file mode 100644 index 0000000..832a40b --- /dev/null +++ b/adapters/README.md @@ -0,0 +1,217 @@ +# PayloadCMS Vectorize - Database Adapters + +The `payloadcms-vectorize` plugin uses a database adapter architecture to support different database backends for vector storage and search. This document explains how adapters work and how to create your own. + +## Available Adapters + +| Adapter | Package | Database | +|---------|---------|----------| +| PostgreSQL | `@payloadcms-vectorize/pg` | PostgreSQL with pgvector | + +## DbAdapter Interface + +All database adapters must implement the `DbAdapter` interface exported from `payloadcms-vectorize`: + +```typescript +import type { Config, BasePayload, Payload, Where } from 'payload' +import type { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' + +export type DbAdapter = { + /** + * Extends the Payload config with adapter-specific configuration. + * Called during plugin initialization. + * + * @param payloadCmsConfig - The Payload CMS config object + * @returns Configuration extension with optional bins (CLI commands) and custom data + */ + getConfigExtension: (payloadCmsConfig: Config) => { + bins?: { key: string; scriptPath: string }[] + custom?: Record + } + + /** + * Performs a vector search. + * + * @param payload - The Payload instance + * @param queryEmbedding - The query vector to search for + * @param poolName - The knowledge pool to search in + * @param limit - Maximum number of results (optional) + * @param where - Payload-style where clause for filtering (optional) + * @returns Array of search results with relevance scores + */ + search: ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit?: number, + where?: Where, + ) => Promise> + + /** + * Stores an embedding vector for a document chunk. + * + * @param payload - The Payload instance + * @param poolName - The knowledge pool to store in + * @param id - The embedding record ID + * @param embedding - The vector to store + */ + storeEmbedding: ( + payload: Payload, + poolName: KnowledgePoolName, + id: string, + embedding: number[] | Float32Array, + ) => Promise +} +``` + +## Creating a Custom Adapter + +To create a custom adapter for a new database backend: + +### 1. Create the adapter package + +``` +my-adapter/ +├── src/ +│ ├── index.ts # Main entry point, exports integration factory +│ ├── search.ts # Vector search implementation +│ └── embed.ts # Embedding storage implementation (storeEmbedding) +├── package.json +└── README.md +``` + +### 2. Implement the integration factory + +Your adapter should export a factory function that returns: +- Any database-specific hooks (e.g., schema initialization) +- The `DbAdapter` implementation + +```typescript +import type { DbAdapter } from 'payloadcms-vectorize' + +export type MyAdapterConfig = { + [poolName: string]: { + dims: number + // ... other database-specific options + } +} + +export const createMyVectorIntegration = ( + config: MyAdapterConfig, +): { + // Database-specific hooks (optional) + someHook?: SomeHookType + // Required: the adapter implementation + adapter: DbAdapter +} => { + const adapter: DbAdapter = { + getConfigExtension: (payloadCmsConfig) => { + return { + // Optional: register CLI commands + bins: [ + { + key: 'vectorize:my-command', + scriptPath: '/path/to/script.js', + }, + ], + // Optional: store adapter-specific data + custom: { + _staticConfigs: config, + }, + } + }, + + search: async (payload, queryEmbedding, poolName, limit, where) => { + // Implement vector search for your database + // Must return Array + }, + + storeEmbedding: async (payload, poolName, id, embedding) => { + // Implement embedding storage for your database + }, + } + + return { adapter } +} +``` + +### 3. Define peer dependencies + +Your adapter should have peer dependencies on: +- `payload` - The Payload CMS package +- `payloadcms-vectorize` - The vectorize plugin +- Your database adapter (e.g., `@payloadcms/db-mongodb`) + +```json +{ + "peerDependencies": { + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.5.4 <1.0.0", + "@payloadcms/db-your-db": ">=3.0.0 <4.0.0" + } +} +``` + +### 4. Usage in Payload config + +Users will use your adapter like this: + +```typescript +import { buildConfig } from 'payload' +import { myDbAdapter } from '@payloadcms/db-my-db' +import { createMyVectorIntegration } from 'my-vectorize-adapter' +import payloadcmsVectorize from 'payloadcms-vectorize' + +const integration = createMyVectorIntegration({ + default: { + dims: 1536, + // ... other options + }, +}) + +export default buildConfig({ + db: myDbAdapter({ + // ... your database config + // Include any hooks from the integration if needed + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + default: { + // ... pool config + }, + }, + }), + ], +}) +``` + +## VectorSearchResult Type + +The `search` method must return results conforming to this type: + +```typescript +export type VectorSearchResult = { + /** The embedding record ID */ + id: string + /** Relevance score (higher = more relevant). Range depends on adapter implementation. */ + score: number + /** Source collection slug */ + sourceCollection: string + /** Source document ID */ + docId: string + /** Chunk index within the document */ + chunkIndex: number + /** The text content of the chunk */ + chunkText: string + /** Embedding version string */ + embeddingVersion: string + /** Any extension field values */ + [key: string]: any +} +``` + +## Contributing + +Want to add support for a new database? We welcome contributions! Please open an issue first to discuss the implementation approach. diff --git a/adapters/cf/README.md b/adapters/cf/README.md new file mode 100644 index 0000000..252ef8d --- /dev/null +++ b/adapters/cf/README.md @@ -0,0 +1,214 @@ +# @payloadcms-vectorize/cf + +Cloudflare Vectorize adapter for [payloadcms-vectorize](https://github.com/techiejd/payloadcms-vectorize). Enables vector search capabilities using Cloudflare Vectorize. + +## Prerequisites + +- Cloudflare account with Vectorize index configured +- Payload CMS 3.x with any supported database adapter +- Node.js 18+ + +## Installation + +```bash +pnpm add @payloadcms-vectorize/cf payloadcms-vectorize +``` + +## Quick Start + +### 1. Create Vectorize Index + +Create a Vectorize index in your Cloudflare dashboard or via Wrangler: + +```bash +wrangler vectorize create my-vectorize-index --dimensions=384 --metric=cosine +``` + +### 2. Configure the Plugin + +```typescript +import { buildConfig } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { createCloudflareVectorizeIntegration } from '@payloadcms-vectorize/cf' +import payloadcmsVectorize from 'payloadcms-vectorize' + +// Create the integration +const integration = createCloudflareVectorizeIntegration({ + config: { + default: { + dims: 384, // Vector dimensions (must match your embedding model and Vectorize index) + }, + }, + binding: env.VECTORIZE, // Cloudflare Vectorize binding +}) + +export default buildConfig({ + // ... your existing config + db: postgresAdapter({ + pool: { + connectionString: process.env.DATABASE_URL, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: 'v1.0.0', + queryFn: embedQuery, + realTimeIngestionFn: embedDocs, + }, + }, + }, + }), + ], +}) +``` + +## Configuration + +The `createCloudflareVectorizeIntegration` function accepts a configuration object with `config` and `binding` properties: + +```typescript +const integration = createCloudflareVectorizeIntegration({ + config: { + poolName: { + dims: number, // Required: Vector dimensions + }, + // ... additional pools + }, + binding: vectorizeBinding, // Required: Cloudflare Vectorize binding +}) +``` + +### Configuration Options + +| Option | Type | Required | Description | +| ------ | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dims` | `number` | Yes | Vector dimensions for the Vectorize index. Must match your embedding model's output dimensions and your Cloudflare Vectorize index configuration. | + +### Cloudflare Bindings + +| Property | Type | Required | Description | +| ----------- | ---------------- | -------- | ------------------------------------------------------------------------------------------------- | +| `vectorize` | `VectorizeIndex` | Yes | Cloudflare Vectorize binding for vector storage. Configured in `wrangler.toml` for Workers/Pages. | + +## Integration Return Value + +`createCloudflareVectorizeIntegration` returns an object with: + +| Property | Type | Description | +| --------- | ----------- | ------------------------------------------------------------------------- | +| `adapter` | `DbAdapter` | The database adapter to pass to `payloadcmsVectorize({ dbAdapter: ... })` | + +## Multiple Knowledge Pools + +You can configure multiple knowledge pools with different dimensions: + +```typescript +const integration = createCloudflareVectorizeIntegration({ + config: { + documents: { + dims: 1536, + }, + images: { + dims: 512, + }, + }, + binding: env.VECTORIZE, +}) + +export default buildConfig({ + // ... + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + documents: { + collections: { + /* ... */ + }, + embeddingConfig: { + /* ... */ + }, + }, + images: { + collections: { + /* ... */ + }, + embeddingConfig: { + /* ... */ + }, + }, + }, + }), + ], +}) +``` + +**Note:** Each knowledge pool requires a separate Vectorize index with matching dimensions. + +## Using with Cloudflare AI + +```typescript +export const embedDocs = async (texts: string[]): Promise => { + const results = await Promise.all( + texts.map((text) => + env.AI.run('@cf/baai/bge-small-en-v1.5', { + text, + }), + ), + ) + return results.map((r) => r.data[0]) +} + +export const embedQuery = async (text: string): Promise => { + const result = await env.AI.run('@cf/baai/bge-small-en-v1.5', { + text, + }) + return result.data[0] +} +``` + +## Using with Voyage AI + +```typescript +import { embed, embedMany } from 'ai' +import { voyage } from 'voyage-ai-provider' + +export const embedDocs = async (texts: string[]): Promise => { + const embedResult = await embedMany({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + values: texts, + providerOptions: { + voyage: { inputType: 'document' }, + }, + }) + return embedResult.embeddings +} + +export const embedQuery = async (text: string): Promise => { + const embedResult = await embed({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + value: text, + providerOptions: { + voyage: { inputType: 'query' }, + }, + }) + return embedResult.embedding +} +``` + +## Known Limitations + +- **Search `limit` with `where` filtering:** When a `where` clause is provided, filtering is applied after fetching results from Cloudflare Vectorize. This means you may receive fewer results than the requested `limit` even when more matching vectors exist. + +## License + +MIT diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts new file mode 100644 index 0000000..5afdb44 --- /dev/null +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -0,0 +1,336 @@ +/** + * Unit tests for the Cloudflare Vectorize adapter. + * + * These tests verify adapter functionality using mocked Cloudflare bindings + * without requiring a real Payload instance. + */ +import { describe, expect, test, vi } from 'vitest' +import { createCloudflareVectorizeIntegration } from '../../src/index.js' + +const DIMS = 8 + +// Mock Cloudflare binding +function createMockCloudflareBinding() { + const storage = new Map() + + return { + query: vi.fn(async (queryVector: number[], options: any) => { + const { topK = 10, returnMetadata = false, where } = options + + // Simple in-memory search using cosine similarity + const results = Array.from(storage.values()) + .filter((item) => { + // Basic metadata filtering + if (where?.and) { + return where.and.every((condition: any) => { + const key = condition.key + const value = condition.value + return item.metadata?.[key] === value + }) + } + return true + }) + .map((item) => { + // Calculate cosine similarity + const dotProduct = item.values.reduce((sum, v, i) => sum + v * queryVector[i], 0) + const normA = Math.sqrt(queryVector.reduce((sum, v) => sum + v * v, 0)) + const normB = Math.sqrt(item.values.reduce((sum, v) => sum + v * v, 0)) + const score = normA === 0 || normB === 0 ? 0 : dotProduct / (normA * normB) + + return { + id: item.id, + score, + metadata: returnMetadata ? item.metadata : undefined, + } + }) + .sort((a, b) => b.score - a.score) + .slice(0, topK) + + return { matches: results } + }), + + upsert: vi.fn(async (vectors: any[]) => { + for (const vector of vectors) { + storage.set(vector.id, { + id: vector.id, + values: vector.values, + metadata: vector.metadata || {}, + }) + } + }), + + deleteByIds: vi.fn(async (ids: string[]) => { + for (const id of ids) { + storage.delete(id) + } + }), + + list: vi.fn(async (options: any) => { + const vectors = Array.from(storage.values()).map((item) => ({ + id: item.id, + values: item.values, + metadata: options?.returnMetadata ? item.metadata : undefined, + })) + return { vectors } + }), + + // Helper to get storage for assertions + __getStorage: () => storage, + } +} + +function createMockPayloadForEmbed(mockBinding: any) { + return { + config: { + custom: { + createVectorizedPayloadObject: () => ({ + getDbAdapterCustom: () => ({ _vectorizeBinding: mockBinding }), + }), + }, + }, + create: vi.fn().mockResolvedValue({ id: 'mapping-1' }), + logger: { error: vi.fn() }, + } as any +} + +describe('createCloudflareVectorizeIntegration', () => { + describe('validation', () => { + test('should throw if vectorize binding is missing', () => { + expect(() => { + createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: undefined as any, + }) + }).toThrow('Cloudflare Vectorize binding is required') + }) + + test('should create integration with valid config', () => { + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const integration = createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: mockVectorize, + }) + + expect(integration).toBeDefined() + expect(integration.adapter).toBeDefined() + expect(integration.adapter.storeEmbedding).toBeDefined() + expect(integration.adapter.search).toBeDefined() + expect(integration.adapter.deleteEmbeddings).toBeDefined() + expect(integration.adapter.getConfigExtension).toBeDefined() + }) + }) + + describe('getConfigExtension', () => { + test('should return config with pool configurations', () => { + const poolConfigs = { mainPool: { dims: 384 }, secondaryPool: { dims: 768 } } + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const { adapter } = createCloudflareVectorizeIntegration({ + config: poolConfigs, + binding: mockVectorize, + }) + const extension = adapter.getConfigExtension({} as any) + + expect(extension.custom?._cfVectorizeAdapter).toBe(true) + expect(extension.custom?._poolConfigs).toEqual(poolConfigs) + }) + + test('should return collections with cfMappings', () => { + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: mockVectorize, + }) + const extension = adapter.getConfigExtension({} as any) + + expect(extension.collections).toBeDefined() + expect(extension.collections!['vector-cf-mappings']).toBeDefined() + expect(extension.collections!['vector-cf-mappings'].slug).toBe('vector-cf-mappings') + }) + }) + + describe('storeEmbedding', () => { + test('should convert Float32Array to regular array', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const embedding = new Float32Array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + const mockPayload = createMockPayloadForEmbed(mockBinding) + + await adapter.storeEmbedding( + mockPayload, + 'default', + 'test-collection', + 'doc-1', + 'test-id', + embedding, + ) + + expect(mockBinding.upsert).toHaveBeenCalledWith([ + { + id: 'test-id', + values: Array.from(embedding), + }, + ]) + }) + + test('should create a mapping row', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = createMockPayloadForEmbed(mockBinding) + const embedding = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + + await adapter.storeEmbedding( + mockPayload, + 'default', + 'test-collection', + 'doc-1', + 'test-id', + embedding, + ) + + expect(mockPayload.create).toHaveBeenCalledWith({ + collection: 'vector-cf-mappings', + data: { + vectorId: 'test-id', + poolName: 'default', + sourceCollection: 'test-collection', + docId: 'doc-1', + }, + }) + }) + }) + + describe('deleteEmbeddings', () => { + test('should look up mappings with correct where clause', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ docs: [], hasNextPage: false }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockPayload.find).toHaveBeenCalledWith( + expect.objectContaining({ + collection: 'vector-cf-mappings', + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'test-collection' } }, + { docId: { equals: 'doc-123' } }, + ], + }, + }), + ) + }) + + test('should delete matching vectors via mappings', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ + docs: [ + { id: 'map-1', vectorId: 'vec-1' }, + { id: 'map-2', vectorId: 'vec-2' }, + ], + hasNextPage: false, + }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockBinding.deleteByIds).toHaveBeenCalledWith(['vec-1', 'vec-2']) + }) + + test('should clean up mapping rows after deleting vectors', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ + docs: [{ id: 'map-1', vectorId: 'vec-1' }], + hasNextPage: false, + }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockPayload.delete).toHaveBeenCalledWith( + expect.objectContaining({ + collection: 'vector-cf-mappings', + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'test-collection' } }, + { docId: { equals: 'doc-123' } }, + ], + }, + }), + ) + }) + + test('should handle empty results gracefully', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ docs: [], hasNextPage: false }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockBinding.deleteByIds).not.toHaveBeenCalled() + }) + + test('should handle errors', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockRejectedValue(new Error('Query failed')), + logger: { error: vi.fn() }, + } as any + + await expect( + adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123'), + ).rejects.toThrow('Failed to delete embeddings') + + expect(mockPayload.logger.error).toHaveBeenCalled() + }) + }) +}) diff --git a/adapters/cf/dev/specs/compliance.spec.ts b/adapters/cf/dev/specs/compliance.spec.ts new file mode 100644 index 0000000..2fae60b --- /dev/null +++ b/adapters/cf/dev/specs/compliance.spec.ts @@ -0,0 +1,432 @@ +/** + * Adapter compliance tests for the Cloudflare Vectorize adapter. + * + * These tests verify that the Cloudflare adapter correctly implements + * the DbAdapter interface as defined in payloadcms-vectorize. + * + * Note: Uses mocked Cloudflare bindings since there's no local Vectorize emulator. + */ +import { beforeAll, afterAll, describe, expect, test, vi } from 'vitest' +import type { Payload, SanitizedConfig } from 'payload' +import { buildConfig, getPayload } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { Client } from 'pg' +import { createCloudflareVectorizeIntegration } from '../../src/index.js' +import payloadcmsVectorize from 'payloadcms-vectorize' +import type { DbAdapter } from 'payloadcms-vectorize' + +const DIMS = 8 +const dbName = `cf_compliance_test_${Date.now()}` + +// Mock Cloudflare Vectorize binding +function createMockVectorizeBinding() { + const storage = new Map< + string, + { id: string; values: number[]; metadata?: Record } + >() + + return { + query: vi.fn(async (vector: number[], options?: any) => { + const topK = options?.topK || 10 + const allVectors = Array.from(storage.values()) + + // Apply WHERE filter if present + let filtered = allVectors + if (options?.where?.and) { + filtered = allVectors.filter((vec) => { + return options.where.and.every((condition: any) => { + return vec.metadata?.[condition.key] === condition.value + }) + }) + } + + // Calculate cosine similarity + const results = filtered.map((vec) => { + let dotProduct = 0 + let normA = 0 + let normB = 0 + + for (let i = 0; i < vector.length; i++) { + dotProduct += vector[i] * vec.values[i] + normA += vector[i] * vector[i] + normB += vec.values[i] * vec.values[i] + } + + const similarity = dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) + + return { + id: vec.id, + score: similarity, + metadata: vec.metadata, + } + }) + + // Sort by score descending and limit + results.sort((a, b) => b.score - a.score) + return { matches: results.slice(0, topK) } + }), + + upsert: vi.fn(async (vectors: Array<{ id: string; values: number[]; metadata?: any }>) => { + for (const vec of vectors) { + storage.set(vec.id, vec) + } + }), + + deleteByIds: vi.fn(async (ids: string[]) => { + for (const id of ids) { + storage.delete(id) + } + }), + + list: vi.fn(async () => { + return Array.from(storage.values()) + }), + + // Test helper + __getStorage: () => storage, + } +} + +// Helper to create test database +async function createTestDb(name: string) { + const adminUri = + process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' + const client = new Client({ connectionString: adminUri }) + await client.connect() + + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [name]) + if (exists.rowCount === 0) { + await client.query(`CREATE DATABASE ${name}`) + } + await client.end() +} + +describe('Cloudflare Adapter Compliance Tests', () => { + let adapter: DbAdapter + let payload: Payload + let config: SanitizedConfig + let mockVectorize: ReturnType + + beforeAll(async () => { + await createTestDb(dbName) + + mockVectorize = createMockVectorizeBinding() + + const { adapter: cfAdapter } = createCloudflareVectorizeIntegration({ + config: { + default: { + dims: DIMS, + }, + }, + binding: mockVectorize as any, + }) + adapter = cfAdapter + + config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [], + db: postgresAdapter({ + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: 'test-v1', + queryFn: async () => Array(DIMS).fill(0.5), + realTimeIngestionFn: async (texts) => texts.map(() => Array(DIMS).fill(0.5)), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `cf-compliance-${Date.now()}`, + cron: false, + }) + }) + + afterAll(async () => { + // Cleanup is handled by test isolation + }) + + describe('getConfigExtension()', () => { + test('returns a valid config extension object', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension).toBeDefined() + expect(typeof extension).toBe('object') + }) + + test('custom property contains adapter metadata', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.custom).toBeDefined() + expect(extension.custom!._cfVectorizeAdapter).toBe(true) + expect(extension.custom!._poolConfigs).toBeDefined() + expect(extension.custom!._poolConfigs.default).toBeDefined() + expect(extension.custom!._poolConfigs.default.dims).toBe(DIMS) + }) + + test('collections property contains cfMappings collection', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.collections).toBeDefined() + expect(extension.collections!['vector-cf-mappings']).toBeDefined() + expect(extension.collections!['vector-cf-mappings'].slug).toBe('vector-cf-mappings') + }) + }) + + describe('storeEmbedding()', () => { + test('persists embedding without error (number[])', async () => { + const embedding = Array(DIMS) + .fill(0) + .map(() => Math.random()) + + const sourceDocId = `test-embed-1-${Date.now()}` + + // Create a document first + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text for embedding', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + String(doc.id), + embedding, + ), + ).resolves.not.toThrow() + + expect(mockVectorize.upsert).toHaveBeenCalled() + }) + + test('persists embedding without error (Float32Array)', async () => { + const embedding = new Float32Array( + Array(DIMS) + .fill(0) + .map(() => Math.random()), + ) + + const sourceDocId = `test-embed-2-${Date.now()}` + + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text for Float32Array', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + String(doc.id), + embedding, + ), + ).resolves.not.toThrow() + + expect(mockVectorize.upsert).toHaveBeenCalled() + }) + + test('stores embedding in Vectorize with correct ID', async () => { + const embedding = Array(DIMS).fill(0.5) + + const sourceDocId = `test-embed-id-${Date.now()}` + + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text', + embeddingVersion: 'v1-test', + }, + }) + + const embeddingId = String(doc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + embeddingId, + embedding, + ) + + const storage = mockVectorize.__getStorage() + expect(storage.has(embeddingId)).toBe(true) + expect(storage.get(embeddingId)?.values).toEqual(embedding) + }) + }) + + describe('search()', () => { + let targetEmbedding: number[] + let similarDocId: string + + beforeAll(async () => { + // Create test documents with known embeddings + targetEmbedding = Array(DIMS).fill(0.5) + const similarEmbedding = Array(DIMS) + .fill(0.5) + .map((v) => v + Math.random() * 0.05) + + const sourceDocId = `test-search-similar-${Date.now()}` + + // Create and embed a document + const similarDoc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'similar document for search test', + embeddingVersion: 'v1-test', + }, + }) + similarDocId = String(similarDoc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + similarDocId, + similarEmbedding, + ) + }) + + test('returns an array of results', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + expect(Array.isArray(results)).toBe(true) + }) + + test('results contain required fields', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + for (const result of results) { + expect(result).toHaveProperty('id') + expect(result).toHaveProperty('score') + expect(result).toHaveProperty('sourceCollection') + expect(result).toHaveProperty('docId') + expect(result).toHaveProperty('chunkIndex') + expect(result).toHaveProperty('chunkText') + expect(result).toHaveProperty('embeddingVersion') + + expect(typeof result.id).toBe('string') + expect(typeof result.score).toBe('number') + expect(typeof result.sourceCollection).toBe('string') + expect(typeof result.docId).toBe('string') + expect(typeof result.chunkIndex).toBe('number') + expect(typeof result.chunkText).toBe('string') + expect(typeof result.embeddingVersion).toBe('string') + } + }) + + test('results are ordered by score (highest first)', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 10) + + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score) + } + }) + + test('respects limit parameter', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 1) + + expect(results.length).toBeLessThanOrEqual(1) + }) + + test('calls Vectorize query with correct parameters', async () => { + await adapter.search(payload, targetEmbedding, 'default', 5) + + expect(mockVectorize.query).toHaveBeenCalledWith(targetEmbedding, expect.any(Object)) + }) + }) + + describe('deleteEmbeddings()', () => { + test('removes embeddings from Vectorize via mapping', async () => { + const embedding = Array(DIMS).fill(0.7) + + const sourceDocId = `doc-to-delete-${Date.now()}` + + // Create and embed a document + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'delete-test', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'document to delete', + embeddingVersion: 'v1-test', + }, + }) + + const embeddingId = String(doc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'delete-test', + sourceDocId, + embeddingId, + embedding, + ) + + // Verify it's stored in Vectorize + const storage = mockVectorize.__getStorage() + expect(storage.has(embeddingId)).toBe(true) + + // Delete it + await adapter.deleteEmbeddings?.(payload, 'default', 'delete-test', sourceDocId) + + // Verify deleteByIds was called with the correct vector ID + expect(mockVectorize.deleteByIds).toHaveBeenCalledWith([embeddingId]) + + // Verify mapping rows are cleaned up + const remainingMappings = await payload.find({ + collection: 'vector-cf-mappings' as any, + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'delete-test' } }, + { docId: { equals: sourceDocId } }, + ], + }, + }) + expect(remainingMappings.totalDocs).toBe(0) + }) + + test('handles non-existent embeddings gracefully', async () => { + await expect( + adapter.deleteEmbeddings?.(payload, 'default', 'non-existent', 'fake-id'), + ).resolves.not.toThrow() + }) + }) +}) diff --git a/adapters/cf/package.json b/adapters/cf/package.json new file mode 100644 index 0000000..4c2a07d --- /dev/null +++ b/adapters/cf/package.json @@ -0,0 +1,27 @@ +{ + "name": "@payloadcms-vectorize/cf", + "version": "0.6.0-beta.4", + "description": "Cloudflare Vectorize adapter for payloadcms-vectorize", + "license": "MIT", + "type": "module", + "files": [ + "dist" + ], + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "peerDependencies": { + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.6.0-beta <1.0.0" + }, + "devDependencies": { + "payloadcms-vectorize": "workspace:*" + }, + "engines": { + "node": "^18.20.2 || >=20.9.0", + "pnpm": "^9 || ^10" + }, + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts" + } +} diff --git a/adapters/cf/src/collections/cfMappings.ts b/adapters/cf/src/collections/cfMappings.ts new file mode 100644 index 0000000..ea9743b --- /dev/null +++ b/adapters/cf/src/collections/cfMappings.ts @@ -0,0 +1,49 @@ +import type { CollectionConfig } from 'payload' + +export const CF_MAPPINGS_SLUG = 'vector-cf-mappings' + +// This collection maps Cloudflare Vectorize vector IDs to source documents, +// so we can find and delete vectors when the source document is deleted. +const CFMappingsCollection: CollectionConfig = { + slug: CF_MAPPINGS_SLUG, + admin: { + hidden: true, + description: + 'Maps Cloudflare Vectorize vector IDs to source documents. Managed by the CF adapter.', + }, + access: { + read: () => true, + create: ({ req }) => req?.payloadAPI === 'local', + update: ({ req }) => req?.payloadAPI === 'local', + delete: ({ req }) => req?.payloadAPI === 'local', + }, + fields: [ + { + name: 'vectorId', + type: 'text', + required: true, + index: true, + }, + { + name: 'poolName', + type: 'text', + required: true, + index: true, + }, + { + name: 'sourceCollection', + type: 'text', + required: true, + index: true, + }, + { + name: 'docId', + type: 'text', + required: true, + index: true, + }, + ], + timestamps: true, +} + +export default CFMappingsCollection diff --git a/adapters/cf/src/embed.ts b/adapters/cf/src/embed.ts new file mode 100644 index 0000000..0f93bd0 --- /dev/null +++ b/adapters/cf/src/embed.ts @@ -0,0 +1,44 @@ +import { CollectionSlug, Payload } from 'payload' +import { getVectorizeBinding } from './types.js' +import { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' + +/** + * Store an embedding vector in Cloudflare Vectorize + */ +export default async ( + payload: Payload, + poolName: string, + sourceCollection: string, + sourceDocId: string, + id: string, + embedding: number[] | Float32Array, +) => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + const vector = Array.isArray(embedding) ? embedding : Array.from(embedding) + + // Upsert the vector in Cloudflare Vectorize + await vectorizeBinding.upsert([ + { + id, + values: vector, + }, + ]) + + // Create a mapping row so we can find this vector during deletion + await payload.create({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + data: { + vectorId: id, + poolName, + sourceCollection, + docId: sourceDocId, + }, + }) + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + } +} diff --git a/adapters/cf/src/index.ts b/adapters/cf/src/index.ts new file mode 100644 index 0000000..8a658e4 --- /dev/null +++ b/adapters/cf/src/index.ts @@ -0,0 +1,127 @@ +import type { CollectionSlug } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' +import type { CloudflareVectorizeBinding, KnowledgePoolsConfig } from './types.js' +import cfMappingsCollection, { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +import embed from './embed.js' +import search from './search.js' + +/** + * Configuration for Cloudflare Vectorize integration + */ +interface CloudflareVectorizeConfig { + /** Knowledge pools configuration with their dimensions */ + config: KnowledgePoolsConfig + /** Cloudflare Vectorize binding for vector storage */ + binding: CloudflareVectorizeBinding +} + +/** + * Create a Cloudflare Vectorize integration for payloadcms-vectorize + * + * @param options Configuration object with knowledge pools and Vectorize binding + * @returns Object containing the DbAdapter instance + * + * @example + * ```typescript + * import { createCloudflareVectorizeIntegration } from '@payloadcms-vectorize/cf' + * + * const { adapter } = createCloudflareVectorizeIntegration({ + * config: { + * default: { + * dims: 384, + * }, + * }, + * binding: env.VECTORIZE, + * }) + * ``` + */ +export const createCloudflareVectorizeIntegration = ( + options: CloudflareVectorizeConfig, +): { adapter: DbAdapter } => { + if (!options.binding) { + throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding is required') + } + + const poolConfig = options.config + + const adapter: DbAdapter = { + getConfigExtension: () => { + return { + collections: { + [CF_MAPPINGS_SLUG]: cfMappingsCollection, + }, + custom: { + _cfVectorizeAdapter: true, + _poolConfigs: poolConfig, + _vectorizeBinding: options.binding, + }, + } + }, + + search, + + storeEmbedding: embed, + + deleteEmbeddings: async (payload, poolName, sourceCollection, docId) => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + // Paginate through all mapping rows for this document+pool + const allVectorIds: string[] = [] + let page = 1 + let hasNextPage = true + + while (hasNextPage) { + const mappings = await payload.find({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + where: { + and: [ + { poolName: { equals: poolName } }, + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + page, + }) + + for (const mapping of mappings.docs) { + allVectorIds.push((mapping as Record).vectorId as string) + } + + hasNextPage = mappings.hasNextPage + page++ + } + + if (allVectorIds.length === 0) { + return + } + // Delete vectors from Cloudflare Vectorize + await vectorizeBinding.deleteByIds(allVectorIds) + // Delete mapping rows + await payload.delete({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + where: { + and: [ + { poolName: { equals: poolName } }, + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + }) + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error) + payload.logger.error( + `[@payloadcms-vectorize/cf] Failed to delete embeddings: ${errorMessage}`, + ) + throw new Error(`[@payloadcms-vectorize/cf] Failed to delete embeddings: ${errorMessage}`) + } + }, + } + + return { adapter } +} + +export { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +export type { CloudflareVectorizeBinding, KnowledgePoolsConfig } +export type { KnowledgePoolsConfig as KnowledgePoolConfig } diff --git a/adapters/cf/src/search.ts b/adapters/cf/src/search.ts new file mode 100644 index 0000000..060db87 --- /dev/null +++ b/adapters/cf/src/search.ts @@ -0,0 +1,109 @@ +import { BasePayload, CollectionSlug, Where } from 'payload' +import { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' + +/** + * Search for similar vectors in Cloudflare Vectorize + */ +export default async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit: number = 10, + where?: Where, +): Promise> => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + // Query Cloudflare Vectorize + // The query returns the top-k most similar vectors + const results = await vectorizeBinding.query(queryEmbedding, { + topK: limit, + returnMetadata: true, + }) + + if (!results.matches) { + return [] + } + + // Batch-fetch all matched documents, paginating through results + const matchIds = results.matches.map((m) => m.id) + const scoreById = new Map(results.matches.map((m) => [m.id, m.score || 0])) + + const docsById = new Map>() + let page = 1 + let hasNextPage = true + while (hasNextPage) { + const found = await payload.find({ + collection: poolName as CollectionSlug, + where: { id: { in: matchIds } }, + page, + }) + for (const doc of found.docs as Record[]) { + docsById.set(String(doc.id), doc) + } + hasNextPage = found.hasNextPage + page++ + } + + // Build results preserving the original similarity-score order + const searchResults: VectorSearchResult[] = [] + for (const matchId of matchIds) { + const doc = docsById.get(matchId) + if (!doc || (where && !matchesWhere(doc, where))) continue + + const { id: _id, createdAt: _createdAt, updatedAt: _updatedAt, ...docFields } = doc + searchResults.push({ + id: matchId, + score: scoreById.get(matchId) || 0, + ...docFields, + } as VectorSearchResult) + } + + return searchResults + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + } +} + +/** + * Simple WHERE clause matcher for basic filtering. + * Supports: equals, in, exists, and, or + */ +function matchesWhere(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + + // Handle 'and' operator + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((clause: Where) => matchesWhere(doc, clause)) + } + + // Handle 'or' operator + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((clause: Where) => matchesWhere(doc, clause)) + } + + // Handle field-level conditions + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + + const value = doc[field] + + if (typeof condition === 'object' && condition !== null) { + if ('equals' in condition && value !== condition.equals) { + return false + } + if ('in' in condition && Array.isArray(condition.in) && !condition.in.includes(value)) { + return false + } + if ('exists' in condition) { + const exists = value !== undefined && value !== null + if (condition.exists !== exists) return false + } + } + } + + return true +} diff --git a/adapters/cf/src/types.ts b/adapters/cf/src/types.ts new file mode 100644 index 0000000..92786d4 --- /dev/null +++ b/adapters/cf/src/types.ts @@ -0,0 +1,65 @@ +import type { BasePayload } from 'payload' +import { getVectorizedPayload } from 'payloadcms-vectorize' + +/** + * Retrieve the Cloudflare Vectorize binding from a Payload instance. + * Throws if the binding is not found. + */ +export function getVectorizeBinding(payload: BasePayload): CloudflareVectorizeBinding { + const binding = getVectorizedPayload(payload)?.getDbAdapterCustom() + ?._vectorizeBinding as CloudflareVectorizeBinding | undefined + if (!binding) { + throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding not found') + } + return binding +} + +/** + * Configuration for a knowledge pool in Cloudflare Vectorize + */ +export interface CloudflareVectorizePoolConfig { + /** Vector dimensions for this pool (must match embedding model output) */ + dims: number +} + +/** + * All knowledge pools configuration for Cloudflare Vectorize + */ +export type KnowledgePoolsConfig = Record + +/** A single vector match returned by a Vectorize query */ +export interface VectorizeMatch { + id: string + score?: number + metadata?: Record +} + +/** Result of a Vectorize query */ +export interface VectorizeQueryResult { + matches: VectorizeMatch[] + count: number +} + +/** Vector to upsert into Vectorize */ +export interface VectorizeVector { + id: string + values: number[] + metadata?: Record +} + +/** + * Cloudflare Vectorize binding interface. + * Mirrors the subset of the Vectorize API we use. + * For the full type, install `@cloudflare/workers-types`. + */ +export interface CloudflareVectorizeBinding { + query(vector: number[], options?: { + topK?: number + returnMetadata?: boolean | 'indexed' | 'all' + filter?: Record + /** Vectorize metadata filtering */ + where?: Record + }): Promise + upsert(vectors: VectorizeVector[]): Promise + deleteByIds(ids: string[]): Promise +} diff --git a/adapters/cf/vitest.config.ts b/adapters/cf/vitest.config.ts new file mode 100644 index 0000000..8baf261 --- /dev/null +++ b/adapters/cf/vitest.config.ts @@ -0,0 +1,38 @@ +import path from 'path' +import { loadEnv } from 'payload/node' +import { fileURLToPath } from 'url' +import tsconfigPaths from 'vite-tsconfig-paths' +import { defineConfig } from 'vitest/config' + +const filename = fileURLToPath(import.meta.url) +const dirname = path.dirname(filename) + +export default defineConfig(() => { + loadEnv(path.resolve(dirname, '../../dev')) + + return { + plugins: [ + tsconfigPaths({ + ignoreConfigErrors: true, + }), + ], + resolve: { + alias: { + '@shared-test/utils': path.resolve(dirname, '../../dev/specs/utils.ts'), + '@shared-test/helpers/chunkers': path.resolve(dirname, '../../dev/helpers/chunkers.ts'), + '@shared-test/helpers/embed': path.resolve(dirname, '../../dev/helpers/embed.ts'), + '@shared-test/constants': path.resolve(dirname, '../../dev/specs/constants.ts'), + }, + }, + test: { + environment: 'node', + hookTimeout: 30_000, + testTimeout: 30_000, + include: ['dev/specs/**/*.spec.ts'], + exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], + // Run test files sequentially to avoid global state interference + // (embeddingsTables map and Payload instance caching) + fileParallelism: false, + }, + } +}) diff --git a/adapters/pg/README.md b/adapters/pg/README.md new file mode 100644 index 0000000..ed36565 --- /dev/null +++ b/adapters/pg/README.md @@ -0,0 +1,254 @@ +# @payloadcms-vectorize/pg + +PostgreSQL adapter for [payloadcms-vectorize](https://github.com/your-repo/payloadcms-vectorize). Enables vector search capabilities using PostgreSQL's pgvector extension. + +## Prerequisites + +- PostgreSQL with pgvector extension +- Payload CMS 3.x with `@payloadcms/db-postgres` +- Node.js 18+ + +## Installation + +```bash +pnpm add @payloadcms-vectorize/pg payloadcms-vectorize +``` + +## Quick Start + +### 1. Ensure pgvector permissions + +The plugin expects the `vector` extension to be configured when Payload initializes. Your PostgreSQL database user must have permission to create extensions. If your user doesn't have these permissions, someone with permissions may need to manually create the extension once: + +```sql +CREATE EXTENSION IF NOT EXISTS vector; +``` + +**Note:** Most managed PostgreSQL services (like AWS RDS, Supabase, etc.) require superuser privileges or specific extension permissions. If you encounter permission errors, contact your database administrator or check your service's documentation. + +### 2. Configure the Plugin + +```typescript +import { buildConfig } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { createPostgresVectorIntegration } from '@payloadcms-vectorize/pg' +import payloadcmsVectorize from 'payloadcms-vectorize' + +// Create the integration with static configs (dims, ivfflatLists) +const integration = createPostgresVectorIntegration({ + // Note: Changing dims requires a migration with TRUNCATE. + // Changing ivfflatLists rebuilds the index (non-destructive). + default: { + dims: 1536, // Vector dimensions (must match your embedding model) + ivfflatLists: 100, // IVFFLAT index parameter + }, +}) + +export default buildConfig({ + // ... your existing config + db: postgresAdapter({ + // Configure the 'vector' extension + extensions: ['vector'], + // afterSchemaInitHook adds vector columns and IVFFLAT indexes to your schema + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: process.env.DATABASE_URL, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: 'v1.0.0', + queryFn: embedQuery, + realTimeIngestionFn: embedDocs, + }, + }, + }, + }), + ], +}) +``` + +## Static Configuration + +The `createPostgresVectorIntegration` function accepts a configuration object where each key is a knowledge pool name: + +```typescript +const integration = createPostgresVectorIntegration({ + poolName: { + dims: number, // Required: Vector dimensions + ivfflatLists: number // Required: IVFFLAT index lists parameter + }, + // ... additional pools +}) +``` + +### Configuration Options + +| Option | Type | Required | Description | +|--------|------|----------|-------------| +| `dims` | `number` | Yes | Vector dimensions for the pgvector column. Must match your embedding model's output dimensions. | +| `ivfflatLists` | `number` | Yes | Number of lists for the IVFFLAT index. Higher values = faster queries but slower index builds. Recommended: `sqrt(num_rows)` to `num_rows / 1000`. | + +## Integration Return Value + +`createPostgresVectorIntegration` returns an object with: + +| Property | Type | Description | +|----------|------|-------------| +| `afterSchemaInitHook` | Function | Hook for `postgresAdapter.afterSchemaInit` that adds vector columns and IVFFLAT indexes | +| `adapter` | `DbAdapter` | The database adapter to pass to `payloadcmsVectorize({ dbAdapter: ... })` | + +## Migrations + +### Initial Setup + +After configuring the plugin, create and apply your initial migration. The IVFFLAT indexes are created automatically via the `afterSchemaInitHook` using Drizzle's `extraConfig`. + +```bash +# Create migration (includes embedding columns and IVFFLAT indexes) +pnpm payload migrate:create --name initial + +# Review the migration file in src/migrations/ + +# Apply the migration +pnpm payload migrate +``` + +### Changing `ivfflatLists` + +Changing `ivfflatLists` is **non-destructive**. Simply update the config and create a new migration: + +```bash +pnpm payload migrate:create --name update_ivfflat_lists +pnpm payload migrate +``` + +Drizzle will automatically generate SQL to rebuild the index with the new lists parameter. + +### Changing `dims` (Destructive) + +**Warning:** Changing `dims` is **DESTRUCTIVE** - it requires truncating the embeddings table and re-embedding all your data. + +1. Update your static config with the new `dims` value + +2. Create a migration: + ```bash + pnpm payload migrate:create --name change_dims + ``` + +3. Run the vectorize:migrate CLI to add the TRUNCATE statement: + ```bash + pnpm payload vectorize:migrate + ``` + + The CLI will: + - Detect the dims change + - Patch the migration with `TRUNCATE TABLE ... CASCADE` + - Add appropriate down migration to restore the old column type + +4. Review the migration file + +5. Apply the migration: + ```bash + pnpm payload migrate + ``` + +6. Re-embed all documents using the bulk embed feature + +### Schema Name Qualification + +The CLI automatically uses the `schemaName` from your Postgres adapter configuration. If you use a custom schema (e.g., `postgresAdapter({ schemaName: 'custom' })`), all SQL in the migration will be properly qualified with that schema name. + +### Idempotency + +Running `pnpm payload vectorize:migrate` multiple times with no config changes will not create duplicate migrations. The CLI detects when no changes are needed and exits early. + +## PostgreSQL Custom Schema Support + +The adapter reads the `schemaName` configuration from your Postgres adapter. + +When you configure a custom schema via `postgresAdapter({ schemaName: 'custom' })`, all plugin SQL queries (for vector columns, indexes, and embeddings) are qualified with that schema name. This is useful for multi-tenant setups or when content tables live in a dedicated schema. + +Where `schemaName` is not specified, the adapter falls back to `public` as is the default adapter behaviour. + +## Multiple Knowledge Pools + +You can configure multiple knowledge pools with different dimensions and index parameters: + +```typescript +const integration = createPostgresVectorIntegration({ + documents: { + dims: 1536, + ivfflatLists: 100, + }, + images: { + dims: 512, + ivfflatLists: 50, + }, +}) + +export default buildConfig({ + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + // ... + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + documents: { + collections: { /* ... */ }, + embeddingConfig: { /* ... */ }, + }, + images: { + collections: { /* ... */ }, + embeddingConfig: { /* ... */ }, + }, + }, + }), + ], +}) +``` + +## Using with Voyage AI + +```typescript +import { embed, embedMany } from 'ai' +import { voyage } from 'voyage-ai-provider' + +export const embedDocs = async (texts: string[]): Promise => { + const embedResult = await embedMany({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + values: texts, + providerOptions: { + voyage: { inputType: 'document' }, + }, + }) + return embedResult.embeddings +} + +export const embedQuery = async (text: string): Promise => { + const embedResult = await embed({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + value: text, + providerOptions: { + voyage: { inputType: 'query' }, + }, + }) + return embedResult.embedding +} +``` + +## License + +MIT diff --git a/adapters/pg/dev/specs/compliance.spec.ts b/adapters/pg/dev/specs/compliance.spec.ts new file mode 100644 index 0000000..5b9e1fe --- /dev/null +++ b/adapters/pg/dev/specs/compliance.spec.ts @@ -0,0 +1,234 @@ +/** + * Adapter compliance tests for the Postgres adapter. + * + * These tests verify that the Postgres adapter correctly implements + * the DbAdapter interface as defined in payloadcms-vectorize. + */ +import { beforeAll, afterAll, describe, expect, test } from 'vitest' +import type { Payload, SanitizedConfig } from 'payload' +import { buildConfig, getPayload } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { Client } from 'pg' +import { createPostgresVectorIntegration } from '../../src/index.js' +import payloadcmsVectorize from 'payloadcms-vectorize' +import type { DbAdapter } from 'payloadcms-vectorize' + +const DIMS = 8 +const dbName = `pg_compliance_test_${Date.now()}` + +// Helper to create test database +async function createTestDb(name: string) { + const adminUri = + process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' + const client = new Client({ connectionString: adminUri }) + await client.connect() + + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [name]) + if (exists.rowCount === 0) { + await client.query(`CREATE DATABASE ${name}`) + } + await client.end() +} + +describe('Postgres Adapter Compliance Tests', () => { + let adapter: DbAdapter + let payload: Payload + let config: SanitizedConfig + + beforeAll(async () => { + await createTestDb(dbName) + + const { afterSchemaInitHook, adapter: pgAdapter } = createPostgresVectorIntegration({ + default: { + dims: DIMS, + ivfflatLists: 1, + }, + }) + adapter = pgAdapter + + config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: 'test-v1', + queryFn: async () => Array(DIMS).fill(0.5), + realTimeIngestionFn: async (texts) => texts.map(() => Array(DIMS).fill(0.5)), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `pg-compliance-${Date.now()}`, + cron: false, + }) + }) + + afterAll(async () => { + // Cleanup is handled by test isolation + }) + + describe('getConfigExtension()', () => { + test('returns a valid config extension object', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension).toBeDefined() + expect(typeof extension).toBe('object') + }) + + test('bins property contains vectorize:migrate script', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.bins).toBeDefined() + expect(Array.isArray(extension.bins)).toBe(true) + expect(extension.bins!.length).toBeGreaterThan(0) + + const migrateScript = extension.bins!.find((b) => b.key === 'vectorize:migrate') + expect(migrateScript).toBeDefined() + expect(migrateScript!.scriptPath).toBeTruthy() + }) + + test('custom property contains _staticConfigs', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.custom).toBeDefined() + expect(extension.custom!._staticConfigs).toBeDefined() + expect(extension.custom!._staticConfigs.default).toBeDefined() + expect(extension.custom!._staticConfigs.default.dims).toBe(DIMS) + }) + }) + + describe('storeEmbedding()', () => { + test('persists embedding without error (number[])', async () => { + const embedding = Array(DIMS) + .fill(0) + .map(() => Math.random()) + + // Create a document first + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: `test-embed-1-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text for embedding', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding(payload, 'default', String(doc.id), embedding), + ).resolves.not.toThrow() + }) + + test('persists embedding without error (Float32Array)', async () => { + const embedding = new Float32Array( + Array(DIMS) + .fill(0) + .map(() => Math.random()), + ) + + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: `test-embed-2-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text for Float32Array', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding(payload, 'default', String(doc.id), embedding), + ).resolves.not.toThrow() + }) + }) + + describe('search()', () => { + let targetEmbedding: number[] + let similarDocId: string + + beforeAll(async () => { + // Create test documents with known embeddings + targetEmbedding = Array(DIMS).fill(0.5) + const similarEmbedding = Array(DIMS) + .fill(0.5) + .map((v) => v + Math.random() * 0.05) + + // Create and embed a document + const similarDoc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: `test-search-similar-${Date.now()}`, + chunkIndex: 0, + chunkText: 'similar document for search test', + embeddingVersion: 'v1-test', + }, + }) + similarDocId = String(similarDoc.id) + await adapter.storeEmbedding(payload, 'default', similarDocId, similarEmbedding) + }) + + test('returns an array of results', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + expect(Array.isArray(results)).toBe(true) + }) + + test('results contain required fields', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + for (const result of results) { + expect(result).toHaveProperty('id') + expect(result).toHaveProperty('score') + expect(result).toHaveProperty('sourceCollection') + expect(result).toHaveProperty('docId') + expect(result).toHaveProperty('chunkIndex') + expect(result).toHaveProperty('chunkText') + expect(result).toHaveProperty('embeddingVersion') + + expect(typeof result.id).toBe('string') + expect(typeof result.score).toBe('number') + expect(typeof result.sourceCollection).toBe('string') + expect(typeof result.docId).toBe('string') + expect(typeof result.chunkIndex).toBe('number') + expect(typeof result.chunkText).toBe('string') + expect(typeof result.embeddingVersion).toBe('string') + } + }) + + test('results are ordered by score (highest first)', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 10) + + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score) + } + }) + + test('respects limit parameter', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 1) + + expect(results.length).toBeLessThanOrEqual(1) + }) + }) +}) diff --git a/adapters/pg/dev/specs/constants.ts b/adapters/pg/dev/specs/constants.ts new file mode 100644 index 0000000..2b186bd --- /dev/null +++ b/adapters/pg/dev/specs/constants.ts @@ -0,0 +1,38 @@ +import type { Config, SanitizedConfig } from 'payload' +import { buildConfig } from 'payload' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { createPostgresVectorIntegration } from '../../src/index.js' +import payloadcmsVectorize from 'payloadcms-vectorize' + +import { DIMS } from '@shared-test/constants' +export { DIMS } from '@shared-test/constants' +export { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' + +const integrationResult = createPostgresVectorIntegration({ + default: { + dims: DIMS, + ivfflatLists: 1, + }, +}) + +export const integration = integrationResult + +/** Create the plugin with the pg adapter pre-configured */ +export const plugin = ( + options: Omit[0], 'dbAdapter'>, +): ReturnType => { + return payloadcmsVectorize({ + ...options, + dbAdapter: integrationResult.adapter, + }) +} + +export async function buildDummyConfig(cfg: Partial): Promise { + const built = await buildConfig({ + secret: process.env.PAYLOAD_SECRET || 'test-secret', + collections: [], + editor: lexicalEditor(), + ...cfg, + }) + return built +} diff --git a/adapters/pg/dev/specs/extensionFields.spec.ts b/adapters/pg/dev/specs/extensionFields.spec.ts new file mode 100644 index 0000000..43ebf2c --- /dev/null +++ b/adapters/pg/dev/specs/extensionFields.spec.ts @@ -0,0 +1,154 @@ +import type { Payload } from 'payload' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildDummyConfig, integration, plugin } from './constants.js' +import { createTestDb, destroyPayload, waitForVectorizationJobs } from './utils.js' +import { getPayload } from 'payload' +import { PostgresPayload } from '../../src/types.js' +import { chunkText, chunkRichTextSimple as chunkRichText } from '@shared-test/helpers/chunkers' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' +import { DIMS } from './constants.js' + +describe('Extension fields integration tests', () => { + let payload: Payload + const dbName = 'extension_fields_test' + + beforeAll(async () => { + await createTestDb({ dbName }) + + const config = await buildDummyConfig({ + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', // Run every 5 seconds + limit: 10, + }, + ], + }, + collections: [ + { + slug: 'posts', + fields: [ + { name: 'title', type: 'text' }, + { name: 'content', type: 'richText' }, + { name: 'category', type: 'text' }, + { name: 'priority', type: 'number' }, + ], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + plugin({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string; category?: string; priority?: number }> = + [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push( + ...titleChunks.map((chunk) => ({ + chunk, + category: doc.category || 'general', + priority: doc.priority || 0, + })), + ) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push( + ...contentChunks.map((chunk) => ({ + chunk, + category: doc.category || 'general', + priority: doc.priority || 0, + })), + ) + } + return chunks + }, + }, + }, + extensionFields: [ + { + name: 'category', + type: 'text', + admin: { + description: 'Category for filtering embeddings', + }, + }, + { + name: 'priority', + type: 'number', + admin: { + description: 'Priority level for the embedding', + }, + }, + ], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `extension-fields-test-${Date.now()}`, + cron: true, + }) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('extension fields are added to the embeddings table schema', async () => { + const db = (payload as PostgresPayload).db + const sql = ` + SELECT column_name, data_type, udt_name + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'default' + ORDER BY column_name + ` + + let rows: any[] = [] + if (db?.pool?.query) { + const res = await db.pool.query(sql) + rows = res?.rows || [] + } else if (db?.drizzle?.execute) { + const res = await db.drizzle.execute(sql) + rows = Array.isArray(res) ? res : res?.rows || [] + } + + const columnsByName = Object.fromEntries(rows.map((r: any) => [r.column_name, r])) + + // Check that reserved fields exist + expect(columnsByName.source_collection).toBeDefined() + expect(columnsByName.doc_id).toBeDefined() + expect(columnsByName.chunk_index).toBeDefined() + expect(columnsByName.chunk_text).toBeDefined() + expect(columnsByName.embedding_version).toBeDefined() + expect(columnsByName.embedding).toBeDefined() + + // Check that extension fields exist + expect(columnsByName.category).toBeDefined() + expect(columnsByName.category.data_type).toBe('character varying') + expect(columnsByName.priority).toBeDefined() + expect(['numeric', 'integer']).toContain(columnsByName.priority.data_type) + }) +}) diff --git a/adapters/pg/dev/specs/integration.spec.ts b/adapters/pg/dev/specs/integration.spec.ts new file mode 100644 index 0000000..0090c3f --- /dev/null +++ b/adapters/pg/dev/specs/integration.spec.ts @@ -0,0 +1,124 @@ +/** + * Postgres-specific integration tests. + * + * These tests verify Postgres-specific functionality like + * vector column creation, schema modifications, etc. + */ +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import type { Payload, SanitizedConfig } from 'payload' +import { buildConfig, getPayload } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { Client } from 'pg' +import { createPostgresVectorIntegration } from '../../src/index.js' +import { destroyPayload } from './utils.js' +import payloadcmsVectorize from 'payloadcms-vectorize' + +const DIMS = 8 +const embeddingsCollection = 'default' + +// Helper to create test database +async function createTestDb(name: string) { + const adminUri = + process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' + const client = new Client({ connectionString: adminUri }) + await client.connect() + + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [name]) + if (exists.rowCount === 0) { + await client.query(`CREATE DATABASE ${name}`) + } + await client.end() +} + +describe('Postgres-specific integration tests', () => { + let payload: Payload + let config: SanitizedConfig + const dbName = `pg_int_test_${Date.now()}` + + beforeAll(async () => { + await createTestDb(dbName) + + const { afterSchemaInitHook, adapter } = createPostgresVectorIntegration({ + default: { + dims: DIMS, + ivfflatLists: 1, + }, + }) + config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: 'test-v1', + queryFn: async () => Array(DIMS).fill(0.5), + realTimeIngestionFn: async (texts) => texts.map(() => Array(DIMS).fill(0.5)), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `pg-int-test-${Date.now()}`, + cron: false, + }) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('adds embeddings collection with vector column', async () => { + // Check schema for embeddings collection + const collections = payload.collections + expect(collections).toHaveProperty(embeddingsCollection) + + // Query Postgres information_schema to verify vector column exists + const db = (payload as any).db + const sql = ` + SELECT column_name, udt_name, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = '${embeddingsCollection}' + ` + + let rows: any[] = [] + if (db?.pool?.query) { + const res = await db.pool.query(sql) + rows = res?.rows || [] + } else if (db?.drizzle?.execute) { + const res = await db.drizzle.execute(sql) + rows = Array.isArray(res) ? res : res?.rows || [] + } + + const columnsByName = Object.fromEntries(rows.map((r: any) => [r.column_name, r])) + + expect(columnsByName.embedding).toBeDefined() + // pgvector columns report udt_name = 'vector' + expect(columnsByName.embedding.udt_name).toBe('vector') + }) +}) diff --git a/dev/specs/migrationCli.spec.ts b/adapters/pg/dev/specs/migrationCli.spec.ts similarity index 94% rename from dev/specs/migrationCli.spec.ts rename to adapters/pg/dev/specs/migrationCli.spec.ts index 015706f..221f84f 100644 --- a/dev/specs/migrationCli.spec.ts +++ b/adapters/pg/dev/specs/migrationCli.spec.ts @@ -2,12 +2,15 @@ import type { Payload, SanitizedConfig } from 'payload' import { beforeAll, describe, expect, test, afterAll, vi } from 'vitest' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildConfig, getPayload } from 'payload' -import { createVectorizeIntegration } from 'payloadcms-vectorize' -import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '../helpers/embed.js' -import { createTestDb } from './utils.js' +import { createPostgresVectorIntegration } from '../../src/index.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' +import { createTestDb, destroyPayload } from './utils.js' import { DIMS } from './constants.js' + +const createVectorizeIntegration = createPostgresVectorIntegration +import payloadcmsVectorize from 'payloadcms-vectorize' import type { PostgresPayload } from '../../src/types.js' -import { script as vectorizeMigrateScript } from '../../src/bin/vectorize-migrate.js' +import { script as vectorizeMigrateScript } from '../../src/bin-vectorize-migrate.js' import { readdirSync, statSync, existsSync, readFileSync, writeFileSync, rmSync } from 'fs' import { join, resolve } from 'path' @@ -42,7 +45,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -73,15 +77,21 @@ describe('Migration CLI integration tests', () => { payload = await getPayload({ config, cron: true }) }) - test('VectorizedPayload has _staticConfigs', async () => { + afterAll(async () => { + await destroyPayload(payload) + }) + + test('VectorizedPayload has _staticConfigs via getDbAdapterCustom', async () => { const { getVectorizedPayload } = await import('payloadcms-vectorize') const vectorizedPayload = getVectorizedPayload(payload) expect(vectorizedPayload).toBeTruthy() - expect(vectorizedPayload?._staticConfigs).toBeDefined() - expect(vectorizedPayload?._staticConfigs.default).toBeDefined() - expect(vectorizedPayload?._staticConfigs.default.dims).toBe(DIMS) - expect(vectorizedPayload?._staticConfigs.default.ivfflatLists).toBe(10) + const adapterCustom = vectorizedPayload?.getDbAdapterCustom() + expect(adapterCustom).toBeDefined() + expect(adapterCustom?._staticConfigs).toBeDefined() + expect(adapterCustom?._staticConfigs.default).toBeDefined() + expect(adapterCustom?._staticConfigs.default.dims).toBe(DIMS) + expect(adapterCustom?._staticConfigs.default.ivfflatLists).toBe(10) }) }) @@ -117,7 +127,8 @@ describe('Migration CLI integration tests', () => { push: false, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -152,6 +163,10 @@ describe('Migration CLI integration tests', () => { }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('vector search fails with descriptive error when embedding column missing', async () => { const { getVectorizedPayload } = await import('payloadcms-vectorize') const vectorizedPayload = getVectorizedPayload(payload) @@ -200,7 +215,7 @@ describe('Migration CLI integration tests', () => { }) afterAll(async () => { - // Cleanup: remove test migrations directory + await destroyPayload(autoPayload) if (existsSync(migrationsDir)) { rmSync(migrationsDir, { recursive: true, force: true }) } @@ -232,7 +247,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -321,7 +337,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -420,6 +437,7 @@ describe('Migration CLI integration tests', () => { }) afterAll(async () => { + await destroyPayload(dimsPayload) if (existsSync(migrationsDir)) { rmSync(migrationsDir, { recursive: true, force: true }) } @@ -450,7 +468,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -540,7 +559,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { @@ -718,6 +738,7 @@ describe('Migration CLI integration tests', () => { }) afterAll(async () => { + await destroyPayload(multiPayload) if (existsSync(migrationsDir)) { rmSync(migrationsDir, { recursive: true, force: true }) } @@ -757,7 +778,8 @@ describe('Migration CLI integration tests', () => { }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: integration.adapter, knowledgePools: { default: { collections: { diff --git a/dev/specs/multipools.spec.ts b/adapters/pg/dev/specs/multipools.spec.ts similarity index 64% rename from dev/specs/multipools.spec.ts rename to adapters/pg/dev/specs/multipools.spec.ts index 9695953..04bef3c 100644 --- a/dev/specs/multipools.spec.ts +++ b/adapters/pg/dev/specs/multipools.spec.ts @@ -1,14 +1,17 @@ import type { Payload, SanitizedConfig } from 'payload' import { buildConfig } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { lexicalEditor } from '@payloadcms/richtext-lexical' import { postgresAdapter } from '@payloadcms/db-postgres' -import { createTestDb } from './utils.js' +import { createTestDb, destroyPayload } from './utils.js' import { getPayload } from 'payload' +import { createPostgresVectorIntegration } from '../../src/index.js' +import payloadcmsVectorize from 'payloadcms-vectorize' import type { PostgresPayload } from '../../src/types.js' +const createVectorizeIntegration = createPostgresVectorIntegration + const DIMS_POOL1 = 8 const DIMS_POOL2 = 16 @@ -31,29 +34,6 @@ describe('Multiple knowledge pools', () => { }, }) - const multiPoolPluginOptions = { - knowledgePools: { - pool1: { - collections: {}, - embeddingConfig: { - version: 'test-pool1', - queryFn: async () => new Array(DIMS_POOL1).fill(0), - realTimeIngestionFn: async (texts: string[]) => - texts.map(() => new Array(DIMS_POOL1).fill(0)), - }, - }, - pool2: { - collections: {}, - embeddingConfig: { - version: 'test-pool2', - queryFn: async () => new Array(DIMS_POOL2).fill(0), - realTimeIngestionFn: async (texts: string[]) => - texts.map(() => new Array(DIMS_POOL2).fill(0)), - }, - }, - }, - } - config = await buildConfig({ secret: process.env.PAYLOAD_SECRET || 'test-secret', collections: [], @@ -65,7 +45,31 @@ describe('Multiple knowledge pools', () => { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), - plugins: [multiPoolIntegration.payloadcmsVectorize(multiPoolPluginOptions)], + plugins: [ + payloadcmsVectorize({ + dbAdapter: multiPoolIntegration.adapter, + knowledgePools: { + pool1: { + collections: {}, + embeddingConfig: { + version: 'test-pool1', + queryFn: async () => new Array(DIMS_POOL1).fill(0), + realTimeIngestionFn: async (texts: string[]) => + texts.map(() => new Array(DIMS_POOL1).fill(0)), + }, + }, + pool2: { + collections: {}, + embeddingConfig: { + version: 'test-pool2', + queryFn: async () => new Array(DIMS_POOL2).fill(0), + realTimeIngestionFn: async (texts: string[]) => + texts.map(() => new Array(DIMS_POOL2).fill(0)), + }, + }, + }, + }), + ], }) payload = await getPayload({ @@ -75,6 +79,10 @@ describe('Multiple knowledge pools', () => { }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('creates two embeddings collections with vector columns', async () => { const collections = payload.collections expect(collections).toHaveProperty('pool1') diff --git a/dev/specs/schemaName.spec.ts b/adapters/pg/dev/specs/schemaName.spec.ts similarity index 82% rename from dev/specs/schemaName.spec.ts rename to adapters/pg/dev/specs/schemaName.spec.ts index d038d2d..438a634 100644 --- a/dev/specs/schemaName.spec.ts +++ b/adapters/pg/dev/specs/schemaName.spec.ts @@ -1,20 +1,20 @@ import type { Payload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' -import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' import { Client } from 'pg' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import type { PostgresPayload } from '../../src/types.js' import { buildDummyConfig, DIMS, integration, plugin } from './constants.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' -import { createVectorSearchHandlers } from '../../src/endpoints/vectorSearch.js' -import type { KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' +import { getVectorizedPayload } from 'payloadcms-vectorize' const CUSTOM_SCHEMA = 'custom' describe('Custom schemaName support', () => { @@ -95,6 +95,10 @@ describe('Custom schemaName support', () => { }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('embeddings table is created in custom schema', async () => { const db = (payload as PostgresPayload).db const tablesRes = await db.pool?.query( @@ -173,38 +177,21 @@ describe('Custom schemaName support', () => { // Wait for vectorization jobs to complete await waitForVectorizationJobs(payload) - // Perform vector search using the search handler - const knowledgePools: Record = { - default: { - collections: {}, - embeddingConfig: { - version: testEmbeddingVersion, - queryFn: makeDummyEmbedQuery(DIMS), - realTimeIngestionFn: makeDummyEmbedDocs(DIMS), - }, - }, - } - const searchHandler = createVectorSearchHandlers(knowledgePools).requestHandler + // Perform vector search using VectorizedPayload API + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() - const mockRequest = { - json: async () => ({ - query: 'Test Post Title', - knowledgePool: 'default', - }), - payload, - } as any - - const response = await searchHandler(mockRequest) - const json = await response.json() + const results = await vectorizedPayload!.search({ + query: 'Test Post Title', + knowledgePool: 'default', + }) // Verify search works and returns results from custom schema - expect(response.status).toBe(200) - expect(json).toHaveProperty('results') - expect(Array.isArray(json.results)).toBe(true) - expect(json.results.length).toBeGreaterThan(0) + expect(Array.isArray(results)).toBe(true) + expect(results.length).toBeGreaterThan(0) // Verify the results match what we created - expect(json.results).toEqual( + expect(results).toEqual( expect.arrayContaining([ expect.objectContaining({ sourceCollection: 'posts', diff --git a/adapters/pg/dev/specs/utils.ts b/adapters/pg/dev/specs/utils.ts new file mode 100644 index 0000000..d7b465b --- /dev/null +++ b/adapters/pg/dev/specs/utils.ts @@ -0,0 +1,16 @@ +import { Client } from 'pg' + +export { waitForVectorizationJobs, destroyPayload } from '@shared-test/utils' + +export const createTestDb = async ({ dbName }: { dbName: string }) => { + const adminUri = + process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' + const client = new Client({ connectionString: adminUri }) + await client.connect() + + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [dbName]) + if (exists.rowCount === 0) { + await client.query(`CREATE DATABASE ${dbName}`) + } + await client.end() +} diff --git a/adapters/pg/package.json b/adapters/pg/package.json new file mode 100644 index 0000000..aaa85fe --- /dev/null +++ b/adapters/pg/package.json @@ -0,0 +1,31 @@ +{ + "name": "@payloadcms-vectorize/pg", + "version": "0.6.0-beta.4", + "description": "PostgreSQL adapter for payloadcms-vectorize", + "license": "MIT", + "type": "module", + "files": [ + "dist" + ], + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "peerDependencies": { + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.6.0-beta <1.0.0", + "@payloadcms/db-postgres": ">=3.0.0 <4.0.0" + }, + "devDependencies": { + "payloadcms-vectorize": "workspace:*" + }, + "dependencies": { + "to-snake-case": "1.0.0" + }, + "engines": { + "node": "^18.20.2 || >=20.9.0", + "pnpm": "^9 || ^10" + }, + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts" + } +} diff --git a/src/bin/vectorize-migrate.ts b/adapters/pg/src/bin-vectorize-migrate.ts similarity index 85% rename from src/bin/vectorize-migrate.ts rename to adapters/pg/src/bin-vectorize-migrate.ts index 849f6d9..9d9f101 100644 --- a/src/bin/vectorize-migrate.ts +++ b/adapters/pg/src/bin-vectorize-migrate.ts @@ -4,8 +4,19 @@ import { readFileSync, writeFileSync, readdirSync, statSync, existsSync } from ' import { join, resolve } from 'path' import toSnakeCase from 'to-snake-case' -import { getVectorizedPayload } from '../types.js' -import type { KnowledgePoolStaticConfig } from '../types.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' +import { KnowledgePoolsConfig } from './types.js' + +function listMigrationFiles(migrationsDir: string) { + return readdirSync(migrationsDir) + .filter((f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js') + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) +} /** * Get prior dims state from existing migrations @@ -26,15 +37,7 @@ function getPriorDimsFromMigrations( } // Find all migration files and read them in reverse order (newest first) - // Exclude index.ts/index.js as those are not migration files - const migrationFiles = readdirSync(migrationsDir) - .filter((f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js') - .map((f) => ({ - name: f, - path: join(migrationsDir, f), - mtime: statSync(join(migrationsDir, f)).mtime, - })) - .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + const migrationFiles = listMigrationFiles(migrationsDir) // Skip the most recent migration when determining prior dims, since it may contain // the pending dims change that we're trying to detect @@ -55,19 +58,24 @@ function getPriorDimsFromMigrations( for (const poolName of poolNames) { const tableName = toSnakeCase(poolName) - const dimsMatch = - upContent.match( - new RegExp(`ALTER\\s+TABLE[^;]*?"${tableName}"[^;]*?vector\\((\\d+)\\)`, 'is'), - ) || - upContent.match( - new RegExp( - `CREATE\\s+TABLE[^;]*?"${tableName}"[^;]*?embedding[^;]*?vector\\((\\d+)\\)`, - 'is', - ), - ) || - upContent.match( - new RegExp(`"${tableName}"\\s*\\([^)]*embedding[^)]*vector\\((\\d+)\\)`, 'is'), - ) + const pattern1 = new RegExp( + `ALTER\\s+TABLE[^;]*?"${tableName}"[^;]*?vector\\((\\d+)\\)`, + 'is', + ) + const pattern2 = new RegExp( + `CREATE\\s+TABLE[^;]*?"${tableName}"[^;]*?embedding[^;]*?vector\\((\\d+)\\)`, + 'is', + ) + const pattern3 = new RegExp( + `"${tableName}"\\s*\\([^)]*embedding[^)]*vector\\((\\d+)\\)`, + 'is', + ) + + const match1 = upContent.match(pattern1) + const match2 = upContent.match(pattern2) + const match3 = upContent.match(pattern3) + + const dimsMatch = match1 || match2 || match3 if (dimsMatch && !state.get(poolName)) { const dims = parseInt(dimsMatch[1], 10) @@ -93,10 +101,10 @@ function generateDimsChangeTruncateCode( newDims: number, ): string { return ` // payloadcms-vectorize: WARNING - Changing dims from ${oldDims} to ${newDims} is DESTRUCTIVE - // All existing embeddings will be deleted. You must re-embed all documents after this migration. - // Truncate table (destructive - all embeddings are lost) - // Use CASCADE to handle foreign key constraints - await db.execute(sql.raw(\`TRUNCATE TABLE "${schemaName}"."${tableName}" CASCADE\`));` + // All existing embeddings will be deleted. You must re-embed all documents after this migration. + // Truncate table (destructive - all embeddings are lost) + // Use CASCADE to handle foreign key constraints + await db.execute(sql.raw(\`TRUNCATE TABLE "${schemaName}"."${tableName}" CASCADE\`));` } /** @@ -108,9 +116,9 @@ function generateDimsChangeDownCode( oldDims: number, ): string { return ` // payloadcms-vectorize: Revert column type to old dimensions - // WARNING: Data was truncated during up migration and cannot be restored. - // You will need to re-embed all documents after rolling back. - await db.execute(sql.raw(\`ALTER TABLE "${schemaName}"."${tableName}" ALTER COLUMN embedding TYPE vector(${oldDims})\`));` + // WARNING: Data was truncated during up migration and cannot be restored. + // You will need to re-embed all documents after rolling back. + await db.execute(sql.raw(\`ALTER TABLE "${schemaName}"."${tableName}" ALTER COLUMN embedding TYPE vector(${oldDims})\`));` } /** @@ -219,7 +227,9 @@ export const script = async (config: SanitizedConfig): Promise => { ) } - const staticConfigs = vectorizedPayload._staticConfigs + const staticConfigs = ( + vectorizedPayload.getDbAdapterCustom() as { _staticConfigs: KnowledgePoolsConfig } + )._staticConfigs if (!staticConfigs || Object.keys(staticConfigs).length === 0) { throw new Error('[payloadcms-vectorize] No static configs found') } @@ -243,7 +253,7 @@ export const script = async (config: SanitizedConfig): Promise => { }> = [] for (const poolName of poolNames) { - const currentConfig = staticConfigs[poolName] as KnowledgePoolStaticConfig + const currentConfig = staticConfigs[poolName] const priorDimsValue = priorDims.get(poolName) const currentDims = currentConfig.dims @@ -282,14 +292,7 @@ export const script = async (config: SanitizedConfig): Promise => { ) } - const migrationFiles = readdirSync(migrationsDir) - .filter((f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js') - .map((f) => ({ - name: f, - path: join(migrationsDir, f), - mtime: statSync(join(migrationsDir, f)).mtime, - })) - .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + const migrationFiles = listMigrationFiles(migrationsDir) if (migrationFiles.length === 0) { throw new Error( diff --git a/src/drizzle/tables.ts b/adapters/pg/src/drizzle.ts similarity index 90% rename from src/drizzle/tables.ts rename to adapters/pg/src/drizzle.ts index 70d4a10..b0cc761 100644 --- a/src/drizzle/tables.ts +++ b/adapters/pg/src/drizzle.ts @@ -1,4 +1,4 @@ -import type { KnowledgePoolName } from '../types.js' +import type { KnowledgePoolName } from 'payloadcms-vectorize' import type { Table } from '@payloadcms/db-postgres/drizzle' // Extend Table to allow dynamic column access (for extension fields) diff --git a/adapters/pg/src/embed.ts b/adapters/pg/src/embed.ts new file mode 100644 index 0000000..4c7505b --- /dev/null +++ b/adapters/pg/src/embed.ts @@ -0,0 +1,35 @@ +import { Payload } from 'payload' +import { isPostgresPayload } from './types.js' +import toSnakeCase from 'to-snake-case' + +export default async ( + payload: Payload, + poolName: string, + _sourceCollection: string, + _sourceDocId: string, + id: string, + embedding: number[] | Float32Array, +) => { + if (!isPostgresPayload(payload)) { + throw new Error('[@payloadcms-vectorize/pg] Only works with Postgres') + } + // After the type guard, payload is narrowed to PostgresPayload + const runSQL = async (sql: string, params?: unknown[]) => { + if (payload.db.pool?.query) return payload.db.pool.query(sql, params) + if (payload.db.drizzle?.execute) return payload.db.drizzle.execute(sql) + throw new Error('[@payloadcms-vectorize/pg] Failed to persist vector column') + } + const pgVectorLiteral = `[${Array.from(embedding).join(',')}]` + const schemaName = payload.db.schemaName || 'public' + // Drizzle converts camelCase collection slugs to snake_case table names + const sqlStatement = `UPDATE "${schemaName}"."${toSnakeCase(poolName)}" SET embedding = $1 WHERE id = $2` + try { + await runSQL(sqlStatement, [pgVectorLiteral, id]) + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error( + `[@payloadcms-vectorize/pg] Failed to persist vector column: ${errorMessage}`, + ) + throw new Error(`[@payloadcms-vectorize/pg] Failed to persist vector column: ${errorMessage}`) + } +} diff --git a/adapters/pg/src/index.ts b/adapters/pg/src/index.ts new file mode 100644 index 0000000..8b4242a --- /dev/null +++ b/adapters/pg/src/index.ts @@ -0,0 +1,93 @@ +import { KnowledgePoolsConfig } from './types.js' +import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' +import { clearEmbeddingsTables, registerEmbeddingsTable } from './drizzle.js' +import { customType, index } from '@payloadcms/db-postgres/drizzle/pg-core' +import toSnakeCase from 'to-snake-case' +import type { DbAdapter } from 'payloadcms-vectorize' +import { fileURLToPath } from 'url' +import { dirname, resolve } from 'path' +import embed from './embed.js' +import search from './search.js' + +export type { KnowledgePoolsConfig as KnowledgePoolConfig } + +export const createPostgresVectorIntegration = ( + config: KnowledgePoolsConfig, +): { + afterSchemaInitHook: Required['afterSchemaInit'][number] + adapter: DbAdapter +} => { + // Augment the generated schema so push/migrations are aware of our custom columns + const afterSchemaInitHook: Required['afterSchemaInit'][number] = async ({ + schema, + extendTable, + }) => { + // Ensure registry reflects the latest schema + clearEmbeddingsTables() + + // Extend schema for each knowledge pool + for (const poolName in config) { + const staticConfig = config[poolName] + const dims = staticConfig.dims + + const vectorType = customType({ + dataType() { + return `vector(${dims})` + }, + }) + + // Drizzle converts camelCase collection slugs to snake_case table names + const tableName = toSnakeCase(poolName) + const table = schema?.tables?.[tableName] + if (!table) { + throw new Error( + `[@payloadcms-vectorize/pg] Embeddings table "${poolName}" (table: "${tableName}") not found during schema initialization. Ensure the collection has been registered.`, + ) + } + + if (typeof extendTable === 'function') { + extendTable({ + table, + columns: { + embedding: vectorType('embedding'), + }, + extraConfig: (cols) => ({ + embeddingIvfflatIndex: index(`${tableName}_embedding_ivfflat`) + .using('ivfflat', cols.embedding.op('vector_cosine_ops')) + .with({ lists: staticConfig.ivfflatLists }), + }), + }) + } + + registerEmbeddingsTable(poolName, table) + } + + return schema + } + + const adapter: DbAdapter = { + getConfigExtension: () => { + // Register bin script for migration helper + const __filename = fileURLToPath(import.meta.url) + const __dirname = dirname(__filename) + const binScriptPath = resolve(__dirname, 'bin-vectorize-migrate.js') + + return { + bins: [ + { + // Register bin script for migration helper + key: 'vectorize:migrate', + scriptPath: binScriptPath, + }, + ], + custom: { + _staticConfigs: config, + }, + } + }, + search, + storeEmbedding: embed, + } + + return { afterSchemaInitHook, adapter } +} diff --git a/adapters/pg/src/search.ts b/adapters/pg/src/search.ts new file mode 100644 index 0000000..f52d4e5 --- /dev/null +++ b/adapters/pg/src/search.ts @@ -0,0 +1,325 @@ +import { + sql, + cosineDistance, + inArray, + eq, + and, + or, + not, + like, + gt, + gte, + lt, + lte, + ne, + isNull, + isNotNull, +} from '@payloadcms/db-postgres/drizzle' +import { BasePayload, Where, SanitizedCollectionConfig, FlattenedField } from 'payload' +import { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' +import toSnakeCase from 'to-snake-case' +import { getEmbeddingsTable } from './drizzle.js' + +export default async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit: number = 10, + where?: Where, +): Promise> => { + const isPostgres = payload.db?.pool?.query || payload.db?.drizzle + + if (!isPostgres) { + throw new Error('Only works with Postgres') + } + + // In PayloadCMS, payload.db IS the adapter, and drizzle is at payload.db.drizzle + const adapter = payload.db + if (!adapter) { + throw new Error('Drizzle adapter not found') + } + + // Get drizzle instance + const drizzle = adapter.drizzle + if (!drizzle) { + throw new Error('Drizzle instance not found in adapter') + } + + // Get collection config and table name + const collectionConfig = payload.collections[poolName]?.config + if (!collectionConfig) { + throw new Error(`Collection ${poolName} not found`) + } + + const table = getEmbeddingsTable(poolName) + if (!table) { + throw new Error( + `[payloadcms-vectorize] Embeddings table for knowledge pool "${poolName}" not registered. Ensure the plugin's afterSchemaInit hook ran and the pool exists.`, + ) + } + + // Use Drizzle's query builder with cosineDistance function + // cosineDistance returns distance, so we calculate score as 1 - distance + // The table from fullSchema should have columns as direct properties + const embeddingColumn = table.embedding + if (!embeddingColumn) { + throw new Error( + `Embedding column not found in table for pool "${poolName}". Available properties: ${Object.keys(table).join(', ')}`, + ) + } + + // Convert WHERE clause to Drizzle conditions + let drizzleWhere: any = undefined + if (where) { + drizzleWhere = convertWhereToDrizzle(where, table, collectionConfig.flattenedFields) + if (drizzleWhere === null) { + // WHERE clause resulted in an empty condition (e.g., empty 'and' or 'or' array) + // This semantically means "match nothing", so return empty results + throw new Error( + `[payloadcms-vectorize] WHERE clause resulted in no valid conditions. This typically occurs when using empty 'and' or 'or' arrays, or when all field conditions reference non-existent columns.`, + ) + } + if (drizzleWhere === undefined) { + // WHERE clause could not be converted (invalid structure or unsupported operators) + throw new Error( + `[payloadcms-vectorize] WHERE clause could not be converted to Drizzle conditions. Please check that all field names exist and operators are supported.`, + ) + } + } + + // Build query using Drizzle's query builder + // Column names in the table are camelCase (docId, chunkText, etc.) + // but their database names are snake_case (doc_id, chunk_text, etc.) + // The table from fullSchema should have columns as direct properties + // Calculate score: 1 - cosineDistance (distance) + // Need to cast 1 to numeric to avoid "integer - vector" error + const distanceExpr = cosineDistance(embeddingColumn, queryEmbedding) + + // Build select object with score + const selectObj: Record = { + id: table.id, // ensure we select id explicitly + score: sql`1 - (${distanceExpr})`, + } + + // Add reserved + extension fields from collection config + for (const field of collectionConfig.fields ?? []) { + if (typeof field === 'object' && 'name' in field) { + const name = field.name as string + if (name in table) { + selectObj[name] = table[name] + } else if (toSnakeCase(name) in table) { + selectObj[name] = table[toSnakeCase(name)] + } + } + } + + let query: any = drizzle.select(selectObj).from(table) + + // Add WHERE clause if provided + if (drizzleWhere) { + query = query.where(drizzleWhere) + } + + // Order by cosine distance (ascending = most similar first) and limit + // Reuse the same distance expression for ordering + query = query.orderBy(distanceExpr).limit(limit) + + // Execute the query + const result = await query + + return mapRowsToResults(result, collectionConfig) +} + +/** + * Drizzle table — dynamically registered, so typed loosely. + * We use `any` here because Drizzle column types (SQLWrapper, Column) are + * not directly expressible for tables that are registered at runtime. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +type DrizzleTable = Record + +/** + * Convert Payload WHERE clause to Drizzle conditions. + * Returns a drizzle SQL condition, null (empty/no-op), or undefined (invalid). + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function convertWhereToDrizzle(where: Where, table: DrizzleTable, fields: FlattenedField[]): any { + if (!where || typeof where !== 'object') { + return undefined + } + + // Handle 'and' operator + if ('and' in where && Array.isArray(where.and)) { + const conditions = where.and + .map((condition) => convertWhereToDrizzle(condition, table, fields)) + .filter((c) => c !== undefined && c !== null) + if (conditions.length === 0) return null + if (conditions.length === 1) return conditions[0] + return and(...conditions) + } + + // Handle 'or' operator + if ('or' in where && Array.isArray(where.or)) { + const conditions = where.or + .map((condition) => convertWhereToDrizzle(condition, table, fields)) + .filter((c) => c !== undefined && c !== null) + if (conditions.length === 0) return null + if (conditions.length === 1) return conditions[0] + return or(...conditions) + } + + // Handle field conditions - collect all field conditions and combine with AND + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const fieldConditions: any[] = [] + for (const [fieldName, condition] of Object.entries(where)) { + if (fieldName === 'and' || fieldName === 'or') continue + + // Get the column from the table (try camelCase first, then snake_case) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let column: any = undefined + if (fieldName in table) { + column = table[fieldName] + } else if (toSnakeCase(fieldName) in table) { + column = table[toSnakeCase(fieldName)] + } else if (table.columns) { + // Fallback to table.columns if it exists + if (fieldName in table.columns) { + column = table.columns[fieldName] + } else if (toSnakeCase(fieldName) in table.columns) { + column = table.columns[toSnakeCase(fieldName)] + } + } + + if (!column) { + // Field not found, skip (could be a nested field we don't support) + continue + } + + if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) { + continue + } + + const cond = condition as Record + + // Handle equals + if ('equals' in cond) { + fieldConditions.push(eq(column, cond.equals)) + continue + } + + // Handle not_equals / notEquals + if ('not_equals' in cond || 'notEquals' in cond) { + fieldConditions.push(ne(column, cond.not_equals ?? cond.notEquals)) + continue + } + + // Handle in + if ('in' in cond && Array.isArray(cond.in)) { + fieldConditions.push(inArray(column, cond.in)) + continue + } + + // Handle not_in / notIn + if ('not_in' in cond || 'notIn' in cond) { + const values = cond.not_in ?? cond.notIn + if (Array.isArray(values)) { + fieldConditions.push(not(inArray(column, values))) + } + continue + } + + // Handle like + if ('like' in cond && typeof cond.like === 'string') { + fieldConditions.push(like(column, cond.like)) + continue + } + + // Handle contains + if ('contains' in cond && typeof cond.contains === 'string') { + fieldConditions.push(like(column, `%${cond.contains}%`)) + continue + } + + // Handle greater_than / greaterThan + if ('greater_than' in cond || 'greaterThan' in cond) { + fieldConditions.push(gt(column, cond.greater_than ?? cond.greaterThan)) + continue + } + + // Handle greater_than_equal / greaterThanEqual + if ('greater_than_equal' in cond || 'greaterThanEqual' in cond) { + fieldConditions.push(gte(column, cond.greater_than_equal ?? cond.greaterThanEqual)) + continue + } + + // Handle less_than / lessThan + if ('less_than' in cond || 'lessThan' in cond) { + fieldConditions.push(lt(column, cond.less_than ?? cond.lessThan)) + continue + } + + // Handle less_than_equal / lessThanEqual + if ('less_than_equal' in cond || 'lessThanEqual' in cond) { + fieldConditions.push(lte(column, cond.less_than_equal ?? cond.lessThanEqual)) + continue + } + + // Handle exists (null check) + if ('exists' in cond && typeof cond.exists === 'boolean') { + fieldConditions.push(cond.exists ? isNotNull(column) : isNull(column)) + continue + } + } + + // Combine all field conditions with AND + if (fieldConditions.length === 0) { + return undefined + } + if (fieldConditions.length === 1) { + return fieldConditions[0] + } + return and(...fieldConditions) +} + +function mapRowsToResults( + rows: Record[], + collectionConfig: SanitizedCollectionConfig, +): Array { + // Collect names of fields that are typed as number on the collection + const numberFields = new Set() + for (const field of collectionConfig.fields) { + if (typeof field === 'object' && 'name' in field && field.type === 'number') { + numberFields.add(field.name) + } + } + + return rows.map((row) => { + // Drizzle returns columns with the names we selected (camelCase) + // Handle both camelCase and snake_case for robustness + const rawDocId = row.docId ?? row.doc_id + const rawChunkIndex = row.chunkIndex ?? row.chunk_index + const rawScore = row.score + + const result = { + ...row, + id: String(row.id), + docId: String(rawDocId), + score: typeof rawScore === 'number' ? rawScore : parseFloat(String(rawScore)), + chunkIndex: + typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), + } as VectorSearchResult + + // Ensure any number fields from the schema are numbers in the result + for (const fieldName of numberFields) { + const value = result[fieldName] + if (value != null && typeof value !== 'number') { + const parsed = parseFloat(String(value)) + if (!Number.isNaN(parsed)) { + result[fieldName] = parsed + } + } + } + + return result + }) +} diff --git a/adapters/pg/src/types.ts b/adapters/pg/src/types.ts new file mode 100644 index 0000000..337633b --- /dev/null +++ b/adapters/pg/src/types.ts @@ -0,0 +1,35 @@ +/** Configuration for a knowledge pool */ + +import { KnowledgePoolName } from 'payloadcms-vectorize' +import type { Payload } from 'payload' + +/** Note current limitation: needs a migration in order to change */ +export type KnowledgePoolsConfig = Record< + KnowledgePoolName, + { + /** Vector dimensions for pgvector column */ + dims: number + /** IVFFLAT lists parameter used when creating the index */ + ivfflatLists: number + } +> + +/** Shape of the Postgres-specific db properties we need */ +export interface PostgresDb { + pool?: { query: (sql: string, params?: unknown[]) => Promise } + drizzle?: Record & { execute?: (sql: string) => Promise } + schemaName?: string +} + +/** Payload instance with a Postgres database adapter */ +export type PostgresPayload = Payload & { + db: PostgresDb +} + +/** Type guard to check if Payload is using Postgres adapter */ +export function isPostgresPayload(payload: Payload): payload is PostgresPayload { + const db = payload.db as unknown as Record + const pool = db?.pool as Record | undefined + const drizzle = db?.drizzle as Record | undefined + return typeof pool?.query === 'function' || typeof drizzle?.execute === 'function' +} diff --git a/adapters/pg/vitest.config.js b/adapters/pg/vitest.config.js new file mode 100644 index 0000000..8baf261 --- /dev/null +++ b/adapters/pg/vitest.config.js @@ -0,0 +1,38 @@ +import path from 'path' +import { loadEnv } from 'payload/node' +import { fileURLToPath } from 'url' +import tsconfigPaths from 'vite-tsconfig-paths' +import { defineConfig } from 'vitest/config' + +const filename = fileURLToPath(import.meta.url) +const dirname = path.dirname(filename) + +export default defineConfig(() => { + loadEnv(path.resolve(dirname, '../../dev')) + + return { + plugins: [ + tsconfigPaths({ + ignoreConfigErrors: true, + }), + ], + resolve: { + alias: { + '@shared-test/utils': path.resolve(dirname, '../../dev/specs/utils.ts'), + '@shared-test/helpers/chunkers': path.resolve(dirname, '../../dev/helpers/chunkers.ts'), + '@shared-test/helpers/embed': path.resolve(dirname, '../../dev/helpers/embed.ts'), + '@shared-test/constants': path.resolve(dirname, '../../dev/specs/constants.ts'), + }, + }, + test: { + environment: 'node', + hookTimeout: 30_000, + testTimeout: 30_000, + include: ['dev/specs/**/*.spec.ts'], + exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], + // Run test files sequentially to avoid global state interference + // (embeddingsTables map and Payload instance caching) + fileParallelism: false, + }, + } +}) diff --git a/dev/app/(frontend)/page.tsx b/dev/app/(frontend)/page.tsx index 30049b9..43e5f58 100644 --- a/dev/app/(frontend)/page.tsx +++ b/dev/app/(frontend)/page.tsx @@ -6,7 +6,7 @@ interface SearchResult { id: string title: string content: any - similarity: number + score: number chunkText: string fieldPath: string } @@ -99,7 +99,7 @@ export default function SearchPage() {

{result.title}

- {Math.round(result.similarity * 100)}% match + {Math.round(result.score * 100)}% match
diff --git a/dev/helpers/chunkers.ts b/dev/helpers/chunkers.ts index 548b030..84ac878 100644 --- a/dev/helpers/chunkers.ts +++ b/dev/helpers/chunkers.ts @@ -139,28 +139,56 @@ export const createRichTextChunker = async (config: SanitizedConfig) => { // Rich text chunker specifically for SerializedEditorState export const chunkRichText = async ( richText: SerializedEditorState, - payload: Payload, + config: SanitizedConfig, ): Promise => { // Create chunker with payload config and chunk the rich text - const chunk = await createRichTextChunker(payload.config) + const chunk = await createRichTextChunker(config) return await chunk(richText) } +/** + * Simplified rich text chunker for adapter tests that don't need Lexical parsing. + * Extracts text content from SerializedEditorState by walking the node tree. + */ +export const chunkRichTextSimple = async ( + richText: SerializedEditorState, +): Promise => { + const root = richText?.root + if (!root || !root.children) { + return [] + } + + const chunks: string[] = [] + for (const node of (root as any).children) { + const text = extractText(node) + if (text) { + chunks.push(text) + } + } + return chunks +} + +function extractText(node: any): string { + if (!node) return '' + if (node.text) return node.text + if (node.children && Array.isArray(node.children)) { + return node.children.map(extractText).join(' ') + } + return '' +} + // Simple text chunker export const chunkText = (text: string): string[] => { const maxChars = 1000 const sentences = text.match(/[^.!?]+[.!?](?:\s+|$)|[^.!?]+$/g) || [] const chunks = [] let currentChunk = '' - let numSentences = 0 for (const sentence of sentences) { if (currentChunk.length + sentence.length >= maxChars) { chunks.push(currentChunk) currentChunk = sentence - numSentences = 0 } else { currentChunk += sentence - numSentences++ } } if (currentChunk) { diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts new file mode 100644 index 0000000..3499457 --- /dev/null +++ b/dev/helpers/mockAdapter.ts @@ -0,0 +1,166 @@ +import type { DbAdapter, KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' +import type { Payload, BasePayload, Where, Config } from 'payload' + +type StoredEmbedding = { + poolName: string + id: string + embedding: number[] +} + +type MockAdapterOptions = { + /** Custom bins to return from getConfigExtension */ + bins?: { key: string; scriptPath: string }[] + /** Custom data to return from getConfigExtension */ + custom?: Record +} + +/** + * Cosine similarity between two vectors + */ +function cosineSimilarity(a: number[], b: number[]): number { + if (a.length !== b.length) { + throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`) + } + let dot = 0 + let normA = 0 + let normB = 0 + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i] + normA += a[i] * a[i] + normB += b[i] * b[i] + } + if (normA === 0 || normB === 0) return 0 + return dot / (Math.sqrt(normA) * Math.sqrt(normB)) +} + +/** + * Creates a mock DbAdapter for testing that stores embeddings in memory. + * This allows testing the core plugin without requiring a database. + */ +export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter => { + const { bins = [], custom = {} } = options + // In-memory storage for embeddings, keyed by `${poolName}:${id}` + const storage = new Map() + + return { + getConfigExtension: (_config: Config) => ({ + bins, + custom: { _isMockAdapter: true, ...custom }, + }), + + storeEmbedding: async ( + _payload: Payload, + poolName: KnowledgePoolName, + _sourceCollection: string, + _sourceDocId: string, + id: string, + embedding: number[] | Float32Array, + ): Promise => { + const key = `${poolName}:${id}` + const embeddingArray = Array.isArray(embedding) ? embedding : Array.from(embedding) + + storage.set(key, { + poolName, + id, + embedding: embeddingArray, + }) + }, + + search: async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: string, + limit: number = 10, + where?: Where, + ): Promise => { + const results: Array = [] + + // Find all embeddings for this pool + for (const [_key, stored] of storage) { + if (stored.poolName !== poolName) continue + + // Calculate score using cosine similarity + const score = cosineSimilarity(queryEmbedding, stored.embedding) + + // Fetch the document from Payload to get metadata + try { + const doc = await payload.findByID({ + collection: poolName as any, + id: stored.id, + }) + + if (doc) { + // Apply basic where filtering if provided + if (where && !matchesWhere(doc, where)) { + continue + } + + // Extract all fields except internal ones, including extension fields + const { + id: _id, + createdAt: _createdAt, + updatedAt: _updatedAt, + embedding: _embedding, + ...docFields + } = doc as any + + results.push({ + id: stored.id, + score, + _score: score, // For sorting + ...docFields, // Includes sourceCollection, docId, chunkText, embeddingVersion, AND extension fields + }) + } + } catch (_e) { + // Document not found, skip + } + } + + // Sort by score descending and apply limit + return results + .sort((a, b) => b._score - a._score) + .slice(0, limit) + .map(({ _score, ...rest }) => rest) + }, + } +} + +/** + * Simple WHERE clause matcher for basic filtering + * Supports: equals, in, exists, and, or + */ +function matchesWhere(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + + // Handle 'and' operator + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((clause: Where) => matchesWhere(doc, clause)) + } + + // Handle 'or' operator + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((clause: Where) => matchesWhere(doc, clause)) + } + + // Handle field-level conditions + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + + const value = doc[field] + + if (typeof condition === 'object' && condition !== null) { + if ('equals' in condition && value !== condition.equals) { + return false + } + if ('in' in condition && Array.isArray(condition.in) && !condition.in.includes(value)) { + return false + } + if ('exists' in condition) { + const exists = value !== undefined && value !== null + if (condition.exists !== exists) return false + } + } + } + + return true +} diff --git a/dev/payload.config.ts b/dev/payload.config.ts index f14e576..a39ae0d 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -2,7 +2,7 @@ import { postgresAdapter } from '@payloadcms/db-postgres' import { lexicalEditor } from '@payloadcms/richtext-lexical' import path from 'path' import { buildConfig } from 'payload' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import payloadcmsVectorize from 'payloadcms-vectorize' import { makeDummyEmbedDocs, testEmbeddingVersion, @@ -18,6 +18,7 @@ import { testEmailAdapter } from './helpers/testEmailAdapter.js' import { seed } from './seed.js' import { chunkRichText, chunkText } from './helpers/chunkers.js' import { createMockBulkEmbeddings } from './specs/utils.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const filename = fileURLToPath(import.meta.url) const dirname = path.dirname(filename) @@ -32,7 +33,6 @@ if (process.env.NODE_ENV === 'production') { } const dims = Number(process.env.DIMS) -const ivfflatLists = Number(process.env.IVFFLATLISTS) const embedDocs = process.env.USE_VOYAGE !== undefined ? voyageEmbedDocs : makeDummyEmbedDocs(dims) const embedQuery = process.env.USE_VOYAGE !== undefined ? voyageEmbedQuery : makeDummyEmbedQuery(dims) @@ -55,20 +55,7 @@ const ssl = } : undefined -const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - default: { - dims, - ivfflatLists, // Rule of thumb: ivfflatLists = sqrt(total_number_of_vectors). Helps with working memory usage. - }, - bulkDefault: { - dims, - ivfflatLists, - }, - failingBulkDefault: { - dims, - ivfflatLists, - }, -}) +const adapter = createMockAdapter() const buildConfigWithPostgres = async () => { return buildConfig({ @@ -87,8 +74,6 @@ const buildConfigWithPostgres = async () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [afterSchemaInitHook], pool: { connectionString: process.env.DATABASE_URI || 'postgresql://postgres:password@localhost:5433/payload_test', @@ -130,6 +115,7 @@ const buildConfigWithPostgres = async () => { }, plugins: [ payloadcmsVectorize({ + dbAdapter: adapter, knowledgePools: { default: { collections: { @@ -143,7 +129,7 @@ const buildConfigWithPostgres = async () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -168,7 +154,7 @@ const buildConfigWithPostgres = async () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -193,7 +179,7 @@ const buildConfigWithPostgres = async () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks diff --git a/dev/specs/bulkEmbed/basic.spec.ts b/dev/specs/bulkEmbed/basic.spec.ts index bb0219f..174f3c1 100644 --- a/dev/specs/bulkEmbed/basic.spec.ts +++ b/dev/specs/bulkEmbed/basic.spec.ts @@ -1,5 +1,5 @@ import type { Payload, SanitizedConfig } from 'payload' -import { afterEach, beforeAll, beforeEach, describe, expect, test, vi } from 'vitest' +import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, test, vi } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../../src/collections/bulkEmbeddingInputMetadata.js' @@ -10,16 +10,19 @@ import { clearAllCollections, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_basic_${Date.now()}` const basePluginOptions = { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -54,6 +57,10 @@ describe('Bulk embed - basic tests', () => { vectorizedPayload = getVectorizedPayload(payload) }) + afterAll(async () => { + await destroyPayload(payload) + }) + beforeEach(async () => { await clearAllCollections(payload) }) diff --git a/dev/specs/bulkEmbed/batchLimit.spec.ts b/dev/specs/bulkEmbed/batchLimit.spec.ts new file mode 100644 index 0000000..d9170a8 --- /dev/null +++ b/dev/specs/bulkEmbed/batchLimit.spec.ts @@ -0,0 +1,117 @@ +import type { Payload } from 'payload' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + createMockBulkEmbeddings, + createTestDb, + destroyPayload, + waitForBulkJobs, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_batchlimit_${Date.now()}` + +describe('Bulk embed - batchLimit', () => { + let payload: Payload + let vectorizedPayload: VectorizedPayload | null = null + + beforeAll(async () => { + await createTestDb({ dbName }) + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + dbAdapter: createMockAdapter(), + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + batchLimit: 2, + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ + statusSequence: ['succeeded'], + }), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + key: `batchlimit-${Date.now()}`, + }) + payload = built.payload + vectorizedPayload = getVectorizedPayload(payload) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('batchLimit splits docs across continuation jobs and all get embedded', async () => { + // Create 5 posts with batchLimit: 2 + // Should result in 3 prepare jobs (2 docs, 2 docs, 1 doc) + for (let i = 0; i < 5; i++) { + await payload.create({ collection: 'posts', data: { title: `BatchLimit Post ${i}` } as any }) + } + + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) + + await waitForBulkJobs(payload, 30000) + + // All 5 posts should have embeddings + const embeds = await payload.find({ collection: 'default' }) + expect(embeds.totalDocs).toBe(5) + + // Run should be succeeded + const runDoc = ( + await (payload as any).find({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + where: { id: { equals: result!.runId } }, + }) + ).docs[0] + expect(runDoc.status).toBe('succeeded') + expect(runDoc.inputs).toBe(5) + }) + + test('batchLimit equal to doc count does not create extra continuations', async () => { + // Clean up from prior test: delete all posts and embeddings + await payload.delete({ collection: 'posts', where: {} }) + await payload.delete({ collection: 'default' as any, where: {} }) + + // Create exactly 2 posts (matching batchLimit: 2) + for (let i = 0; i < 2; i++) { + await payload.create({ + collection: 'posts', + data: { title: `Exact Post ${i}` } as any, + }) + } + + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) + + await waitForBulkJobs(payload, 20000) + + const embeds = await payload.find({ collection: 'default' }) + expect(embeds.totalDocs).toBe(2) + + const runDoc = ( + await (payload as any).find({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + where: { id: { equals: result!.runId } }, + }) + ).docs[0] + expect(runDoc.status).toBe('succeeded') + }) +}) diff --git a/dev/specs/bulkEmbed/canceledBatch.spec.ts b/dev/specs/bulkEmbed/canceledBatch.spec.ts index 2b99b88..ade1df2 100644 --- a/dev/specs/bulkEmbed/canceledBatch.spec.ts +++ b/dev/specs/bulkEmbed/canceledBatch.spec.ts @@ -1,16 +1,18 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_canceled_${Date.now()}` @@ -27,6 +29,7 @@ describe('Bulk embed - canceled batch', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -49,6 +52,10 @@ describe('Bulk embed - canceled batch', () => { vectorizedPayload = getVectorizedPayload(payload) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('canceled batch marks entire run as failed', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Cancel' } as any }) const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) diff --git a/dev/specs/bulkEmbed/concurrentRuns.spec.ts b/dev/specs/bulkEmbed/concurrentRuns.spec.ts index c03f212..27023bd 100644 --- a/dev/specs/bulkEmbed/concurrentRuns.spec.ts +++ b/dev/specs/bulkEmbed/concurrentRuns.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { getVectorizedPayload } from '../../../src/types.js' import { @@ -8,8 +8,10 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_concurrent_${Date.now()}` @@ -22,6 +24,7 @@ describe('Bulk embed - concurrent runs prevention', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -45,6 +48,10 @@ describe('Bulk embed - concurrent runs prevention', () => { payload = built.payload }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('cannot start concurrent bulk embed runs for the same pool', async () => { const vectorizedPayload = getVectorizedPayload<'default'>(payload)! // Create a test post first diff --git a/dev/specs/bulkEmbed/extensionFields.spec.ts b/dev/specs/bulkEmbed/extensionFields.spec.ts index c564bea..865aed5 100644 --- a/dev/specs/bulkEmbed/extensionFields.spec.ts +++ b/dev/specs/bulkEmbed/extensionFields.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, @@ -7,11 +7,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_extfields_${Date.now()}` @@ -25,6 +27,7 @@ describe('Bulk embed - extension fields', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -53,6 +56,10 @@ describe('Bulk embed - extension fields', () => { vectorizedPayload = getVectorizedPayload(payload) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('extension fields are merged when writing embeddings', async () => { const post = await payload.create({ collection: 'posts', diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 037fdb5..8313167 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../../src/collections/bulkEmbeddingInputMetadata.js' @@ -10,10 +10,12 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_failed_${Date.now()}` @@ -27,6 +29,7 @@ describe('Bulk embed - failed batch', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -49,6 +52,10 @@ describe('Bulk embed - failed batch', () => { vectorizedPayload = getVectorizedPayload(payload) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('failed batch marks entire run as failed', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Fail' } as any }) diff --git a/dev/specs/bulkEmbed/ingestionFailure.spec.ts b/dev/specs/bulkEmbed/ingestionFailure.spec.ts index e73d3b9..20fa09e 100644 --- a/dev/specs/bulkEmbed/ingestionFailure.spec.ts +++ b/dev/specs/bulkEmbed/ingestionFailure.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { @@ -8,11 +8,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_ingestion_failure_${Date.now()}` @@ -24,6 +26,10 @@ describe('Bulk embed - ingestion validation failures', () => { await createTestDb({ dbName }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('malformed chunk entry fails the bulk embedding run', async () => { // Use unique version to ensure this test only processes its own data const testVersion = `${testEmbeddingVersion}-ingestion-fail-${Date.now()}` @@ -31,6 +37,7 @@ describe('Bulk embed - ingestion validation failures', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { diff --git a/dev/specs/bulkEmbed/multipleBatches.spec.ts b/dev/specs/bulkEmbed/multipleBatches.spec.ts index aa2de86..b038e1d 100644 --- a/dev/specs/bulkEmbed/multipleBatches.spec.ts +++ b/dev/specs/bulkEmbed/multipleBatches.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { @@ -8,11 +8,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_multibatch_${Date.now()}` @@ -26,6 +28,7 @@ describe('Bulk embed - multiple batches', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -51,6 +54,10 @@ describe('Bulk embed - multiple batches', () => { vectorizedPayload = getVectorizedPayload(payload) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('multiple batches are created when flushing after N chunks', async () => { // Create 5 posts (should result in 3 batches: 2, 2, 1) for (let i = 0; i < 5; i++) { diff --git a/dev/specs/bulkEmbed/multipleChunks.spec.ts b/dev/specs/bulkEmbed/multipleChunks.spec.ts index 0f99eab..8d1cd64 100644 --- a/dev/specs/bulkEmbed/multipleChunks.spec.ts +++ b/dev/specs/bulkEmbed/multipleChunks.spec.ts @@ -1,16 +1,18 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_multichunk_${Date.now()}` @@ -23,6 +25,7 @@ describe('Bulk embed - multiple chunks with extension fields', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -51,6 +54,10 @@ describe('Bulk embed - multiple chunks with extension fields', () => { payload = built.payload }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('multiple chunks keep their respective extension fields', async () => { const post = await payload.create({ collection: 'posts', diff --git a/dev/specs/bulkEmbed/onError.spec.ts b/dev/specs/bulkEmbed/onError.spec.ts index f128009..c6da88c 100644 --- a/dev/specs/bulkEmbed/onError.spec.ts +++ b/dev/specs/bulkEmbed/onError.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, @@ -7,9 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_onerror_${Date.now()}` @@ -29,6 +31,7 @@ describe('Bulk embed - onError callback', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -56,6 +59,10 @@ describe('Bulk embed - onError callback', () => { payload = built.payload }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('onError callback is called when batch fails', async () => { await payload.create({ collection: 'posts', data: { title: 'Error Test' } as any }) diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index d3ef57e..499dba8 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, @@ -7,11 +7,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_${Date.now()}` @@ -30,10 +32,15 @@ describe('Bulk embed - partial chunk failures', () => { await createTestDb({ dbName }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('partial chunk failures are tracked and passed to onError', async () => { // Reset state onErrorCalled = false onErrorArgs = null + const adapter = createMockAdapter() // Use unique version to ensure this test only processes its own data const testVersion = `${testEmbeddingVersion}-partial-${Date.now()}` @@ -43,6 +50,7 @@ describe('Bulk embed - partial chunk failures', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: adapter, knowledgePools: { default: { collections: { diff --git a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts index 35e877f..7ff483a 100644 --- a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts +++ b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, @@ -7,11 +7,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_nofail_${Date.now()}` @@ -30,6 +32,10 @@ describe('Bulk embed - partial failures', () => { await createTestDb({ dbName }) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('run with no partial failures does not call onError', async () => { // Reset state onErrorCalled = false @@ -41,6 +47,7 @@ describe('Bulk embed - partial failures', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { diff --git a/dev/specs/bulkEmbed/polling.spec.ts b/dev/specs/bulkEmbed/polling.spec.ts index eedd32a..50e283b 100644 --- a/dev/specs/bulkEmbed/polling.spec.ts +++ b/dev/specs/bulkEmbed/polling.spec.ts @@ -1,5 +1,5 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test, vi } from 'vitest' +import { afterAll, beforeAll, describe, expect, test, vi } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, @@ -7,11 +7,13 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_polling_${Date.now()}` @@ -24,6 +26,7 @@ describe('Bulk embed - polling requeue', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -47,6 +50,10 @@ describe('Bulk embed - polling requeue', () => { payload = built.payload }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('polling requeues when non-terminal then succeeds', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Loop' } as any }) const queueSpy = vi.spyOn(payload.jobs, 'queue') @@ -57,12 +64,12 @@ describe('Bulk embed - polling requeue', () => { await waitForBulkJobs(payload, 15000) expect(queueSpy).toHaveBeenNthCalledWith( - 2, // 2nd call - expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding' }), + 3, // 3rd call - per-batch task queued from worker (1=coordinator, 2=worker) + expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-single-batch' }), ) expect(queueSpy).toHaveBeenNthCalledWith( - 3, // 3rd call - expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding' }), + 4, // 4th call - per-batch task re-queued after 'running' status + expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-single-batch' }), ) const embeds = await payload.find({ diff --git a/dev/specs/bulkEmbed/realtimeMode.spec.ts b/dev/specs/bulkEmbed/realtimeMode.spec.ts index 8e4c224..9f0ecd6 100644 --- a/dev/specs/bulkEmbed/realtimeMode.spec.ts +++ b/dev/specs/bulkEmbed/realtimeMode.spec.ts @@ -1,14 +1,16 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForVectorizationJobs, } from '../utils.js' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_realtime_${Date.now()}` @@ -19,6 +21,7 @@ describe('Bulk embed - realtime mode', () => { beforeAll(async () => { realtimeOptions = { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -46,6 +49,10 @@ describe('Bulk embed - realtime mode', () => { payload = built.payload }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('realtime mode queues vectorize jobs when realTimeIngestionFn is provided', async () => { const post = await payload.create({ collection: 'posts', diff --git a/dev/specs/bulkEmbed/shouldEmbedFn.spec.ts b/dev/specs/bulkEmbed/shouldEmbedFn.spec.ts new file mode 100644 index 0000000..c7ee29b --- /dev/null +++ b/dev/specs/bulkEmbed/shouldEmbedFn.spec.ts @@ -0,0 +1,111 @@ +import type { Payload, SanitizedConfig } from 'payload' +import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, test, vi } from 'vitest' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + clearAllCollections, + createMockBulkEmbeddings, + createTestDb, + destroyPayload, + waitForBulkJobs, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_should_embed_fn_${Date.now()}` + +const basePluginOptions = { + dbAdapter: createMockAdapter(), + knowledgePools: { + default: { + collections: { + posts: { + shouldEmbedFn: async (doc: any) => !doc.title?.startsWith('SKIP'), + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, +} + +describe('Bulk embed - shouldEmbedFn', () => { + let payload: Payload + let config: SanitizedConfig + let vectorizedPayload: VectorizedPayload | null = null + + beforeAll(async () => { + await createTestDb({ dbName }) + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: basePluginOptions, + key: `bulk-should-embed-${Date.now()}`, + }) + payload = built.payload + config = built.config + vectorizedPayload = getVectorizedPayload(payload) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + beforeEach(async () => { + await clearAllCollections(payload) + }) + + afterEach(async () => { + vi.restoreAllMocks() + }) + + test('filtered-out document is not embedded during bulk run', async () => { + await payload.create({ collection: 'posts', data: { title: 'SKIP me' } as any }) + const embeddedPost = await payload.create({ + collection: 'posts', + data: { title: 'Embed me' } as any, + }) + + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) + + await waitForBulkJobs(payload) + + // Only the allowed post should have embeddings + const allEmbeddings = await payload.find({ + collection: 'default', + where: { sourceCollection: { equals: 'posts' } }, + }) + expect(allEmbeddings.totalDocs).toBe(1) + expect(allEmbeddings.docs[0]).toHaveProperty('docId', String(embeddedPost.id)) + }) + + test('multiple filtered-out documents produce no embeddings while allowed ones do', async () => { + await payload.create({ collection: 'posts', data: { title: 'SKIP first' } as any }) + await payload.create({ collection: 'posts', data: { title: 'SKIP second' } as any }) + const allowedPost = await payload.create({ + collection: 'posts', + data: { title: 'Allowed post' } as any, + }) + + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) + + await waitForBulkJobs(payload) + + const allEmbeddings = await payload.find({ + collection: 'default', + where: { sourceCollection: { equals: 'posts' } }, + }) + expect(allEmbeddings.totalDocs).toBe(1) + expect(allEmbeddings.docs[0]).toHaveProperty('docId', String(allowedPost.id)) + }) +}) diff --git a/dev/specs/bulkEmbed/versionBump.spec.ts b/dev/specs/bulkEmbed/versionBump.spec.ts index 5d3dcde..b60afbe 100644 --- a/dev/specs/bulkEmbed/versionBump.spec.ts +++ b/dev/specs/bulkEmbed/versionBump.spec.ts @@ -1,111 +1,93 @@ -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery } from 'helpers/embed.js' import { getVectorizedPayload } from '../../../src/types.js' import { expectGoodResult } from '../utils.vitest.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' +import type { Payload } from 'payload' const DIMS = DEFAULT_DIMS const dbName = `bulk_version_${Date.now()}` -// Use distinct bulk queue names per payload instance so that -// the second payload's cron worker handles its own bulk runs, -// instead of the first payload instance continuing to process them. -const BULK_QUEUE_NAMES_0 = BULK_QUEUE_NAMES -const BULK_QUEUE_NAMES_1 = { - prepareBulkEmbedQueueName: `${BULK_QUEUE_NAMES.prepareBulkEmbedQueueName}-v2`, - pollOrCompleteQueueName: `${BULK_QUEUE_NAMES.pollOrCompleteQueueName}-v2`, -} - describe('Bulk embed - version bump', () => { - let post: any + let payload: Payload + let knowledgePools: any + beforeAll(async () => { await createTestDb({ dbName }) - }) - test('version bump re-embeds all even without updates', async () => { - const payload0 = ( + knowledgePools = { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: 'old-version', + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), + }, + }, + } + + payload = ( await buildPayloadWithIntegration({ dbName, pluginOpts: { - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], - }, - }, - embeddingConfig: { - version: 'old-version', - queryFn: makeDummyEmbedQuery(DIMS), - bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), - }, - }, - }, - bulkQueueNames: BULK_QUEUE_NAMES_0, + dbAdapter: createMockAdapter(), + knowledgePools, + bulkQueueNames: BULK_QUEUE_NAMES, }, - key: `payload0`, + key: `version-bump-${Date.now()}`, }) ).payload + }) + + afterAll(async () => { + await destroyPayload(payload) + }) - post = await payload0.create({ collection: 'posts', data: { title: 'Old' } as any }) + test('version bump re-embeds all even without updates', async () => { + // Phase 1: Bulk embed with old-version + const post = await payload.create({ collection: 'posts', data: { title: 'Old' } as any }) - const vectorizedPayload0 = getVectorizedPayload(payload0) - const result0 = await vectorizedPayload0?.bulkEmbed({ knowledgePool: 'default' }) + const vp = getVectorizedPayload(payload) + const result0 = await vp?.bulkEmbed({ knowledgePool: 'default' }) expectGoodResult(result0) - await waitForBulkJobs(payload0) + await waitForBulkJobs(payload, 30000) - // Debug: log embeddings after first run - const embeds0 = await payload0.find({ + const embeds0 = await payload.find({ collection: 'default', where: { docId: { equals: String(post.id) } }, }) expect(embeds0.totalDocs).toBe(1) expect(embeds0.docs[0].embeddingVersion).toBe('old-version') - const payload1 = ( - await buildPayloadWithIntegration({ - dbName, - pluginOpts: { - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], - }, - }, - embeddingConfig: { - version: 'new-version', - queryFn: makeDummyEmbedQuery(DIMS), - bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), - }, - }, - }, - bulkQueueNames: BULK_QUEUE_NAMES_1, - }, - key: `payload1`, - skipMigrations: true, - }) - ).payload + // Phase 2: Mutate config to new-version and re-embed + knowledgePools.default.embeddingConfig.version = 'new-version' + knowledgePools.default.embeddingConfig.bulkEmbeddingsFns = createMockBulkEmbeddings({ + statusSequence: ['succeeded'], + }) - const vectorizedPayload1 = getVectorizedPayload(payload1) - const result1 = await vectorizedPayload1?.bulkEmbed({ knowledgePool: 'default' }) + const result1 = await vp?.bulkEmbed({ knowledgePool: 'default' }) expectGoodResult(result1) - await waitForBulkJobs(payload1) + await waitForBulkJobs(payload, 30000) - const embeds1 = await payload1.find({ + const embeds1 = await payload.find({ collection: 'default', where: { docId: { equals: String(post.id) } }, }) - expect(embeds1.totalDocs).toBe(1) expect(embeds1.docs[0].embeddingVersion).toBe('new-version') }) diff --git a/dev/specs/chunkers.spec.ts b/dev/specs/chunkers.spec.ts index 445a6c7..cbf7848 100644 --- a/dev/specs/chunkers.spec.ts +++ b/dev/specs/chunkers.spec.ts @@ -1,9 +1,6 @@ import { describe, expect, test } from 'vitest' import { chunkText, chunkRichText } from 'helpers/chunkers.js' -import { postgresAdapter } from '@payloadcms/db-postgres' -import { buildDummyConfig, getInitialMarkdownContent, integration } from './constants.js' -import { createTestDb } from './utils.js' -import { getPayload } from 'payload' +import { buildDummyConfig, getInitialMarkdownContent } from './constants.js' describe('Chunkers', () => { test('textChunker', () => { @@ -17,26 +14,14 @@ describe('Chunkers', () => { }) test('richTextChunker splits by H2', async () => { - const dbName = 'chunkers_test' - await createTestDb({ dbName }) - const cfg = await buildDummyConfig({ - db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], - pool: { - connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, - }, - }), + db: {} as any, }) const markdownContent = await getInitialMarkdownContent(cfg) - const thisPayload = await getPayload({ - config: cfg, - key: `chunkers-test-${Date.now()}`, - cron: true, - }) - const chunks = await chunkRichText(markdownContent, thisPayload) + // chunkRichText only needs the SanitizedConfig for Lexical editor setup, + // no real db required + const chunks = await chunkRichText(markdownContent, cfg) expect(chunks.length).toBe(3) diff --git a/dev/specs/config.spec.ts b/dev/specs/config.spec.ts index b183af7..7818cc4 100644 --- a/dev/specs/config.spec.ts +++ b/dev/specs/config.spec.ts @@ -1,5 +1,11 @@ import { describe, expect, test } from 'vitest' -import { buildDummyConfig, dummyPluginOptions, plugin } from './constants.js' +import { buildConfig, getPayload } from 'payload' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildDummyConfig, dummyPluginOptions } from './constants.js' +import payloadcmsVectorize, { getVectorizedPayload } from 'payloadcms-vectorize' +import { createMockAdapter } from 'helpers/mockAdapter.js' +import { createTestDb } from './utils.js' describe('jobs.tasks merging', () => { test('adds tasks when none provided', async () => { @@ -11,7 +17,7 @@ describe('jobs.tasks merging', () => { { slug: 'payloadcms-vectorize:vectorize', handler: expect.any(Function) }, { slug: 'payloadcms-vectorize:prepare-bulk-embedding', handler: expect.any(Function) }, { - slug: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', + slug: 'payloadcms-vectorize:poll-or-complete-single-batch', handler: expect.any(Function), }, ]), @@ -41,7 +47,9 @@ describe('endpoints: /vector-search, /vector-bulk-embed', () => { }) test('does not add the endpoint when disabled', async () => { const cfg = await buildDummyConfig({ - plugins: [plugin({ ...dummyPluginOptions, endpointOverrides: { enabled: false } })], + plugins: [ + payloadcmsVectorize({ ...dummyPluginOptions, endpointOverrides: { enabled: false } }), + ], }) const endpoints = cfg.endpoints expect(Array.isArray(endpoints)).toBe(true) @@ -68,7 +76,9 @@ describe('endpoints: /vector-search, /vector-bulk-embed', () => { test('uses the custom path when provided', async () => { // TODO: Add test for custom path for bulk embed and retry failed batch const cfg = await buildDummyConfig({ - plugins: [plugin({ ...dummyPluginOptions, endpointOverrides: { path: '/custom-path' } })], + plugins: [ + payloadcmsVectorize({ ...dummyPluginOptions, endpointOverrides: { path: '/custom-path' } }), + ], }) const endpoints = cfg.endpoints expect(Array.isArray(endpoints)).toBe(true) @@ -82,4 +92,142 @@ describe('endpoints: /vector-search, /vector-bulk-embed', () => { ]), ) }) + + test('bins are added to the config', async () => { + const testBins = [ + { key: 'test:bin', scriptPath: '/path/to/script.js' }, + { key: 'another:bin', scriptPath: '/path/to/another.js' }, + ] + + const dbAdapter = createMockAdapter({ bins: testBins }) + + const cfg = await buildConfig({ + secret: 'test-secret', + collections: [], + editor: lexicalEditor(), + db: {} as any, + plugins: [ + payloadcmsVectorize({ + dbAdapter, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: 'test', + queryFn: async () => [0, 0, 0, 0, 0, 0, 0, 0], + realTimeIngestionFn: async (texts) => texts.map(() => [0, 0, 0, 0, 0, 0, 0, 0]), + }, + }, + }, + }), + ], + }) + + expect(cfg.bin).toBeDefined() + expect(cfg.bin).toEqual(expect.arrayContaining(testBins)) + }) + + test('custom dict is retrievable when provided', async () => { + const dbName = 'config_custom_dict_test' + await createTestDb({ dbName }) + + const testCustom = { + myKey: 'myValue', + anotherKey: { nested: true }, + } + + const dbAdapter = createMockAdapter({ custom: testCustom }) + + const cfg = await buildConfig({ + secret: 'test-secret', + collections: [], + editor: lexicalEditor(), + db: postgresAdapter({ + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter, + knowledgePools: { + default: { + collections: {}, + extensionFields: [{ type: 'text', name: 'textField' }], + embeddingConfig: { + version: 'test', + queryFn: async () => [0, 0, 0, 0, 0, 0, 0, 0], + realTimeIngestionFn: async (texts) => texts.map(() => [0, 0, 0, 0, 0, 0, 0, 0]), + }, + }, + }, + }), + ], + }) + + const vectorizedPayload = getVectorizedPayload(await getPayload({ config: cfg })) + expect(vectorizedPayload).toBeDefined() + expect(vectorizedPayload!.getDbAdapterCustom()).toEqual(expect.objectContaining(testCustom)) + }) + + test('infra collections (bulk embedding runs, batches, input metadata) are added to list of collections', async () => { + const cfg = await buildDummyConfig({}) + + const collectionSlugs = cfg.collections.map((c) => c.slug) + + // Bulk embedding runs collection + expect(collectionSlugs).toContain('vector-bulk-embeddings-runs') + + // Bulk embedding batches collection + expect(collectionSlugs).toContain('vector-bulk-embeddings-batches') + + // Bulk embedding input metadata collection + expect(collectionSlugs).toContain('vector-bulk-embedding-input-metadata') + }) + + test('embedding collection w/ extensionFields are added to list of collections', async () => { + const dbAdapter = createMockAdapter() + + const cfg = await buildConfig({ + secret: 'test-secret', + collections: [], + editor: lexicalEditor(), + db: {} as any, + plugins: [ + payloadcmsVectorize({ + dbAdapter, + knowledgePools: { + default: { + collections: {}, + extensionFields: [ + { name: 'customField', type: 'text' }, + { name: 'anotherField', type: 'number' }, + ], + embeddingConfig: { + version: 'test', + queryFn: async () => [0, 0, 0, 0, 0, 0, 0, 0], + realTimeIngestionFn: async (texts) => texts.map(() => [0, 0, 0, 0, 0, 0, 0, 0]), + }, + }, + }, + }), + ], + }) + + // Find the default embedding collection + const embeddingCollection = cfg.collections.find((c) => c.slug === 'default') + expect(embeddingCollection).toBeDefined() + + // Check that extension fields are present + const fieldNames = embeddingCollection!.fields.map((f: any) => f.name).filter(Boolean) + expect(fieldNames).toContain('customField') + expect(fieldNames).toContain('anotherField') + + // Also verify the built-in fields are present + expect(fieldNames).toContain('sourceCollection') + expect(fieldNames).toContain('docId') + expect(fieldNames).toContain('chunkIndex') + expect(fieldNames).toContain('chunkText') + expect(fieldNames).toContain('embeddingVersion') + }) }) diff --git a/dev/specs/constants.ts b/dev/specs/constants.ts index e695599..ee349ad 100644 --- a/dev/specs/constants.ts +++ b/dev/specs/constants.ts @@ -10,7 +10,8 @@ import { } from '@payloadcms/richtext-lexical/lexical' import { $createHeadingNode } from '@payloadcms/richtext-lexical/lexical/rich-text' import { editorConfigFactory, getEnabledNodes, lexicalEditor } from '@payloadcms/richtext-lexical' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import payloadcmsVectorize from 'payloadcms-vectorize' +import { createMockAdapter } from 'helpers/mockAdapter.js' export const DIMS = 8 @@ -48,22 +49,16 @@ export const getInitialMarkdownContent = async ( export const embeddingsCollection = 'default' -export const integration = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 1, - }, -}) export const vectorizeCronJob = { cron: '*/10 * * * * *', limit: 5, queue: 'default' } -export const plugin = integration.payloadcmsVectorize export const dummyPluginOptions = { + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: {}, embeddingConfig: { version: 'test', - queryFn: async (text: string) => [0, 0, 0, 0, 0, 0, 0, 0], + queryFn: async (_text: string) => [0, 0, 0, 0, 0, 0, 0, 0], realTimeIngestionFn: async (texts: string[]) => texts.map(() => [0, 0, 0, 0, 0, 0, 0, 0]), }, }, @@ -78,7 +73,7 @@ export async function buildDummyConfig(cfg: Partial) { editor: lexicalEditor(), // Provide a dummy db adapter to satisfy types; not used by these tests db: {} as any, - plugins: [plugin(dummyPluginOptions)], + plugins: [payloadcmsVectorize(dummyPluginOptions)], ...cfg, }) return built diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 5b8bf85..7d81b5e 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -3,7 +3,7 @@ import type { Payload, SanitizedConfig } from 'payload' import config from '@payload-config' import { getPayload } from 'payload' import { getInitialMarkdownContent } from './constants.js' -import { waitForVectorizationJobs, waitForBulkJobs } from './utils.js' +import { destroyPayload, waitForVectorizationJobs, waitForBulkJobs } from './utils.js' import { testEmbeddingVersion } from 'helpers/embed.js' import { devUser } from 'helpers/credentials.js' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' @@ -66,6 +66,10 @@ test.describe('Vector embedding e2e tests', () => { payload = await getPayload({ config: _config, key: `e2e-test-${Date.now()}` }) }) + test.afterAll(async () => { + await destroyPayload(payload) + }) + test('querying the endpoint should return the title with testEmbeddingVersion', async ({ request, }) => { diff --git a/dev/specs/extensionFields.spec.ts b/dev/specs/extensionFields.spec.ts index 80a91fe..911056c 100644 --- a/dev/specs/extensionFields.spec.ts +++ b/dev/specs/extensionFields.spec.ts @@ -1,16 +1,18 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { postgresAdapter } from '@payloadcms/db-postgres' -import { buildDummyConfig, integration, plugin } from './constants.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' -import { getPayload } from 'payload' -import { PostgresPayload } from '../../src/types.js' +import { getPayload, buildConfig } from 'payload' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { DIMS } from './constants.js' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { createMockAdapter } from 'helpers/mockAdapter.js' +import payloadcmsVectorize from 'payloadcms-vectorize' describe('Extension fields integration tests', () => { let payload: Payload @@ -19,7 +21,12 @@ describe('Extension fields integration tests', () => { beforeAll(async () => { await createTestDb({ dbName }) - const config = await buildDummyConfig({ + // Create mock adapter for testing without requiring pg vector extension + const dbAdapter = createMockAdapter() + + const config = await buildConfig({ + secret: process.env.PAYLOAD_SECRET || 'test-secret', + editor: lexicalEditor(), jobs: { tasks: [], autoRun: [ @@ -41,19 +48,18 @@ describe('Extension fields integration tests', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - plugin({ + payloadcmsVectorize({ + dbAdapter, knowledgePools: { default: { collections: { posts: { - toKnowledgePool: async (doc, payload) => { + toKnowledgePool: async (doc: any, payload: Payload) => { const chunks: Array<{ chunk: string; category?: string; priority?: number }> = [] // Process title @@ -69,7 +75,7 @@ describe('Extension fields integration tests', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push( ...contentChunks.map((chunk) => ({ chunk, @@ -116,39 +122,8 @@ describe('Extension fields integration tests', () => { }) }) - test('extension fields are added to the embeddings table schema', async () => { - const db = (payload as PostgresPayload).db - const sql = ` - SELECT column_name, data_type, udt_name - FROM information_schema.columns - WHERE table_schema = 'public' AND table_name = 'default' - ORDER BY column_name - ` - - let rows: any[] = [] - if (db?.pool?.query) { - const res = await db.pool.query(sql) - rows = res?.rows || [] - } else if (db?.drizzle?.execute) { - const res = await db.drizzle.execute(sql) - rows = Array.isArray(res) ? res : res?.rows || [] - } - - const columnsByName = Object.fromEntries(rows.map((r: any) => [r.column_name, r])) - - // Check that reserved fields exist - expect(columnsByName.source_collection).toBeDefined() - expect(columnsByName.doc_id).toBeDefined() - expect(columnsByName.chunk_index).toBeDefined() - expect(columnsByName.chunk_text).toBeDefined() - expect(columnsByName.embedding_version).toBeDefined() - expect(columnsByName.embedding).toBeDefined() - - // Check that extension fields exist - expect(columnsByName.category).toBeDefined() - expect(columnsByName.category.data_type).toBe('character varying') - expect(columnsByName.priority).toBeDefined() - expect(['numeric', 'integer']).toContain(columnsByName.priority.data_type) + afterAll(async () => { + await destroyPayload(payload) }) test('extension field values are stored with embeddings', async () => { diff --git a/dev/specs/extensionFieldsVectorSearch.spec.ts b/dev/specs/extensionFieldsVectorSearch.spec.ts index 825798a..0eada79 100644 --- a/dev/specs/extensionFieldsVectorSearch.spec.ts +++ b/dev/specs/extensionFieldsVectorSearch.spec.ts @@ -1,8 +1,9 @@ import { describe, expect, test } from 'vitest' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' -import { buildDummyConfig, DIMS, integration, plugin } from './constants.js' +import { buildDummyConfig, DIMS } from './constants.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' @@ -10,12 +11,15 @@ import { postgresAdapter } from '@payloadcms/db-postgres' import { chunkRichText, chunkText } from 'helpers/chunkers.js' import { createVectorSearchHandlers } from '../../src/endpoints/vectorSearch.js' import type { KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' +import payloadcmsVectorize from 'payloadcms-vectorize' +import { createMockAdapter } from 'helpers/mockAdapter.js' describe('extensionFields', () => { test('returns extensionFields in search results with correct types', async () => { // Create a new payload instance with extensionFields const dbName = 'endpoint_test_extension' await createTestDb({ dbName }) + const adapter = createMockAdapter() const defaultKnowledgePool: KnowledgePoolDynamicConfig = { collections: { posts: { @@ -34,7 +38,7 @@ describe('extensionFields', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push( ...contentChunks.map((chunk) => ({ chunk, @@ -91,14 +95,13 @@ describe('extensionFields', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - plugin({ + payloadcmsVectorize({ + dbAdapter: adapter, knowledgePools: { default: defaultKnowledgePool, }, @@ -112,55 +115,59 @@ describe('extensionFields', () => { cron: true, }) - // Create a post with extension field values - const testQuery = 'Extension fields test content' - const post = await payloadWithExtensions.create({ - collection: 'posts', - data: { - title: testQuery, - content: null, - category: 'tech', - priorityLevel: 42, - } as unknown as any, - }) + try { + // Create a post with extension field values + const testQuery = 'Extension fields test content' + const post = await payloadWithExtensions.create({ + collection: 'posts', + data: { + title: testQuery, + content: null, + category: 'tech', + priorityLevel: 42, + } as unknown as any, + }) - // Wait for vectorization jobs to complete - await waitForVectorizationJobs(payloadWithExtensions) + // Wait for vectorization jobs to complete + await waitForVectorizationJobs(payloadWithExtensions) - // Perform vector search - const knowledgePools: Record = { - default: defaultKnowledgePool, - } - const searchHandler = createVectorSearchHandlers(knowledgePools).requestHandler - const mockRequest = { - json: async () => ({ - query: testQuery, - knowledgePool: 'default', - }), - payload: payloadWithExtensions, - } as any - const response = await searchHandler(mockRequest) - const json = await response.json() + // Perform vector search + const knowledgePools: Record = { + default: defaultKnowledgePool, + } + const searchHandler = createVectorSearchHandlers(knowledgePools, adapter).requestHandler + const mockRequest = { + json: async () => ({ + query: testQuery, + knowledgePool: 'default', + }), + payload: payloadWithExtensions, + } as any + const response = await searchHandler(mockRequest) + const json = await response.json() - // Verify results contain extensionFields - expect(json).toHaveProperty('results') - expect(Array.isArray(json.results)).toBe(true) - expect(json.results.length).toBeGreaterThan(0) + // Verify results contain extensionFields + expect(json).toHaveProperty('results') + expect(Array.isArray(json.results)).toBe(true) + expect(json.results.length).toBeGreaterThan(0) - // Find a result that matches our post - const matchingResult = json.results.find( - (r: any) => r.docId === String(post.id) && r.chunkText === testQuery, - ) - expect(matchingResult).toBeDefined() + // Find a result that matches our post + const matchingResult = json.results.find( + (r: any) => r.docId === String(post.id) && r.chunkText === testQuery, + ) + expect(matchingResult).toBeDefined() - // Verify extensionFields are present - expect(matchingResult).toHaveProperty('category') - expect(matchingResult).toHaveProperty('priorityLevel') + // Verify extensionFields are present + expect(matchingResult).toHaveProperty('category') + expect(matchingResult).toHaveProperty('priorityLevel') - // Verify types are correct - expect(typeof matchingResult.category).toBe('string') - expect(matchingResult.category).toBe('tech') - expect(typeof matchingResult.priorityLevel).toBe('number') - expect(matchingResult.priorityLevel).toBe(42) + // Verify types are correct + expect(typeof matchingResult.category).toBe('string') + expect(matchingResult.category).toBe('tech') + expect(typeof matchingResult.priorityLevel).toBe('number') + expect(matchingResult.priorityLevel).toBe(42) + } finally { + await destroyPayload(payloadWithExtensions) + } }) }) diff --git a/dev/specs/failedValidation.spec.ts b/dev/specs/failedValidation.spec.ts index 558c74e..9dd44c4 100644 --- a/dev/specs/failedValidation.spec.ts +++ b/dev/specs/failedValidation.spec.ts @@ -2,12 +2,14 @@ import { postgresAdapter } from '@payloadcms/db-postgres' import { buildConfig } from 'payload' import { describe, expect, test } from 'vitest' -import { createVectorizeIntegration } from '../../src/index.js' +import payloadcmsVectorize from '../../src/index.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' +import { createMockAdapter } from 'helpers/mockAdapter.js' const DIMS = 8 const dbName = 'failed_validation_test' @@ -15,13 +17,6 @@ const dbName = 'failed_validation_test' const embedDocs = async (texts: string[]) => texts.map(() => Array(DIMS).fill(0)) const embedQuery = async (_text: string) => Array(DIMS).fill(0) -const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 1, - }, -}) - const buildMalformedConfig = async () => { return buildConfig({ jobs: { @@ -40,16 +35,14 @@ const buildMalformedConfig = async () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [afterSchemaInitHook], pool: { connectionString: - process.env.DATABASE_URI || - `postgresql://postgres:password@localhost:5433/${dbName}`, + process.env.DATABASE_URI || `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ payloadcmsVectorize({ + dbAdapter: createMockAdapter(), knowledgePools: { default: { collections: { @@ -82,31 +75,35 @@ describe('Validation failures mark jobs as errored', () => { cron: true, }) - await payload.create({ - collection: 'posts', - data: { title: 'bad chunks' }, - }) + try { + await payload.create({ + collection: 'posts', + data: { title: 'bad chunks' }, + }) - // Wait for the queued job to finish (success or failure) - await waitForVectorizationJobs(payload, 30000) + // Wait for the queued job to finish (success or failure) + await waitForVectorizationJobs(payload, 30000) - // Then assert failure - const res = await payload.find({ - collection: 'payload-jobs', - where: { - and: [{ taskSlug: { equals: 'payloadcms-vectorize:vectorize' } }], - }, - limit: 1, - sort: '-createdAt', - }) - const failedJob = (res as any)?.docs?.[0] - expect(failedJob.hasError).toBe(true) - const errMsg = failedJob.error.message - expect(errMsg).toMatch(/chunk/i) - expect(errMsg).toMatch(/Invalid indices: 1/) + // Then assert failure + const res = await payload.find({ + collection: 'payload-jobs', + where: { + and: [{ taskSlug: { equals: 'payloadcms-vectorize:vectorize' } }], + }, + limit: 1, + sort: '-createdAt', + }) + const failedJob = (res as any)?.docs?.[0] + expect(failedJob.hasError).toBe(true) + const errMsg = failedJob.error.message + expect(errMsg).toMatch(/chunk/i) + expect(errMsg).toMatch(/Invalid indices: 1/) - // Ensure no embeddings were created (all-or-nothing validation) - const embeddingsCount = await payload.count({ collection: 'default' }) - expect(embeddingsCount.totalDocs).toBe(0) + // Ensure no embeddings were created (all-or-nothing validation) + const embeddingsCount = await payload.count({ collection: 'default' }) + expect(embeddingsCount.totalDocs).toBe(0) + } finally { + await destroyPayload(payload) + } }, 60000) }) diff --git a/dev/specs/helpers/vectorSearchExpectations.ts b/dev/specs/helpers/vectorSearchExpectations.ts index a97b267..15541ad 100644 --- a/dev/specs/helpers/vectorSearchExpectations.ts +++ b/dev/specs/helpers/vectorSearchExpectations.ts @@ -12,7 +12,7 @@ export function expectVectorSearchResults(results: VectorSearchResult[]) { export function expectVectorSearchResultShape(result: VectorSearchResult) { expect(result).toHaveProperty('id') - expect(result).toHaveProperty('similarity') + expect(result).toHaveProperty('score') expect(result).toHaveProperty('sourceCollection') expect(result).toHaveProperty('docId') expect(result).toHaveProperty('chunkIndex') @@ -20,11 +20,11 @@ export function expectVectorSearchResultShape(result: VectorSearchResult) { expect(result).toHaveProperty('embeddingVersion') } -export function expectResultsOrderedBySimilarity(results: VectorSearchResult[]) { +export function expectResultsOrderedByScore(results: VectorSearchResult[]) { expect(results.length).toBeGreaterThan(1) for (let i = 0; i < results.length - 1; i++) { - expect(results[i].similarity).toBeGreaterThanOrEqual(results[i + 1].similarity) + expect(results[i].score).toBeGreaterThanOrEqual(results[i + 1].score) } } @@ -81,7 +81,7 @@ export function expectValidVectorSearchResults( } if (options?.checkOrdering && results.length > 1) { - expectResultsOrderedBySimilarity(results) + expectResultsOrderedByScore(results) } if (options?.expectedTitle) { diff --git a/dev/specs/int.spec.ts b/dev/specs/int.spec.ts index 0521deb..9fede10 100644 --- a/dev/specs/int.spec.ts +++ b/dev/specs/int.spec.ts @@ -1,6 +1,6 @@ import type { Payload, SanitizedConfig } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { chunkRichText, chunkText } from 'helpers/chunkers.js' import { createHeadlessEditor } from '@payloadcms/richtext-lexical/lexical/headless' @@ -11,20 +11,19 @@ import { type SerializedEditorState, } from '@payloadcms/richtext-lexical/lexical' import { $createHeadingNode } from '@payloadcms/richtext-lexical/lexical/rich-text' -import { PostgresPayload } from '../../src/types.js' import { editorConfigFactory, getEnabledNodes, lexicalEditor } from '@payloadcms/richtext-lexical' import { DIMS, getInitialMarkdownContent } from './constants.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildConfig } from 'payload' -import { createVectorizeIntegration } from 'payloadcms-vectorize' - -const embedFn = makeDummyEmbedDocs(DIMS) +import { createMockAdapter } from 'helpers/mockAdapter.js' const embeddingsCollection = 'default' +import payloadcmsVectorize from 'payloadcms-vectorize' describe('Plugin integration tests', () => { let payload: Payload @@ -32,18 +31,10 @@ describe('Plugin integration tests', () => { let postId: string let markdownContent: SerializedEditorState const dbName = `int_test_${Date.now()}` - + const adapter = createMockAdapter() beforeAll(async () => { await createTestDb({ dbName }) - // Create isolated integration for this test suite - const integration = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 1, - }, - }) - config = await buildConfig({ secret: process.env.PAYLOAD_SECRET || 'test-secret', editor: lexicalEditor(), @@ -57,14 +48,13 @@ describe('Plugin integration tests', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - integration.payloadcmsVectorize({ + payloadcmsVectorize({ + dbAdapter: adapter, knowledgePools: { default: { collections: { @@ -76,7 +66,7 @@ describe('Plugin integration tests', () => { chunks.push(...titleChunks.map((chunk) => ({ chunk }))) } if (doc.content) { - const contentChunks = await chunkRichText(doc.content, pl) + const contentChunks = await chunkRichText(doc.content, pl.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -112,59 +102,10 @@ describe('Plugin integration tests', () => { markdownContent = await getInitialMarkdownContent(config) }) - test('adds embeddings collection with vector column', async () => { - // Check schema for embeddings collection - const collections = payload.collections - expect(collections).toHaveProperty(embeddingsCollection) - - // Do sql check for vector column - const db = (payload as PostgresPayload).db - const sql = ` - SELECT column_name, udt_name, data_type - FROM information_schema.columns - WHERE table_schema = 'public' AND table_name = '${embeddingsCollection}' - ` - - let rows: any[] = [] - if (db?.pool?.query) { - const res = await db.pool.query(sql) - rows = res?.rows || [] - } else if (db?.drizzle?.execute) { - const res = await db.drizzle.execute(sql) - rows = Array.isArray(res) ? res : res?.rows || [] - } - - const columnsByName = Object.fromEntries(rows.map((r: any) => [r.column_name, r])) - - expect(columnsByName.embedding).toBeDefined() - // pgvector columns report udt_name = 'vector' - expect(columnsByName.embedding.udt_name).toBe('vector') + afterAll(async () => { + await destroyPayload(payload) }) - const getSQLRow = async ( - db: { - pool?: { query: (sql: string, params?: any[]) => Promise } - drizzle?: { execute: (sql: string) => Promise } - }, - id: string, - ) => { - if (db?.pool?.query) { - const sql = ` - SELECT embedding, pg_typeof(embedding) AS t - FROM "${embeddingsCollection}" - WHERE id = $1 - ` - const res = await db.pool.query(sql, [id]) - return res.rows[0] - } else if (db?.drizzle?.execute) { - // drizzle.execute may not support params; inline if needed - const res = await db.drizzle.execute( - `SELECT embedding, pg_typeof(embedding) AS t FROM "${embeddingsCollection}" WHERE id = '${id}'`, - ) - return Array.isArray(res) ? res[0] : res.rows?.[0] - } - } - test('creates embeddings on create', async () => { const title = 'Hello world' const post = await payload.create({ @@ -179,7 +120,7 @@ describe('Plugin integration tests', () => { await waitForVectorizationJobs(payload) // Get the actual content chunks to create proper expectations - const contentChunks = await chunkRichText(markdownContent, payload) + const contentChunks = await chunkRichText(markdownContent, payload.config) const expectedTitleDoc = { sourceCollection: 'posts', @@ -220,24 +161,6 @@ describe('Plugin integration tests', () => { expect.arrayContaining(expectedContentDocs.map((doc) => expect.objectContaining(doc))), ) - const expectedEmbeddings = await embedFn(embeddings.docs.map((doc) => doc.chunkText as string)) - await Promise.all( - embeddings.docs.map(async (doc, index) => { - expect(doc.chunkText).toBeDefined() - const id = String(doc.id) - const expectedEmbedding = expectedEmbeddings[index] - const row = await getSQLRow((payload as any).db, id) - - expect(row).toBeDefined() - expect(row.embedding).toBeDefined() - expect(row.t).toBe('vector') - const received = JSON.parse(row.embedding) - for (let i = 0; i < expectedEmbedding.length; i++) { - expect(received[i]).toBeCloseTo(expectedEmbedding[i], 5) // 5 decimal places is typical for float4 - } - }), - ) - // Save for follow-up tests postId = String(post.id) }) @@ -287,7 +210,7 @@ describe('Plugin integration tests', () => { await waitForVectorizationJobs(payload) // Get the updated content chunks - const updatedContentChunks = await chunkRichText(updatedContent, payload) + const updatedContentChunks = await chunkRichText(updatedContent, payload.config) const updatedEmbeddings = await payload.find({ collection: embeddingsCollection, @@ -308,28 +231,6 @@ describe('Plugin integration tests', () => { expect.arrayContaining([expect.objectContaining({ chunkText })]), ) } - - const expectedEmbeddings = await embedFn( - updatedEmbeddings.docs.map((doc) => doc.chunkText as string), - ) - await Promise.all( - updatedEmbeddings.docs.map(async (doc, index) => { - const id = String(doc.id) - expect(doc.chunkText).toBeDefined() - const expectedEmbedding = expectedEmbeddings[index] - - // now check the DB vector column directly - const row = await getSQLRow((payload as any).db, id) - expect(row).toBeDefined() - expect(row.t).toBe('vector') - expect(row.embedding).toBeDefined() - const received = JSON.parse(row.embedding) - for (let i = 0; i < expectedEmbedding.length; i++) { - // We have to use 5 decimal places because float4 is used in pgvector - expect(received[i]).toBeCloseTo(expectedEmbedding[i], 5) - } - }), - ) }) test('deletes embeddings on delete', async () => { diff --git a/dev/specs/queueName.spec.ts b/dev/specs/queueName.spec.ts index 8721d74..57615aa 100644 --- a/dev/specs/queueName.spec.ts +++ b/dev/specs/queueName.spec.ts @@ -1,11 +1,13 @@ import type { Payload, SanitizedConfig } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import type { SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' import { postgresAdapter } from '@payloadcms/db-postgres' -import { buildDummyConfig, getInitialMarkdownContent, integration, plugin } from './constants.js' -import { createTestDb } from './utils.js' +import { buildDummyConfig, getInitialMarkdownContent } from './constants.js' +import { createTestDb, destroyPayload } from './utils.js' import { getPayload } from 'payload' +import payloadcmsVectorize from 'payloadcms-vectorize' +import { createMockAdapter } from 'helpers/mockAdapter.js' describe('Queue tests', () => { let config: SanitizedConfig @@ -13,6 +15,7 @@ describe('Queue tests', () => { let markdownContent: SerializedEditorState const expectedQueueName = 'queueName' const dbName = 'queue_test' + const adapter = createMockAdapter() beforeAll(async () => { await createTestDb({ dbName }) @@ -27,14 +30,13 @@ describe('Queue tests', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - plugin({ + payloadcmsVectorize({ + dbAdapter: adapter, realtimeQueueName: expectedQueueName, knowledgePools: { default: { @@ -49,7 +51,7 @@ describe('Queue tests', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -74,6 +76,11 @@ describe('Queue tests', () => { }) markdownContent = await getInitialMarkdownContent(config) }) + + afterAll(async () => { + await destroyPayload(payload) + }) + test('vectorization jobs are queued using the queueName', async () => { // There is no autoRun so previous jobs are queued and never removed between tests const prevJobs = await payload.find({ diff --git a/dev/specs/shouldEmbedFn.spec.ts b/dev/specs/shouldEmbedFn.spec.ts new file mode 100644 index 0000000..27ed84e --- /dev/null +++ b/dev/specs/shouldEmbedFn.spec.ts @@ -0,0 +1,104 @@ +import type { Payload, SanitizedConfig } from 'payload' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { DIMS } from './constants.js' +import { createTestDb, destroyPayload, waitForVectorizationJobs } from './utils.js' +import { getPayload } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildConfig } from 'payload' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import payloadcmsVectorize from 'payloadcms-vectorize' +import { createMockAdapter } from 'helpers/mockAdapter.js' + +const embeddingsCollection = 'default' + +describe('shouldEmbedFn - real-time', () => { + let payload: Payload + let config: SanitizedConfig + const dbName = `should_embed_fn_rt_${Date.now()}` + const adapter = createMockAdapter() + + beforeAll(async () => { + await createTestDb({ dbName }) + + config = await buildConfig({ + secret: process.env.PAYLOAD_SECRET || 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: { + posts: { + shouldEmbedFn: async (doc) => !doc.title?.startsWith('SKIP'), + toKnowledgePool: async (doc) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + payload = await getPayload({ + config, + key: `should-embed-fn-rt-${Date.now()}`, + cron: true, + }) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + test('shouldEmbedFn filters documents on real-time create', async () => { + const skippedPost = await payload.create({ + collection: 'posts', + data: { title: 'SKIP this post' } as any, + }) + const allowedPost = await payload.create({ + collection: 'posts', + data: { title: 'Embed this post' } as any, + }) + + await waitForVectorizationJobs(payload) + + const skippedEmbeddings = await payload.find({ + collection: embeddingsCollection, + where: { docId: { equals: String(skippedPost.id) } }, + }) + expect(skippedEmbeddings.docs.length).toBe(0) + + const allowedEmbeddings = await payload.find({ + collection: embeddingsCollection, + where: { docId: { equals: String(allowedPost.id) } }, + }) + expect(allowedEmbeddings.docs.length).toBeGreaterThan(0) + }) +}) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index 62b7de2..8739b10 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -2,28 +2,30 @@ import type { Payload, SanitizedConfig } from 'payload' import { buildConfig, getPayload } from 'payload' import { Client } from 'pg' -import { mkdirSync, rmSync } from 'fs' -import { join } from 'path' import { postgresAdapter } from '@payloadcms/db-postgres' import { lexicalEditor } from '@payloadcms/richtext-lexical' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import payloadcmsVectorize, { + TASK_SLUG_VECTORIZE, + TASK_SLUG_PREPARE_BULK_EMBEDDING, + TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, +} from 'payloadcms-vectorize' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../src/collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../src/collections/bulkEmbeddingsBatches.js' import { makeDummyEmbedDocs } from '../helpers/embed.js' -import { script as vectorizeMigrateScript } from '../../src/bin/vectorize-migrate.js' import type { BulkEmbeddingsFns, BulkEmbeddingInput, BulkEmbeddingRunStatus, -} from '../../src/types.js' + PayloadcmsVectorizeConfig, +} from 'payloadcms-vectorize' export const createTestDb = async ({ dbName }: { dbName: string }) => { const adminUri = process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' // connect to 'postgres' const client = new Client({ connectionString: adminUri }) await client.connect() - + /* // Drop and recreate the database to ensure a clean state // First, terminate any existing connections to the database @@ -33,7 +35,7 @@ export const createTestDb = async ({ dbName }: { dbName: string }) => { WHERE pg_stat_activity.datname = $1 AND pid <> pg_backend_pid() `, [dbName])*/ - + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [dbName]) if (exists.rowCount === 0) { await client.query(`CREATE DATABASE ${dbName}`) @@ -43,74 +45,6 @@ export const createTestDb = async ({ dbName }: { dbName: string }) => { await client.end() } -/** - * Initialize Payload with migrations applied. - * This handles the full migration setup: - * 1. Get payload instance - * 2. Create initial migration - * 3. Run vectorize:migrate to patch with IVFFLAT index - * 4. Apply migrations - * - * NOTE: This function is only used by migration-specific tests (e.g., migrationCli.spec.ts). - * All other tests should use getPayload() directly without migrations. - * - * @param config - A pre-built SanitizedConfig (must have migrationDir and push: false in db config) - * @param key - Unique key for getPayload caching (prevents instance collisions in tests) - * @param cron - Whether to enable cron jobs (default: true) - */ -export async function initializePayloadWithMigrations({ - config, - key, - cron = true, - skipMigrations = false, -}: { - config: SanitizedConfig - key?: string - cron?: boolean - skipMigrations?: boolean -}): Promise { - if (skipMigrations) { - return await getPayload({ config, key, cron }) - } - - const migrationKey = `${key ?? 'payload'}-migrations-${Date.now()}` - const payloadForMigrations = await getPayload({ config, key: migrationKey, cron: false }) - - // Create initial migration (Payload's schema) - await payloadForMigrations.db.createMigration({ migrationName: 'initial', payload: payloadForMigrations }) - - // Run vectorize:migrate to patch with IVFFLAT index - await vectorizeMigrateScript(config) - - // Apply migrations (forceAcceptWarning bypasses the dev mode prompt) - await (payloadForMigrations.db as any).migrate({ forceAcceptWarning: true }) - - if (!cron) { - return payloadForMigrations - } - - return await getPayload({ config, key, cron: true }) -} - -/** - * Create a unique migration directory for a test. - * Returns the path and a cleanup function. - */ -export function createTestMigrationsDir(dbName: string): { - migrationsDir: string - cleanup: () => void -} { - const migrationsDir = join(process.cwd(), 'dev', `test-migrations-${dbName}`) - // Clean up any existing migration directory - rmSync(migrationsDir, { recursive: true, force: true }) - mkdirSync(migrationsDir, { recursive: true }) - - return { - migrationsDir, - cleanup: () => rmSync(migrationsDir, { recursive: true, force: true }), - } -} - async function waitForTasks( payload: Payload, taskSlugs: string[], @@ -138,18 +72,42 @@ async function waitForTasks( } export async function waitForVectorizationJobs(payload: Payload, maxWaitMs = 10000) { - await waitForTasks(payload, ['payloadcms-vectorize:vectorize'], maxWaitMs) + await waitForTasks(payload, [TASK_SLUG_VECTORIZE], maxWaitMs) } -export async function waitForBulkJobs(payload: Payload, maxWaitMs = 10000) { - await waitForTasks( - payload, - [ - 'payloadcms-vectorize:prepare-bulk-embedding', - 'payloadcms-vectorize:poll-or-complete-bulk-embedding', - ], - maxWaitMs, +export async function waitForBulkJobs(payload: Payload, maxWaitMs = 10000, intervalMs = 250) { + const hasJobsCollection = (payload as any)?.config?.collections?.some( + (c: any) => c.slug === 'payload-jobs', ) + if (!hasJobsCollection) return + + const taskSlugs = [TASK_SLUG_PREPARE_BULK_EMBEDDING, TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING] + const startTime = Date.now() + + while (Date.now() - startTime < maxWaitMs) { + const pending = await payload.find({ + collection: 'payload-jobs', + where: { + and: [{ taskSlug: { in: taskSlugs } }, { completedAt: { exists: false } }], + }, + }) + + if (pending.totalDocs === 0) { + // No pending jobs — but with coordinator/worker fan-out, new jobs may + // appear between the coordinator completing and the worker being queued. + // Double-check: if any bulk run is still non-terminal, keep waiting. + const activeRuns = await payload.find({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + where: { status: { in: ['queued', 'running'] } }, + limit: 1, + }) + if (activeRuns.totalDocs === 0) return + } + + await new Promise((resolve) => setTimeout(resolve, intervalMs)) + } + // One last grace wait + await new Promise((resolve) => setTimeout(resolve, 500)) } export const DEFAULT_DIMS = 8 @@ -265,7 +223,7 @@ export function createMockBulkEmbeddings( export type BuildPayloadArgs = { dbName: string - pluginOpts: any + pluginOpts: PayloadcmsVectorizeConfig key?: string skipMigrations?: boolean } @@ -274,15 +232,7 @@ export async function buildPayloadWithIntegration({ dbName, pluginOpts, key, - skipMigrations, }: BuildPayloadArgs): Promise<{ payload: Payload; config: SanitizedConfig }> { - const integration = createVectorizeIntegration({ - default: { - dims: DEFAULT_DIMS, - ivfflatLists: 1, - }, - }) - const config = await buildConfig({ secret: 'test-secret', editor: lexicalEditor(), @@ -293,13 +243,11 @@ export async function buildPayloadWithIntegration({ }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), - plugins: [integration.payloadcmsVectorize(pluginOpts)], + plugins: [payloadcmsVectorize(pluginOpts)], jobs: { tasks: [], autoRun: [ @@ -356,6 +304,11 @@ export const clearAllCollections = async (pl: Payload) => { await safeDelete('payload-jobs') } +/** Safely destroy a Payload instance (stops crons, closes DB pool). */ +export async function destroyPayload(payload: Payload | null | undefined) { + if (payload) await payload.destroy() +} + export async function createSucceededBaselineRun( payload: Payload, { diff --git a/dev/specs/vectorSearch.spec.ts b/dev/specs/vectorSearch.spec.ts index 510f4eb..091d49b 100644 --- a/dev/specs/vectorSearch.spec.ts +++ b/dev/specs/vectorSearch.spec.ts @@ -1,6 +1,6 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { type SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' import { buildDummyConfig, DIMS, getInitialMarkdownContent } from './constants.js' @@ -8,24 +8,30 @@ import { BULK_QUEUE_NAMES, createMockBulkEmbeddings, createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' import { chunkRichText, chunkText } from 'helpers/chunkers.js' import { createVectorSearchHandlers } from '../../src/endpoints/vectorSearch.js' -import { createVectorizeIntegration, type KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' +import payloadcmsVectorize, { + DbAdapter, + type KnowledgePoolDynamicConfig, +} from 'payloadcms-vectorize' import { expectValidVectorSearchResults, - expectResultsOrderedBySimilarity, + expectResultsOrderedByScore, expectResultsRespectWhere, } from './helpers/vectorSearchExpectations.js' +import { createMockAdapter } from 'helpers/mockAdapter.js' const embedFn = makeDummyEmbedQuery(DIMS) // Helper function to perform vector search directly async function performVectorSearch( payload: Payload, + adapter: DbAdapter, query: any, knowledgePool: string = 'default', where?: any, @@ -41,7 +47,7 @@ async function performVectorSearch( }, }, } - const searchHandler = createVectorSearchHandlers(knowledgePools).requestHandler + const searchHandler = createVectorSearchHandlers(knowledgePools, adapter).requestHandler // Create a mock request object const mockRequest = { @@ -57,30 +63,16 @@ async function performVectorSearch( return await searchHandler(mockRequest) } -const integration = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 1, - }, - nonSnakeCasePost: { - dims: DIMS, - ivfflatLists: 1, - }, - 'test-non-snake-case-post': { - dims: DIMS, - ivfflatLists: 1, - }, -}) -const plugin = integration.payloadcmsVectorize - describe('Search endpoint integration tests', () => { let payload: Payload + let adapter: DbAdapter let markdownContent: SerializedEditorState const titleAndQuery = 'My query is a title' const dbName = 'endpoint_test' beforeAll(async () => { await createTestDb({ dbName }) + adapter = createMockAdapter() const config = await buildDummyConfig({ jobs: { @@ -102,14 +94,13 @@ describe('Search endpoint integration tests', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - plugin({ + payloadcmsVectorize({ + dbAdapter: adapter, knowledgePools: { default: { collections: { @@ -123,7 +114,7 @@ describe('Search endpoint integration tests', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -148,7 +139,7 @@ describe('Search endpoint integration tests', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -173,7 +164,7 @@ describe('Search endpoint integration tests', () => { } // Process content if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -200,6 +191,10 @@ describe('Search endpoint integration tests', () => { markdownContent = await getInitialMarkdownContent(config) }) + afterAll(async () => { + await destroyPayload(payload) + }) + test('querying a title should return the title', async () => { // This should create multiple embeddings for the title and content const post = await payload.create({ @@ -212,7 +207,7 @@ describe('Search endpoint integration tests', () => { // Wait for vectorization jobs to complete await waitForVectorizationJobs(payload) - const response = await performVectorSearch(payload, titleAndQuery) + const response = await performVectorSearch(payload, adapter, titleAndQuery) const json = await response.json() expect(json).toHaveProperty('results') @@ -226,15 +221,15 @@ describe('Search endpoint integration tests', () => { }) }) - test('search results are ordered by similarity (highest first)', async () => { - const response = await performVectorSearch(payload, titleAndQuery) + test('search results are ordered by score (highest first)', async () => { + const response = await performVectorSearch(payload, adapter, titleAndQuery) const json = await response.json() - expectResultsOrderedBySimilarity(json.results) + expectResultsOrderedByScore(json.results) }) test('search handles empty query gracefully', async () => { - const response = await performVectorSearch(payload, '') + const response = await performVectorSearch(payload, adapter, '') expect(response.status).toBe(400) const error = await response.json() @@ -243,7 +238,7 @@ describe('Search endpoint integration tests', () => { }) test('search handles missing query parameter', async () => { - const response = await performVectorSearch(payload, undefined) + const response = await performVectorSearch(payload, adapter, undefined) expect(response.status).toBe(400) const error = await response.json() @@ -252,7 +247,7 @@ describe('Search endpoint integration tests', () => { }) test('search handles non-string query', async () => { - const response = await performVectorSearch(payload, 123) + const response = await performVectorSearch(payload, adapter, 123) expect(response.status).toBe(400) const error = await response.json() @@ -285,13 +280,13 @@ describe('Search endpoint integration tests', () => { await waitForVectorizationJobs(payload) // Search without WHERE - should return both - const responseAll = await performVectorSearch(payload, sharedText) + const responseAll = await performVectorSearch(payload, adapter, sharedText) const jsonAll = await responseAll.json() expect(jsonAll.results.length).toBeGreaterThanOrEqual(2) // Search with WHERE clause filtering by docId - should return only one - const responseFiltered = await performVectorSearch(payload, sharedText, 'default', { + const responseFiltered = await performVectorSearch(payload, adapter, sharedText, 'default', { docId: { equals: String(post1.id) }, }) const jsonFiltered = await responseFiltered.json() diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 28cbf82..65c40ba 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -1,39 +1,34 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' import { getVectorizedPayload, VectorizedPayload } from '../../src/types.js' import { buildDummyConfig, DIMS, getInitialMarkdownContent } from './constants.js' import { createTestDb, + destroyPayload, waitForVectorizationJobs, } from './utils.js' import { getPayload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { chunkRichText, chunkText } from 'helpers/chunkers.js' -import { createVectorizeIntegration } from 'payloadcms-vectorize' +import payloadcmsVectorize from 'payloadcms-vectorize' import { type SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' import { expectValidVectorSearchResults, - expectResultsOrderedBySimilarity, + expectResultsOrderedByScore, expectResultsRespectLimit, expectResultsRespectWhere, expectResultsContainTitle, } from './helpers/vectorSearchExpectations.js' - -const integration = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 1, - }, -}) -const plugin = integration.payloadcmsVectorize +import { createMockAdapter } from 'helpers/mockAdapter.js' describe('VectorizedPayload', () => { let payload: Payload let markdownContent: SerializedEditorState const titleAndQuery = 'VectorizedPayload Test Title' const dbName = 'vectorized_payload_test' + const adapter = createMockAdapter() beforeAll(async () => { await createTestDb({ dbName }) @@ -58,14 +53,13 @@ describe('VectorizedPayload', () => { }, ], db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ - plugin({ + payloadcmsVectorize({ + dbAdapter: adapter, knowledgePools: { default: { collections: { @@ -77,7 +71,7 @@ describe('VectorizedPayload', () => { chunks.push(...titleChunks.map((chunk) => ({ chunk }))) } if (doc.content) { - const contentChunks = await chunkRichText(doc.content, payload) + const contentChunks = await chunkRichText(doc.content, payload.config) chunks.push(...contentChunks.map((chunk) => ({ chunk }))) } return chunks @@ -103,6 +97,10 @@ describe('VectorizedPayload', () => { markdownContent = await getInitialMarkdownContent(config) }) + afterAll(async () => { + await destroyPayload(payload) + }) + describe('getVectorizedPayload', () => { test('returns vectorized payload object for a payload instance with vectorize extensions', () => { const vectorizedPayload = getVectorizedPayload(payload) @@ -133,13 +131,13 @@ describe('VectorizedPayload', () => { }) test('payload has search method', () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload) + const vectorizedPayload = getVectorizedPayload(payload) expect(vectorizedPayload).not.toBeNull() expect(typeof vectorizedPayload!.search).toBe('function') }) test('search returns an array of VectorSearchResult', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + const vectorizedPayload = getVectorizedPayload(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -150,8 +148,8 @@ describe('VectorizedPayload', () => { expectValidVectorSearchResults(results, { checkShape: true }) }) - test('search results are ordered by similarity (highest first)', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + test('search results are ordered by score (highest first)', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -159,11 +157,11 @@ describe('VectorizedPayload', () => { limit: 10, }) - expectResultsOrderedBySimilarity(results) + expectResultsOrderedByScore(results) }) test('search respects limit parameter', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + const vectorizedPayload = getVectorizedPayload(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -175,7 +173,7 @@ describe('VectorizedPayload', () => { }) test('search respects where clause', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + const vectorizedPayload = getVectorizedPayload(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -190,7 +188,7 @@ describe('VectorizedPayload', () => { }) test('querying a title should return the title as top result', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + const vectorizedPayload = getVectorizedPayload(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -253,7 +251,7 @@ describe('VectorizedPayload', () => { }) test('bulkEmbed throws error when bulk embedding not configured', async () => { - const vectorizedPayload = getVectorizedPayload<'default'>(payload)! + const vectorizedPayload = getVectorizedPayload(payload)! // This pool doesn't have bulkEmbeddingsFns configured await expect(vectorizedPayload.bulkEmbed({ knowledgePool: 'default' })).rejects.toThrow( @@ -280,4 +278,17 @@ describe('VectorizedPayload', () => { } }) }) + describe('getAdapterCustom method', () => { + test('payload has getAdapterCustom method', () => { + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(typeof vectorizedPayload!.getDbAdapterCustom).toBe('function') + }) + + test('getAdapterCustom returns the adapter custom', () => { + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(vectorizedPayload!.getDbAdapterCustom()).toBeDefined() + }) + }) }) diff --git a/dev/tsconfig.json b/dev/tsconfig.json index 2a74386..e4918f5 100644 --- a/dev/tsconfig.json +++ b/dev/tsconfig.json @@ -16,20 +16,13 @@ "compilerOptions": { "baseUrl": "./", "paths": { - "@payload-config": [ - "./payload.config.ts" - ], - "payloadcms-vectorize": [ - "../src/index.ts" - ], - "payloadcms-vectorize/client": [ - "../src/exports/client.ts" - ], - "payloadcms-vectorize/rsc": [ - "../src/exports/rsc.ts" - ] + "@payload-config": ["./payload.config.ts"], + "payloadcms-vectorize": ["../src/index.ts"], + "payloadcms-vectorize/client": ["../src/exports/client.ts"], + "payloadcms-vectorize/rsc": ["../src/exports/rsc.ts"], + "@payloadcms-vectorize/pg": ["../adapters/pg/src/index.ts"] }, "noEmit": true, - "emitDeclarationOnly": false, + "emitDeclarationOnly": false } } diff --git a/package.json b/package.json index b217ac0..7641980 100644 --- a/package.json +++ b/package.json @@ -1,31 +1,35 @@ { "name": "payloadcms-vectorize", - "version": "0.5.3", + "version": "0.6.0-beta.5", "description": "A plugin to vectorize collections for RAG in Payload 3.0", "license": "MIT", "type": "module", "exports": { ".": { - "import": "./src/index.ts", - "types": "./src/index.ts", - "default": "./src/index.ts" + "import": "./dist/index.js", + "types": "./dist/index.d.ts", + "default": "./dist/index.js" }, "./client": { - "import": "./src/exports/client.ts", - "types": "./src/exports/client.ts", - "default": "./src/exports/client.ts" + "import": "./dist/exports/client.js", + "types": "./dist/exports/client.d.ts", + "default": "./dist/exports/client.js" } }, - "main": "./src/index.ts", - "types": "./src/index.ts", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", "files": [ "dist" ], "scripts": { - "build": "pnpm copyfiles && pnpm build:types && pnpm build:swc", + "build": "pnpm copyfiles && pnpm build:types && pnpm build:swc && pnpm build:adapters", "build:swc": "swc ./src -d ./dist --config-file .swcrc --strip-leading-paths", - "build:types": "tsc --outDir dist --rootDir ./src", - "clean": "rimraf {dist,*.tsbuildinfo}", + "build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf", + "build:adapters:pg": "cd ./adapters/pg && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", + "build:adapters:cf": "cd ./adapters/cf && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", + "build:types": "tsc -p tsconfig.build.json --outDir dist --rootDir ./src", + "build:types:all": "pnpm build:types && tsc --noEmit", + "clean": "rimraf {dist,*.tsbuildinfo,adapters/pg/dist,adapters/cf/dist}", "copyfiles": "copyfiles -u 1 \"src/**/*.{html,css,scss,ttf,woff,woff2,eot,svg,jpg,png,json}\" dist/", "dev": "cross-env DOTENV_CONFIG_PATH=dev/.env.development NODE_OPTIONS=--require=dotenv/config next dev dev --turbo", "dev:generate-importmap": "pnpm dev:payload generate:importmap", @@ -35,14 +39,21 @@ "generate:types": "pnpm dev:generate-types", "lint": "eslint", "lint:fix": "eslint ./src --fix", + "changeset": "changeset", + "changeset:version": "changeset version", + "release": "pnpm clean && pnpm build && changeset publish", "prepublishOnly": "pnpm clean && pnpm build", "test:setup": "docker-compose -f dev/docker-compose.test.yml up -d", "test:teardown": "docker-compose -f dev/docker-compose.test.yml down", "test": "pnpm test:int && pnpm test:e2e", "test:e2e": "playwright test", - "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx' vitest" + "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest", + "test:adapters:pg": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/pg/vitest.config.js", + "test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.js" }, "devDependencies": { + "@changesets/changelog-github": "^0.5.2", + "@changesets/cli": "^2.29.8", "@eslint/eslintrc": "^3.2.0", "@payloadcms/db-postgres": "3.69.0", "@payloadcms/db-sqlite": "3.69.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9c865c8..cc55b9e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -12,6 +12,12 @@ importers: specifier: 1.0.0 version: 1.0.0 devDependencies: + '@changesets/changelog-github': + specifier: ^0.5.2 + version: 0.5.2 + '@changesets/cli': + specifier: ^2.29.8 + version: 2.29.8(@types/node@22.19.3) '@eslint/eslintrc': specifier: ^3.2.0 version: 3.3.3 @@ -148,6 +154,32 @@ importers: specifier: ^0.0.8 version: 0.0.8 + adapters/cf: + dependencies: + payload: + specifier: '>=3.0.0 <4.0.0' + version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) + devDependencies: + payloadcms-vectorize: + specifier: workspace:* + version: link:../.. + + adapters/pg: + dependencies: + '@payloadcms/db-postgres': + specifier: '>=3.0.0 <4.0.0' + version: 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) + payload: + specifier: '>=3.0.0 <4.0.0' + version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) + to-snake-case: + specifier: 1.0.0 + version: 1.0.0 + devDependencies: + payloadcms-vectorize: + specifier: workspace:* + version: link:../.. + packages: '@ai-sdk/gateway@2.0.24': @@ -349,6 +381,67 @@ packages: '@borewit/text-codec@0.1.1': resolution: {integrity: sha512-5L/uBxmjaCIX5h8Z+uu+kA9BQLkc/Wl06UGR5ajNRxu+/XjonB5i8JpgFMrPj3LXTCPA0pv8yxUvbUi+QthGGA==} + '@changesets/apply-release-plan@7.0.14': + resolution: {integrity: sha512-ddBvf9PHdy2YY0OUiEl3TV78mH9sckndJR14QAt87KLEbIov81XO0q0QAmvooBxXlqRRP8I9B7XOzZwQG7JkWA==} + + '@changesets/assemble-release-plan@6.0.9': + resolution: {integrity: sha512-tPgeeqCHIwNo8sypKlS3gOPmsS3wP0zHt67JDuL20P4QcXiw/O4Hl7oXiuLnP9yg+rXLQ2sScdV1Kkzde61iSQ==} + + '@changesets/changelog-git@0.2.1': + resolution: {integrity: sha512-x/xEleCFLH28c3bQeQIyeZf8lFXyDFVn1SgcBiR2Tw/r4IAWlk1fzxCEZ6NxQAjF2Nwtczoen3OA2qR+UawQ8Q==} + + '@changesets/changelog-github@0.5.2': + resolution: {integrity: sha512-HeGeDl8HaIGj9fQHo/tv5XKQ2SNEi9+9yl1Bss1jttPqeiASRXhfi0A2wv8yFKCp07kR1gpOI5ge6+CWNm1jPw==} + + '@changesets/cli@2.29.8': + resolution: {integrity: sha512-1weuGZpP63YWUYjay/E84qqwcnt5yJMM0tep10Up7Q5cS/DGe2IZ0Uj3HNMxGhCINZuR7aO9WBMdKnPit5ZDPA==} + hasBin: true + + '@changesets/config@3.1.2': + resolution: {integrity: sha512-CYiRhA4bWKemdYi/uwImjPxqWNpqGPNbEBdX1BdONALFIDK7MCUj6FPkzD+z9gJcvDFUQJn9aDVf4UG7OT6Kog==} + + '@changesets/errors@0.2.0': + resolution: {integrity: sha512-6BLOQUscTpZeGljvyQXlWOItQyU71kCdGz7Pi8H8zdw6BI0g3m43iL4xKUVPWtG+qrrL9DTjpdn8eYuCQSRpow==} + + '@changesets/get-dependents-graph@2.1.3': + resolution: {integrity: sha512-gphr+v0mv2I3Oxt19VdWRRUxq3sseyUpX9DaHpTUmLj92Y10AGy+XOtV+kbM6L/fDcpx7/ISDFK6T8A/P3lOdQ==} + + '@changesets/get-github-info@0.7.0': + resolution: {integrity: sha512-+i67Bmhfj9V4KfDeS1+Tz3iF32btKZB2AAx+cYMqDSRFP7r3/ZdGbjCo+c6qkyViN9ygDuBjzageuPGJtKGe5A==} + + '@changesets/get-release-plan@4.0.14': + resolution: {integrity: sha512-yjZMHpUHgl4Xl5gRlolVuxDkm4HgSJqT93Ri1Uz8kGrQb+5iJ8dkXJ20M2j/Y4iV5QzS2c5SeTxVSKX+2eMI0g==} + + '@changesets/get-version-range-type@0.4.0': + resolution: {integrity: sha512-hwawtob9DryoGTpixy1D3ZXbGgJu1Rhr+ySH2PvTLHvkZuQ7sRT4oQwMh0hbqZH1weAooedEjRsbrWcGLCeyVQ==} + + '@changesets/git@3.0.4': + resolution: {integrity: sha512-BXANzRFkX+XcC1q/d27NKvlJ1yf7PSAgi8JG6dt8EfbHFHi4neau7mufcSca5zRhwOL8j9s6EqsxmT+s+/E6Sw==} + + '@changesets/logger@0.1.1': + resolution: {integrity: sha512-OQtR36ZlnuTxKqoW4Sv6x5YIhOmClRd5pWsjZsddYxpWs517R0HkyiefQPIytCVh4ZcC5x9XaG8KTdd5iRQUfg==} + + '@changesets/parse@0.4.2': + resolution: {integrity: sha512-Uo5MC5mfg4OM0jU3up66fmSn6/NE9INK+8/Vn/7sMVcdWg46zfbvvUSjD9EMonVqPi9fbrJH9SXHn48Tr1f2yA==} + + '@changesets/pre@2.0.2': + resolution: {integrity: sha512-HaL/gEyFVvkf9KFg6484wR9s0qjAXlZ8qWPDkTyKF6+zqjBe/I2mygg3MbpZ++hdi0ToqNUF8cjj7fBy0dg8Ug==} + + '@changesets/read@0.6.6': + resolution: {integrity: sha512-P5QaN9hJSQQKJShzzpBT13FzOSPyHbqdoIBUd2DJdgvnECCyO6LmAOWSV+O8se2TaZJVwSXjL+v9yhb+a9JeJg==} + + '@changesets/should-skip-package@0.1.2': + resolution: {integrity: sha512-qAK/WrqWLNCP22UDdBTMPH5f41elVDlsNyat180A33dWxuUDyNpg6fPi/FyTZwRriVjg0L8gnjJn2F9XAoF0qw==} + + '@changesets/types@4.1.0': + resolution: {integrity: sha512-LDQvVDv5Kb50ny2s25Fhm3d9QSZimsoUGBsUioj6MC3qbMUCuC8GPIvk/M6IvXx3lYhAs0lwWUQLb+VIEUCECw==} + + '@changesets/types@6.1.0': + resolution: {integrity: sha512-rKQcJ+o1nKNgeoYRHKOS07tAMNd3YSN0uHaJOZYjBAgxfV7TUE7JE+z4BzZdQwb5hKaYbayKN5KrYV7ODb2rAA==} + + '@changesets/write@0.4.0': + resolution: {integrity: sha512-CdTLvIOPiCNuH71pyDu3rA+Q0n65cmAbXnwWH84rKGiFumFzkmHNT8KHTMEchcxN+Kl8I54xGUhJ7l3E7X396Q==} + '@date-fns/tz@1.2.0': resolution: {integrity: sha512-LBrd7MiJZ9McsOgxqWX7AaxrDjcFVjWH/tIKJd7pnR7McaslGYOP1QmmiBXdJH/H/yLCT+rcQ7FaPBUxRGUtrg==} @@ -1297,6 +1390,15 @@ packages: cpu: [x64] os: [win32] + '@inquirer/external-editor@1.0.3': + resolution: {integrity: sha512-RWbSrDiYmO4LbejWY7ttpxczuwQyZLBUyygsA9Nsv95hpzUWwnNTVQmAq3xuh7vNwCp07UTmE5i11XAEExx4RA==} + engines: {node: '>=18'} + peerDependencies: + '@types/node': '>=18' + peerDependenciesMeta: + '@types/node': + optional: true + '@isaacs/cliui@8.0.2': resolution: {integrity: sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==} engines: {node: '>=12'} @@ -1538,6 +1640,12 @@ packages: cpu: [x64] os: [win32] + '@manypkg/find-root@1.1.0': + resolution: {integrity: sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA==} + + '@manypkg/get-packages@1.1.3': + resolution: {integrity: sha512-fo+QhuU3qE/2TQMQmbVMqaQ6EWbMhi4ABWP+O4AM1NqPBuy0OrApV5LO6BrrgnhtAHS2NH6RrVk9OL181tTi8A==} + '@monaco-editor/loader@1.7.0': resolution: {integrity: sha512-gIwR1HrJrrx+vfyOhYmCZ0/JcWqG5kbfG7+d3f/C1LXk2EvzAbHSg3MQ5lO2sMlo9izoAZ04shohfKLVT6crVA==} @@ -2314,6 +2422,9 @@ packages: '@types/ms@2.1.0': resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==} + '@types/node@12.20.55': + resolution: {integrity: sha512-J8xLz7q2OFulZ2cyGTLE1TbbZcjpno7FaN6zdJNrgAdrJ+DZzh/uFR6YrTb4C+nXakvud8Q4+rbhoIWlYQbUFQ==} + '@types/node@18.19.130': resolution: {integrity: sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg==} @@ -2696,6 +2807,10 @@ packages: ajv@8.17.1: resolution: {integrity: sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==} + ansi-colors@4.1.3: + resolution: {integrity: sha512-/6w/C21Pm1A7aZitlI5Ni/2J6FFQN8i1Cvz3kHABAAbw93v/NlvKdVOqz7CCWz/3iv/JplRSEEZ83XION15ovw==} + engines: {node: '>=6'} + ansi-escapes@4.3.2: resolution: {integrity: sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==} engines: {node: '>=8'} @@ -2753,6 +2868,10 @@ packages: resolution: {integrity: sha512-FmeCCAenzH0KH381SPT5FZmiA/TmpndpcaShhfgEN9eCVjnFBqq3l1xrI42y8+PPLI6hypzou4GXw00WHmPBLQ==} engines: {node: '>= 0.4'} + array-union@2.1.0: + resolution: {integrity: sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==} + engines: {node: '>=8'} + array.prototype.findlast@1.2.5: resolution: {integrity: sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==} engines: {node: '>= 0.4'} @@ -2911,6 +3030,10 @@ packages: bcrypt-pbkdf@1.0.2: resolution: {integrity: sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==} + better-path-resolve@1.0.0: + resolution: {integrity: sha512-pbnl5XzGBdrFU/wT4jqmJVPn2B6UHPBOhzMQkY/SPUPB6QtUXtmBHBIwCbXJol93mOpGMnQyP/+BB19q04xj7g==} + engines: {node: '>=4'} + bin-version-check@5.1.0: resolution: {integrity: sha512-bYsvMqJ8yNGILLz1KP9zKLzQ6YpljV3ln1gqhuLkUtyfGi3qXKGuK2p+U4NAvjVFzDFiBBtOpCOSFNuYYEGZ5g==} engines: {node: '>=12'} @@ -3051,6 +3174,9 @@ packages: character-reference-invalid@2.0.1: resolution: {integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==} + chardet@2.1.1: + resolution: {integrity: sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==} + charenc@0.0.2: resolution: {integrity: sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==} @@ -3065,6 +3191,10 @@ packages: chownr@1.1.4: resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} + ci-info@3.9.0: + resolution: {integrity: sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==} + engines: {node: '>=8'} + ci-info@4.3.1: resolution: {integrity: sha512-Wdy2Igu8OcBpI2pZePZ5oWjPC38tmDVx5WKUXKwlLYkA0ozo85sLsLvkBbBn/sZaSCMFOGZJ14fvW9t5/d7kdA==} engines: {node: '>=8'} @@ -3217,6 +3347,9 @@ packages: resolution: {integrity: sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==} engines: {node: '>= 0.4'} + dataloader@1.4.0: + resolution: {integrity: sha512-68s5jYdlvasItOJnCuI2Q9s4q98g0pCyL3HrcKJu8KNugUl8ahgmZYg38ysLTgQjjXX3H8CJLkAvWrclWfcalw==} + dataloader@2.2.3: resolution: {integrity: sha512-y2krtASINtPFS1rSDjacrFgn1dcUuoREVabwlOGOe4SdxenREqwjwjElAdwvbGM7kgZz9a3KVicWR7vcz8rnzA==} @@ -3308,6 +3441,10 @@ packages: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} + detect-indent@6.1.0: + resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} + engines: {node: '>=8'} + detect-indent@7.0.2: resolution: {integrity: sha512-y+8xyqdGLL+6sh0tVeHcfP/QDd8gUgbasolJJpY7NgeQGSZ739bDtSiaiDgtoicy+mtYB81dKLxO9xRhCyIB3A==} engines: {node: '>=12.20'} @@ -3331,6 +3468,10 @@ packages: devlop@1.1.0: resolution: {integrity: sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==} + dir-glob@3.0.1: + resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==} + engines: {node: '>=8'} + docker-compose@1.3.0: resolution: {integrity: sha512-7Gevk/5eGD50+eMD+XDnFnOrruFkL0kSd7jEG4cjmqweDSUhB7i0g8is/nBdVpl+Bx338SqIB2GLKm32M+Vs6g==} engines: {node: '>= 6.0.0'} @@ -3361,6 +3502,10 @@ packages: resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} engines: {node: '>=12'} + dotenv@8.6.0: + resolution: {integrity: sha512-IrPdXQsk2BbzvCBGBOTmmSH5SodmqZNt4ERAZDmW4CT+tL8VtvinqywuANaFu4bOMWki16nqf0e4oC0QIaDr/g==} + engines: {node: '>=10'} + drizzle-kit@0.31.7: resolution: {integrity: sha512-hOzRGSdyKIU4FcTSFYGKdXEjFsncVwHZ43gY3WU5Bz9j5Iadp6Rh6hxLSQ1IWXpKLBKt/d5y1cpSPcV+FcoQ1A==} hasBin: true @@ -3484,6 +3629,10 @@ packages: resolution: {integrity: sha512-LgQMM4WXU3QI+SYgEc2liRgznaD5ojbmY3sb8LxyguVkIg5FxdpTkvk72te2R38/TGKxH634oLxXRGY6d7AP+Q==} engines: {node: '>=10.13.0'} + enquirer@2.4.1: + resolution: {integrity: sha512-rRqJg/6gd538VHvR3PSrdRBb/1Vy2YfzHqzvbhGIQpDRKIa4FgV/54b5Q1xYSxOOwKvjXweS26E0Q+nAMwp2pQ==} + engines: {node: '>=8.6'} + error-ex@1.3.4: resolution: {integrity: sha512-sqQamAnR14VgCr1A618A3sGrygcpK+HEbenA/HiEAkkUwcZIIB/tgWqHFxWgOyDh4nB4JCRimh79dR5Ywc9MDQ==} @@ -3862,6 +4011,9 @@ packages: resolution: {integrity: sha512-yblEwXAbGv1VQDmow7s38W77hzAgJAO50ztBLMcUyUBfxv1HC+LGwtiEN+Co6LtlqT/5uwVOxsD4TNIilWhwdQ==} engines: {node: '>=4'} + extendable-error@0.1.7: + resolution: {integrity: sha512-UOiS2in6/Q0FK0R0q6UY9vYpQ21mr/Qn1KOnte7vsACuNJf514WvCCUHSRCPcgjPT2bAhNIJdlE6bVap1GKmeg==} + fast-copy@3.0.2: resolution: {integrity: sha512-dl0O9Vhju8IrcLndv2eU4ldt1ftXMqqfgN4H1cpmGV7P6jeB9FwpN9a2c8DPGE1Ys88rNUJVYDHq73CGAGOPfQ==} @@ -3992,6 +4144,14 @@ packages: fs-constants@1.0.0: resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} + fs-extra@7.0.1: + resolution: {integrity: sha512-YJDaCJZEnBmcbw13fvdAM9AwNOJwOzrE4pqMqBq5nFiEqXUqHwlK4B+3pUw6JNvfSPtX05xFHtYy/1ni01eGCw==} + engines: {node: '>=6 <7 || >=8'} + + fs-extra@8.1.0: + resolution: {integrity: sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==} + engines: {node: '>=6 <7 || >=8'} + fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -4092,6 +4252,10 @@ packages: resolution: {integrity: sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==} engines: {node: '>= 0.4'} + globby@11.1.0: + resolution: {integrity: sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==} + engines: {node: '>=10'} + globrex@0.1.2: resolution: {integrity: sha512-uHJgbwAMwNFf5mLst7IWLNg14x1CkeqglJb/K3doi4dw6q2IvAAmM/Y81kevy83wP+Sst+nutFTYOGg3d1lsxg==} @@ -4175,10 +4339,18 @@ packages: resolution: {integrity: sha512-V5nVw1PAOgfI3Lmeaj2Exmeg7fenjhRUgz1lPSezy1CuhPYbgQtbQj4jZfEAEMlaL+vupsvhjqCyjzob0yxsmQ==} engines: {node: '>=10.19.0'} + human-id@4.1.3: + resolution: {integrity: sha512-tsYlhAYpjCKa//8rXZ9DqKEawhPoSytweBC2eNvcaDK+57RZLHGqNs3PZTQO6yekLFSuvA6AlnAfrw1uBvtb+Q==} + hasBin: true + human-signals@2.1.0: resolution: {integrity: sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==} engines: {node: '>=10.17.0'} + iconv-lite@0.7.2: + resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==} + engines: {node: '>=0.10.0'} + ieee754@1.2.1: resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} @@ -4373,6 +4545,10 @@ packages: resolution: {integrity: sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==} engines: {node: '>= 0.4'} + is-subdir@1.2.0: + resolution: {integrity: sha512-2AT6j+gXe/1ueqbW6fLZJiIw3F8iXGJtt0yDrZaBhAZEG1raiTxKWU+IPqMCzQAXOUCKdA4UDMgacKH25XG2Cw==} + engines: {node: '>=4'} + is-symbol@1.1.1: resolution: {integrity: sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==} engines: {node: '>= 0.4'} @@ -4393,6 +4569,10 @@ packages: resolution: {integrity: sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==} engines: {node: '>= 0.4'} + is-windows@1.0.2: + resolution: {integrity: sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==} + engines: {node: '>=0.10.0'} + is-wsl@3.1.0: resolution: {integrity: sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==} engines: {node: '>=16'} @@ -4639,6 +4819,9 @@ packages: engines: {node: '>=6'} hasBin: true + jsonfile@4.0.0: + resolution: {integrity: sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg==} + jsox@1.2.121: resolution: {integrity: sha512-9Ag50tKhpTwS6r5wh3MJSAvpSof0UBr39Pto8OnzFT32Z/pAbxAsKHzyvsyMEHVslELvHyO/4/jaQELHk8wDcw==} hasBin: true @@ -4777,6 +4960,9 @@ packages: lodash.merge@4.6.2: resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} + lodash.startcase@4.4.0: + resolution: {integrity: sha512-+WKqsK294HMSc2jEbNgpHpd0JfIBhp7rEV4aqXWqFr6AlXov+SlcgB1Fv01y2kGe3Gc8nMW7VA0SrGuSkRfIEg==} + lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} @@ -4979,6 +5165,10 @@ packages: monaco-editor@0.55.1: resolution: {integrity: sha512-jz4x+TJNFHwHtwuV9vA9rMujcZRb0CEilTEwG2rRSpe/A7Jdkuj8xPKttCgOh+v/lkHy7HsZ64oj+q3xoAFl9A==} + mri@1.2.0: + resolution: {integrity: sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==} + engines: {node: '>=4'} + ms@2.1.3: resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} @@ -5119,6 +5309,9 @@ packages: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} + outdent@0.5.0: + resolution: {integrity: sha512-/jHxFIzoMXdqPzTaCpFzAAWhpkSjZPF4Vsn6jAfNpmbH/ymsmd7Qc6VE9BGn0L6YMj6uwpQLxCECpus4ukKS9Q==} + own-keys@1.0.1: resolution: {integrity: sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==} engines: {node: '>= 0.4'} @@ -5130,6 +5323,10 @@ packages: resolution: {integrity: sha512-mlVgR3PGuzlo0MmTdk4cXqXWlwQDLnONTAg6sm62XkMJEiRxN3GL3SffkYvqwonbkJBcrI7Uvv5Zh9yjvn2iUw==} engines: {node: '>=12.20'} + p-filter@2.1.0: + resolution: {integrity: sha512-ZBxxZ5sL2HghephhpGAQdoskxplTwr7ICaehZwLIlfL6acuVgZPm8yBNuRAFBGEqtD/hmUeq9eqLg2ys9Xr/yw==} + engines: {node: '>=8'} + p-limit@2.3.0: resolution: {integrity: sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==} engines: {node: '>=6'} @@ -5146,6 +5343,10 @@ packages: resolution: {integrity: sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==} engines: {node: '>=10'} + p-map@2.1.0: + resolution: {integrity: sha512-y3b8Kpd8OAN444hxfBbFfj1FY/RjtTd8tzYwhUqNYXx0fXx2iX4maP4Qr6qhIKbQXI02wTLAda4fYUbDagTUFw==} + engines: {node: '>=6'} + p-try@2.2.0: resolution: {integrity: sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==} engines: {node: '>=6'} @@ -5153,6 +5354,9 @@ packages: package-json-from-dist@1.0.1: resolution: {integrity: sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==} + package-manager-detector@0.2.11: + resolution: {integrity: sha512-BEnLolu+yuz22S56CU1SUKq3XC3PkwD5wv4ikR4MfGvnRVcmzXR9DwSlW2fEamyTPyXHomBJRzgapeuBvRNzJQ==} + parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -5264,6 +5468,10 @@ packages: resolution: {integrity: sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==} engines: {node: '>=12'} + pify@4.0.1: + resolution: {integrity: sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g==} + engines: {node: '>=6'} + pino-abstract-transport@2.0.0: resolution: {integrity: sha512-F63x5tizV6WCh4R6RHyi2Ml+M70DNRXt/+HANowMflpgGFMAym/VKm6G7ZOQRjqN7XbGxK1Lg9t6ZrtzOaivMw==} @@ -5357,6 +5565,11 @@ packages: resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} engines: {node: '>= 0.8.0'} + prettier@2.8.8: + resolution: {integrity: sha512-tdN8qQGvNjw4CHbY+XXk0JgCXn9QiF21a55rBe5LJAU+kDyC4WQn4+awm2Xfk2lQMk5fKup9XgzTZtGkjBdP9Q==} + engines: {node: '>=10.13.0'} + hasBin: true + prettier@3.7.4: resolution: {integrity: sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==} engines: {node: '>=14'} @@ -5419,6 +5632,9 @@ packages: resolution: {integrity: sha512-tDNIz22aBzCDxLtVH++VnTfzxlfeK5CbqohpSqpJgj1Wg/cQbStNAz3NuqCs5vV+pjBsK4x4pN9HlVh7rcYRiA==} engines: {node: '>=0.6'} + quansync@0.2.11: + resolution: {integrity: sha512-AifT7QEbW9Nri4tAwR5M/uzpBuqfZf+zwaEM/QkzEjj7NBuFD2rBuy0K3dE+8wltbezDV7JMA0WfnCPYRSYbXA==} + queue-microtask@1.2.3: resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==} @@ -5482,6 +5698,10 @@ packages: resolution: {integrity: sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==} engines: {node: '>=0.10.0'} + read-yaml-file@1.1.0: + resolution: {integrity: sha512-VIMnQi/Z4HT2Fxuwg5KrY174U1VdUIASQVWXXyqtNRtxSr9IYkn1rsI6Tb6HsrHCmB7gVpNwX6JxPTHcH6IoTA==} + engines: {node: '>=6'} + readable-stream@1.0.34: resolution: {integrity: sha512-ok1qVCJuRkNmvebYikljxJA/UEsKwLl2nI1OmaqAu4/UE+h0wKCHok4XkL/gvi39OacXvw59RJUOFUkDib2rHg==} @@ -5779,6 +5999,9 @@ packages: resolution: {integrity: sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ==} engines: {node: '>= 12'} + spawndamnit@3.0.1: + resolution: {integrity: sha512-MmnduQUuHCoFckZoWnXsTg7JaiLBJrKFj9UI2MbRPGaJeVpsLcVBu6P/IGZovziM/YBsellCmsprgNA+w0CzVg==} + split-ca@1.0.1: resolution: {integrity: sha512-Q5thBSxp5t8WPTTJQS59LrGqOZqOsrhDGDVm8azCqIBjSBd7nd9o2PM+mDulQQkh8h//4U6hFZnc/mul8t5pWQ==} @@ -5973,6 +6196,10 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} + term-size@2.2.1: + resolution: {integrity: sha512-wK0Ri4fOGjv/XPy8SBHZChl8CM7uMc5VML7SqiQ0zG7+J5Vr+RMQDoHa2CNT6KHUnTGIXH34UDMkPzAUyapBZg==} + engines: {node: '>=8'} + test-exclude@6.0.0: resolution: {integrity: sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==} engines: {node: '>=8'} @@ -6196,6 +6423,10 @@ packages: unist-util-visit@5.0.0: resolution: {integrity: sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==} + universalify@0.1.2: + resolution: {integrity: sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==} + engines: {node: '>= 4.0.0'} + unrs-resolver@1.11.1: resolution: {integrity: sha512-bSjt9pjaEBnNiGgc9rUiHGKv5l4/TGzDmYw3RhnkJGtLhbnnA/5qJj7x3dNDCRx/PJxu774LlH8lCOlB4hEfKg==} @@ -6703,6 +6934,165 @@ snapshots: '@borewit/text-codec@0.1.1': {} + '@changesets/apply-release-plan@7.0.14': + dependencies: + '@changesets/config': 3.1.2 + '@changesets/get-version-range-type': 0.4.0 + '@changesets/git': 3.0.4 + '@changesets/should-skip-package': 0.1.2 + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + detect-indent: 6.1.0 + fs-extra: 7.0.1 + lodash.startcase: 4.4.0 + outdent: 0.5.0 + prettier: 2.8.8 + resolve-from: 5.0.0 + semver: 7.7.3 + + '@changesets/assemble-release-plan@6.0.9': + dependencies: + '@changesets/errors': 0.2.0 + '@changesets/get-dependents-graph': 2.1.3 + '@changesets/should-skip-package': 0.1.2 + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + semver: 7.7.3 + + '@changesets/changelog-git@0.2.1': + dependencies: + '@changesets/types': 6.1.0 + + '@changesets/changelog-github@0.5.2': + dependencies: + '@changesets/get-github-info': 0.7.0 + '@changesets/types': 6.1.0 + dotenv: 8.6.0 + transitivePeerDependencies: + - encoding + + '@changesets/cli@2.29.8(@types/node@22.19.3)': + dependencies: + '@changesets/apply-release-plan': 7.0.14 + '@changesets/assemble-release-plan': 6.0.9 + '@changesets/changelog-git': 0.2.1 + '@changesets/config': 3.1.2 + '@changesets/errors': 0.2.0 + '@changesets/get-dependents-graph': 2.1.3 + '@changesets/get-release-plan': 4.0.14 + '@changesets/git': 3.0.4 + '@changesets/logger': 0.1.1 + '@changesets/pre': 2.0.2 + '@changesets/read': 0.6.6 + '@changesets/should-skip-package': 0.1.2 + '@changesets/types': 6.1.0 + '@changesets/write': 0.4.0 + '@inquirer/external-editor': 1.0.3(@types/node@22.19.3) + '@manypkg/get-packages': 1.1.3 + ansi-colors: 4.1.3 + ci-info: 3.9.0 + enquirer: 2.4.1 + fs-extra: 7.0.1 + mri: 1.2.0 + p-limit: 2.3.0 + package-manager-detector: 0.2.11 + picocolors: 1.1.1 + resolve-from: 5.0.0 + semver: 7.7.3 + spawndamnit: 3.0.1 + term-size: 2.2.1 + transitivePeerDependencies: + - '@types/node' + + '@changesets/config@3.1.2': + dependencies: + '@changesets/errors': 0.2.0 + '@changesets/get-dependents-graph': 2.1.3 + '@changesets/logger': 0.1.1 + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + fs-extra: 7.0.1 + micromatch: 4.0.8 + + '@changesets/errors@0.2.0': + dependencies: + extendable-error: 0.1.7 + + '@changesets/get-dependents-graph@2.1.3': + dependencies: + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + picocolors: 1.1.1 + semver: 7.7.3 + + '@changesets/get-github-info@0.7.0': + dependencies: + dataloader: 1.4.0 + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + + '@changesets/get-release-plan@4.0.14': + dependencies: + '@changesets/assemble-release-plan': 6.0.9 + '@changesets/config': 3.1.2 + '@changesets/pre': 2.0.2 + '@changesets/read': 0.6.6 + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + + '@changesets/get-version-range-type@0.4.0': {} + + '@changesets/git@3.0.4': + dependencies: + '@changesets/errors': 0.2.0 + '@manypkg/get-packages': 1.1.3 + is-subdir: 1.2.0 + micromatch: 4.0.8 + spawndamnit: 3.0.1 + + '@changesets/logger@0.1.1': + dependencies: + picocolors: 1.1.1 + + '@changesets/parse@0.4.2': + dependencies: + '@changesets/types': 6.1.0 + js-yaml: 4.1.1 + + '@changesets/pre@2.0.2': + dependencies: + '@changesets/errors': 0.2.0 + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + fs-extra: 7.0.1 + + '@changesets/read@0.6.6': + dependencies: + '@changesets/git': 3.0.4 + '@changesets/logger': 0.1.1 + '@changesets/parse': 0.4.2 + '@changesets/types': 6.1.0 + fs-extra: 7.0.1 + p-filter: 2.1.0 + picocolors: 1.1.1 + + '@changesets/should-skip-package@0.1.2': + dependencies: + '@changesets/types': 6.1.0 + '@manypkg/get-packages': 1.1.3 + + '@changesets/types@4.1.0': {} + + '@changesets/types@6.1.0': {} + + '@changesets/write@0.4.0': + dependencies: + '@changesets/types': 6.1.0 + fs-extra: 7.0.1 + human-id: 4.1.3 + prettier: 2.8.8 + '@date-fns/tz@1.2.0': {} '@dnd-kit/accessibility@3.1.1(react@19.1.0)': @@ -7481,6 +7871,13 @@ snapshots: '@img/sharp-win32-x64@0.34.5': optional: true + '@inquirer/external-editor@1.0.3(@types/node@22.19.3)': + dependencies: + chardet: 2.1.1 + iconv-lite: 0.7.2 + optionalDependencies: + '@types/node': 22.19.3 + '@isaacs/cliui@8.0.2': dependencies: string-width: 5.1.2 @@ -7908,6 +8305,22 @@ snapshots: '@libsql/win32-x64-msvc@0.4.7': optional: true + '@manypkg/find-root@1.1.0': + dependencies: + '@babel/runtime': 7.28.4 + '@types/node': 12.20.55 + find-up: 4.1.0 + fs-extra: 8.1.0 + + '@manypkg/get-packages@1.1.3': + dependencies: + '@babel/runtime': 7.28.4 + '@changesets/types': 4.1.0 + '@manypkg/find-root': 1.1.0 + fs-extra: 8.1.0 + globby: 11.1.0 + read-yaml-file: 1.1.0 + '@monaco-editor/loader@1.7.0': dependencies: state-local: 1.0.7 @@ -8841,6 +9254,8 @@ snapshots: '@types/ms@2.1.0': {} + '@types/node@12.20.55': {} + '@types/node@18.19.130': dependencies: undici-types: 5.26.5 @@ -9368,6 +9783,8 @@ snapshots: json-schema-traverse: 1.0.0 require-from-string: 2.0.2 + ansi-colors@4.1.3: {} + ansi-escapes@4.3.2: dependencies: type-fest: 0.21.3 @@ -9438,6 +9855,8 @@ snapshots: is-string: 1.1.1 math-intrinsics: 1.1.0 + array-union@2.1.0: {} + array.prototype.findlast@1.2.5: dependencies: call-bind: 1.0.8 @@ -9631,6 +10050,10 @@ snapshots: dependencies: tweetnacl: 0.14.5 + better-path-resolve@1.0.0: + dependencies: + is-windows: 1.0.2 + bin-version-check@5.1.0: dependencies: bin-version: 6.0.0 @@ -9774,6 +10197,8 @@ snapshots: character-reference-invalid@2.0.1: {} + chardet@2.1.1: {} + charenc@0.0.2: {} check-error@2.1.1: {} @@ -9792,6 +10217,8 @@ snapshots: chownr@1.1.4: {} + ci-info@3.9.0: {} + ci-info@4.3.1: {} cjs-module-lexer@2.1.1: {} @@ -9943,6 +10370,8 @@ snapshots: es-errors: 1.3.0 is-data-view: 1.0.2 + dataloader@1.4.0: {} + dataloader@2.2.3: {} date-fns@3.6.0: {} @@ -10006,6 +10435,8 @@ snapshots: dequal@2.0.3: {} + detect-indent@6.1.0: {} + detect-indent@7.0.2: {} detect-libc@2.0.2: {} @@ -10020,6 +10451,10 @@ snapshots: dependencies: dequal: 2.0.3 + dir-glob@3.0.1: + dependencies: + path-type: 4.0.0 + docker-compose@1.3.0: dependencies: yaml: 2.8.2 @@ -10064,6 +10499,8 @@ snapshots: dotenv@17.2.3: {} + dotenv@8.6.0: {} + drizzle-kit@0.31.7: dependencies: '@drizzle-team/brocli': 0.10.2 @@ -10112,6 +10549,11 @@ snapshots: graceful-fs: 4.2.11 tapable: 2.3.0 + enquirer@2.4.1: + dependencies: + ansi-colors: 4.1.3 + strip-ansi: 6.0.1 + error-ex@1.3.4: dependencies: is-arrayish: 0.2.1 @@ -10836,6 +11278,8 @@ snapshots: ext-list: 2.2.2 sort-keys-length: 1.0.1 + extendable-error@0.1.7: {} + fast-copy@3.0.2: {} fast-deep-equal@3.1.3: {} @@ -10970,6 +11414,18 @@ snapshots: fs-constants@1.0.0: {} + fs-extra@7.0.1: + dependencies: + graceful-fs: 4.2.11 + jsonfile: 4.0.0 + universalify: 0.1.2 + + fs-extra@8.1.0: + dependencies: + graceful-fs: 4.2.11 + jsonfile: 4.0.0 + universalify: 0.1.2 + fs.realpath@1.0.0: {} fsevents@2.3.2: @@ -11074,6 +11530,15 @@ snapshots: define-properties: 1.2.1 gopd: 1.2.0 + globby@11.1.0: + dependencies: + array-union: 2.1.0 + dir-glob: 3.0.1 + fast-glob: 3.3.3 + ignore: 5.3.2 + merge2: 1.4.1 + slash: 3.0.0 + globrex@0.1.2: {} gopd@1.2.0: {} @@ -11150,8 +11615,14 @@ snapshots: quick-lru: 5.1.1 resolve-alpn: 1.2.1 + human-id@4.1.3: {} + human-signals@2.1.0: {} + iconv-lite@0.7.2: + dependencies: + safer-buffer: 2.1.2 + ieee754@1.2.1: {} ignore@5.3.2: {} @@ -11331,6 +11802,10 @@ snapshots: call-bound: 1.0.4 has-tostringtag: 1.0.2 + is-subdir@1.2.0: + dependencies: + better-path-resolve: 1.0.0 + is-symbol@1.1.1: dependencies: call-bound: 1.0.4 @@ -11352,6 +11827,8 @@ snapshots: call-bound: 1.0.4 get-intrinsic: 1.3.0 + is-windows@1.0.2: {} + is-wsl@3.1.0: dependencies: is-inside-container: 1.0.0 @@ -11781,6 +12258,10 @@ snapshots: json5@2.2.3: {} + jsonfile@4.0.0: + optionalDependencies: + graceful-fs: 4.2.11 + jsox@1.2.121: {} jsx-ast-utils@3.3.5: @@ -11897,6 +12378,8 @@ snapshots: lodash.merge@4.6.2: {} + lodash.startcase@4.4.0: {} + lodash@4.17.21: {} long@5.3.2: {} @@ -12213,6 +12696,8 @@ snapshots: dompurify: 3.2.7 marked: 14.0.0 + mri@1.2.0: {} + ms@2.1.3: {} nan@2.24.0: @@ -12353,6 +12838,8 @@ snapshots: type-check: 0.4.0 word-wrap: 1.2.5 + outdent@0.5.0: {} + own-keys@1.0.1: dependencies: get-intrinsic: 1.3.0 @@ -12375,6 +12862,10 @@ snapshots: p-cancelable@3.0.0: {} + p-filter@2.1.0: + dependencies: + p-map: 2.1.0 + p-limit@2.3.0: dependencies: p-try: 2.2.0 @@ -12391,10 +12882,16 @@ snapshots: dependencies: p-limit: 3.1.0 + p-map@2.1.0: {} + p-try@2.2.0: {} package-json-from-dist@1.0.1: {} + package-manager-detector@0.2.11: + dependencies: + quansync: 0.2.11 + parent-module@1.0.1: dependencies: callsites: 3.1.0 @@ -12534,6 +13031,8 @@ snapshots: picomatch@4.0.3: {} + pify@4.0.1: {} + pino-abstract-transport@2.0.0: dependencies: split2: 4.2.0 @@ -12630,6 +13129,8 @@ snapshots: prelude-ls@1.2.1: {} + prettier@2.8.8: {} + prettier@3.7.4: {} pretty-format@30.2.0: @@ -12699,6 +13200,8 @@ snapshots: dependencies: side-channel: 1.1.0 + quansync@0.2.11: {} + queue-microtask@1.2.3: {} quick-format-unescaped@4.0.4: {} @@ -12766,6 +13269,13 @@ snapshots: react@19.1.0: {} + read-yaml-file@1.1.0: + dependencies: + graceful-fs: 4.2.11 + js-yaml: 3.14.2 + pify: 4.0.1 + strip-bom: 3.0.0 + readable-stream@1.0.34: dependencies: core-util-is: 1.0.3 @@ -13159,6 +13669,11 @@ snapshots: source-map@0.7.6: {} + spawndamnit@3.0.1: + dependencies: + cross-spawn: 7.0.6 + signal-exit: 4.1.0 + split-ca@1.0.1: {} split2@4.2.0: {} @@ -13394,6 +13909,8 @@ snapshots: - bare-abort-controller - react-native-b4a + term-size@2.2.1: {} + test-exclude@6.0.0: dependencies: '@istanbuljs/schema': 0.1.3 @@ -13647,6 +14164,8 @@ snapshots: unist-util-is: 6.0.1 unist-util-visit-parents: 6.0.2 + universalify@0.1.2: {} + unrs-resolver@1.11.1: dependencies: napi-postinstall: 0.3.4 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml new file mode 100644 index 0000000..740176f --- /dev/null +++ b/pnpm-workspace.yaml @@ -0,0 +1,3 @@ +packages: + - "." + - "adapters/*" diff --git a/src/collections/bulkEmbeddingsBatches.ts b/src/collections/bulkEmbeddingsBatches.ts index e47c18a..43aa02a 100644 --- a/src/collections/bulkEmbeddingsBatches.ts +++ b/src/collections/bulkEmbeddingsBatches.ts @@ -115,6 +115,14 @@ export const createBulkEmbeddingsBatchesCollection = (): CollectionConfig => ({ description: 'Error message if the batch failed', }, }, + { + name: 'failedChunkData', + type: 'json', + admin: { + description: + 'Collection, documentId and chunkIndex for each chunk that failed in this batch', + }, + }, { name: 'retriedBatch', type: 'relationship', diff --git a/src/constants.ts b/src/constants.ts new file mode 100644 index 0000000..aabe244 --- /dev/null +++ b/src/constants.ts @@ -0,0 +1,5 @@ +export const TASK_SLUG_VECTORIZE = 'payloadcms-vectorize:vectorize' as const +export const TASK_SLUG_PREPARE_BULK_EMBEDDING = + 'payloadcms-vectorize:prepare-bulk-embedding' as const +export const TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING = + 'payloadcms-vectorize:poll-or-complete-single-batch' as const diff --git a/src/endpoints/bulkEmbed.ts b/src/endpoints/bulkEmbed.ts index d3d5503..d554543 100644 --- a/src/endpoints/bulkEmbed.ts +++ b/src/endpoints/bulkEmbed.ts @@ -1,6 +1,7 @@ import type { Payload, PayloadHandler } from 'payload' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' import type { BulkEmbedResult, KnowledgePoolDynamicConfig, KnowledgePoolName } from '../types.js' +import { TASK_SLUG_PREPARE_BULK_EMBEDDING } from '../constants.js' /** * Core logic for starting a bulk embed run. @@ -54,8 +55,8 @@ export async function startBulkEmbed< }, }) - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', + await payload.jobs.queue({ + task: TASK_SLUG_PREPARE_BULK_EMBEDDING, input: { runId: String(run.id) }, ...(queueName ? { queue: queueName } : {}), }) diff --git a/src/endpoints/retryFailedBatch.ts b/src/endpoints/retryFailedBatch.ts index 8d86e50..a5a2975 100644 --- a/src/endpoints/retryFailedBatch.ts +++ b/src/endpoints/retryFailedBatch.ts @@ -8,6 +8,7 @@ import type { RetryFailedBatchResult, BulkEmbeddingInput, } from '../types.js' +import { TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING } from '../constants.js' /** * Core logic for retrying a failed batch. @@ -209,10 +210,10 @@ export async function retryBatch({ - task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', - input: { runId: String(runId) }, + // Queue a per-batch task for the retried batch + await payload.jobs.queue({ + task: TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, + input: { runId: String(runId), batchId: String(newBatch.id) }, ...(queueName ? { queue: queueName } : {}), }) diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 2ac5dae..b39109f 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -1,38 +1,20 @@ import type { BasePayload, PayloadHandler, Where } from 'payload' -import { - sql, - cosineDistance, - inArray, - eq, - and, - or, - not, - like, - gt, - gte, - lt, - lte, - ne, - isNull, - isNotNull, -} from '@payloadcms/db-postgres/drizzle' -import toSnakeCase from 'to-snake-case' import type { - VectorSearchResult, KnowledgePoolName, KnowledgePoolDynamicConfig, VectorSearchQuery, + DbAdapter, } from '../types.js' -import { getEmbeddingsTable } from '../drizzle/tables.js' -export const createVectorSearchHandlers = ( - knowledgePools: Record, +export const createVectorSearchHandlers = ( + knowledgePools: Record, + adapter: DbAdapter, ) => { const vectorSearch = async ( payload: BasePayload, query: string, - knowledgePool: TPoolNames, + knowledgePool: KnowledgePoolName, limit?: number, where?: Where, ) => { @@ -44,19 +26,14 @@ export const createVectorSearchHandlers = })() // Perform cosine similarity search using Drizzle - return await performCosineSearch(payload, queryEmbedding, knowledgePool, limit, where) + return await adapter.search(payload, queryEmbedding, knowledgePool, limit, where) } const requestHandler: PayloadHandler = async (req) => { if (!req || !req.json) { return Response.json({ error: 'Request is required' }, { status: 400 }) } try { - const { - query, - knowledgePool, - where, - limit = 10, - }: VectorSearchQuery = await req.json() + const { query, knowledgePool, where, limit = 10 }: VectorSearchQuery = await req.json() if (!query || typeof query !== 'string') { return Response.json({ error: 'Query is required and must be a string' }, { status: 400 }) } @@ -86,299 +63,3 @@ export const createVectorSearchHandlers = } return { vectorSearch, requestHandler } } - -async function performCosineSearch( - payload: BasePayload, - queryEmbedding: number[], - poolName: KnowledgePoolName, - limit: number = 10, - whereClause?: Where, -): Promise> { - const isPostgres = payload.db?.pool?.query || payload.db?.drizzle - - if (!isPostgres) { - throw new Error('Only works with Postgres') - } - - // In PayloadCMS, payload.db IS the adapter, and drizzle is at payload.db.drizzle - const adapter = payload.db - if (!adapter) { - throw new Error('Drizzle adapter not found') - } - - // Get drizzle instance - const drizzle = adapter.drizzle - if (!drizzle) { - throw new Error('Drizzle instance not found in adapter') - } - - // Get collection config and table name - const collectionConfig = payload.collections[poolName]?.config - if (!collectionConfig) { - throw new Error(`Collection ${poolName} not found`) - } - - const table = getEmbeddingsTable(poolName) - if (!table) { - throw new Error( - `[payloadcms-vectorize] Embeddings table for knowledge pool "${poolName}" not registered. Ensure the plugin's afterSchemaInit hook ran and the pool exists.`, - ) - } - - // Use Drizzle's query builder with cosineDistance function - // cosineDistance returns distance, so we calculate similarity as 1 - distance - // The table from fullSchema should have columns as direct properties - const embeddingColumn = table.embedding - if (!embeddingColumn) { - throw new Error( - `Embedding column not found in table for pool "${poolName}". Available properties: ${Object.keys(table).join(', ')}`, - ) - } - - // Convert WHERE clause to Drizzle conditions - let drizzleWhere: any = undefined - if (whereClause) { - drizzleWhere = convertWhereToDrizzle(whereClause, table, collectionConfig.flattenedFields) - if (drizzleWhere === null) { - // WHERE clause resulted in an empty condition (e.g., empty 'and' or 'or' array) - // This semantically means "match nothing", so return empty results - throw new Error( - `[payloadcms-vectorize] WHERE clause resulted in no valid conditions. This typically occurs when using empty 'and' or 'or' arrays, or when all field conditions reference non-existent columns.`, - ) - } - if (drizzleWhere === undefined) { - // WHERE clause could not be converted (invalid structure or unsupported operators) - throw new Error( - `[payloadcms-vectorize] WHERE clause could not be converted to Drizzle conditions. Please check that all field names exist and operators are supported.`, - ) - } - } - - // Build query using Drizzle's query builder - // Column names in the table are camelCase (docId, chunkText, etc.) - // but their database names are snake_case (doc_id, chunk_text, etc.) - // The table from fullSchema should have columns as direct properties - // Calculate similarity: 1 - cosineDistance (distance) - // Need to cast 1 to numeric to avoid "integer - vector" error - const distanceExpr = cosineDistance(embeddingColumn, queryEmbedding) - - // Build select object with similarity - const selectObj: Record = { - id: table.id, // ensure we select id explicitly - similarity: sql`1 - (${distanceExpr})`, - } - - // Add reserved + extension fields from collection config - for (const field of collectionConfig.fields ?? []) { - if (typeof field === 'object' && 'name' in field) { - const name = field.name as string - if (name in table) { - selectObj[name] = table[name] - } else if (toSnakeCase(name) in table) { - selectObj[name] = table[toSnakeCase(name)] - } - } - } - - let query: any = drizzle.select(selectObj).from(table) - - // Add WHERE clause if provided - if (drizzleWhere) { - query = query.where(drizzleWhere) - } - - // Order by cosine distance (ascending = most similar first) and limit - // Reuse the same distance expression for ordering - query = query.orderBy(distanceExpr).limit(limit) - - // Execute the query - const result = await query - - return mapRowsToResults(result, collectionConfig) -} - -/** - * Convert Payload WHERE clause to Drizzle conditions - * Simplified version inspired by Payload's buildQuery - */ -function convertWhereToDrizzle(where: Where, table: any, fields: any[]): any { - if (!where || typeof where !== 'object') { - return undefined - } - - // Handle 'and' operator - if ('and' in where && Array.isArray(where.and)) { - const conditions = where.and - .map((condition) => convertWhereToDrizzle(condition, table, fields)) - .filter((c) => c !== undefined && c !== null) - if (conditions.length === 0) return null - if (conditions.length === 1) return conditions[0] - return and(...conditions) - } - - // Handle 'or' operator - if ('or' in where && Array.isArray(where.or)) { - const conditions = where.or - .map((condition) => convertWhereToDrizzle(condition, table, fields)) - .filter((c) => c !== undefined && c !== null) - if (conditions.length === 0) return null - if (conditions.length === 1) return conditions[0] - return or(...conditions) - } - - // Handle field conditions - collect all field conditions and combine with AND - const fieldConditions: any[] = [] - for (const [fieldName, condition] of Object.entries(where)) { - if (fieldName === 'and' || fieldName === 'or') continue - - // Get the column from the table - // Drizzle tables have columns as direct properties - // Try camelCase first, then snake_case as fallback - // Use 'in' operator to check existence, then access the property - let column: any = undefined - if (fieldName in table) { - column = table[fieldName] - } else if (toSnakeCase(fieldName) in table) { - column = table[toSnakeCase(fieldName)] - } else if (table.columns) { - // Fallback to table.columns if it exists - if (fieldName in table.columns) { - column = table.columns[fieldName] - } else if (toSnakeCase(fieldName) in table.columns) { - column = table.columns[toSnakeCase(fieldName)] - } - } - - if (!column) { - // Field not found, skip (could be a nested field we don't support) - continue - } - - if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) { - continue - } - - const cond = condition as Record - - // Handle equals - if ('equals' in cond) { - fieldConditions.push(eq(column, cond.equals)) - continue - } - - // Handle not_equals / notEquals - if ('not_equals' in cond || 'notEquals' in cond) { - fieldConditions.push(ne(column, cond.not_equals ?? cond.notEquals)) - continue - } - - // Handle in - if ('in' in cond && Array.isArray(cond.in)) { - fieldConditions.push(inArray(column, cond.in)) - continue - } - - // Handle not_in / notIn - if ('not_in' in cond || 'notIn' in cond) { - const values = cond.not_in ?? cond.notIn - if (Array.isArray(values)) { - fieldConditions.push(not(inArray(column, values))) - } - continue - } - - // Handle like - if ('like' in cond && typeof cond.like === 'string') { - fieldConditions.push(like(column, cond.like)) - continue - } - - // Handle contains - if ('contains' in cond && typeof cond.contains === 'string') { - fieldConditions.push(like(column, `%${cond.contains}%`)) - continue - } - - // Handle greater_than / greaterThan - if ('greater_than' in cond || 'greaterThan' in cond) { - fieldConditions.push(gt(column, cond.greater_than ?? cond.greaterThan)) - continue - } - - // Handle greater_than_equal / greaterThanEqual - if ('greater_than_equal' in cond || 'greaterThanEqual' in cond) { - fieldConditions.push(gte(column, cond.greater_than_equal ?? cond.greaterThanEqual)) - continue - } - - // Handle less_than / lessThan - if ('less_than' in cond || 'lessThan' in cond) { - fieldConditions.push(lt(column, cond.less_than ?? cond.lessThan)) - continue - } - - // Handle less_than_equal / lessThanEqual - if ('less_than_equal' in cond || 'lessThanEqual' in cond) { - fieldConditions.push(lte(column, cond.less_than_equal ?? cond.lessThanEqual)) - continue - } - - // Handle exists (null check) - if ('exists' in cond && typeof cond.exists === 'boolean') { - fieldConditions.push(cond.exists ? isNotNull(column) : isNull(column)) - continue - } - } - - // Combine all field conditions with AND - if (fieldConditions.length === 0) { - return undefined - } - if (fieldConditions.length === 1) { - return fieldConditions[0] - } - return and(...fieldConditions) -} - -function mapRowsToResults(rows: any[], collectionConfig: any): Array { - // Collect names of fields that are typed as number on the collection - const numberFields = new Set() - if (collectionConfig?.fields) { - for (const field of collectionConfig.fields) { - if (typeof field === 'object' && 'name' in field && field.type === 'number') { - numberFields.add(field.name as string) - } - } - } - - return rows.map((row: any) => { - // Drizzle returns columns with the names we selected (camelCase) - // Handle both camelCase and snake_case for robustness - const rawDocId = row.docId ?? row.doc_id - const rawChunkIndex = row.chunkIndex ?? row.chunk_index - const rawSimilarity = row.similarity - - const result: any = { - ...row, - id: String(row.id), - docId: String(rawDocId), - similarity: - typeof rawSimilarity === 'number' ? rawSimilarity : parseFloat(String(rawSimilarity)), - chunkIndex: - typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), - } - - // Ensure any number fields from the schema are numbers in the result - for (const fieldName of numberFields) { - const value = result[fieldName] - if (value != null && typeof value !== 'number') { - const parsed = parseFloat(String(value)) - if (!Number.isNaN(parsed)) { - result[fieldName] = parsed - } - } - } - - return result - }) -} diff --git a/src/index.ts b/src/index.ts index 2f4a6c9..44aca25 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,26 +1,20 @@ -import type { Config, Payload, PayloadRequest } from 'payload' -import { customType, index } from '@payloadcms/db-postgres/drizzle/pg-core' -import toSnakeCase from 'to-snake-case' -import { fileURLToPath } from 'url' -import { dirname, resolve } from 'path' +import type { CollectionSlug, Config, Payload, PayloadRequest } from 'payload' import { createEmbeddingsCollection } from './collections/embeddings.js' import type { PayloadcmsVectorizeConfig, - PostgresPayload, KnowledgePoolName, - KnowledgePoolStaticConfig, KnowledgePoolDynamicConfig, VectorizedPayload, VectorSearchQuery, BulkEmbedResult, RetryFailedBatchResult, + DbAdapter, } from './types.js' -import { isPostgresPayload } from './types.js' -import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' import { createVectorizeTask } from './tasks/vectorize.js' +import { TASK_SLUG_VECTORIZE } from './constants.js' +import { deleteDocumentEmbeddings } from './utils/deleteDocumentEmbeddings.js' import { createVectorSearchHandlers } from './endpoints/vectorSearch.js' -import { clearEmbeddingsTables, registerEmbeddingsTable } from './drizzle/tables.js' import { createBulkEmbeddingsRunsCollection, BULK_EMBEDDINGS_RUNS_SLUG, @@ -35,13 +29,12 @@ import { } from './collections/bulkEmbeddingsBatches.js' import { createPrepareBulkEmbeddingTask, - createPollOrCompleteBulkEmbeddingTask, + createPollOrCompleteSingleBatchTask, } from './tasks/bulkEmbedAll.js' import { createBulkEmbedHandler, startBulkEmbed } from './endpoints/bulkEmbed.js' import { createRetryFailedBatchHandler, retryBatch } from './endpoints/retryFailedBatch.js' export type { - KnowledgePoolStaticConfig, PayloadcmsVectorizeConfig, // PayloadcmsVectorizeConfig @@ -53,6 +46,7 @@ export type { EmbeddingConfig, // CollectionVectorizeOption + ShouldEmbedFn, ToKnowledgePoolFn, // EmbeddingConfig @@ -74,408 +68,353 @@ export type { // PollBulkEmbeddingsResult BulkEmbeddingRunStatus, VectorizedPayload, + DbAdapter, + + // For adapters + VectorSearchResult, } from './types.js' export { getVectorizedPayload } from './types.js' +export { + TASK_SLUG_VECTORIZE, + TASK_SLUG_PREPARE_BULK_EMBEDDING, + TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, +} from './constants.js' +export { validateChunkData } from './utils/validateChunkData.js' +export { deleteDocumentEmbeddings } from './utils/deleteDocumentEmbeddings.js' + // ================== // Plugin entry point // ================== -export const createVectorizeIntegration = ( - staticConfigs: Record, -): { - afterSchemaInitHook: Required['afterSchemaInit'][number] - payloadcmsVectorize: ( - pluginOptions: PayloadcmsVectorizeConfig, - ) => (config: Config) => Config -} => { - // Augment the generated schema so push/migrations are aware of our custom columns - const afterSchemaInitHook: Required['afterSchemaInit'][number] = async ({ - schema, - extendTable, - }) => { - // Ensure registry reflects the latest schema - clearEmbeddingsTables() - - // Extend schema for each knowledge pool - for (const poolName in staticConfigs) { - const staticConfig = staticConfigs[poolName] - const dims = staticConfig.dims - - const vectorType = customType({ - dataType() { - return `vector(${dims})` - }, - }) - - // Drizzle converts camelCase collection slugs to snake_case table names - const tableName = toSnakeCase(poolName) - const table = schema?.tables?.[tableName] - if (!table) { - throw new Error( - `[payloadcms-vectorize] Embeddings table "${poolName}" (table: "${tableName}") not found during schema initialization. Ensure the collection has been registered.`, - ) - } - - if (typeof extendTable === 'function') { - extendTable({ - table, - columns: { - embedding: vectorType('embedding'), - }, - extraConfig: (cols) => ({ - embeddingIvfflatIndex: index(`${tableName}_embedding_ivfflat`) - .using('ivfflat', cols.embedding.op('vector_cosine_ops')) - .with({ lists: staticConfig.ivfflatLists }), - }), - }) - } +export default (pluginOptions: PayloadcmsVectorizeConfig) => + (config: Config): Config => { + // Ensure collections array exists + config.collections = [...(config.collections || [])] - registerEmbeddingsTable(poolName as KnowledgePoolName, table) + // Ensure bulk runs collection exists once + const bulkRunsCollection = createBulkEmbeddingsRunsCollection() + if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_RUNS_SLUG)) { + config.collections.push(bulkRunsCollection) + } + // Ensure bulk input metadata collection exists once + const bulkInputMetadataCollection = createBulkEmbeddingInputMetadataCollection() + if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_INPUT_METADATA_SLUG)) { + config.collections.push(bulkInputMetadataCollection) + } + // Ensure bulk batches collection exists once + const bulkBatchesCollection = createBulkEmbeddingsBatchesCollection() + if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_BATCHES_SLUG)) { + config.collections.push(bulkBatchesCollection) } - return schema - } - const payloadcmsVectorize = - (pluginOptions: PayloadcmsVectorizeConfig) => - (config: Config): Config => { - // Ensure collections array exists - config.collections = [...(config.collections || [])] - - // Ensure bulk runs collection exists once - const bulkRunsCollection = createBulkEmbeddingsRunsCollection() - if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_RUNS_SLUG)) { - config.collections.push(bulkRunsCollection) - } - // Ensure bulk input metadata collection exists once - const bulkInputMetadataCollection = createBulkEmbeddingInputMetadataCollection() - if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_INPUT_METADATA_SLUG)) { - config.collections.push(bulkInputMetadataCollection) - } - // Ensure bulk batches collection exists once - const bulkBatchesCollection = createBulkEmbeddingsBatchesCollection() - if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_BATCHES_SLUG)) { - config.collections.push(bulkBatchesCollection) + // Build reverse mapping: collectionSlug -> KnowledgePoolName[] + const collectionToPools = new Map< + string, + Array<{ + pool: KnowledgePoolName + dynamic: KnowledgePoolDynamicConfig + }> + >() + + // Process each knowledge pool + for (const poolName in pluginOptions.knowledgePools) { + const dynamicConfig = pluginOptions.knowledgePools[poolName] + + // Add the embeddings collection for this knowledge pool with extensionFields + const embeddingsCollection = createEmbeddingsCollection( + poolName, + dynamicConfig.extensionFields, + ) + if (!config.collections.find((c) => c.slug === poolName)) { + config.collections.push(embeddingsCollection) } - // Validate static/dynamic configs share the same pool names - for (const poolName in pluginOptions.knowledgePools) { - if (!staticConfigs[poolName]) { - throw new Error( - `[payloadcms-vectorize] Knowledge pool "${poolName}" not found in static configs`, - ) + // Build reverse mapping for hooks + const collectionSlugs = Object.keys(dynamicConfig.collections) + for (const collectionSlug of collectionSlugs) { + if (!collectionToPools.has(collectionSlug)) { + collectionToPools.set(collectionSlug, []) } + collectionToPools.get(collectionSlug)!.push({ pool: poolName, dynamic: dynamicConfig }) } + } - const unusedStaticPools: TPoolNames[] = [] - for (const poolName in staticConfigs) { - if (!pluginOptions.knowledgePools[poolName]) { - unusedStaticPools.push(poolName) - } + // Validate bulk queue requirements + let bulkIngestEnabled = false + for (const poolName in pluginOptions.knowledgePools) { + const dynamicConfig = pluginOptions.knowledgePools[poolName] + if (dynamicConfig.embeddingConfig.bulkEmbeddingsFns) { + bulkIngestEnabled = true + break } - if (unusedStaticPools.length > 0) { - throw new Error( - `[payloadcms-vectorize] Static knowledge pool(s) ${unusedStaticPools.join(', ')} lack dynamic configuration`, - ) - } - - // Build reverse mapping: collectionSlug -> KnowledgePoolName[] - const collectionToPools = new Map< - string, - Array<{ - pool: KnowledgePoolName - dynamic: KnowledgePoolDynamicConfig - }> - >() - - // Process each knowledge pool - for (const poolName in pluginOptions.knowledgePools) { - const dynamicConfig = pluginOptions.knowledgePools[poolName] - - // Add the embeddings collection for this knowledge pool with extensionFields - const embeddingsCollection = createEmbeddingsCollection( - poolName, - dynamicConfig.extensionFields, - ) - if (!config.collections.find((c) => c.slug === poolName)) { - config.collections.push(embeddingsCollection) - } + } + if (bulkIngestEnabled && !pluginOptions.bulkQueueNames) { + throw new Error( + '[payloadcms-vectorize] bulkQueueNames is required when any knowledge pool has bulk embedding configured (embeddingConfig.bulkEmbeddingsFns).', + ) + } - // Build reverse mapping for hooks - const collectionSlugs = Object.keys(dynamicConfig.collections) - for (const collectionSlug of collectionSlugs) { - if (!collectionToPools.has(collectionSlug)) { - collectionToPools.set(collectionSlug, []) - } - collectionToPools.get(collectionSlug)!.push({ pool: poolName, dynamic: dynamicConfig }) - } - } + // Exit early if disabled, but keep embeddings collections present for migrations + if (pluginOptions.disabled) { + return config + } - // Validate bulk queue requirements - let bulkIngestEnabled = false - for (const poolName in pluginOptions.knowledgePools) { - const dynamicConfig = pluginOptions.knowledgePools[poolName] - if (dynamicConfig.embeddingConfig.bulkEmbeddingsFns) { - bulkIngestEnabled = true - break - } - } - if (bulkIngestEnabled && !pluginOptions.bulkQueueNames) { - throw new Error( - '[payloadcms-vectorize] bulkQueueNames is required when any knowledge pool has bulk embedding configured (embeddingConfig.bulkEmbeddingsFns).', - ) - } + // Register tasks using Payload Jobs + const incomingJobs = config.jobs || { tasks: [] } + const tasks = [...(config.jobs?.tasks || [])] + + const vectorizeTask = createVectorizeTask({ + knowledgePools: pluginOptions.knowledgePools, + adapter: pluginOptions.dbAdapter, + }) + tasks.push(vectorizeTask) + + const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + }) + tasks.push(prepareBulkEmbedTask) + + const pollOrCompleteBulkEmbedTask = createPollOrCompleteSingleBatchTask({ + knowledgePools: pluginOptions.knowledgePools, + pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + adapter: pluginOptions.dbAdapter, + }) + tasks.push(pollOrCompleteBulkEmbedTask) + + config.jobs = { + ...incomingJobs, + tasks, + } - // Exit early if disabled, but keep embeddings collections present for migrations - if (pluginOptions.disabled) { - return config - } + const collectionToEmbedQueue = new Map< + string, + (doc: Record, payload: Payload, req?: PayloadRequest) => Promise + >() - // Register tasks using Payload Jobs - const incomingJobs = config.jobs || { tasks: [] } - const tasks = [...(config.jobs?.tasks || [])] - - const vectorizeTask = createVectorizeTask({ - knowledgePools: pluginOptions.knowledgePools, - }) - tasks.push(vectorizeTask) - - const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ - knowledgePools: pluginOptions.knowledgePools, - pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, - }) - tasks.push(prepareBulkEmbedTask) - - const pollOrCompleteBulkEmbedTask = createPollOrCompleteBulkEmbeddingTask({ - knowledgePools: pluginOptions.knowledgePools, - pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, - }) - tasks.push(pollOrCompleteBulkEmbedTask) - - config.jobs = { - ...incomingJobs, - tasks, + // Extend configured collections with hooks + for (const [collectionSlug, pools] of collectionToPools.entries()) { + const collection = config.collections.find((c) => c.slug === collectionSlug) + if (!collection) { + throw new Error(`[payloadcms-vectorize] Collection ${collectionSlug} not found`) } - const collectionToEmbedQueue = new Map< - string, - (doc: any, payload: Payload, req?: PayloadRequest) => Promise - >() - - // Extend configured collections with hooks - for (const [collectionSlug, pools] of collectionToPools.entries()) { - const collection = config.collections.find((c) => c.slug === collectionSlug) - if (!collection) { - throw new Error(`[payloadcms-vectorize] Collection ${collectionSlug} not found`) - } - - const embedQueue = async (doc: any, payload: Payload, req?: PayloadRequest) => { - // Queue vectorization jobs for ALL knowledge pools containing this collection - for (const { pool, dynamic } of pools) { - const collectionConfig = dynamic.collections[collectionSlug] - if (!collectionConfig) continue - - // Only queue real-time vectorization if realTimeIngestionFn is provided - if (!dynamic.embeddingConfig.realTimeIngestionFn) continue - // If no realTimeIngestionFn, nothing happens on doc change - // User must trigger bulk embedding manually - - await payload.jobs.queue<'payloadcms-vectorize:vectorize'>({ - task: 'payloadcms-vectorize:vectorize', - input: { - doc, - collection: collectionSlug, - knowledgePool: pool, - }, - req: req, - ...(pluginOptions.realtimeQueueName - ? { queue: pluginOptions.realtimeQueueName } - : {}), - }) + const embedQueue = async (doc: Record, payload: Payload, req?: PayloadRequest) => { + // Queue vectorization jobs for ALL knowledge pools containing this collection + for (const { pool, dynamic } of pools) { + const collectionConfig = dynamic.collections[collectionSlug] + if (!collectionConfig) continue + + // Only queue real-time vectorization if realTimeIngestionFn is provided + if (!dynamic.embeddingConfig.realTimeIngestionFn) continue + // If no realTimeIngestionFn, nothing happens on doc change + // User must trigger bulk embedding manually + + // Check if document should be embedded + if (collectionConfig.shouldEmbedFn) { + const shouldEmbed = await collectionConfig.shouldEmbedFn(doc, payload) + if (!shouldEmbed) continue } + + await payload.jobs.queue({ + task: TASK_SLUG_VECTORIZE, + input: { + doc, + collection: collectionSlug, + knowledgePool: pool, + }, + req: req, + ...(pluginOptions.realtimeQueueName ? { queue: pluginOptions.realtimeQueueName } : {}), + }) } + } - collectionToEmbedQueue.set(collectionSlug, embedQueue) + collectionToEmbedQueue.set(collectionSlug, embedQueue) - collection.hooks = { - ...(collection.hooks || {}), - afterChange: [ - ...((collection.hooks?.afterChange as any[]) || []), - async (args) => { - const { doc, req } = args - const payload = req.payload - return embedQueue(doc, payload, req) - }, - ], - afterDelete: [ - ...((collection.hooks?.afterDelete as any[]) || []), - async ({ id, payload: pld, req }: any) => { - const payload = (pld as any) || (req as any)?.payload - - // Delete from ALL knowledge pools containing this collection - for (const { pool } of pools) { - try { - await payload.delete({ - collection: pool, - where: { - and: [ - { sourceCollection: { equals: collectionSlug } }, - { docId: { equals: String(id) } }, - ], - }, - }) - } catch (e) { - payload?.logger?.warn?.( - `[payloadcms-vectorize] Failed to delete from knowledge pool ${pool}`, - e as Error, - ) - } - } + const adapter = pluginOptions.dbAdapter - // Also clean up any pending bulk embedding metadata for this document - // This prevents embedding a document that was deleted during a bulk run + collection.hooks = { + ...(collection.hooks || {}), + afterChange: [ + ...(collection.hooks?.afterChange || []), + async (args) => { + const { doc, req } = args + const payload = req.payload + return embedQueue(doc, payload, req) + }, + ], + afterDelete: [ + ...(collection.hooks?.afterDelete || []), + async ({ id, req }) => { + const payload = req.payload + + // Delete from ALL knowledge pools containing this collection + for (const { pool } of pools) { try { - await payload.delete({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { - and: [ - { sourceCollection: { equals: collectionSlug } }, - { docId: { equals: String(id) } }, - ], - }, + await deleteDocumentEmbeddings({ + payload, + poolName: pool, + collection: collectionSlug, + docId: String(id), + adapter, }) } catch (e) { payload?.logger?.warn?.( - `[payloadcms-vectorize] Failed to delete bulk embedding metadata for ${collectionSlug}:${id}`, - e as Error, + `[payloadcms-vectorize] Failed to delete from knowledge pool ${pool}: ${e instanceof Error ? e.message : String(e)}`, ) } - }, - ], - } - } - - const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) + } - // Create vectorized payload object factory that creates methods bound to a payload instance - const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { - return { - _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { - const poolConfig = pluginOptions.knowledgePools[knowledgePool] - return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns - }, - _staticConfigs: staticConfigs, - search: (params: VectorSearchQuery) => - vectorSearchHandlers.vectorSearch( - payload, - params.query, - params.knowledgePool, - params.limit, - params.where, - ), - queueEmbed: async ( - params: - | { - collection: string - docId: string - } - | { - collection: string - doc: Record + // Also clean up any pending bulk embedding metadata for this document + // This prevents embedding a document that was deleted during a bulk run + try { + await payload.delete({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { + and: [ + { sourceCollection: { equals: collectionSlug } }, + { docId: { equals: String(id) } }, + ], }, - ) => { - const collection = params.collection - let doc: Record - if ('docId' in params && params.docId) { - doc = await payload.findByID({ - collection: collection as any, - id: params.docId, }) - } else if ('doc' in params && params.doc) { - doc = params.doc - } else { - throw new Error( - `[payloadcms-vectorize] queueEmbed requires either docId or doc parameter`, + } catch (e) { + payload?.logger?.warn?.( + `[payloadcms-vectorize] Failed to delete bulk embedding metadata for ${collectionSlug}:${id}: ${e instanceof Error ? e.message : String(e)}`, ) } - const embedQueue = collectionToEmbedQueue.get(collection) - if (!embedQueue) { - throw new Error( - `[payloadcms-vectorize] Collection "${collection}" is not configured for vectorization`, - ) - } - return embedQueue(doc, payload) }, - bulkEmbed: (params: { knowledgePool: TPoolNames }): Promise => - startBulkEmbed({ - payload, - knowledgePool: params.knowledgePool, - knowledgePools: pluginOptions.knowledgePools, - queueName: pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, - }), - retryFailedBatch: (params: { batchId: string }): Promise => - retryBatch({ - payload, - batchId: params.batchId, - knowledgePools: pluginOptions.knowledgePools, - queueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, - }), - } as VectorizedPayload + ], } + } - // Store factory in config.custom - config.custom = { - ...(config.custom || {}), - createVectorizedPayloadObject, - } + const vectorSearchHandlers = createVectorSearchHandlers( + pluginOptions.knowledgePools, + pluginOptions.dbAdapter, + ) - // Register bin script for migration helper - const __filename = fileURLToPath(import.meta.url) - const __dirname = dirname(__filename) - const binScriptPath = resolve(__dirname, 'bin/vectorize-migrate.js') - config.bin = [ - ...(config.bin || []), + if (pluginOptions.endpointOverrides?.enabled !== false) { + const path = pluginOptions.endpointOverrides?.path || '/vector-search' + const inputEndpoints = config.endpoints || [] + const endpoints = [ + ...inputEndpoints, + { + path, + method: 'post' as const, + handler: vectorSearchHandlers.requestHandler, + }, + { + path: '/vector-bulk-embed', + method: 'post' as const, + handler: createBulkEmbedHandler( + pluginOptions.knowledgePools, + pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, + ), + }, { - key: 'vectorize:migrate', - scriptPath: binScriptPath, + path: '/vector-retry-failed-batch', + method: 'post' as const, + handler: createRetryFailedBatchHandler( + pluginOptions.knowledgePools, + pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + ), }, ] + config.endpoints = endpoints + } - if (pluginOptions.endpointOverrides?.enabled !== false) { - const path = pluginOptions.endpointOverrides?.path || '/vector-search' - const inputEndpoints = config.endpoints || [] - const endpoints = [ - ...inputEndpoints, - { - path, - method: 'post' as const, - handler: vectorSearchHandlers.requestHandler, - }, - { - path: '/vector-bulk-embed', - method: 'post' as const, - handler: createBulkEmbedHandler( - pluginOptions.knowledgePools, - pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, - ), - }, - { - path: '/vector-retry-failed-batch', - method: 'post' as const, - handler: createRetryFailedBatchHandler( - pluginOptions.knowledgePools, - pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, - ), - }, - ] - config.endpoints = endpoints - } + const configExtension = pluginOptions.dbAdapter.getConfigExtension(config) - return config + // Create vectorized payload object factory that creates methods bound to a payload instance + const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { + return { + _isBulkEmbedEnabled: (knowledgePool: KnowledgePoolName): boolean => { + const poolConfig = pluginOptions.knowledgePools[knowledgePool] + return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns + }, + getDbAdapterCustom: () => configExtension?.custom, + search: (params: VectorSearchQuery) => + vectorSearchHandlers.vectorSearch( + payload, + params.query, + params.knowledgePool, + params.limit, + params.where, + ), + queueEmbed: async ( + params: + | { + collection: string + docId: string + } + | { + collection: string + doc: Record + }, + ) => { + const collection = params.collection + let doc: Record + if ('docId' in params && params.docId) { + doc = await payload.findByID({ + collection: collection as CollectionSlug, + id: params.docId, + }) + } else if ('doc' in params && params.doc) { + doc = params.doc + } else { + throw new Error( + `[payloadcms-vectorize] queueEmbed requires either docId or doc parameter`, + ) + } + const embedQueue = collectionToEmbedQueue.get(collection) + if (!embedQueue) { + throw new Error( + `[payloadcms-vectorize] Collection "${collection}" is not configured for vectorization`, + ) + } + return embedQueue(doc, payload) + }, + bulkEmbed: (params: { knowledgePool: KnowledgePoolName }): Promise => + startBulkEmbed({ + payload, + knowledgePool: params.knowledgePool, + knowledgePools: pluginOptions.knowledgePools, + queueName: pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, + }), + retryFailedBatch: (params: { batchId: string }): Promise => + retryBatch({ + payload, + batchId: params.batchId, + knowledgePools: pluginOptions.knowledgePools, + queueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + }), + } as VectorizedPayload } - return { - afterSchemaInitHook, - payloadcmsVectorize, + + // Store factory and db adapter custom in config.custom + config.custom = { + ...(config.custom || {}), + createVectorizedPayloadObject, + payloadCmsVectorizeDbAdapterCustom: configExtension?.custom, + } + + if (configExtension?.bins) { + config.bin = [...(config.bin || []), ...configExtension.bins] + } + + // Register adapter-provided collections + if (configExtension?.collections) { + for (const [_slug, collectionConfig] of Object.entries(configExtension.collections)) { + if (!config.collections!.find((c) => c.slug === collectionConfig.slug)) { + config.collections!.push(collectionConfig) + } + } + } + + return config } + +export const getDbAdapterCustom = (config: Config): Record | undefined => { + return config.custom?.payloadCmsVectorizeDbAdapterCustom } diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 973371c..4f77883 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -1,6 +1,6 @@ import { + CollectionSlug, JsonObject, - PaginatedDocs, Payload, TaskConfig, TaskHandlerResult, @@ -9,23 +9,33 @@ import { import { BatchSubmission, BulkEmbeddingOutput, + BulkEmbeddingRunDoc, + BulkEmbeddingBatchDoc, + BulkEmbeddingInputMetadataDoc, CollectedEmbeddingInput, + CollectionVectorizeOption, KnowledgePoolDynamicConfig, KnowledgePoolName, + BulkEmbeddingInput, + DbAdapter, + FailedChunkData, } from '../types.js' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../collections/bulkEmbeddingsBatches.js' import { - isPostgresPayload, - PostgresPayload, - BulkEmbeddingInput, - FailedChunkData, -} from '../types.js' -import toSnakeCase from 'to-snake-case' + TASK_SLUG_PREPARE_BULK_EMBEDDING, + TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, +} from '../constants.js' +import { validateChunkData } from '../utils/validateChunkData.js' +import { deleteDocumentEmbeddings } from '../utils/deleteDocumentEmbeddings.js' type PrepareBulkEmbeddingTaskInput = { runId: string + /** If set, this is a per-collection worker job */ + collectionSlug?: string + /** Page within the collection (default: 1) */ + page?: number } type PrepareBulkEmbeddingTaskOutput = { @@ -39,18 +49,20 @@ type PrepareBulkEmbeddingTaskInputOutput = { output: PrepareBulkEmbeddingTaskOutput } -type PollOrCompleteBulkEmbeddingTaskInput = { +type PollOrCompleteSingleBatchTaskInput = { runId: string + batchId: string } -type PollOrCompleteBulkEmbeddingTaskOutput = { +type PollOrCompleteSingleBatchTaskOutput = { runId: string + batchId: string status: string } -type PollOrCompleteBulkEmbeddingTaskInputOutput = { - input: PollOrCompleteBulkEmbeddingTaskInput - output: PollOrCompleteBulkEmbeddingTaskOutput +type PollOrCompleteSingleBatchTaskInputOutput = { + input: PollOrCompleteSingleBatchTaskInput + output: PollOrCompleteSingleBatchTaskOutput } const TERMINAL_STATUSES = new Set(['succeeded', 'failed', 'canceled', 'retried']) @@ -65,11 +77,11 @@ async function loadRunAndConfig({ runId: string knowledgePools: Record }) { - const run = await payload.findByID({ + const run = (await payload.findByID({ collection: BULK_EMBEDDINGS_RUNS_SLUG, id: runId, - }) - const poolName = (run as any)?.pool as KnowledgePoolName + })) as BulkEmbeddingRunDoc + const poolName = run.pool as KnowledgePoolName if (!poolName) { throw new Error(`[payloadcms-vectorize] bulk embed run ${runId} missing pool`) } @@ -87,15 +99,159 @@ async function loadRunAndConfig({ return { run, poolName, dynamicConfig } } +/** + * Check if all batches for a run are terminal, and if so finalize the run. + * This function is idempotent - safe to call concurrently from multiple per-batch tasks. + */ +async function finalizeRunIfComplete(args: { + payload: Payload + runId: string + poolName: KnowledgePoolName + callbacks: { + onError?: (args: { + providerBatchIds: string[] + error: Error + failedChunkData?: FailedChunkData[] + failedChunkCount?: number + }) => Promise + } +}): Promise<{ finalized: boolean; status?: string }> { + const { payload, runId, poolName, callbacks } = args + + // Check if run is already terminal (prevents double-finalization race) + const currentRun = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: runId, + }) + if (TERMINAL_STATUSES.has((currentRun as any).status)) { + return { finalized: true, status: (currentRun as any).status } + } + + // Stream through batches page-by-page, aggregating without storing them all in memory + const runIdNum = parseInt(runId, 10) + const PAGE_SIZE = 100 + let page = 1 + let totalBatchCount = 0 + let allTerminal = true + let hasAnySucceeded = false + let allCanceled = true + let totalSucceeded = 0 + let totalFailed = 0 + const allFailedChunkData: FailedChunkData[] = [] + const succeededBatchIds: number[] = [] + const providerBatchIds: string[] = [] + + while (true) { + const result = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + limit: PAGE_SIZE, + page, + sort: 'batchIndex', + }) + const docs = (result as any)?.docs || [] + + for (const batch of docs) { + totalBatchCount++ + const status = batch.status as string + providerBatchIds.push(batch.providerBatchId as string) + + if (!TERMINAL_STATUSES.has(status)) allTerminal = false + if (status === 'succeeded') hasAnySucceeded = true + if (status !== 'canceled') allCanceled = false + + if (status === 'succeeded') { + totalSucceeded += batch.succeededCount || 0 + totalFailed += batch.failedCount || 0 + succeededBatchIds.push(parseInt(String(batch.id), 10)) + if (Array.isArray(batch.failedChunkData)) { + allFailedChunkData.push(...batch.failedChunkData) + } + } + } + + const totalPages = (result as any)?.totalPages ?? page + if (page >= totalPages || docs.length === 0) break + page++ + } + + if (totalBatchCount === 0) { + await payload.update({ + id: runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'succeeded', + inputs: 0, + succeeded: 0, + failed: 0, + completedAt: new Date().toISOString(), + }, + }) + return { finalized: true, status: 'succeeded' } + } + + if (!allTerminal) { + return { finalized: false } + } + + // All batches are terminal — finalize the run + if (allCanceled) { + await payload.update({ + id: runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { status: 'canceled', completedAt: new Date().toISOString() }, + }) + return { finalized: true, status: 'canceled' } + } + + const runStatus = hasAnySucceeded ? 'succeeded' : 'failed' + + await payload.update({ + id: runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: runStatus, + succeeded: totalSucceeded, + failed: totalFailed, + failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, + completedAt: new Date().toISOString(), + }, + }) + + // Cleanup metadata for succeeded batches only + if (succeededBatchIds.length > 0) { + await payload.delete({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { batch: { in: succeededBatchIds } }, + }) + } + + // Call onError if there were any failures + if (callbacks.onError && (totalFailed > 0 || !hasAnySucceeded)) { + await callbacks.onError({ + providerBatchIds, + error: new Error( + totalFailed > 0 ? `${totalFailed} chunk(s) failed during completion` : 'All batches failed', + ), + failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, + failedChunkCount: totalFailed > 0 ? totalFailed : undefined, + }) + } + + return { finalized: true, status: runStatus } +} + export const createPrepareBulkEmbeddingTask = ({ knowledgePools, pollOrCompleteQueueName, + prepareBulkEmbedQueueName, }: { knowledgePools: Record pollOrCompleteQueueName?: string + prepareBulkEmbedQueueName?: string }): TaskConfig => { const task: TaskConfig = { - slug: 'payloadcms-vectorize:prepare-bulk-embedding', + slug: TASK_SLUG_PREPARE_BULK_EMBEDDING, handler: async ({ input, req, @@ -104,7 +260,7 @@ export const createPrepareBulkEmbeddingTask = ({ throw new Error('[payloadcms-vectorize] bulk embed runId is required') } const payload = req.payload - const { poolName, dynamicConfig } = await loadRunAndConfig({ + const { run, poolName, dynamicConfig } = await loadRunAndConfig({ payload, runId: input.runId, knowledgePools, @@ -113,7 +269,76 @@ export const createPrepareBulkEmbeddingTask = ({ const callbacks = dynamicConfig.embeddingConfig.bulkEmbeddingsFns! const embeddingVersion = dynamicConfig.embeddingConfig.version - // Find baseline run information + // ============================================= + // COORDINATOR MODE: no collectionSlug in input + // ============================================= + if (!input.collectionSlug) { + // Queue one worker per collection + const collectionSlugs = Object.keys(dynamicConfig.collections) + if (collectionSlugs.length === 0) { + // No collections configured - mark run as succeeded + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'succeeded', + totalBatches: 0, + inputs: 0, + succeeded: 0, + failed: 0, + completedAt: new Date().toISOString(), + }, + }) + return { output: { runId: input.runId, status: 'succeeded', batchCount: 0 } } + } + + for (const collectionSlug of collectionSlugs) { + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: input.runId, collectionSlug, page: 1 }, + req, + ...(prepareBulkEmbedQueueName ? { queue: prepareBulkEmbedQueueName } : {}), + }) + } + + // Update run status + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'running', + submittedAt: new Date().toISOString(), + }, + }) + + return { output: { runId: input.runId, status: 'coordinated' } } + } + + // ============================================= + // WORKER MODE: collectionSlug is set + // ============================================= + + // Early exit if run is already terminal + if (TERMINAL_STATUSES.has((run as any).status)) { + return { output: { runId: input.runId, status: (run as any).status } } + } + + const collectionSlug = input.collectionSlug + const collectionConfig = dynamicConfig.collections[collectionSlug] + if (!collectionConfig) { + throw new Error( + `[payloadcms-vectorize] collection "${collectionSlug}" not found in pool "${poolName}"`, + ) + } + + const DEFAULT_BATCH_LIMIT = 1000 + const batchLimit = + collectionConfig.batchLimit && collectionConfig.batchLimit > 0 + ? collectionConfig.batchLimit + : DEFAULT_BATCH_LIMIT + const page = input.page ?? 1 + + // Compute baseline/version for filtering const latestSucceededRun = await payload.find({ collection: BULK_EMBEDDINGS_RUNS_SLUG, where: { @@ -127,27 +352,56 @@ export const createPrepareBulkEmbeddingTask = ({ sort: '-completedAt', }) - const baselineRun = (latestSucceededRun as any)?.docs?.[0] + const baselineRun = latestSucceededRun.docs?.[0] as BulkEmbeddingRunDoc | undefined const baselineVersion: string | undefined = baselineRun?.embeddingVersion const lastBulkCompletedAt: string | undefined = baselineRun?.completedAt const versionMismatch = baselineVersion !== undefined && baselineVersion !== embeddingVersion + const includeAll = versionMismatch || !baselineRun + const lastCompletedAtDate = lastBulkCompletedAt ? new Date(lastBulkCompletedAt) : undefined + + // Build where clause for this collection + const where = includeAll + ? undefined + : lastCompletedAtDate + ? { updatedAt: { greater_than: lastCompletedAtDate.toISOString() } } + : undefined + + // STEP 1: Query the page + const queryResult = await payload.find({ + collection: collectionSlug, + where, + limit: batchLimit, + page, + sort: 'id', + }) + + // STEP 2: If there's a next page, queue continuation BEFORE processing + if (queryResult.nextPage) { + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: input.runId, collectionSlug, page: queryResult.nextPage }, + req, + ...(prepareBulkEmbedQueueName ? { queue: prepareBulkEmbedQueueName } : {}), + }) + } - // Stream missing embeddings and create batches - let result + // STEP 3: Process this page's docs + let totalResult: { batchCount: number; totalInputs: number; batchIds: (string | number)[] } try { - result = await streamAndBatchMissingEmbeddings({ + totalResult = await streamAndBatchDocs({ payload, runId: input.runId, poolName, - dynamicConfig, + collectionSlug, + collectionConfig, + docs: (queryResult.docs || []) as Array, embeddingVersion, - lastBulkCompletedAt, - versionMismatch, - hasBaseline: Boolean(baselineRun), + includeAll, + lastCompletedAtDate, addChunk: callbacks.addChunk, }) } catch (error) { - // Ingestion failed (e.g., validation error) - mark run as failed + // Ingestion failed - mark run as failed const errorMessage = (error as Error).message || String(error) await payload.update({ id: input.runId, @@ -158,49 +412,47 @@ export const createPrepareBulkEmbeddingTask = ({ completedAt: new Date().toISOString(), }, }) - // Re-throw so Payload's job system marks the job as failed throw error } - if (result.totalInputs === 0) { - // No inputs to process - mark run as succeeded + // STEP 4: Accumulate counts on run record + if (totalResult.totalInputs > 0) { + const currentRun = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: input.runId, + }) + const existingInputs = (currentRun as any).inputs ?? 0 + const existingBatches = (currentRun as any).totalBatches ?? 0 await payload.update({ id: input.runId, collection: BULK_EMBEDDINGS_RUNS_SLUG, data: { - status: 'succeeded', - totalBatches: 0, - inputs: 0, - succeeded: 0, - failed: 0, - completedAt: new Date().toISOString(), + totalBatches: existingBatches + totalResult.batchCount, + inputs: existingInputs + totalResult.totalInputs, }, }) - return { output: { runId: input.runId, status: 'succeeded', batchCount: 0 } } } - // Update run with batch count and total inputs - await payload.update({ - id: input.runId, - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { - status: 'running', - totalBatches: result.batchCount, - inputs: result.totalInputs, - submittedAt: new Date().toISOString(), - }, - }) + // STEP 5: Queue per-batch polling tasks + for (const batchId of totalResult.batchIds) { + await payload.jobs.queue({ + task: TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, + input: { runId: input.runId, batchId: String(batchId) }, + req, + ...(pollOrCompleteQueueName ? { queue: pollOrCompleteQueueName } : {}), + }) + } - // Queue the poll task to monitor all batches - await payload.jobs.queue<'payloadcms-vectorize:poll-or-complete-bulk-embedding'>({ - task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', - input: { runId: input.runId }, - req, - ...(pollOrCompleteQueueName ? { queue: pollOrCompleteQueueName } : {}), - }) + // If this worker produced 0 batches and has no continuation, try to finalize. + // finalizeRunIfComplete is idempotent: if other workers created batches that + // aren't terminal yet, it returns { finalized: false } and the polling tasks + // will handle finalization later. + if (totalResult.batchCount === 0 && !queryResult.nextPage) { + await finalizeRunIfComplete({ payload, runId: input.runId, poolName, callbacks }) + } return { - output: { runId: input.runId, status: 'prepared', batchCount: result.batchCount }, + output: { runId: input.runId, status: 'prepared', batchCount: totalResult.batchCount }, } }, } @@ -208,277 +460,116 @@ export const createPrepareBulkEmbeddingTask = ({ return task } -export const createPollOrCompleteBulkEmbeddingTask = ({ +export const createPollOrCompleteSingleBatchTask = ({ knowledgePools, pollOrCompleteQueueName, + adapter, }: { knowledgePools: Record pollOrCompleteQueueName?: string -}): TaskConfig => { - const task: TaskConfig = { - slug: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', + adapter: DbAdapter +}): TaskConfig => { + const task: TaskConfig = { + slug: TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, handler: async ({ input, req, - }): Promise> => { - if (!input?.runId) { - throw new Error('[payloadcms-vectorize] bulk embed runId is required') + }): Promise> => { + if (!input?.runId || !input?.batchId) { + throw new Error('[payloadcms-vectorize] single batch task requires runId and batchId') } + const { runId, batchId } = input const payload = req.payload const { run, poolName, dynamicConfig } = await loadRunAndConfig({ payload, - runId: input.runId, + runId, knowledgePools, }) const callbacks = dynamicConfig.embeddingConfig.bulkEmbeddingsFns! - // Check if run is already terminal - const currentStatus = (run as any).status - if (TERMINAL_STATUSES.has(currentStatus)) { - return { output: { runId: input.runId, status: currentStatus } } + // Early exit if run is already terminal + if (TERMINAL_STATUSES.has(run.status)) { + return { output: { runId, batchId, status: run.status } } } - // Load all batches for this run with pagination to handle >1000 batches - // Convert runId to number for postgres relationship queries - const runIdNum = parseInt(input.runId, 10) - const batches: any[] = [] - let batchPage = 1 - const batchLimit = 100 // Smaller pages for better memory management - - while (true) { - const batchesResult = await payload.find({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { run: { equals: runIdNum } }, - limit: batchLimit, - page: batchPage, - sort: 'batchIndex', - }) - const pageDocs = (batchesResult as any)?.docs || [] - batches.push(...pageDocs) + // Load this specific batch + const batch = (await payload.findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: batchId, + })) as BulkEmbeddingBatchDoc - const totalPages = (batchesResult as any)?.totalPages ?? batchPage - if (batchPage >= totalPages || pageDocs.length === 0) break - batchPage++ + // If batch is already terminal, just check if run can be finalized + if (TERMINAL_STATUSES.has((batch as any).status)) { + await finalizeRunIfComplete({ payload, runId, poolName, callbacks }) + return { output: { runId, batchId, status: (batch as any).status } } } - if (batches.length === 0) { - // No batches found - this shouldn't happen but handle gracefully - await payload.update({ - id: input.runId, - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { - status: 'failed', - error: 'No batches found for run', - completedAt: new Date().toISOString(), - }, + // Poll and complete this single batch + try { + const completionResult = await pollAndCompleteSingleBatch({ + payload, + runId, + poolName, + batch, + callbacks, + adapter, }) - return { output: { runId: input.runId, status: 'failed' } } - } - - // Poll each non-terminal batch and complete succeeded ones incrementally - let anyRunning = false - let totalSucceeded = 0 - let totalFailed = 0 - const allFailedChunkData: FailedChunkData[] = [] - const batchStatuses = new Map() // Track batch statuses as we process - - // Initialize with current statuses - for (const batch of batches) { - batchStatuses.set(String(batch.id), batch.status as string) - // Accumulate counts from already completed batches - if (TERMINAL_STATUSES.has(batch.status as string)) { - if (batch.status === 'succeeded') { - totalSucceeded += batch.succeededCount || 0 - totalFailed += batch.failedCount || 0 - } - } - } - - for (const batch of batches) { - const batchStatus = batchStatuses.get(String(batch.id)) as string - - // Skip batches that are already completed - if (TERMINAL_STATUSES.has(batchStatus)) { - continue - } - - // Poll batch and complete if succeeded (streams embeddings via onChunk callback) - try { - const completionResult = await pollAndCompleteSingleBatch({ - payload, - runId: input.runId, - poolName, - batch, - callbacks, - }) - - // Update batch status and counts - await payload.update({ - id: batch.id, - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - status: completionResult.status, - error: completionResult.error, - ...(TERMINAL_STATUSES.has(completionResult.status) - ? { completedAt: new Date().toISOString() } - : {}), - ...(completionResult.status === 'succeeded' - ? { - succeededCount: completionResult.succeededCount, - failedCount: completionResult.failedCount, - } - : {}), - }, - }) - - // Track the new status - batchStatuses.set(String(batch.id), completionResult.status) - - // Accumulate counts from newly succeeded batches - if (completionResult.status === 'succeeded') { - totalSucceeded += completionResult.succeededCount - totalFailed += completionResult.failedCount - allFailedChunkData.push(...completionResult.failedChunkData) - } - - // Track if still running (queued or running) - if (completionResult.status === 'queued' || completionResult.status === 'running') { - anyRunning = true - } - // Failed/canceled batches - leave them, can be re-run later - } catch (error) { - // Completion failed - mark batch as failed - const errorMessage = (error as Error).message || String(error) - await payload.update({ - id: batch.id, - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - status: 'failed', - error: `Completion failed: ${errorMessage}`, - completedAt: new Date().toISOString(), - }, - }) - batchStatuses.set(String(batch.id), 'failed') - } - } - - // Check if all batches are complete - const allBatchesComplete = Array.from(batchStatuses.values()).every((status) => - TERMINAL_STATUSES.has(status), - ) - - if (allBatchesComplete) { - // All batches are done - finalize the run - const hasAnySucceeded = Array.from(batchStatuses.values()).some( - (status) => status === 'succeeded', - ) - - // Check if any batches are failed (not just canceled) - we keep metadata for potential retries - const hasFailedBatches = Array.from(batchStatuses.values()).some( - (status) => status === 'failed', - ) + // Update batch status and counts await payload.update({ - id: input.runId, - collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: batchId, + collection: BULK_EMBEDDINGS_BATCHES_SLUG, data: { - status: hasAnySucceeded ? 'succeeded' : 'failed', - succeeded: totalSucceeded, - failed: totalFailed, - failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, - completedAt: new Date().toISOString(), + status: completionResult.status, + error: completionResult.error, + ...(TERMINAL_STATUSES.has(completionResult.status) + ? { completedAt: new Date().toISOString() } + : {}), + ...(completionResult.status === 'succeeded' + ? { + succeededCount: completionResult.succeededCount, + failedCount: completionResult.failedCount, + failedChunkData: + completionResult.failedChunkData.length > 0 + ? completionResult.failedChunkData + : undefined, + } + : {}), }, }) - // Cleanup metadata for succeeded batches only - // Keep metadata for failed batches to allow retry functionality - const succeededBatchIds = Array.from(batchStatuses.entries()) - .filter(([_, status]) => status === 'succeeded') - .map(([id, _]) => parseInt(id, 10)) - - if (succeededBatchIds.length > 0) { - await payload.delete({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { batch: { in: succeededBatchIds } }, - }) - } - - // Call onError if there were any failures - if (callbacks.onError && (totalFailed > 0 || !hasAnySucceeded)) { - const providerBatchIds = batches.map((b: any) => b.providerBatchId as string) - await callbacks.onError({ - providerBatchIds, - error: new Error( - totalFailed > 0 - ? `${totalFailed} chunk(s) failed during completion` - : 'All batches failed', - ), - failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, - failedChunkCount: totalFailed > 0 ? totalFailed : undefined, - }) - } - - return { - output: { - runId: input.runId, - status: hasAnySucceeded ? 'succeeded' : 'failed', - }, + // If batch is now terminal, check if run should be finalized + if (TERMINAL_STATUSES.has(completionResult.status)) { + await finalizeRunIfComplete({ payload, runId, poolName, callbacks }) + return { output: { runId, batchId, status: completionResult.status } } } - } - // If still running, requeue this task - if (anyRunning) { - await payload.jobs.queue<'payloadcms-vectorize:poll-or-complete-bulk-embedding'>({ - task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', - input: { runId: input.runId }, + // Still running - re-queue self with polling delay + await payload.jobs.queue({ + task: TASK_SLUG_POLL_OR_COMPLETE_BULK_EMBEDDING, + input: { runId, batchId }, req, ...(pollOrCompleteQueueName ? { queue: pollOrCompleteQueueName } : {}), }) - return { output: { runId: input.runId, status: 'polling' } } - } - // Edge case: allBatchesComplete is false but anyRunning is false - // This happens when all batches are in 'canceled' or 'failed' status but we didn't detect it above - // Check if all batches are canceled - const allCanceled = Array.from(batchStatuses.values()).every( - (status) => status === 'canceled', - ) - - if (allCanceled) { + return { output: { runId, batchId, status: completionResult.status } } + } catch (error) { + // Batch processing failed - mark batch as failed + const errorMessage = (error as Error).message || String(error) await payload.update({ - id: input.runId, - collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: batchId, + collection: BULK_EMBEDDINGS_BATCHES_SLUG, data: { - status: 'canceled', + status: 'failed', + error: `Completion failed: ${errorMessage}`, completedAt: new Date().toISOString(), }, }) - return { output: { runId: input.runId, status: 'canceled' } } + // Check if this was the last batch to complete + await finalizeRunIfComplete({ payload, runId, poolName, callbacks }) + return { output: { runId, batchId, status: 'failed' } } } - - // Fallback: mark as failed with diagnostic info - const statusCounts = Array.from(batchStatuses.values()).reduce( - (acc, status) => { - acc[status] = (acc[status] || 0) + 1 - return acc - }, - {} as Record, - ) - payload.logger.warn( - `[payloadcms-vectorize] Run ${input.runId} reached unexpected state. Batch statuses: ${JSON.stringify(statusCounts)}`, - ) - - await payload.update({ - id: input.runId, - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { - status: 'failed', - error: `Run reached unexpected state. Batch statuses: ${JSON.stringify(statusCounts)}`, - completedAt: new Date().toISOString(), - }, - }) - return { output: { runId: input.runId, status: 'failed' } } }, } @@ -486,144 +577,105 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ } /** - * Stream through missing embeddings, calling addChunk for each. + * Process pre-fetched docs from a single collection, calling addChunk for each chunk. * User controls batching via addChunk return value. * * Single-pass approach using async generator to yield chunks sequentially. * This avoids the need for a pre-counting pass while correctly determining isLastChunk. */ -async function streamAndBatchMissingEmbeddings(args: { +async function streamAndBatchDocs(args: { payload: Payload runId: string poolName: KnowledgePoolName - dynamicConfig: KnowledgePoolDynamicConfig + collectionSlug: string + collectionConfig: CollectionVectorizeOption + docs: Array embeddingVersion: string - lastBulkCompletedAt?: string - versionMismatch: boolean - hasBaseline: boolean + includeAll: boolean + lastCompletedAtDate?: Date addChunk: (args: { chunk: BulkEmbeddingInput isLastChunk: boolean }) => Promise -}): Promise<{ batchCount: number; totalInputs: number }> { +}): Promise<{ batchCount: number; totalInputs: number; batchIds: (string | number)[] }> { const { payload, runId, poolName, - dynamicConfig, + collectionSlug, + collectionConfig, + docs, embeddingVersion, - lastBulkCompletedAt, - versionMismatch, - hasBaseline, + includeAll, + lastCompletedAtDate, addChunk, } = args - const includeAll = versionMismatch || !hasBaseline - const lastCompletedAtDate = lastBulkCompletedAt ? new Date(lastBulkCompletedAt) : undefined - const collectionSlugs = Object.keys(dynamicConfig.collections) - - // Async generator that yields chunks one at a time + // Async generator that yields chunks one at a time from pre-fetched docs async function* generateChunks(): AsyncGenerator { - for (const collectionSlug of collectionSlugs) { - const collectionConfig = dynamicConfig.collections[collectionSlug] - if (!collectionConfig) continue + const toKnowledgePool = collectionConfig.toKnowledgePool - const toKnowledgePool = collectionConfig.toKnowledgePool - const limit = 50 + for (const doc of docs) { + // If !includeAll, we still need to check if document has current embedding + // (can't filter this in the where clause since it's a cross-collection check) + if (!includeAll && !lastCompletedAtDate) { + const hasCurrentEmbedding = await docHasEmbeddingVersion({ + payload, + poolName, + sourceCollection: collectionSlug, + docId: String(doc.id), + embeddingVersion, + }) + if (hasCurrentEmbedding) continue + } - // Build where clause: filter by updatedAt if we have lastBulkCompletedAt and !includeAll - const where = includeAll - ? undefined - : lastCompletedAtDate - ? { - updatedAt: { - greater_than: lastCompletedAtDate.toISOString(), - }, - } - : undefined + // Check if document should be embedded + if (collectionConfig.shouldEmbedFn) { + const shouldEmbed = await collectionConfig.shouldEmbedFn(doc, payload) + if (!shouldEmbed) continue + } - let res: PaginatedDocs | undefined = await payload.find({ - collection: collectionSlug, - where, - limit, - }) - do { - const docs = res?.docs || [] - if (!docs.length) break - - for (const doc of docs) { - // If !includeAll, we still need to check if document has current embedding - // (can't filter this in the where clause since it's a cross-collection check) - if (!includeAll && !lastCompletedAtDate) { - const hasCurrentEmbedding = await docHasEmbeddingVersion({ - payload, - poolName, - sourceCollection: collectionSlug, - docId: String(doc.id), - embeddingVersion, - }) - if (hasCurrentEmbedding) continue - } - - const chunkData = await toKnowledgePool(doc, payload) - - // Validate chunks (same validation as real-time ingestion) - const invalidEntries = chunkData - .map((entry, idx) => { - if (!entry || typeof entry !== 'object') return idx - if (typeof entry.chunk !== 'string') return idx - return null - }) - .filter((idx): idx is number => idx !== null) - - if (invalidEntries.length > 0) { - throw new Error( - `[payloadcms-vectorize] toKnowledgePool returned ${invalidEntries.length} invalid entr${ - invalidEntries.length === 1 ? 'y' : 'ies' - } for document ${doc.id} in collection "${collectionSlug}". Each entry must be an object with a "chunk" string. Invalid indices: ${invalidEntries.join( - ', ', - )}`, - ) - } - - // Yield valid chunks - for (let idx = 0; idx < chunkData.length; idx++) { - const chunkEntry = chunkData[idx] - const { chunk, ...extensionFields } = chunkEntry - - yield { - id: `${collectionSlug}:${doc.id}:${idx}`, - text: chunk, - metadata: { - sourceCollection: collectionSlug, - docId: String(doc.id), - chunkIndex: idx, - embeddingVersion, - extensionFields, - }, - } - } + const chunkData = await toKnowledgePool(doc, payload) + + validateChunkData(chunkData, String(doc.id), collectionSlug) + + // Yield valid chunks + for (let idx = 0; idx < chunkData.length; idx++) { + const chunkEntry = chunkData[idx] + const { chunk, ...extensionFields } = chunkEntry + + yield { + id: `${collectionSlug}:${doc.id}:${idx}`, + text: chunk, + metadata: { + sourceCollection: collectionSlug, + docId: String(doc.id), + chunkIndex: idx, + embeddingVersion, + extensionFields, + }, } - } while ( - (res = res.nextPage - ? await payload.find({ - collection: collectionSlug, - where, - limit, - page: res.nextPage, - }) - : undefined) - ) + } } } + // Determine starting batchIndex from existing batches for this run + const runIdNum = parseInt(runId, 10) + const maxBatchResult = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + sort: '-batchIndex', + limit: 1, + }) + let batchIndex = + maxBatchResult.docs.length > 0 ? ((maxBatchResult.docs[0] as any).batchIndex ?? 0) + 1 : 0 + // Process chunks from generator - let batchIndex = 0 let totalInputs = 0 const pendingChunks: CollectedEmbeddingInput[] = [] const chunkIterator = generateChunks() - const runIdNum = parseInt(runId, 10) let currentBatchId: number | undefined = undefined + const batchIds: (string | number)[] = [] async function processChunk( chunk: CollectedEmbeddingInput, @@ -646,7 +698,7 @@ async function streamAndBatchMissingEmbeddings(args: { submittedAt: new Date().toISOString(), }, }) - currentBatchId = (placeholderBatch as any).id + currentBatchId = placeholderBatch.id as number } if (!currentBatchId) { @@ -693,12 +745,13 @@ async function streamAndBatchMissingEmbeddings(args: { }) totalInputs += inputCount + batchIds.push(currentBatchId) batchIndex++ currentBatchId = undefined // Reset for next batch } } - // Process chunks from generator + // Process chunks from generator using look-ahead for isLastChunk let prevChunk: CollectedEmbeddingInput | undefined = undefined for await (const currentChunk of chunkIterator) { if (prevChunk) { @@ -710,7 +763,7 @@ async function streamAndBatchMissingEmbeddings(args: { await processChunk(prevChunk, true) } - return { batchCount: batchIndex, totalInputs } + return { batchCount: batchIds.length, totalInputs, batchIds } } /** @@ -724,7 +777,7 @@ async function documentExists(args: { const { payload, collection, docId } = args try { await payload.findByID({ - collection: collection as any, + collection: collection as CollectionSlug, id: docId, }) return true @@ -743,13 +796,14 @@ async function pollAndCompleteSingleBatch(args: { payload: Payload runId: string poolName: KnowledgePoolName - batch: any + batch: BulkEmbeddingBatchDoc callbacks: { pollOrCompleteBatch: (args: { providerBatchId: string onChunk: (chunk: BulkEmbeddingOutput) => Promise }) => Promise<{ status: string; error?: string }> } + adapter: DbAdapter }): Promise<{ status: string error?: string @@ -757,7 +811,7 @@ async function pollAndCompleteSingleBatch(args: { failedCount: number failedChunkData: FailedChunkData[] }> { - const { payload, runId, poolName, batch, callbacks } = args + const { payload, runId, poolName, batch, callbacks, adapter } = args let succeededCount = 0 let failedCount = 0 @@ -835,15 +889,12 @@ async function pollAndCompleteSingleBatch(args: { // Only delete if no embeddings exist for this version (they're from an old version) if (!hasCurrentEmbedding) { - // Delete existing embeddings for this document (from old version) - await payload.delete({ - collection: poolName, - where: { - and: [ - { sourceCollection: { equals: meta.sourceCollection } }, - { docId: { equals: String(meta.docId) } }, - ], - }, + await deleteDocumentEmbeddings({ + payload, + poolName, + collection: meta.sourceCollection, + docId: String(meta.docId), + adapter, }) } } @@ -854,7 +905,7 @@ async function pollAndCompleteSingleBatch(args: { : Array.from(output.embedding) const created = await payload.create({ - collection: poolName, + collection: poolName as CollectionSlug, data: { sourceCollection: meta.sourceCollection, docId: String(meta.docId), @@ -863,15 +914,17 @@ async function pollAndCompleteSingleBatch(args: { embeddingVersion: meta.embeddingVersion, ...(meta.extensionFields || {}), embedding: embeddingArray, - } as any, + }, }) - await persistVectorColumn({ + await adapter.storeEmbedding( payload, - poolName: toSnakeCase(poolName), - vector: embeddingArray, - id: String((created as any)?.id ?? ''), - }) + poolName, + meta.sourceCollection, + String(meta.docId), + String(created.id), + embeddingArray, + ) succeededCount++ }, @@ -886,34 +939,6 @@ async function pollAndCompleteSingleBatch(args: { } } -async function persistVectorColumn(args: { - payload: Payload - poolName: KnowledgePoolName - vector: number[] | Float32Array - id: string -}) { - const { payload, poolName, vector, id } = args - if (!isPostgresPayload(payload)) { - throw new Error('[payloadcms-vectorize] Bulk embeddings require the Postgres adapter') - } - const postgresPayload = payload as PostgresPayload - const schemaName = postgresPayload.db.schemaName || 'public' - const literal = `[${Array.from(vector).join(',')}]` - const sql = `UPDATE "${schemaName}"."${toSnakeCase(poolName)}" SET embedding = $1 WHERE id = $2` - const runSQL = async (statement: string, params?: any[]) => { - if (postgresPayload.db.pool?.query) return postgresPayload.db.pool.query(statement, params) - if (postgresPayload.db.drizzle?.execute) return postgresPayload.db.drizzle.execute(statement) - throw new Error('[payloadcms-vectorize] Failed to persist vector column') - } - try { - await runSQL(sql, [literal, id]) - } catch (e) { - const errorMessage = (e as Error).message || (e as any).toString() - payload.logger.error(`[payloadcms-vectorize] Failed to persist vector column: ${errorMessage}`) - throw e - } -} - async function docHasEmbeddingVersion(args: { payload: Payload poolName: KnowledgePoolName @@ -923,7 +948,7 @@ async function docHasEmbeddingVersion(args: { }): Promise { const { payload, poolName, sourceCollection, docId, embeddingVersion } = args const existing = await payload.find({ - collection: poolName, + collection: poolName as CollectionSlug, where: { and: [ { sourceCollection: { equals: sourceCollection } }, @@ -933,7 +958,7 @@ async function docHasEmbeddingVersion(args: { }, limit: 1, }) - return (existing as any)?.totalDocs > 0 + return existing.totalDocs > 0 } /** @@ -951,7 +976,7 @@ async function getMetadataByInputId(args: { docId: string chunkIndex: number embeddingVersion: string - extensionFields?: Record + extensionFields?: Record } | null> { const { payload, runId, inputId } = args const runIdNum = parseInt(runId, 10) @@ -964,7 +989,7 @@ async function getMetadataByInputId(args: { limit: 1, }) - const doc = (result as any)?.docs?.[0] + const doc = result.docs?.[0] as BulkEmbeddingInputMetadataDoc | undefined if (!doc) return null return { diff --git a/src/tasks/vectorize.ts b/src/tasks/vectorize.ts index 5dc191c..802c362 100644 --- a/src/tasks/vectorize.ts +++ b/src/tasks/vectorize.ts @@ -1,16 +1,18 @@ -import { Payload, TaskConfig, TaskHandlerResult } from 'payload' -import { - isPostgresPayload, - PostgresPayload, - KnowledgePoolName, +import type { Payload, TaskConfig, TaskHandlerResult } from 'payload' + +import type { + DbAdapter, KnowledgePoolDynamicConfig, + KnowledgePoolName, ToKnowledgePoolFn, } from '../types.js' -import toSnakeCase from 'to-snake-case' +import { TASK_SLUG_VECTORIZE } from '../constants.js' +import { validateChunkData } from '../utils/validateChunkData.js' +import { deleteDocumentEmbeddings } from '../utils/deleteDocumentEmbeddings.js' type VectorizeTaskInput = { - doc: Record collection: string + doc: Record knowledgePool: KnowledgePoolName } type VectorizeTaskOutput = { @@ -22,8 +24,10 @@ type VectorizeTaskInputOutput = { } export const createVectorizeTask = ({ + adapter, knowledgePools, }: { + adapter: DbAdapter knowledgePools: Record }) => { /** @@ -31,10 +35,14 @@ export const createVectorizeTask = ({ * @description Scheduled task that vectorizes on data change. */ const processVectorizationTask: TaskConfig = { - slug: 'payloadcms-vectorize:vectorize', + slug: TASK_SLUG_VECTORIZE, handler: async ({ input, req }): Promise> => { - if (!input.collection) throw new Error('[payloadcms-vectorize] collection is required') - if (!input.knowledgePool) throw new Error('[payloadcms-vectorize] knowledgePool is required') + if (!input.collection) { + throw new Error('[payloadcms-vectorize] collection is required') + } + if (!input.knowledgePool) { + throw new Error('[payloadcms-vectorize] knowledgePool is required') + } const dynamicConfig = knowledgePools[input.knowledgePool] if (!dynamicConfig) { @@ -44,13 +52,14 @@ export const createVectorizeTask = ({ } await runVectorizeTask({ - payload: req.payload, - poolName: input.knowledgePool, + adapter, dynamicConfig, job: { - doc: input.doc, collection: input.collection, + doc: input.doc, }, + payload: req.payload, + poolName: input.knowledgePool, }) return { output: { @@ -63,15 +72,16 @@ export const createVectorizeTask = ({ } async function runVectorizeTask(args: { - payload: Payload - poolName: KnowledgePoolName + adapter: DbAdapter dynamicConfig: KnowledgePoolDynamicConfig job: { - doc: Record collection: string + doc: Record } + payload: Payload + poolName: KnowledgePoolName }) { - const { payload, poolName, dynamicConfig, job } = args + const { adapter, dynamicConfig, job, payload, poolName } = args const embeddingVersion = dynamicConfig.embeddingConfig.version const sourceDoc = job.doc const collection = job.collection @@ -83,56 +93,21 @@ async function runVectorizeTask(args: { } const toKnowledgePoolFn: ToKnowledgePoolFn = collectionConfig.toKnowledgePool - const isPostgres = isPostgresPayload(payload) - if (!isPostgres) { - throw new Error('[payloadcms-vectorize] Only works with Postgres') - } - const runSQL = async (sql: string, params?: any[]) => { - const postgresPayload = payload as PostgresPayload - if (postgresPayload.db.pool?.query) return postgresPayload.db.pool.query(sql, params) - if (postgresPayload.db.drizzle?.execute) return postgresPayload.db.drizzle.execute(sql) - throw new Error('[payloadcms-vectorize] Failed to persist vector column') - } - // Delete all existing embeddings for this document before creating new ones // This ensures we replace old embeddings (potentially with a different embeddingVersion) // and prevents duplicates when a document is updated - await payload.delete({ - collection: poolName, - where: { - and: [ - { sourceCollection: { equals: collection } }, - { docId: { equals: String(sourceDoc.id) } }, - ], - }, + await deleteDocumentEmbeddings({ + payload, + poolName, + collection, + docId: String(sourceDoc.id), + adapter, }) // Get chunks from toKnowledgePoolFn const chunkData = await toKnowledgePoolFn(sourceDoc, payload) - if (!Array.isArray(chunkData)) { - throw new Error( - `[payloadcms-vectorize] toKnowledgePool for collection "${collection}" must return an array of entries with a required "chunk" string`, - ) - } - - const invalidEntries = chunkData - .map((entry, idx) => { - if (!entry || typeof entry !== 'object') return idx - if (typeof entry.chunk !== 'string') return idx - return null - }) - .filter((idx): idx is number => idx !== null) - - if (invalidEntries.length > 0) { - throw new Error( - `[payloadcms-vectorize] toKnowledgePool returned ${invalidEntries.length} invalid entr${ - invalidEntries.length === 1 ? 'y' : 'ies' - } for document ${sourceDoc.id} in collection "${collection}". Each entry must be an object with a "chunk" string. Invalid indices: ${invalidEntries.join( - ', ', - )}`, - ) - } + validateChunkData(chunkData, String(sourceDoc.id), collection) // Extract chunk texts for embedding const chunkTexts = chunkData.map((item) => item.chunk) @@ -145,32 +120,19 @@ async function runVectorizeTask(args: { const created = await payload.create({ collection: poolName, data: { - sourceCollection: collection, - docId: String(sourceDoc.id), chunkIndex: index, chunkText: chunk, + docId: String(sourceDoc.id), embeddingVersion, + sourceCollection: collection, ...extensionFields, embedding: Array.isArray(vector) ? vector : Array.from(vector), }, }) const id = String(created.id) - const literal = `[${Array.from(vector).join(',')}]` - const postgresPayload = payload as PostgresPayload - const schemaName = postgresPayload.db.schemaName || 'public' - // Drizzle converts camelCase collection slugs to snake_case table names - const sql = - `UPDATE "${schemaName}"."${toSnakeCase(poolName)}" SET embedding = $1 WHERE id = $2` as string - try { - await runSQL(sql, [literal, id]) - } catch (e) { - const errorMessage = (e as Error).message || (e as any).toString() - payload.logger.error( - `[payloadcms-vectorize] Failed to persist vector column: ${errorMessage}`, - ) - throw new Error(`[payloadcms-vectorize] Failed to persist vector column: ${e}`) - } + + await adapter.storeEmbedding(payload, poolName, collection, String(sourceDoc.id), id, vector) }), ) } diff --git a/src/types.ts b/src/types.ts index 9c48a29..0e41c98 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,4 +1,13 @@ -import type { CollectionSlug, Payload, Field, Where } from 'payload' +import type { + CollectionConfig, + CollectionSlug, + Payload, + Field, + Where, + Config, + BasePayload, + TypeWithID, +} from 'payload' /** Result from bulkEmbed method */ export type BulkEmbedResult = @@ -43,12 +52,11 @@ export type RetryFailedBatchResult = /** * Extended Payload type with vectorize plugin methods */ -export type VectorizedPayload = { +export type VectorizedPayload = { /** Check if bulk embedding is enabled for a knowledge pool */ - _isBulkEmbedEnabled: (knowledgePool: TPoolNames) => boolean - /** Static configs for migration helper access */ - _staticConfigs: Record - search: (params: VectorSearchQuery) => Promise> + _isBulkEmbedEnabled: (knowledgePool: KnowledgePoolName) => boolean + getDbAdapterCustom: () => Record | undefined + search: (params: VectorSearchQuery) => Promise> queueEmbed: ( params: | { @@ -61,7 +69,7 @@ export type VectorizedPayload Promise /** Start a bulk embedding run for a knowledge pool */ - bulkEmbed: (params: { knowledgePool: TPoolNames }) => Promise + bulkEmbed: (params: { knowledgePool: KnowledgePoolName }) => Promise /** Retry a failed batch */ retryFailedBatch: (params: { batchId: string }) => Promise } @@ -70,13 +78,11 @@ export type VectorizedPayload( - payload: Payload, -): VectorizedPayload | null { - const custom = (payload.config as any)?.custom +export function getVectorizedPayload(payload: Payload): VectorizedPayload | null { + const custom = payload.config?.custom const vectorizedPayloadFactory = custom?.createVectorizedPayloadObject if (vectorizedPayloadFactory && typeof vectorizedPayloadFactory === 'function') { - return vectorizedPayloadFactory(payload) as VectorizedPayload + return vectorizedPayloadFactory(payload) as VectorizedPayload } return null } @@ -89,23 +95,27 @@ export type ToKnowledgePoolFn = ( payload: Payload, ) => Promise> +export type ShouldEmbedFn = ( + doc: Record, + payload: Payload, +) => Promise | boolean + export type CollectionVectorizeOption = { + /** Optional filter: return false to skip embedding this document. + * For bulk embeddings, runs before job is queued. + * If undefined, defaults to embedding all documents. */ + shouldEmbedFn?: ShouldEmbedFn /** Function that converts a document to an array of chunks with optional extension field values */ toKnowledgePool: ToKnowledgePoolFn + /** Max documents to fetch from this collection per prepare job. + * Each page of results becomes a separate job linked to the next. + * Defaults to 1000 if not set. */ + batchLimit?: number } /** Knowledge pool name identifier */ export type KnowledgePoolName = string -/** Static configuration for a knowledge pool */ -/** Note current limitation: needs a migration in order to change after initial creation */ -export type KnowledgePoolStaticConfig = { - /** Vector dimensions for pgvector column */ - dims: number - /** IVFFLAT lists parameter used when creating the index */ - ivfflatLists: number -} - /** Dynamic configuration for a knowledge pool */ /** Does not need a migration in order to change after initial creation */ export type KnowledgePoolDynamicConfig = { @@ -177,7 +187,8 @@ export type PollBulkEmbeddingsResult = { export type AddChunkArgs = { /** The chunk to add */ chunk: BulkEmbeddingInput - /** True if this is the last chunk in the run */ + /** True if this is the last chunk in this job (forces flush). + * Note: may not be the last chunk in the entire run if batchLimit continuations are used. */ isLastChunk: boolean } @@ -233,8 +244,9 @@ export type BulkEmbeddingsFns = { * of them were submitted when you return a BatchSubmission. * * **About `isLastChunk`:** - * - `isLastChunk=true` indicates this is the final chunk in the run - * - Use this to flush any remaining accumulated chunks before the run completes + * - `isLastChunk=true` indicates this is the final chunk in this job + * - Use this to flush any remaining accumulated chunks before the job completes + * - When `batchLimit` is set, each job's last chunk gets `isLastChunk=true` (not just the run's last chunk) * - The plugin uses this only to know when to stop iterating, not to determine which chunks were submitted * * **Example flow when chunk would exceed limit:** @@ -258,9 +270,11 @@ export type BulkEmbeddingsFns = { onError?: (args: OnBulkErrorArgs) => Promise } -export type PayloadcmsVectorizeConfig = { +export type PayloadcmsVectorizeConfig = { + /** DbAdapter instance to use for the plugin */ + dbAdapter: DbAdapter /** Knowledge pools and their dynamic configurations */ - knowledgePools: Record + knowledgePools: Record /** Queue name for realtime vectorization jobs. * Default is Payload's default queue (undefined). */ realtimeQueueName?: string @@ -281,40 +295,10 @@ export type PayloadcmsVectorizeConfig Promise } - drizzle?: { execute: (sql: string) => Promise } - } -} { - return ( - typeof payload?.db?.pool?.query === 'function' || - typeof payload?.db?.drizzle?.execute === 'function' - ) -} - -// Type for Payload with Postgres database -export type PostgresPayload = any & { - db: { - pool?: { query: (sql: string, params?: any[]) => Promise } - drizzle?: { execute: (sql: string) => Promise } - } -} - -// Job task argument types -export type VectorizeTaskArgs = { - payload: any - pluginOptions: PayloadcmsVectorizeConfig - doc: Record - collection: string - knowledgePool: KnowledgePoolName - toKnowledgePoolFn: ToKnowledgePoolFn -} - export interface VectorSearchResult { id: string - similarity: number + /** Relevance score (higher = more relevant). Range depends on adapter implementation. */ + score: number sourceCollection: string // The collection that this embedding belongs to docId: string // The ID of the source document chunkIndex: number // The index of this chunk @@ -323,13 +307,9 @@ export interface VectorSearchResult { [key: string]: any // Extension fields and other dynamic fields } -export interface VectorSearchResponse { - results: VectorSearchResult[] -} - -export interface VectorSearchQuery { +export interface VectorSearchQuery { /** The knowledge pool to search in */ - knowledgePool: TPoolNames + knowledgePool: KnowledgePoolName /** The search query string */ query: string /** Optional Payload where clause to filter results. Can rely on embeddings collection fields or extension fields. */ @@ -338,9 +318,89 @@ export interface VectorSearchQuery + createdAt: string + updatedAt: string +} + +export type DbAdapter = { + getConfigExtension: (payloadCmsConfig: Config) => { + bins?: { key: string; scriptPath: string }[] + custom?: Record + collections?: Record + } + search: ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit?: number, + where?: Where, + ) => Promise> + storeEmbedding: ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + sourceDocId: string, + embeddingId: string, + embedding: number[] | Float32Array, + ) => Promise + /** + * Delete embeddings for a source document + * Called when a document is deleted or re-indexed + * The adapter should delete all vectors associated with this document + */ + deleteEmbeddings?: ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + ) => Promise } diff --git a/src/utils/deleteDocumentEmbeddings.ts b/src/utils/deleteDocumentEmbeddings.ts new file mode 100644 index 0000000..413879b --- /dev/null +++ b/src/utils/deleteDocumentEmbeddings.ts @@ -0,0 +1,30 @@ +import type { CollectionSlug, Payload } from 'payload' +import type { DbAdapter, KnowledgePoolName } from '../types.js' + +/** + * Two-step deletion: removes embeddings from the Payload collection + * and then from the adapter's storage (for adapters that store vectors separately). + */ +export async function deleteDocumentEmbeddings(args: { + payload: Payload + poolName: KnowledgePoolName + collection: string + docId: string + adapter: DbAdapter +}): Promise { + const { payload, poolName, collection, docId, adapter } = args + + await payload.delete({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: collection } }, + { docId: { equals: String(docId) } }, + ], + }, + }) + + if (adapter.deleteEmbeddings) { + await adapter.deleteEmbeddings(payload, poolName, collection, String(docId)) + } +} diff --git a/src/utils/validateChunkData.ts b/src/utils/validateChunkData.ts new file mode 100644 index 0000000..c3bb863 --- /dev/null +++ b/src/utils/validateChunkData.ts @@ -0,0 +1,33 @@ +/** + * Validates that each entry in chunkData is an object with a `chunk` string property. + * Throws a descriptive error listing invalid indices if any entries are malformed. + */ +export function validateChunkData( + chunkData: unknown[], + docId: string, + collection: string, +): void { + if (!Array.isArray(chunkData)) { + throw new Error( + `[payloadcms-vectorize] toKnowledgePool for collection "${collection}" must return an array of entries with a required "chunk" string`, + ) + } + + const invalidEntries = chunkData + .map((entry, idx) => { + if (!entry || typeof entry !== 'object') return idx + if (typeof (entry as Record).chunk !== 'string') return idx + return null + }) + .filter((idx): idx is number => idx !== null) + + if (invalidEntries.length > 0) { + throw new Error( + `[payloadcms-vectorize] toKnowledgePool returned ${invalidEntries.length} invalid entr${ + invalidEntries.length === 1 ? 'y' : 'ies' + } for document ${docId} in collection "${collection}". Each entry must be an object with a "chunk" string. Invalid indices: ${invalidEntries.join( + ', ', + )}`, + ) + } +} diff --git a/tsconfig.build.json b/tsconfig.build.json new file mode 100644 index 0000000..52a4b44 --- /dev/null +++ b/tsconfig.build.json @@ -0,0 +1,4 @@ +{ + "extends": "./tsconfig.json", + "include": ["./src/**/*.ts", "./src/**/*.tsx", "./dev/next-env.d.ts"] +} diff --git a/tsconfig.json b/tsconfig.json index 2622313..8c2fdbc 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -22,5 +22,5 @@ } ] }, - "include": ["./src/**/*.ts", "./src/**/*.tsx", "./dev/next-env.d.ts"] + "include": ["./src/**/*.ts", "./src/**/*.tsx", "./adapters/*/src/**/*.ts", "./dev/next-env.d.ts"] } diff --git a/vitest.config.js b/vitest.config.js index 9d7b479..ad613db 100644 --- a/vitest.config.js +++ b/vitest.config.js @@ -20,13 +20,13 @@ export default defineConfig(() => { environment: 'node', hookTimeout: 30_000, testTimeout: 30_000, + include: ['dev/specs/**/*.spec.ts'], exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], - // Run test files sequentially to avoid global state interference - // (embeddingsTables map and Payload instance caching) + // Each test file gets its own forked process so memory is fully + // reclaimed between files (prevents OOM on CI). + pool: 'forks', + // Run test files sequentially to avoid DB / global-state interference. fileParallelism: false, - // Disable parallel test execution within files as well - //threads: false, - //maxConcurrency: 1, }, } })