diff --git a/.claude/commands/validate-connector.md b/.claude/commands/validate-connector.md new file mode 100644 index 0000000000..adcbf61b12 --- /dev/null +++ b/.claude/commands/validate-connector.md @@ -0,0 +1,316 @@ +--- +description: Validate an existing knowledge base connector against its service's API docs +argument-hint: [api-docs-url] +--- + +# Validate Connector Skill + +You are an expert auditor for Sim knowledge base connectors. Your job is to thoroughly validate that an existing connector is correct, complete, and follows all conventions. + +## Your Task + +When the user asks you to validate a connector: +1. Read the service's API documentation (via Context7 or WebFetch) +2. Read the connector implementation, OAuth config, and registry entries +3. Cross-reference everything against the API docs and Sim conventions +4. Report all issues found, grouped by severity (critical, warning, suggestion) +5. Fix all issues after reporting them + +## Step 1: Gather All Files + +Read **every** file for the connector — do not skip any: + +``` +apps/sim/connectors/{service}/{service}.ts # Connector implementation +apps/sim/connectors/{service}/index.ts # Barrel export +apps/sim/connectors/registry.ts # Connector registry entry +apps/sim/connectors/types.ts # ConnectorConfig interface, ExternalDocument, etc. +apps/sim/connectors/utils.ts # Shared utilities (computeContentHash, htmlToPlainText, etc.) +apps/sim/lib/oauth/oauth.ts # OAUTH_PROVIDERS — single source of truth for scopes +apps/sim/lib/oauth/utils.ts # getCanonicalScopesForProvider, getScopesForService, SCOPE_DESCRIPTIONS +apps/sim/lib/oauth/types.ts # OAuthService union type +apps/sim/components/icons.tsx # Icon definition for the service +``` + +If the connector uses selectors, also read: +``` +apps/sim/hooks/selectors/registry.ts # Selector key definitions +apps/sim/hooks/selectors/types.ts # SelectorKey union type +apps/sim/lib/workflows/subblocks/context.ts # SELECTOR_CONTEXT_FIELDS +``` + +## Step 2: Pull API Documentation + +Fetch the official API docs for the service. This is the **source of truth** for: +- Endpoint URLs, HTTP methods, and auth headers +- Required vs optional parameters +- Parameter types and allowed values +- Response shapes and field names +- Pagination patterns (cursor, offset, next token) +- Rate limits and error formats +- OAuth scopes and their meanings + +Use Context7 (resolve-library-id → query-docs) or WebFetch to retrieve documentation. If both fail, note which claims are based on training knowledge vs verified docs. + +## Step 3: Validate API Endpoints + +For **every** API call in the connector (`listDocuments`, `getDocument`, `validateConfig`, and any helper functions), verify against the API docs: + +### URLs and Methods +- [ ] Base URL is correct for the service's API version +- [ ] Endpoint paths match the API docs exactly +- [ ] HTTP method is correct (GET, POST, PUT, PATCH, DELETE) +- [ ] Path parameters are correctly interpolated and URI-encoded where needed +- [ ] Query parameters use correct names and formats per the API docs + +### Headers +- [ ] Authorization header uses the correct format: + - OAuth: `Authorization: Bearer ${accessToken}` + - API Key: correct header name per the service's docs +- [ ] `Content-Type` is set for POST/PUT/PATCH requests +- [ ] Any service-specific headers are present (e.g., `Notion-Version`, `Dropbox-API-Arg`) +- [ ] No headers are sent that the API doesn't support or silently ignores + +### Request Bodies +- [ ] POST/PUT body fields match API parameter names exactly +- [ ] Required fields are always sent +- [ ] Optional fields are conditionally included (not sent as `null` or empty unless the API expects that) +- [ ] Field value types match API expectations (string vs number vs boolean) + +### Input Sanitization +- [ ] User-controlled values interpolated into query strings are properly escaped: + - OData `$filter`: single quotes escaped with `''` (e.g., `externalId.replace(/'/g, "''")`) + - SOQL: single quotes escaped with `\'` + - GraphQL variables: passed as variables, not interpolated into query strings + - URL path segments: `encodeURIComponent()` applied +- [ ] URL-type config fields (e.g., `siteUrl`, `instanceUrl`) are normalized: + - Strip `https://` / `http://` prefix if the API expects bare domains + - Strip trailing `/` + - Apply `.trim()` before validation + +### Response Parsing +- [ ] Response structure is correctly traversed (e.g., `data.results` vs `data.items` vs `data`) +- [ ] Field names extracted match what the API actually returns +- [ ] Nullable fields are handled with `?? null` or `|| undefined` +- [ ] Error responses are checked before accessing data fields + +## Step 4: Validate OAuth Scopes (if OAuth connector) + +Scopes must be correctly declared and sufficient for all API calls the connector makes. + +### Connector requiredScopes +- [ ] `requiredScopes` in the connector's `auth` config lists all scopes needed by the connector +- [ ] Each scope in `requiredScopes` is a real, valid scope recognized by the service's API +- [ ] No invalid, deprecated, or made-up scopes are listed +- [ ] No unnecessary excess scopes beyond what the connector actually needs + +### Scope Subset Validation (CRITICAL) +- [ ] Every scope in `requiredScopes` exists in the OAuth provider's `scopes` array in `lib/oauth/oauth.ts` +- [ ] Find the provider in `OAUTH_PROVIDERS[providerGroup].services[serviceId].scopes` +- [ ] Verify: `requiredScopes` ⊆ `OAUTH_PROVIDERS scopes` (every required scope is present in the provider config) +- [ ] If a required scope is NOT in the provider config, flag as **critical** — the connector will fail at runtime + +### Scope Sufficiency +For each API endpoint the connector calls: +- [ ] Identify which scopes are required per the API docs +- [ ] Verify those scopes are included in the connector's `requiredScopes` +- [ ] If the connector calls endpoints requiring scopes not in `requiredScopes`, flag as **warning** + +### Token Refresh Config +- [ ] Check the `getOAuthTokenRefreshConfig` function in `lib/oauth/oauth.ts` for this provider +- [ ] `useBasicAuth` matches the service's token exchange requirements +- [ ] `supportsRefreshTokenRotation` matches whether the service issues rotating refresh tokens +- [ ] Token endpoint URL is correct + +## Step 5: Validate Pagination + +### listDocuments Pagination +- [ ] Cursor/pagination parameter name matches the API docs +- [ ] Response pagination field is correctly extracted (e.g., `next_cursor`, `nextPageToken`, `@odata.nextLink`, `offset`) +- [ ] `hasMore` is correctly determined from the response +- [ ] `nextCursor` is correctly passed back for the next page +- [ ] `maxItems` / `maxRecords` cap is correctly applied across pages using `syncContext.totalDocsFetched` +- [ ] Page size is within the API's allowed range (not exceeding max page size) +- [ ] Last page precision: when a `maxItems` cap exists, the final page request uses `Math.min(PAGE_SIZE, remaining)` to avoid fetching more records than needed +- [ ] No off-by-one errors in pagination tracking +- [ ] The connector does NOT hit known API pagination limits silently (e.g., HubSpot search 10k cap) + +### Pagination State Across Pages +- [ ] `syncContext` is used to cache state across pages (user names, field maps, instance URLs, portal IDs, etc.) +- [ ] Cached state in `syncContext` is correctly initialized on first page and reused on subsequent pages + +## Step 6: Validate Data Transformation + +### ExternalDocument Construction +- [ ] `externalId` is a stable, unique identifier from the source API +- [ ] `title` is extracted from the correct field and has a sensible fallback (e.g., `'Untitled'`) +- [ ] `content` is plain text — HTML content is stripped using `htmlToPlainText` from `@/connectors/utils` +- [ ] `mimeType` is `'text/plain'` +- [ ] `contentHash` is computed using `computeContentHash` from `@/connectors/utils` +- [ ] `sourceUrl` is a valid, complete URL back to the original resource (not relative) +- [ ] `metadata` contains all fields referenced by `mapTags` and `tagDefinitions` + +### Content Extraction +- [ ] Rich text / HTML fields are converted to plain text before indexing +- [ ] Important content is not silently dropped (e.g., nested blocks, table cells, code blocks) +- [ ] Content is not silently truncated without logging a warning +- [ ] Empty/blank documents are properly filtered out +- [ ] Size checks use `Buffer.byteLength(text, 'utf8')` not `text.length` when comparing against byte-based limits (e.g., `MAX_FILE_SIZE` in bytes) + +## Step 7: Validate Tag Definitions and mapTags + +### tagDefinitions +- [ ] Each `tagDefinition` has an `id`, `displayName`, and `fieldType` +- [ ] `fieldType` matches the actual data type: `'text'` for strings, `'number'` for numbers, `'date'` for dates, `'boolean'` for booleans +- [ ] Every `id` in `tagDefinitions` is returned by `mapTags` +- [ ] No `tagDefinition` references a field that `mapTags` never produces + +### mapTags +- [ ] Return keys match `tagDefinition` `id` values exactly +- [ ] Date values are properly parsed using `parseTagDate` from `@/connectors/utils` +- [ ] Array values are properly joined using `joinTagArray` from `@/connectors/utils` +- [ ] Number values are validated (not `NaN`) +- [ ] Metadata field names accessed in `mapTags` match what `listDocuments`/`getDocument` store in `metadata` + +## Step 8: Validate Config Fields and Validation + +### configFields +- [ ] Every field has `id`, `title`, `type` +- [ ] `required` is set explicitly (not omitted) +- [ ] Dropdown fields have `options` with `label` and `id` for each option +- [ ] Selector fields follow the canonical pair pattern: + - A `type: 'selector'` field with `selectorKey`, `canonicalParamId`, `mode: 'basic'` + - A `type: 'short-input'` field with the same `canonicalParamId`, `mode: 'advanced'` + - `required` is identical on both fields in the pair +- [ ] `selectorKey` values exist in the selector registry +- [ ] `dependsOn` references selector field `id` values, not `canonicalParamId` + +### validateConfig +- [ ] Validates all required fields are present before making API calls +- [ ] Validates optional numeric fields (checks `Number.isNaN`, positive values) +- [ ] Makes a lightweight API call to verify access (e.g., fetch 1 record, get profile) +- [ ] Uses `VALIDATE_RETRY_OPTIONS` for retry budget +- [ ] Returns `{ valid: true }` on success +- [ ] Returns `{ valid: false, error: 'descriptive message' }` on failure +- [ ] Catches exceptions and returns user-friendly error messages +- [ ] Does NOT make expensive calls (full data listing, large queries) + +## Step 9: Validate getDocument + +- [ ] Fetches a single document by `externalId` +- [ ] Returns `null` for 404 / not found (does not throw) +- [ ] Returns the same `ExternalDocument` shape as `listDocuments` +- [ ] Handles all content types that `listDocuments` can produce (e.g., if `listDocuments` returns both pages and blogposts, `getDocument` must handle both — not hardcode one endpoint) +- [ ] Forwards `syncContext` if it needs cached state (user names, field maps, etc.) +- [ ] Error handling is graceful (catches, logs, returns null or throws with context) +- [ ] Does not redundantly re-fetch data already included in the initial API response (e.g., if comments come back with the post, don't fetch them again separately) + +## Step 10: Validate General Quality + +### fetchWithRetry Usage +- [ ] All external API calls use `fetchWithRetry` from `@/lib/knowledge/documents/utils` +- [ ] No raw `fetch()` calls to external APIs +- [ ] `VALIDATE_RETRY_OPTIONS` used in `validateConfig` +- [ ] If `validateConfig` calls a shared helper (e.g., `linearGraphQL`, `resolveId`), that helper must accept and forward `retryOptions` to `fetchWithRetry` +- [ ] Default retry options used in `listDocuments`/`getDocument` + +### API Efficiency +- [ ] APIs that support field selection (e.g., `$select`, `sysparm_fields`, `fields`) should request only the fields the connector needs — in both `listDocuments` AND `getDocument` +- [ ] No redundant API calls: if a helper already fetches data (e.g., site metadata), callers should reuse the result instead of making a second call for the same information +- [ ] Sequential per-item API calls (fetching details for each document in a loop) should be batched with `Promise.all` and a concurrency limit of 3-5 + +### Error Handling +- [ ] Individual document failures are caught and logged without aborting the sync +- [ ] API error responses include status codes in error messages +- [ ] No unhandled promise rejections in concurrent operations + +### Concurrency +- [ ] Concurrent API calls use reasonable batch sizes (3-5 is typical) +- [ ] No unbounded `Promise.all` over large arrays + +### Logging +- [ ] Uses `createLogger` from `@sim/logger` (not `console.log`) +- [ ] Logs sync progress at `info` level +- [ ] Logs errors at `warn` or `error` level with context + +### Registry +- [ ] Connector is exported from `connectors/{service}/index.ts` +- [ ] Connector is registered in `connectors/registry.ts` +- [ ] Registry key matches the connector's `id` field + +## Step 11: Report and Fix + +### Report Format + +Group findings by severity: + +**Critical** (will cause runtime errors, data loss, or auth failures): +- Wrong API endpoint URL or HTTP method +- Invalid or missing OAuth scopes (not in provider config) +- Incorrect response field mapping (accessing wrong path) +- SOQL/query fields that don't exist on the target object +- Pagination that silently hits undocumented API limits +- Missing error handling that would crash the sync +- `requiredScopes` not a subset of OAuth provider scopes +- Query/filter injection: user-controlled values interpolated into OData `$filter`, SOQL, or query strings without escaping + +**Warning** (incorrect behavior, data quality issues, or convention violations): +- HTML content not stripped via `htmlToPlainText` +- `getDocument` not forwarding `syncContext` +- `getDocument` hardcoded to one content type when `listDocuments` returns multiple (e.g., only pages but not blogposts) +- Missing `tagDefinition` for metadata fields returned by `mapTags` +- Incorrect `useBasicAuth` or `supportsRefreshTokenRotation` in token refresh config +- Invalid scope names that the API doesn't recognize (even if silently ignored) +- Private resources excluded from name-based lookup despite scopes being available +- Silent data truncation without logging +- Size checks using `text.length` (character count) instead of `Buffer.byteLength` (byte count) for byte-based limits +- URL-type config fields not normalized (protocol prefix, trailing slashes cause API failures) +- `VALIDATE_RETRY_OPTIONS` not threaded through helper functions called by `validateConfig` + +**Suggestion** (minor improvements): +- Missing incremental sync support despite API supporting it +- Overly broad scopes that could be narrowed (not wrong, but could be tighter) +- Source URL format could be more specific +- Missing `orderBy` for deterministic pagination +- Redundant API calls that could be cached in `syncContext` +- Sequential per-item API calls that could be batched with `Promise.all` (concurrency 3-5) +- API supports field selection but connector fetches all fields (e.g., missing `$select`, `sysparm_fields`, `fields`) +- `getDocument` re-fetches data already included in the initial API response (e.g., comments returned with post) +- Last page of pagination requests full `PAGE_SIZE` when fewer records remain (`Math.min(PAGE_SIZE, remaining)`) + +### Fix All Issues + +After reporting, fix every **critical** and **warning** issue. Apply **suggestions** where they don't add unnecessary complexity. + +### Validation Output + +After fixing, confirm: +1. `bun run lint` passes +2. TypeScript compiles clean +3. Re-read all modified files to verify fixes are correct + +## Checklist Summary + +- [ ] Read connector implementation, types, utils, registry, and OAuth config +- [ ] Pulled and read official API documentation for the service +- [ ] Validated every API endpoint URL, method, headers, and body against API docs +- [ ] Validated input sanitization: no query/filter injection, URL fields normalized +- [ ] Validated OAuth scopes: `requiredScopes` ⊆ OAuth provider `scopes` in `oauth.ts` +- [ ] Validated each scope is real and recognized by the service's API +- [ ] Validated scopes are sufficient for all API endpoints the connector calls +- [ ] Validated token refresh config (`useBasicAuth`, `supportsRefreshTokenRotation`) +- [ ] Validated pagination: cursor names, page sizes, hasMore logic, no silent caps +- [ ] Validated data transformation: plain text extraction, HTML stripping, content hashing +- [ ] Validated tag definitions match mapTags output, correct fieldTypes +- [ ] Validated config fields: canonical pairs, selector keys, required flags +- [ ] Validated validateConfig: lightweight check, error messages, retry options +- [ ] Validated getDocument: null on 404, all content types handled, no redundant re-fetches, syncContext forwarding +- [ ] Validated fetchWithRetry used for all external calls (no raw fetch), VALIDATE_RETRY_OPTIONS threaded through helpers +- [ ] Validated API efficiency: field selection used, no redundant calls, sequential fetches batched +- [ ] Validated error handling: graceful failures, no unhandled rejections +- [ ] Validated logging: createLogger, no console.log +- [ ] Validated registry: correct export, correct key +- [ ] Reported all issues grouped by severity +- [ ] Fixed all critical and warning issues +- [ ] Ran `bun run lint` after fixes +- [ ] Verified TypeScript compiles clean diff --git a/apps/sim/connectors/confluence/confluence.ts b/apps/sim/connectors/confluence/confluence.ts index 6ae872409e..319f665448 100644 --- a/apps/sim/connectors/confluence/confluence.ts +++ b/apps/sim/connectors/confluence/confluence.ts @@ -1,6 +1,6 @@ import { createLogger } from '@sim/logger' import { ConfluenceIcon } from '@/components/icons' -import { fetchWithRetry } from '@/lib/knowledge/documents/utils' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' import { computeContentHash, htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils' import { getConfluenceCloudId } from '@/tools/confluence/utils' @@ -243,23 +243,31 @@ export const confluenceConnector: ConnectorConfig = { const domain = sourceConfig.domain as string const cloudId = await getConfluenceCloudId(domain, accessToken) - const url = `https://api.atlassian.com/ex/confluence/${cloudId}/wiki/api/v2/pages/${externalId}?body-format=storage` - - const response = await fetchWithRetry(url, { - method: 'GET', - headers: { - Accept: 'application/json', - Authorization: `Bearer ${accessToken}`, - }, - }) + // Try pages first, fall back to blogposts if not found + let page: Record | null = null + for (const endpoint of ['pages', 'blogposts']) { + const url = `https://api.atlassian.com/ex/confluence/${cloudId}/wiki/api/v2/${endpoint}/${externalId}?body-format=storage` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Accept: 'application/json', + Authorization: `Bearer ${accessToken}`, + }, + }) - if (!response.ok) { - if (response.status === 404) return null - throw new Error(`Failed to get Confluence page: ${response.status}`) + if (response.ok) { + page = await response.json() + break + } + if (response.status !== 404) { + throw new Error(`Failed to get Confluence content: ${response.status}`) + } } - const page = await response.json() - const rawContent = page.body?.storage?.value || '' + if (!page) return null + const body = page.body as Record | undefined + const storage = body?.storage as Record | undefined + const rawContent = (storage?.value as string) || '' const plainText = htmlToPlainText(rawContent) const contentHash = await computeContentHash(plainText) @@ -267,19 +275,22 @@ export const confluenceConnector: ConnectorConfig = { const labelMap = await fetchLabelsForPages(cloudId, accessToken, [String(page.id)]) const labels = labelMap.get(String(page.id)) ?? [] + const links = page._links as Record | undefined + const version = page.version as Record | undefined + return { externalId: String(page.id), - title: page.title || 'Untitled', + title: (page.title as string) || 'Untitled', content: plainText, mimeType: 'text/plain', - sourceUrl: page._links?.webui ? `https://${domain}/wiki${page._links.webui}` : undefined, + sourceUrl: links?.webui ? `https://${domain}/wiki${links.webui}` : undefined, contentHash, metadata: { spaceId: page.spaceId, status: page.status, - version: page.version?.number, + version: version?.number, labels, - lastModified: page.version?.createdAt, + lastModified: version?.createdAt, }, } }, @@ -302,7 +313,25 @@ export const confluenceConnector: ConnectorConfig = { try { const cloudId = await getConfluenceCloudId(domain, accessToken) - await resolveSpaceId(cloudId, accessToken, spaceKey) + const spaceUrl = `https://api.atlassian.com/ex/confluence/${cloudId}/wiki/api/v2/spaces?keys=${encodeURIComponent(spaceKey)}&limit=1` + const response = await fetchWithRetry( + spaceUrl, + { + method: 'GET', + headers: { + Accept: 'application/json', + Authorization: `Bearer ${accessToken}`, + }, + }, + VALIDATE_RETRY_OPTIONS + ) + if (!response.ok) { + return { valid: false, error: `Failed to validate space: ${response.status}` } + } + const data = await response.json() + if (!data.results?.length) { + return { valid: false, error: `Space "${spaceKey}" not found` } + } return { valid: true } } catch (error) { const message = error instanceof Error ? error.message : 'Failed to validate configuration' diff --git a/apps/sim/connectors/github/github.ts b/apps/sim/connectors/github/github.ts index 80e9ca3242..065f6e0835 100644 --- a/apps/sim/connectors/github/github.ts +++ b/apps/sim/connectors/github/github.ts @@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger' import { GithubIcon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { computeContentHash } from '@/connectors/utils' +import { computeContentHash, parseTagDate } from '@/connectors/utils' const logger = createLogger('GitHubConnector') @@ -82,7 +82,7 @@ async function fetchTree( const data = await response.json() if (data.truncated) { - logger.error('GitHub tree was truncated — some files may be missing', { owner, repo, branch }) + logger.warn('GitHub tree was truncated — some files may be missing', { owner, repo, branch }) } return (data.tree || []).filter((item: TreeItem) => item.type === 'blob') @@ -139,7 +139,7 @@ async function treeItemToDocument( title: item.path.split('/').pop() || item.path, content, mimeType: 'text/plain', - sourceUrl: `https://github.com/${owner}/${repo}/blob/${branch}/${item.path}`, + sourceUrl: `https://github.com/${owner}/${repo}/blob/${encodeURIComponent(branch)}/${item.path.split('/').map(encodeURIComponent).join('/')}`, contentHash, metadata: { path: item.path, @@ -302,6 +302,7 @@ export const githubConnector: ConnectorConfig = { throw new Error(`Failed to fetch file ${path}: ${response.status}`) } + const lastModifiedHeader = response.headers.get('last-modified') || undefined const data = await response.json() const content = data.encoding === 'base64' @@ -314,7 +315,7 @@ export const githubConnector: ConnectorConfig = { title: path.split('/').pop() || path, content, mimeType: 'text/plain', - sourceUrl: `https://github.com/${owner}/${repo}/blob/${branch}/${path}`, + sourceUrl: `https://github.com/${owner}/${repo}/blob/${encodeURIComponent(branch)}/${path.split('/').map(encodeURIComponent).join('/')}`, contentHash, metadata: { path, @@ -322,6 +323,7 @@ export const githubConnector: ConnectorConfig = { size: data.size as number, branch, repository: `${owner}/${repo}`, + lastModified: lastModifiedHeader, }, } } catch (error) { @@ -400,6 +402,7 @@ export const githubConnector: ConnectorConfig = { { id: 'repository', displayName: 'Repository', fieldType: 'text' }, { id: 'branch', displayName: 'Branch', fieldType: 'text' }, { id: 'size', displayName: 'File Size', fieldType: 'number' }, + { id: 'lastModified', displayName: 'Last Modified', fieldType: 'date' }, ], mapTags: (metadata: Record): Record => { @@ -414,6 +417,9 @@ export const githubConnector: ConnectorConfig = { if (!Number.isNaN(num)) result.size = num } + const lastModified = parseTagDate(metadata.lastModified) + if (lastModified) result.lastModified = lastModified + return result }, } diff --git a/apps/sim/connectors/google-calendar/google-calendar.ts b/apps/sim/connectors/google-calendar/google-calendar.ts index 29d97fc7d2..1fbda8d1a3 100644 --- a/apps/sim/connectors/google-calendar/google-calendar.ts +++ b/apps/sim/connectors/google-calendar/google-calendar.ts @@ -439,6 +439,8 @@ export const googleCalendarConnector: ConnectorConfig = { { id: 'attendeeCount', displayName: 'Attendee Count', fieldType: 'number' }, { id: 'location', displayName: 'Location', fieldType: 'text' }, { id: 'eventDate', displayName: 'Event Date', fieldType: 'date' }, + { id: 'lastModified', displayName: 'Last Modified', fieldType: 'date' }, + { id: 'createdAt', displayName: 'Created', fieldType: 'date' }, ], mapTags: (metadata: Record): Record => { @@ -459,6 +461,12 @@ export const googleCalendarConnector: ConnectorConfig = { const eventDate = parseTagDate(metadata.eventDate) if (eventDate) result.eventDate = eventDate + const lastModified = parseTagDate(metadata.updatedTime) + if (lastModified) result.lastModified = lastModified + + const createdAt = parseTagDate(metadata.createdTime) + if (createdAt) result.createdAt = createdAt + return result }, } diff --git a/apps/sim/connectors/google-docs/google-docs.ts b/apps/sim/connectors/google-docs/google-docs.ts index 2c7276a94c..2c80dc23e1 100644 --- a/apps/sim/connectors/google-docs/google-docs.ts +++ b/apps/sim/connectors/google-docs/google-docs.ts @@ -162,7 +162,7 @@ function buildQuery(sourceConfig: Record): string { const folderId = sourceConfig.folderId as string | undefined if (folderId?.trim()) { - parts.push(`'${folderId.trim()}' in parents`) + parts.push(`'${folderId.trim().replace(/'/g, "\\'")}' in parents`) } return parts.join(' and ') diff --git a/apps/sim/connectors/google-drive/google-drive.ts b/apps/sim/connectors/google-drive/google-drive.ts index 7025638e2a..4d60d6f787 100644 --- a/apps/sim/connectors/google-drive/google-drive.ts +++ b/apps/sim/connectors/google-drive/google-drive.ts @@ -112,7 +112,7 @@ function buildQuery(sourceConfig: Record): string { const folderId = sourceConfig.folderId as string | undefined if (folderId?.trim()) { - parts.push(`'${folderId.trim()}' in parents`) + parts.push(`'${folderId.trim().replace(/'/g, "\\'")}' in parents`) } const fileType = (sourceConfig.fileType as string) || 'all' diff --git a/apps/sim/connectors/google-sheets/google-sheets.ts b/apps/sim/connectors/google-sheets/google-sheets.ts index 3a094bc965..9b70ea9a94 100644 --- a/apps/sim/connectors/google-sheets/google-sheets.ts +++ b/apps/sim/connectors/google-sheets/google-sheets.ts @@ -2,11 +2,12 @@ import { createLogger } from '@sim/logger' import { GoogleSheetsIcon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { computeContentHash } from '@/connectors/utils' +import { computeContentHash, parseTagDate } from '@/connectors/utils' const logger = createLogger('GoogleSheetsConnector') const SHEETS_API_BASE = 'https://sheets.googleapis.com/v4/spreadsheets' +const DRIVE_API_BASE = 'https://www.googleapis.com/drive/v3/files' const MAX_ROWS = 10000 const CONCURRENCY = 3 @@ -102,6 +103,38 @@ async function fetchSpreadsheetMetadata( return (await response.json()) as SpreadsheetMetadata } +/** + * Fetches the spreadsheet's modifiedTime from the Drive API. + */ +async function fetchSpreadsheetModifiedTime( + accessToken: string, + spreadsheetId: string +): Promise { + try { + const url = `${DRIVE_API_BASE}/${encodeURIComponent(spreadsheetId)}?fields=modifiedTime&supportsAllDrives=true` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + logger.warn('Failed to fetch modifiedTime from Drive API', { status: response.status }) + return undefined + } + + const data = (await response.json()) as { modifiedTime?: string } + return data.modifiedTime + } catch (error) { + logger.warn('Error fetching modifiedTime from Drive API', { + error: error instanceof Error ? error.message : String(error), + }) + return undefined + } +} + /** * Converts a single sheet tab into an ExternalDocument. */ @@ -109,7 +142,8 @@ async function sheetToDocument( accessToken: string, spreadsheetId: string, spreadsheetTitle: string, - sheet: SheetProperties + sheet: SheetProperties, + modifiedTime?: string ): Promise { try { const values = await fetchSheetValues(accessToken, spreadsheetId, sheet.title) @@ -151,6 +185,7 @@ async function sheetToDocument( sheetId: sheet.sheetId, rowCount, columnCount: headers.length, + ...(modifiedTime ? { modifiedTime } : {}), }, } } catch (error) { @@ -208,7 +243,10 @@ export const googleSheetsConnector: ConnectorConfig = { logger.info('Fetching spreadsheet metadata', { spreadsheetId }) - const metadata = await fetchSpreadsheetMetadata(accessToken, spreadsheetId) + const [metadata, modifiedTime] = await Promise.all([ + fetchSpreadsheetMetadata(accessToken, spreadsheetId), + fetchSpreadsheetModifiedTime(accessToken, spreadsheetId), + ]) const sheetFilter = (sourceConfig.sheetFilter as string) || 'all' let sheets = metadata.sheets.map((s) => s.properties) @@ -226,7 +264,13 @@ export const googleSheetsConnector: ConnectorConfig = { const batch = sheets.slice(i, i + CONCURRENCY) const results = await Promise.all( batch.map((sheet) => - sheetToDocument(accessToken, spreadsheetId, metadata.properties.title, sheet) + sheetToDocument( + accessToken, + spreadsheetId, + metadata.properties.title, + sheet, + modifiedTime + ) ) ) documents.push(...(results.filter(Boolean) as ExternalDocument[])) @@ -257,7 +301,22 @@ export const googleSheetsConnector: ConnectorConfig = { return null } - const metadata = await fetchSpreadsheetMetadata(accessToken, spreadsheetId) + let metadata: SpreadsheetMetadata + let modifiedTime: string | undefined + try { + ;[metadata, modifiedTime] = await Promise.all([ + fetchSpreadsheetMetadata(accessToken, spreadsheetId), + fetchSpreadsheetModifiedTime(accessToken, spreadsheetId), + ]) + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + if (message.includes('404')) { + logger.info('Spreadsheet not found (possibly deleted)', { spreadsheetId }) + return null + } + throw error + } + const sheetEntry = metadata.sheets.find((s) => s.properties.sheetId === sheetId) if (!sheetEntry) { @@ -269,7 +328,8 @@ export const googleSheetsConnector: ConnectorConfig = { accessToken, spreadsheetId, metadata.properties.title, - sheetEntry.properties + sheetEntry.properties, + modifiedTime ) }, @@ -325,6 +385,7 @@ export const googleSheetsConnector: ConnectorConfig = { { id: 'sheetTitle', displayName: 'Sheet Name', fieldType: 'text' }, { id: 'rowCount', displayName: 'Row Count', fieldType: 'number' }, { id: 'columnCount', displayName: 'Column Count', fieldType: 'number' }, + { id: 'lastModified', displayName: 'Last Modified', fieldType: 'date' }, ], mapTags: (metadata: Record): Record => { @@ -342,6 +403,11 @@ export const googleSheetsConnector: ConnectorConfig = { result.columnCount = metadata.columnCount } + const lastModified = parseTagDate(metadata.modifiedTime) + if (lastModified) { + result.lastModified = lastModified + } + return result }, } diff --git a/apps/sim/connectors/hubspot/hubspot.ts b/apps/sim/connectors/hubspot/hubspot.ts index 27812ecfe6..2461ec8ac8 100644 --- a/apps/sim/connectors/hubspot/hubspot.ts +++ b/apps/sim/connectors/hubspot/hubspot.ts @@ -223,20 +223,15 @@ export const hubspotConnector: ConnectorConfig = { const portalId = await getPortalId(accessToken, syncContext) - const body: Record = { - filterGroups: [], - sorts: [ - { - propertyName: objectType === 'contacts' ? 'lastmodifieddate' : 'hs_lastmodifieddate', - direction: 'DESCENDING', - }, - ], - properties: [...properties], + const sortProperty = objectType === 'contacts' ? 'lastmodifieddate' : 'hs_lastmodifieddate' + + const searchBody: Record = { + properties, + sorts: [{ propertyName: sortProperty, direction: 'DESCENDING' }], limit: PAGE_SIZE, } - if (cursor) { - body.after = cursor + searchBody.after = cursor } logger.info(`Listing HubSpot ${objectType}`, { cursor }) @@ -248,16 +243,16 @@ export const hubspotConnector: ConnectorConfig = { Accept: 'application/json', Authorization: `Bearer ${accessToken}`, }, - body: JSON.stringify(body), + body: JSON.stringify(searchBody), }) if (!response.ok) { const errorText = await response.text() - logger.error(`Failed to search HubSpot ${objectType}`, { + logger.error(`Failed to list HubSpot ${objectType}`, { status: response.status, error: errorText, }) - throw new Error(`Failed to search HubSpot ${objectType}: ${response.status}`) + throw new Error(`Failed to list HubSpot ${objectType}: ${response.status}`) } const data = await response.json() @@ -294,12 +289,17 @@ export const hubspotConnector: ConnectorConfig = { getDocument: async ( accessToken: string, sourceConfig: Record, - externalId: string + externalId: string, + syncContext?: Record ): Promise => { const objectType = sourceConfig.objectType as string const properties = OBJECT_PROPERTIES[objectType] || [] - const portalId = await getPortalId(accessToken) + let portalId = syncContext?.portalId as string | undefined + if (!portalId) { + portalId = await getPortalId(accessToken) + if (syncContext) syncContext.portalId = portalId + } const params = new URLSearchParams() for (const prop of properties) { @@ -346,19 +346,13 @@ export const hubspotConnector: ConnectorConfig = { try { const response = await fetchWithRetry( - `${BASE_URL}/crm/v3/objects/${objectType}/search`, + `${BASE_URL}/crm/v3/objects/${objectType}?limit=1`, { - method: 'POST', + method: 'GET', headers: { - 'Content-Type': 'application/json', Accept: 'application/json', Authorization: `Bearer ${accessToken}`, }, - body: JSON.stringify({ - filterGroups: [], - limit: 1, - properties: ['hs_object_id'], - }), }, VALIDATE_RETRY_OPTIONS ) diff --git a/apps/sim/connectors/jira/jira.ts b/apps/sim/connectors/jira/jira.ts index 2cbd084cf6..aaa1218a58 100644 --- a/apps/sim/connectors/jira/jira.ts +++ b/apps/sim/connectors/jira/jira.ts @@ -155,7 +155,11 @@ export const jiraConnector: ConnectorConfig = { const params = new URLSearchParams() params.append('jql', jql) params.append('startAt', String(startAt)) - params.append('maxResults', String(PAGE_SIZE)) + const remaining = maxIssues > 0 ? Math.max(0, maxIssues - startAt) : PAGE_SIZE + if (remaining === 0) { + return { documents: [], hasMore: false } + } + params.append('maxResults', String(Math.min(PAGE_SIZE, remaining))) params.append( 'fields', 'summary,description,comment,issuetype,status,priority,assignee,reporter,project,labels,created,updated' @@ -203,10 +207,15 @@ export const jiraConnector: ConnectorConfig = { getDocument: async ( accessToken: string, sourceConfig: Record, - externalId: string + externalId: string, + syncContext?: Record ): Promise => { const domain = sourceConfig.domain as string - const cloudId = await getJiraCloudId(domain, accessToken) + let cloudId = syncContext?.cloudId as string | undefined + if (!cloudId) { + cloudId = await getJiraCloudId(domain, accessToken) + if (syncContext) syncContext.cloudId = cloudId + } const params = new URLSearchParams() params.append( diff --git a/apps/sim/connectors/linear/linear.ts b/apps/sim/connectors/linear/linear.ts index 72333d1c86..cd779b3385 100644 --- a/apps/sim/connectors/linear/linear.ts +++ b/apps/sim/connectors/linear/linear.ts @@ -1,6 +1,7 @@ import { createLogger } from '@sim/logger' import { LinearIcon } from '@/components/icons' -import { fetchWithRetry } from '@/lib/knowledge/documents/utils' +import type { RetryOptions } from '@/lib/knowledge/documents/utils' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils' @@ -35,16 +36,21 @@ function markdownToPlainText(md: string): string { async function linearGraphQL( accessToken: string, query: string, - variables?: Record + variables?: Record, + retryOptions?: RetryOptions ): Promise> { - const response = await fetchWithRetry(LINEAR_API, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${accessToken}`, + const response = await fetchWithRetry( + LINEAR_API, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${accessToken}`, + }, + body: JSON.stringify({ query, variables }), }, - body: JSON.stringify({ query, variables }), - }) + retryOptions + ) if (!response.ok) { const errorText = await response.text() @@ -374,7 +380,7 @@ export const linearConnector: ConnectorConfig = { try { // Verify the token works by fetching teams - const data = await linearGraphQL(accessToken, TEAMS_QUERY) + const data = await linearGraphQL(accessToken, TEAMS_QUERY, undefined, VALIDATE_RETRY_OPTIONS) const teamsConn = data.teams as Record const teams = (teamsConn.nodes || []) as Record[] diff --git a/apps/sim/connectors/microsoft-teams/microsoft-teams.ts b/apps/sim/connectors/microsoft-teams/microsoft-teams.ts index f59add9693..d47f49bc43 100644 --- a/apps/sim/connectors/microsoft-teams/microsoft-teams.ts +++ b/apps/sim/connectors/microsoft-teams/microsoft-teams.ts @@ -66,7 +66,6 @@ async function graphApiGet( headers: { Authorization: `Bearer ${accessToken}`, Accept: 'application/json', - Prefer: 'outlook.body-content-type="text"', }, }, retryOptions @@ -191,7 +190,7 @@ export const microsoftTeamsConnector: ConnectorConfig = { auth: { mode: 'oauth', provider: 'microsoft-teams', - requiredScopes: ['ChannelMessage.Read.All'], + requiredScopes: ['ChannelMessage.Read.All', 'Channel.ReadBasic.All'], }, configFields: [ diff --git a/apps/sim/connectors/notion/notion.ts b/apps/sim/connectors/notion/notion.ts index 6062177102..254fc51159 100644 --- a/apps/sim/connectors/notion/notion.ts +++ b/apps/sim/connectors/notion/notion.ts @@ -538,10 +538,8 @@ async function listFromParentPage( const data = await response.json() const blocks = (data.results || []) as Record[] - // Filter to child_page and child_database blocks - const childPageIds = blocks - .filter((b) => b.type === 'child_page' || b.type === 'child_database') - .map((b) => b.id as string) + // Filter to child_page blocks only (child_database blocks cannot be fetched via the Pages API) + const childPageIds = blocks.filter((b) => b.type === 'child_page').map((b) => b.id as string) // Also include the root page itself on the first call (no cursor) const pageIdsToFetch = !cursor ? [rootPageId, ...childPageIds] : childPageIds diff --git a/apps/sim/connectors/obsidian/obsidian.ts b/apps/sim/connectors/obsidian/obsidian.ts index d7a554eb5b..1ad77a6da9 100644 --- a/apps/sim/connectors/obsidian/obsidian.ts +++ b/apps/sim/connectors/obsidian/obsidian.ts @@ -200,33 +200,47 @@ export const obsidianConnector: ConnectorConfig = { const documents: ExternalDocument[] = [] - for (const filePath of pageFiles) { - try { - const note = await fetchNote(baseUrl, accessToken, filePath) - const content = note.content || '' - const contentHash = await computeContentHash(content) - - documents.push({ - externalId: filePath, - title: titleFromPath(filePath), - content, - mimeType: 'text/plain', - sourceUrl: `${baseUrl}/vault/${filePath.split('/').map(encodeURIComponent).join('/')}`, - contentHash, - metadata: { - tags: note.tags, - frontmatter: note.frontmatter, - createdAt: note.stat?.ctime ? new Date(note.stat.ctime).toISOString() : undefined, - modifiedAt: note.stat?.mtime ? new Date(note.stat.mtime).toISOString() : undefined, - size: note.stat?.size, - folder: filePath.includes('/') ? filePath.substring(0, filePath.lastIndexOf('/')) : '', - }, - }) - } catch (error) { - logger.warn('Failed to fetch note', { - filePath, - error: error instanceof Error ? error.message : String(error), + const BATCH_SIZE = 5 + for (let i = 0; i < pageFiles.length; i += BATCH_SIZE) { + const batch = pageFiles.slice(i, i + BATCH_SIZE) + const results = await Promise.all( + batch.map(async (filePath) => { + try { + const note = await fetchNote(baseUrl, accessToken, filePath) + const content = note.content || '' + const contentHash = await computeContentHash(content) + + return { + externalId: filePath, + title: titleFromPath(filePath), + content, + mimeType: 'text/plain' as const, + sourceUrl: `${baseUrl}/vault/${filePath.split('/').map(encodeURIComponent).join('/')}`, + contentHash, + metadata: { + tags: note.tags, + frontmatter: note.frontmatter, + createdAt: note.stat?.ctime ? new Date(note.stat.ctime).toISOString() : undefined, + modifiedAt: note.stat?.mtime ? new Date(note.stat.mtime).toISOString() : undefined, + size: note.stat?.size, + folder: filePath.includes('/') + ? filePath.substring(0, filePath.lastIndexOf('/')) + : '', + }, + } + } catch (error) { + logger.warn('Failed to fetch note', { + filePath, + error: error instanceof Error ? error.message : String(error), + }) + return null + } }) + ) + for (const doc of results) { + if (doc) { + documents.push(doc) + } } } diff --git a/apps/sim/connectors/onedrive/onedrive.ts b/apps/sim/connectors/onedrive/onedrive.ts index 49c7a3fcfc..3557767878 100644 --- a/apps/sim/connectors/onedrive/onedrive.ts +++ b/apps/sim/connectors/onedrive/onedrive.ts @@ -71,8 +71,8 @@ async function downloadFileContent(accessToken: string, fileId: string): Promise } const text = await response.text() - if (text.length > MAX_FILE_SIZE) { - return text.slice(0, MAX_FILE_SIZE) + if (Buffer.byteLength(text, 'utf8') > MAX_FILE_SIZE) { + return Buffer.from(text, 'utf8').subarray(0, MAX_FILE_SIZE).toString('utf8') } return text } diff --git a/apps/sim/connectors/outlook/outlook.ts b/apps/sim/connectors/outlook/outlook.ts index 34db1b1874..1c800701ca 100644 --- a/apps/sim/connectors/outlook/outlook.ts +++ b/apps/sim/connectors/outlook/outlook.ts @@ -464,7 +464,7 @@ export const outlookConnector: ConnectorConfig = { try { // Fetch messages for this conversation const params = new URLSearchParams({ - $filter: `conversationId eq '${externalId}'`, + $filter: `conversationId eq '${externalId.replace(/'/g, "''")}'`, $select: MESSAGE_FIELDS, $top: '50', }) @@ -557,7 +557,12 @@ export const outlookConnector: ConnectorConfig = { // If a search query is specified, verify it's valid with a dry run const searchQuery = sourceConfig.query as string | undefined if (searchQuery?.trim()) { - const searchUrl = `${GRAPH_API_BASE}/messages?$search="${encodeURIComponent(searchQuery.trim())}"&$top=1&$select=id` + const searchParams = new URLSearchParams({ + $search: `"${searchQuery.trim()}"`, + $top: '1', + $select: 'id', + }) + const searchUrl = `${GRAPH_API_BASE}/messages?${searchParams.toString()}` const searchResponse = await fetchWithRetry( searchUrl, { diff --git a/apps/sim/connectors/reddit/reddit.ts b/apps/sim/connectors/reddit/reddit.ts index f4449b5442..b74a8d4b0b 100644 --- a/apps/sim/connectors/reddit/reddit.ts +++ b/apps/sim/connectors/reddit/reddit.ts @@ -103,18 +103,7 @@ async function fetchPostComments( if (!Array.isArray(data) || data.length < 2) return [] - const commentListing = data[1] - const comments: string[] = [] - - for (const child of commentListing.data.children) { - if (child.kind !== 't1') continue - const comment = child as RedditComment - if (!comment.data.body || comment.data.author === 'AutoModerator') continue - comments.push(`[${comment.data.author} | score: ${comment.data.score}]: ${comment.data.body}`) - if (comments.length >= maxComments) break - } - - return comments + return extractComments(data[1], maxComments) } catch (error) { logger.warn('Failed to fetch comments for post', { postId, @@ -124,13 +113,32 @@ async function fetchPostComments( } } +/** + * Extracts formatted comment strings from a Reddit comment listing. + */ +function extractComments(commentListing: RedditListing, maxComments: number): string[] { + const comments: string[] = [] + + for (const child of commentListing.data.children) { + if (child.kind !== 't1') continue + const comment = child as RedditComment + if (!comment.data.body || comment.data.author === 'AutoModerator') continue + comments.push(`[${comment.data.author} | score: ${comment.data.score}]: ${comment.data.body}`) + if (comments.length >= maxComments) break + } + + return comments +} + /** * Formats a Reddit post with its comments into a document content string. + * When `prefetchedComments` is provided, uses those directly instead of fetching. */ async function formatPostContent( accessToken: string, post: RedditPost['data'], - maxComments: number + maxComments: number, + prefetchedComments?: string[] ): Promise { const lines: string[] = [] @@ -153,7 +161,9 @@ async function formatPostContent( } if (maxComments > 0) { - const comments = await fetchPostComments(accessToken, post.subreddit, post.id, maxComments) + const comments = + prefetchedComments ?? + (await fetchPostComments(accessToken, post.subreddit, post.id, maxComments)) if (comments.length > 0) { lines.push('---') lines.push(`Top Comments (${comments.length}):`) @@ -384,7 +394,9 @@ export const redditConnector: ConnectorConfig = { if (postChildren.length === 0) return null const post = postChildren[0].data - const content = await formatPostContent(accessToken, post, COMMENTS_PER_POST) + const comments = + data.length >= 2 ? extractComments(data[1] as RedditListing, COMMENTS_PER_POST) : [] + const content = await formatPostContent(accessToken, post, COMMENTS_PER_POST, comments) const contentHash = await computeContentHash(content) return { diff --git a/apps/sim/connectors/salesforce/salesforce.ts b/apps/sim/connectors/salesforce/salesforce.ts index 6379e69ef4..c829187c69 100644 --- a/apps/sim/connectors/salesforce/salesforce.ts +++ b/apps/sim/connectors/salesforce/salesforce.ts @@ -2,24 +2,17 @@ import { createLogger } from '@sim/logger' import { SalesforceIcon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { computeContentHash, parseTagDate } from '@/connectors/utils' +import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils' const logger = createLogger('SalesforceConnector') const USERINFO_URL = 'https://login.salesforce.com/services/oauth2/userinfo' -const API_VERSION = 'v59.0' +const API_VERSION = 'v62.0' const PAGE_SIZE = 200 /** SOQL field lists per object type. */ const OBJECT_FIELDS: Record = { - KnowledgeArticleVersion: [ - 'Id', - 'Title', - 'Summary', - 'ArticleBody', - 'LastModifiedDate', - 'ArticleNumber', - ], + KnowledgeArticleVersion: ['Id', 'Title', 'Summary', 'LastModifiedDate', 'ArticleNumber'], Case: ['Id', 'Subject', 'Description', 'Status', 'LastModifiedDate', 'CaseNumber'], Account: ['Id', 'Name', 'Description', 'Industry', 'LastModifiedDate'], Opportunity: [ @@ -98,6 +91,9 @@ function buildRecordTitle(objectType: string, record: Record): } } +/** Fields that may contain HTML content and should be stripped to plain text. */ +const HTML_FIELDS = new Set(['Description', 'Summary']) + /** * Builds plain-text content from a Salesforce record for indexing. */ @@ -112,7 +108,9 @@ function buildRecordContent(objectType: string, record: Record) const value = record[field] if (value != null && value !== '') { const label = field.replace(/([A-Z])/g, ' $1').trim() - parts.push(`${label}: ${String(value)}`) + const text = + HTML_FIELDS.has(field) && typeof value === 'string' ? htmlToPlainText(value) : String(value) + parts.push(`${label}: ${text}`) } } @@ -288,7 +286,8 @@ export const salesforceConnector: ConnectorConfig = { getDocument: async ( accessToken: string, sourceConfig: Record, - externalId: string + externalId: string, + syncContext?: Record ): Promise => { const objectType = sourceConfig.objectType as string const fields = OBJECT_FIELDS[objectType] @@ -297,7 +296,11 @@ export const salesforceConnector: ConnectorConfig = { throw new Error(`Unsupported Salesforce object type: ${objectType}`) } - const instanceUrl = await resolveInstanceUrl(accessToken) + let instanceUrl = syncContext?.instanceUrl as string | undefined + if (!instanceUrl) { + instanceUrl = await resolveInstanceUrl(accessToken) + if (syncContext) syncContext.instanceUrl = instanceUrl + } const url = `${instanceUrl}sobjects/${objectType}/${externalId}?fields=${fields.join(',')}` diff --git a/apps/sim/connectors/servicenow/servicenow.ts b/apps/sim/connectors/servicenow/servicenow.ts index 26f3a53a85..3aefb66fbd 100644 --- a/apps/sim/connectors/servicenow/servicenow.ts +++ b/apps/sim/connectors/servicenow/servicenow.ts @@ -513,11 +513,16 @@ export const servicenowConnector: ConnectorConfig = { const isKB = contentType === 'kb_knowledge' const tableName = isKB ? 'kb_knowledge' : 'incident' + const fields = isKB + ? 'sys_id,short_description,text,wiki,workflow_state,kb_category,kb_knowledge_base,number,author,sys_created_by,sys_updated_by,sys_updated_on,sys_created_on' + : 'sys_id,number,short_description,description,state,priority,category,assigned_to,opened_by,close_notes,resolution_notes,sys_created_by,sys_updated_by,sys_updated_on,sys_created_on' + try { const { result } = await serviceNowApiGet(instanceUrl, tableName, authHeader, { sysparm_query: `sys_id=${externalId}`, sysparm_limit: '1', sysparm_offset: '0', + sysparm_fields: fields, sysparm_display_value: 'all', }) diff --git a/apps/sim/connectors/sharepoint/sharepoint.ts b/apps/sim/connectors/sharepoint/sharepoint.ts index 08ab142f5f..ec3d08655f 100644 --- a/apps/sim/connectors/sharepoint/sharepoint.ts +++ b/apps/sim/connectors/sharepoint/sharepoint.ts @@ -61,7 +61,7 @@ async function resolveSiteId( accessToken: string, siteUrl: string, retryOptions?: Parameters[2] -): Promise { +): Promise<{ id: string; displayName: string }> { // Normalise: strip protocol, trailing slashes const cleaned = siteUrl.replace(/^https?:\/\//, '').replace(/\/+$/, '') @@ -108,7 +108,7 @@ async function resolveSiteId( siteId: site.id, displayName: site.displayName, }) - return site.id + return { id: site.id, displayName: site.displayName ?? '' } } /** @@ -338,17 +338,9 @@ export const sharepointConnector: ConnectorConfig = { siteId = syncContext.siteId as string siteName = (syncContext.siteName as string) ?? '' } else { - siteId = await resolveSiteId(accessToken, siteUrl) - - // Fetch site display name - const siteResponse = await fetchWithRetry(`${GRAPH_BASE}/sites/${siteId}`, { - method: 'GET', - headers: { Authorization: `Bearer ${accessToken}`, Accept: 'application/json' }, - }) - const siteData = siteResponse.ok - ? ((await siteResponse.json()) as { displayName?: string }) - : {} - siteName = siteData.displayName ?? siteUrl + const site = await resolveSiteId(accessToken, siteUrl) + siteId = site.id + siteName = site.displayName || siteUrl if (syncContext) { syncContext.siteId = siteId @@ -463,10 +455,22 @@ export const sharepointConnector: ConnectorConfig = { getDocument: async ( accessToken: string, sourceConfig: Record, - externalId: string + externalId: string, + syncContext?: Record ): Promise => { const siteUrl = sourceConfig.siteUrl as string - const siteId = await resolveSiteId(accessToken, siteUrl) + + let siteId = syncContext?.siteId as string | undefined + let siteName = syncContext?.siteName as string | undefined + if (!siteId) { + const site = await resolveSiteId(accessToken, siteUrl) + siteId = site.id + siteName = site.displayName ?? siteUrl + if (syncContext) { + syncContext.siteId = siteId + syncContext.siteName = siteName + } + } const url = `${GRAPH_BASE}/sites/${siteId}/drive/items/${externalId}` const response = await fetchWithRetry(url, { @@ -484,22 +488,11 @@ export const sharepointConnector: ConnectorConfig = { const item = (await response.json()) as DriveItem - // Verify it is a supported text file if (!item.file || !isSupportedTextFile(item.name)) { return null } - // Fetch site display name for metadata - const siteResponse = await fetchWithRetry(`${GRAPH_BASE}/sites/${siteId}`, { - method: 'GET', - headers: { Authorization: `Bearer ${accessToken}`, Accept: 'application/json' }, - }) - const siteData = siteResponse.ok - ? ((await siteResponse.json()) as { displayName?: string }) - : {} - const siteName = siteData.displayName ?? siteUrl - - return itemToDocument(accessToken, siteId, item, siteName) + return itemToDocument(accessToken, siteId, item, siteName ?? siteUrl) }, validateConfig: async ( @@ -517,7 +510,8 @@ export const sharepointConnector: ConnectorConfig = { } try { - const siteId = await resolveSiteId(accessToken, siteUrl, VALIDATE_RETRY_OPTIONS) + const site = await resolveSiteId(accessToken, siteUrl, VALIDATE_RETRY_OPTIONS) + const siteId = site.id // If a folder path is configured, verify it exists const folderPath = (sourceConfig.folderPath as string)?.trim() diff --git a/apps/sim/connectors/slack/slack.ts b/apps/sim/connectors/slack/slack.ts index 79e3d34b26..d0a3e0233f 100644 --- a/apps/sim/connectors/slack/slack.ts +++ b/apps/sim/connectors/slack/slack.ts @@ -213,11 +213,11 @@ async function resolveChannel( } } - // Search by name through conversations.list + // Search by name through conversations.list (include private channels the bot is in) let cursor: string | undefined do { const params: Record = { - types: 'public_channel', + types: 'public_channel,private_channel', limit: '200', exclude_archived: 'true', } @@ -248,7 +248,13 @@ export const slackConnector: ConnectorConfig = { auth: { mode: 'oauth', provider: 'slack', - requiredScopes: ['channels:read', 'channels:history', 'users:read'], + requiredScopes: [ + 'channels:read', + 'channels:history', + 'groups:read', + 'groups:history', + 'users:read', + ], }, configFields: [ @@ -356,7 +362,8 @@ export const slackConnector: ConnectorConfig = { getDocument: async ( accessToken: string, sourceConfig: Record, - externalId: string + externalId: string, + syncContext?: Record ): Promise => { const maxMessages = sourceConfig.maxMessages ? Number(sourceConfig.maxMessages) @@ -372,7 +379,7 @@ export const slackConnector: ConnectorConfig = { maxMessages ) - const content = await formatMessages(accessToken, messages) + const content = await formatMessages(accessToken, messages, syncContext) if (!content.trim()) return null const contentHash = await computeContentHash(content) @@ -441,11 +448,11 @@ export const slackConnector: ConnectorConfig = { return { valid: true } } - // Otherwise search by name + // Otherwise search by name (include private channels the bot is in) let cursor: string | undefined do { const params: Record = { - types: 'public_channel', + types: 'public_channel,private_channel', limit: '200', exclude_archived: 'true', } diff --git a/apps/sim/connectors/webflow/webflow.ts b/apps/sim/connectors/webflow/webflow.ts index ff0b64c47d..c398ed815f 100644 --- a/apps/sim/connectors/webflow/webflow.ts +++ b/apps/sim/connectors/webflow/webflow.ts @@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger' import { WebflowIcon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { computeContentHash, parseTagDate } from '@/connectors/utils' +import { computeContentHash, htmlToPlainText, parseTagDate } from '@/connectors/utils' const logger = createLogger('WebflowConnector') @@ -60,6 +60,8 @@ function itemToPlainText(item: WebflowItem, collectionName: string): string { lines.push(`${key}: ${items.join(', ')}`) } else if (typeof value === 'object') { lines.push(`${key}: ${JSON.stringify(value)}`) + } else if (typeof value === 'string' && /<[a-z][^>]*>/i.test(value)) { + lines.push(`${key}: ${htmlToPlainText(value)}`) } else { lines.push(`${key}: ${String(value)}`) } diff --git a/apps/sim/connectors/wordpress/wordpress.ts b/apps/sim/connectors/wordpress/wordpress.ts index 3fc219786b..9f835e52e0 100644 --- a/apps/sim/connectors/wordpress/wordpress.ts +++ b/apps/sim/connectors/wordpress/wordpress.ts @@ -7,6 +7,15 @@ import { computeContentHash, htmlToPlainText, joinTagArray, parseTagDate } from const logger = createLogger('WordPressConnector') const WP_API_BASE = 'https://public-api.wordpress.com/rest/v1.1/sites' + +/** + * Strips protocol prefix and trailing slashes from a site URL so the + * WordPress.com API receives a bare domain (e.g. "mysite.wordpress.com"). + */ +function normalizeSiteUrl(raw: string): string { + return raw.replace(/^https?:\/\//, '').replace(/\/+$/, '') +} + const POSTS_PER_PAGE = 20 const DEFAULT_MAX_POSTS = 100 @@ -135,10 +144,11 @@ export const wordpressConnector: ConnectorConfig = { cursor?: string, syncContext?: Record ): Promise => { - const siteUrl = (sourceConfig.siteUrl as string)?.trim() - if (!siteUrl) { + const rawSiteUrl = (sourceConfig.siteUrl as string)?.trim() + if (!rawSiteUrl) { throw new Error('Site URL is required') } + const siteUrl = normalizeSiteUrl(rawSiteUrl) const maxPosts = sourceConfig.maxPosts ? Number(sourceConfig.maxPosts) : DEFAULT_MAX_POSTS const type = resolvePostType(sourceConfig.postType as string | undefined) @@ -193,10 +203,11 @@ export const wordpressConnector: ConnectorConfig = { sourceConfig: Record, externalId: string ): Promise => { - const siteUrl = (sourceConfig.siteUrl as string)?.trim() - if (!siteUrl) { + const rawSiteUrl = (sourceConfig.siteUrl as string)?.trim() + if (!rawSiteUrl) { throw new Error('Site URL is required') } + const siteUrl = normalizeSiteUrl(rawSiteUrl) const url = `${WP_API_BASE}/${encodeURIComponent(siteUrl)}/posts/${externalId}` @@ -229,12 +240,13 @@ export const wordpressConnector: ConnectorConfig = { accessToken: string, sourceConfig: Record ): Promise<{ valid: boolean; error?: string }> => { - const siteUrl = (sourceConfig.siteUrl as string)?.trim() + const rawSiteUrl = (sourceConfig.siteUrl as string)?.trim() const maxPosts = sourceConfig.maxPosts as string | undefined - if (!siteUrl) { + if (!rawSiteUrl) { return { valid: false, error: 'Site URL is required' } } + const siteUrl = normalizeSiteUrl(rawSiteUrl) if (maxPosts && (Number.isNaN(Number(maxPosts)) || Number(maxPosts) <= 0)) { return { valid: false, error: 'Max posts must be a positive number' } diff --git a/apps/sim/connectors/zendesk/zendesk.ts b/apps/sim/connectors/zendesk/zendesk.ts index dac6b21648..d42d016366 100644 --- a/apps/sim/connectors/zendesk/zendesk.ts +++ b/apps/sim/connectors/zendesk/zendesk.ts @@ -391,10 +391,21 @@ export const zendeskConnector: ConnectorConfig = { ) logger.info(`Fetched ${tickets.length} tickets from Zendesk`) - for (const ticket of tickets) { - const comments = await fetchTicketComments(subdomain, accessToken, sourceConfig, ticket.id) - const doc = await ticketToDocument(ticket, comments, subdomain) - documents.push(doc) + const BATCH_SIZE = 5 + for (let i = 0; i < tickets.length; i += BATCH_SIZE) { + const batch = tickets.slice(i, i + BATCH_SIZE) + const batchResults = await Promise.all( + batch.map(async (ticket) => { + const comments = await fetchTicketComments( + subdomain, + accessToken, + sourceConfig, + ticket.id + ) + return ticketToDocument(ticket, comments, subdomain) + }) + ) + documents.push(...batchResults) } } diff --git a/apps/sim/lib/oauth/oauth.test.ts b/apps/sim/lib/oauth/oauth.test.ts index 363ed09ae3..1b1338892a 100644 --- a/apps/sim/lib/oauth/oauth.test.ts +++ b/apps/sim/lib/oauth/oauth.test.ts @@ -180,7 +180,6 @@ describe('OAuth Token Refresh', () => { providerId: 'outlook', endpoint: 'https://login.microsoftonline.com/common/oauth2/v2.0/token', }, - { name: 'Notion', providerId: 'notion', endpoint: 'https://api.notion.com/v1/oauth/token' }, { name: 'Slack', providerId: 'slack', endpoint: 'https://slack.com/api/oauth.v2.access' }, { name: 'Dropbox', @@ -274,6 +273,44 @@ describe('OAuth Token Refresh', () => { ) }) + it.concurrent('should send Notion request with Basic Auth header and JSON body', async () => { + const mockFetch = createMockFetch(defaultOAuthResponse) + const refreshToken = 'test_refresh_token' + + await withMockFetch(mockFetch, () => refreshOAuthToken('notion', refreshToken)) + + expect(mockFetch).toHaveBeenCalledWith( + 'https://api.notion.com/v1/oauth/token', + expect.objectContaining({ + method: 'POST', + headers: expect.objectContaining({ + 'Content-Type': 'application/json', + Authorization: expect.stringMatching(/^Basic /), + }), + body: expect.any(String), + }) + ) + + const [, requestOptions] = mockFetch.mock.calls[0] as [ + string, + { headers: Record; body: string }, + ] + + const authHeader = requestOptions.headers.Authorization + const base64Credentials = authHeader.replace('Basic ', '') + const credentials = Buffer.from(base64Credentials, 'base64').toString('utf-8') + const [clientId, clientSecret] = credentials.split(':') + + expect(clientId).toBe('notion_client_id') + expect(clientSecret).toBe('notion_client_secret') + + const bodyParams = JSON.parse(requestOptions.body) + expect(bodyParams).toEqual({ + grant_type: 'refresh_token', + refresh_token: refreshToken, + }) + }) + it.concurrent('should include User-Agent header for Reddit requests', async () => { const mockFetch = createMockFetch(defaultOAuthResponse) const refreshToken = 'test_refresh_token' diff --git a/apps/sim/lib/oauth/oauth.ts b/apps/sim/lib/oauth/oauth.ts index 73a71a3e32..38c6472cfd 100644 --- a/apps/sim/lib/oauth/oauth.ts +++ b/apps/sim/lib/oauth/oauth.ts @@ -866,7 +866,7 @@ export const OAUTH_PROVIDERS: Record = { providerId: 'salesforce', icon: SalesforceIcon, baseProviderIcon: SalesforceIcon, - scopes: ['api', 'refresh_token', 'openid', 'offline_access'], + scopes: ['api', 'refresh_token', 'openid'], }, }, defaultService: 'salesforce', @@ -960,6 +960,11 @@ interface ProviderAuthConfig { * instead of in the request body. Used by Cal.com. */ refreshTokenInAuthHeader?: boolean + /** + * If true, the token endpoint expects a JSON body with Content-Type: application/json + * instead of the default application/x-www-form-urlencoded. Used by Notion. + */ + useJsonBody?: boolean } /** @@ -1056,8 +1061,9 @@ function getProviderAuthConfig(provider: string): ProviderAuthConfig { tokenEndpoint: 'https://api.notion.com/v1/oauth/token', clientId, clientSecret, - useBasicAuth: false, + useBasicAuth: true, supportsRefreshTokenRotation: true, + useJsonBody: true, } } case 'microsoft': @@ -1295,9 +1301,9 @@ function getProviderAuthConfig(provider: string): ProviderAuthConfig { function buildAuthRequest( config: ProviderAuthConfig, refreshToken: string -): { headers: Record; bodyParams: Record } { +): { headers: Record; bodyParams: Record; useJsonBody?: boolean } { const headers: Record = { - 'Content-Type': 'application/x-www-form-urlencoded', + 'Content-Type': config.useJsonBody ? 'application/json' : 'application/x-www-form-urlencoded', ...config.additionalHeaders, } @@ -1326,7 +1332,7 @@ function buildAuthRequest( } } - return { headers, bodyParams } + return { headers, bodyParams, useJsonBody: config.useJsonBody } } /** @@ -1361,12 +1367,12 @@ export async function refreshOAuthToken( const config = getProviderAuthConfig(provider) - const { headers, bodyParams } = buildAuthRequest(config, refreshToken) + const { headers, bodyParams, useJsonBody } = buildAuthRequest(config, refreshToken) const response = await fetch(config.tokenEndpoint, { method: 'POST', headers, - body: new URLSearchParams(bodyParams).toString(), + body: useJsonBody ? JSON.stringify(bodyParams) : new URLSearchParams(bodyParams).toString(), }) if (!response.ok) { diff --git a/apps/sim/tools/jira/utils.ts b/apps/sim/tools/jira/utils.ts index 620eadd40d..1891eba245 100644 --- a/apps/sim/tools/jira/utils.ts +++ b/apps/sim/tools/jira/utils.ts @@ -1,4 +1,5 @@ import { createLogger } from '@sim/logger' +import { fetchWithRetry } from '@/lib/knowledge/documents/utils' const logger = createLogger('JiraUtils') @@ -67,7 +68,7 @@ export async function downloadJiraAttachments( continue } try { - const response = await fetch(att.content, { + const response = await fetchWithRetry(att.content, { headers: { Authorization: `Bearer ${accessToken}`, Accept: '*/*', @@ -97,13 +98,21 @@ export async function downloadJiraAttachments( } export async function getJiraCloudId(domain: string, accessToken: string): Promise { - const response = await fetch('https://api.atlassian.com/oauth/token/accessible-resources', { - method: 'GET', - headers: { - Authorization: `Bearer ${accessToken}`, - Accept: 'application/json', - }, - }) + const response = await fetchWithRetry( + 'https://api.atlassian.com/oauth/token/accessible-resources', + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + } + ) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Failed to fetch Jira accessible resources: ${response.status} - ${errorText}`) + } const resources = await response.json()