diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4f55d92..53841c7 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -7,14 +7,14 @@ }, "metadata": { "description": "Official Parallel Web Systems plugin for Claude Code - web search, content extraction, deep research, and data enrichment capabilities.", - "version": "0.2.0", + "version": "0.3.0", "pluginRoot": "./" }, "plugins": [ { "name": "parallel", "description": "Parallel CLI integration for web search, URL content extraction, deep research tasks, and bulk data enrichment powered by Parallel's AI-native web infrastructure.", - "version": "0.2.0", + "version": "0.3.0", "source": "./", "author": { "name": "Parallel Web Systems", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index b01d4c7..92b6e5c 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "parallel", - "version": "0.2.0", + "version": "0.3.0", "description": "Parallel Web Search MCP and Task API integration for Claude Code. Provides web search, content extraction, deep research tasks, and data enrichment capabilities.", "author": { "name": "Parallel Web Systems", diff --git a/README.md b/README.md index 575bbfa..15d0d24 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ Skills follow the [Agent Skills](https://agentskills.io/specification) specifica | **parallel-web-extract** | Extract content from URLs, articles, PDFs | | **parallel-deep-research** | Comprehensive research and analysis | | **parallel-data-enrichment** | Enrich lists of companies, people, products | +| **parallel-develop** | Bootstrap a Parallel API integration in your codebase (Python / TypeScript / cURL / MCP) | | **setup** | Install CLI and authenticate | | **status** | Check running research task status | | **result** | Get completed research task result | @@ -93,6 +94,7 @@ Skills follow the [Agent Skills](https://agentskills.io/specification) specifica /parallel:parallel-web-extract https://docs.parallel.ai /parallel:parallel-deep-research competitive landscape of AI code assistants /parallel:parallel-data-enrichment Apple, Microsoft, Google - get CEO names +/parallel:parallel-develop add a research agent to my Next.js app in typescript /parallel:setup ``` diff --git a/skills/parallel-develop/SKILL.md b/skills/parallel-develop/SKILL.md new file mode 100644 index 0000000..74fe8b9 --- /dev/null +++ b/skills/parallel-develop/SKILL.md @@ -0,0 +1,84 @@ +--- +name: parallel-develop +description: "Bootstrap a Parallel API integration in the user's codebase. Use when the user says 'integrate Parallel', 'add Parallel API', 'build with Parallel', 'use Parallel for web search / research / enrichment / monitoring / extraction / lead discovery', or asks for starter code that talks to api.parallel.ai. Produces install + env + working example code tailored to the user's language (Python / TypeScript / cURL / MCP) and the API that fits their use case." +user-invocable: true +argument-hint: +compatibility: Works in any agent/IDE. The generated code targets Python (`parallel-web`), TypeScript (`parallel-web`), raw cURL, or MCP clients (Cursor, Claude Desktop, VS Code, Claude Code). +allowed-tools: Read Write Edit Bash(ls:*) Bash(cat:*) Bash(mkdir:*) Bash(pip:*) Bash(npm:*) Bash(pnpm:*) Bash(uv:*) WebFetch +metadata: + author: parallel +--- + +# Parallel API Bootstrap + +Build a Parallel integration for: $ARGUMENTS + +## Step 1 — identify the client + use case + +Parallel has five main APIs. Pick the one that fits the user's goal: + +| Use case | API | When | +|----------|-----|------| +| Single web lookup, RAG retrieval | **Search** | Fast, LLM-optimized excerpts for a given objective + keywords | +| Deep research / multi-hop question | **Task** (processor `pro`/`ultra`) | Complex queries that need several retrieval hops and structured output | +| Enrich a known list (people, companies, products) | **Task** (processor `core`) | You have the entities; fill in columns | +| Discover NEW entities matching criteria | **FindAll** (beta) | Building a list from scratch | +| Track web changes on a schedule | **Monitor** (alpha) | News tracking, change alerts with webhooks | +| Convert a specific URL to LLM-friendly markdown | **Extract** | Single-URL ingestion, PDFs, JS-heavy pages | + +Likewise, pick the client the user is working in: + +- **Python** — official `parallel-web` SDK (top-level `client.search`, `client.extract`, `client.task_run`, `client.beta.findall`) +- **TypeScript** — official `parallel-web` SDK (top-level `client.search`, `client.extract`, `client.taskRun`; FindAll / Monitor via generic `client.post`) +- **cURL** — any language; just HTTP + `x-api-key` header +- **MCP** — if the user wants an MCP client (Cursor / Claude Desktop / VS Code / Claude Code) to call Parallel's hosted Search + Task MCPs + +If the user's message doesn't make the client + use case obvious, **ask them once** (AskUserQuestion or a plain clarifying question). Don't guess. + +## Step 2 — emit the integration + +Read the matching recipe file and follow its instructions verbatim. Each recipe covers install, env setup, the minimal working code, and the top best practices. + +- Python → [references/python.md](references/python.md) +- TypeScript → [references/typescript.md](references/typescript.md) +- cURL → [references/curl.md](references/curl.md) +- MCP → [references/mcp.md](references/mcp.md) + +After reading the recipe, write the example into the user's repo (a sensible path like `scripts/parallel_.py` or `examples/parallel-.ts`), update the nearest `requirements.txt` / `package.json` if needed, and walk the user through getting an API key at [platform.parallel.ai](https://platform.parallel.ai). + +## Step 3 — point at the canonical docs for follow-ups + +Parallel publishes an agent-friendly docs index at **https://docs.parallel.ai/llms.txt** — a single markdown file with the full API surface. If the user asks for anything beyond the recipe (webhooks, source policies, task groups, advanced schemas), fetch that URL first and pull in the relevant section rather than guessing. + +For deep-dive API questions the `/llms.txt` index is authoritative; do not invent parameter names. + +### Canonical best-practice pages (link, don't paraphrase) + +When the user asks "how do I do X correctly?", send them to the dedicated page instead of making things up: + +- **Search** — [Best Practices](https://docs.parallel.ai/search/best-practices) · [Advanced Settings](https://docs.parallel.ai/search/advanced-search-settings) · [Modes](https://docs.parallel.ai/search/modes) +- **Extract** — [Best Practices](https://docs.parallel.ai/extract/best-practices) · [Advanced Settings](https://docs.parallel.ai/extract/advanced-extract-settings) +- **Task** — [Specify a Task](https://docs.parallel.ai/task-api/guides/specify-a-task) · [Choose a Processor](https://docs.parallel.ai/task-api/guides/choose-a-processor) · [Task Run Lifecycle](https://docs.parallel.ai/task-api/guides/execute-task-run) · [Webhooks](https://docs.parallel.ai/task-api/webhooks) +- **FindAll** — [Generators & Pricing](https://docs.parallel.ai/findall-api/core-concepts/findall-generator-pricing) +- **MCP** — [Quickstart](https://docs.parallel.ai/integrations/mcp/quickstart) · [Programmatic Use](https://docs.parallel.ai/integrations/mcp/programmatic-use) + +## Guardrails + +- **Snake_case everywhere.** Both the Python and TypeScript SDKs use **snake_case** body keys (`task_spec`, `output_schema`, `json_schema`, `run_id`, `event_types`). Do NOT camelCase these in TypeScript — it will silently be rejected by the server or produce a type error. +- **Never** invent an endpoint version. Current versions: Search/Extract/Task at `/v1`, FindAll at `/v1beta`, Monitor at `/v1alpha`. +- **For runs > 30 s** (Task `pro`/`ultra`, FindAll `core`/`pro`, any Monitor event), prefer a **webhook** over polling. Pass `webhook={"url": "...", "event_types": [...]}` at creation. +- **Task output schemas are strict.** Every property must appear in `required`; for optional fields use a union like `{"type": ["string", "null"]}` instead of omitting. Always set `additionalProperties: false`. The root must be `{"type": "object"}` — arrays cannot be the root. Prefer flat schemas. +- **Always** give each Task/Enrichment field a *specific* description with exact format (e.g. `"MM-YYYY"`, `"USD"`, `"ISO 3166-1 alpha-2"`) and an explicit missing-value behavior (`"Return 'Not Available' if no source confirms"`). +- **Start FindAll with `generator="preview"`** to iterate on the objective and match_conditions before scaling up — it's cheap and fast. **If you get 0 matches, upgrade the generator before rewriting the query** — the issue is usually pool size, not query quality. +- **session_id** groups related Search + Extract calls into one logical task. Generate a fresh UUID per task and reuse it across calls. **client_model** tells the server which LLM will consume the excerpts so it can tune formatting. +- **When exposing Search/Extract as an LLM tool**, expose ONLY `objective` and `search_queries`. Exposing `advanced_settings` tempts the model to over-narrow and hurts recall. +- **Don't** use `parallel-cli` for these recipes — this skill is about writing integration code, not about invoking the CLI. For CLI workflows, use the `parallel-web-search`, `parallel-web-extract`, `parallel-deep-research`, or `parallel-data-enrichment` skills instead. + +## When to choose a different skill + +- User just wants to **run a web search right now** → use `parallel-web-search`. +- User just wants **one research task** run → use `parallel-deep-research`. +- User wants to **enrich a CSV right now** → use `parallel-data-enrichment`. +- User wants to **fetch a specific URL** → use `parallel-web-extract`. + +This skill is for the *build-and-ship* case — when the user is adding Parallel to their own codebase. diff --git a/skills/parallel-develop/references/curl.md b/skills/parallel-develop/references/curl.md new file mode 100644 index 0000000..0fbef0d --- /dev/null +++ b/skills/parallel-develop/references/curl.md @@ -0,0 +1,173 @@ +# cURL recipe — raw HTTP + +All calls use header `x-api-key: $PARALLEL_API_KEY`. Get a key at [platform.parallel.ai](https://platform.parallel.ai). + +```bash +export PARALLEL_API_KEY="your-api-key" +``` + +## Snippets by use case + +### Web Search — `POST /v1/search` + +```bash +# Best practices: +# - Provide both "objective" and "search_queries" (2-3 short keyword queries). +# - Modes: "basic" = lowest latency, "advanced" = higher recall. +# - advanced_settings.source_policy restricts/blocks domains; +# advanced_settings.fetch_policy.max_age_seconds controls freshness; +# advanced_settings.location (ISO 3166 alpha-2) for geo-targeting. +curl -X POST https://api.parallel.ai/v1/search \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "objective": "Find the latest information about ", + "search_queries": ["", ""], + "mode": "advanced", + "max_chars_total": 27000, + "advanced_settings": { + "max_results": 10, + "excerpt_settings": {"max_chars_per_result": 10000} + } + }' +``` + +### Research / structured task — `POST /v1/tasks/runs` + +```bash +# Processors: lite/base (simple), core (~10 output fields), pro/ultra (deep). +# Append "-fast" for lower latency (e.g. "core-fast", "pro-fast"). +# For pro/ultra tasks prefer a webhook over polling. + +# 1. Create a task run and capture the run_id +RUN_ID=$(curl -s -X POST https://api.parallel.ai/v1/tasks/runs \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "input": "", + "task_spec": { + "output_schema": "A summary, key findings, and sources" + }, + "processor": "core" + }' | jq -r '.run_id') + +# 2. Get the result (?timeout=N in seconds; max 600. Re-poll on timeout.) +curl -X GET "https://api.parallel.ai/v1/tasks/runs/$RUN_ID/result?timeout=600" \ + -H "x-api-key: $PARALLEL_API_KEY" +``` + +### Data enrichment — same endpoint, structured schema + +```bash +# Processors: lite/base (1-2 fields), core (up to ~10 fields), pro (complex). + +RUN_ID=$(curl -s -X POST https://api.parallel.ai/v1/tasks/runs \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "input": "", + "task_spec": { + "output_schema": { + "type": "json", + "json_schema": { + "type": "object", + "properties": { + "founding_date": { + "type": "string", + "description": "Founding date in MM-YYYY format. Return \"Not Available\" if unknown." + }, + "employee_count": { + "type": "string", + "description": "Estimated employee count as a range, e.g. \"500-1000\"." + }, + "funding_sources": { + "type": "string", + "description": "Funding sources and total raised in USD." + } + }, + "required": ["founding_date", "employee_count", "funding_sources"] + } + } + }, + "processor": "core" + }' | jq -r '.run_id') + +curl -X GET "https://api.parallel.ai/v1/tasks/runs/$RUN_ID/result?timeout=600" \ + -H "x-api-key: $PARALLEL_API_KEY" +``` + +### Lead discovery — `POST /v1beta/findall/runs` + +```bash +# Start with "preview" generator to validate your query (~10 candidates, fast, cheap). +# Generators: preview (test), base (broad), core (specific), pro (rare matches). + +FINDALL_ID=$(curl -s -X POST https://api.parallel.ai/v1beta/findall/runs \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "objective": "", + "entity_type": "companies", + "match_conditions": [ + { + "name": "series_a_2024", + "description": "Company raised a Series A in 2024, confirmed by TechCrunch/Crunchbase/PR." + }, + { + "name": "ai_focused", + "description": "Primary product must be AI-focused (LLMs, ML infra, or AI apps)." + } + ], + "generator": "core", + "match_limit": 20 + }' | jq -r '.findall_id') + +# May take several minutes for core/pro. Use a webhook for large runs. +curl -X GET "https://api.parallel.ai/v1beta/findall/runs/$FINDALL_ID/result" \ + -H "x-api-key: $PARALLEL_API_KEY" +``` + +### Web Monitoring — `POST /v1alpha/monitors` + +```bash +# Natural-language query focused on intent, not keywords. +# Cadence: "hourly" (fast-moving), "daily" (most news), "weekly" (slow-changing). +# Don't include dates — Monitor tracks new updates automatically. +# "simulate_event": true forces an immediate test event during development. +curl -X POST https://api.parallel.ai/v1alpha/monitors \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "query": "", + "cadence": "daily", + "webhook": { + "url": "https://your-app.com/webhook", + "event_types": ["monitor.event.detected"] + } + }' +``` + +Response shape: `{ "monitor_id": "...", "status": "...", "cadence": "...", "query": "...", ... }`. + +### Content extraction — `POST /v1/extract` + +```bash +# Always provide an "objective" — extraction is LLM-focused, not a raw scrape. +# Up to 20 URLs per call. PDFs and JS-heavy pages are handled. +# "full_content": true only when you need the whole page as markdown. +# "fetch_policy": {"max_age_seconds": 3600} for cache-vs-live control. +curl -X POST https://api.parallel.ai/v1/extract \ + -H "Content-Type: application/json" \ + -H "x-api-key: $PARALLEL_API_KEY" \ + -d '{ + "urls": ["https://example.com/article"], + "objective": "", + "excerpt_settings": {"max_chars_per_result": 5000} + }' +``` + +## Error handling + +- Non-2xx responses return `{ "error": { "message": "...", "type": "..." } }`. Check HTTP status first, then inspect the body. +- For 429 (rate limit), back off with exponential delay. +- For 5xx, retry with jitter. Runs that have been *created* are durable — refetch with the `run_id` / `findall_id` rather than recreating. diff --git a/skills/parallel-develop/references/mcp.md b/skills/parallel-develop/references/mcp.md new file mode 100644 index 0000000..e24aaf2 --- /dev/null +++ b/skills/parallel-develop/references/mcp.md @@ -0,0 +1,128 @@ +# MCP recipe — Parallel's hosted Search + Task MCPs + +Parallel ships two remote streamable-HTTP MCP servers. Almost every client just needs the URL: + +- **Search MCP** — `https://search.parallel.ai/mcp` (web search + extract) +- **Task MCP** — `https://task-mcp.parallel.ai/mcp` (deep research tasks) + +OAuth handles auth on first use in most clients. No `mcp-remote`, no API key in config, no npm package to install. + +Canonical install docs per client: [docs.parallel.ai/integrations/mcp/search-mcp](https://docs.parallel.ai/integrations/mcp/search-mcp). + +--- + +## Quick install by client + +### Claude Code + +```bash +claude mcp add --transport http "Parallel-Search-MCP" https://search.parallel.ai/mcp +claude mcp add --transport http "Parallel-Task-MCP" https://task-mcp.parallel.ai/mcp +``` + +Run `/mcp` and complete the browser OAuth flow. + +### Claude Desktop + +Settings → Connectors → Add Custom Connector, once per server: + +- **Parallel Search MCP** → `https://search.parallel.ai/mcp` +- **Parallel Task MCP** → `https://task-mcp.parallel.ai/mcp` + +(Older Claude Desktop builds without the Connectors UI — see "Stdio fallback" below.) + +### Cursor — `~/.cursor/mcp.json` (or `.cursor/mcp.json` per-project) + +```json +{ + "mcpServers": { + "Parallel Search MCP": { "url": "https://search.parallel.ai/mcp" }, + "Parallel Task MCP": { "url": "https://task-mcp.parallel.ai/mcp" } + } +} +``` + +Restart Cursor after editing. OAuth handles auth. + +### VS Code — `settings.json` + +```json +{ + "mcp": { + "servers": { + "Parallel Search MCP": { "type": "http", "url": "https://search.parallel.ai/mcp" }, + "Parallel Task MCP": { "type": "http", "url": "https://task-mcp.parallel.ai/mcp" } + } + } +} +``` + +### Windsurf — `~/.codeium/windsurf/mcp_config.json` + +```json +{ + "mcpServers": { + "Parallel Search MCP": { "serverUrl": "https://search.parallel.ai/mcp" }, + "Parallel Task MCP": { "serverUrl": "https://task-mcp.parallel.ai/mcp" } + } +} +``` + +### Cline — MCP Servers → Remote Servers → Edit Configuration + +```json +{ + "mcpServers": { + "Parallel Search MCP": { "url": "https://search.parallel.ai/mcp", "type": "streamableHttp" }, + "Parallel Task MCP": { "url": "https://task-mcp.parallel.ai/mcp", "type": "streamableHttp" } + } +} +``` + +### Other clients + +Gemini CLI, ChatGPT, Codex, Amp, Kiro, Antigravity — all covered in the [Search MCP install guide](https://docs.parallel.ai/integrations/mcp/search-mcp). + +--- + +## Stdio fallback (older clients only) + +If a client can't speak remote HTTP MCP, bridge via `mcp-remote`: + +```json +{ + "mcpServers": { + "Parallel Search MCP": { + "command": "npx", + "args": [ + "-y", "mcp-remote", + "https://search.parallel.ai/mcp", + "--header", "Authorization: Bearer YOUR-PARALLEL-API-KEY" + ] + } + } +} +``` + +Paste your key from [platform.parallel.ai](https://platform.parallel.ai) into the header. **Don't** rely on `${PARALLEL_API_KEY}` expansion from a shell `export` — many GUI clients don't inherit shell env. + +--- + +## Programmatic access with Bearer auth + +For scripts or agents that aren't MCP-aware, hit the endpoints as plain HTTP: + +```bash +curl https://search.parallel.ai/mcp \ + -H "Authorization: Bearer $PARALLEL_API_KEY" +``` + +See [Programmatic Use](https://docs.parallel.ai/integrations/mcp/programmatic-use). + +--- + +## When to choose MCP vs direct SDK + +**MCP** when the user lives in an MCP-native IDE and wants web tools inside that agent — sensible defaults, no custom request shaping needed. + +**Direct SDK** (Python / TypeScript) when they're building a backend / agent / worker and need full control: custom output schemas, webhooks, batch workflows, structured task outputs. diff --git a/skills/parallel-develop/references/python.md b/skills/parallel-develop/references/python.md new file mode 100644 index 0000000..b42d611 --- /dev/null +++ b/skills/parallel-develop/references/python.md @@ -0,0 +1,347 @@ +# Python recipe — `parallel-web` + +## Install + +```bash +pip install 'parallel-web>=0.5,<1.0' # or: uv add 'parallel-web>=0.5,<1.0' +export PARALLEL_API_KEY="your-api-key" # get one at https://platform.parallel.ai +``` + +The current minor is `0.5.x` — pin against `>=0.5,<1.0` until v1.0 ships. The SDK reads `PARALLEL_API_KEY` from the environment automatically; use `Parallel(api_key=...)` to override. + +### Package / import / types + +| Item | Value | +|------|-------| +| PyPI package | `parallel-web` | +| Import | `from parallel import Parallel` | +| Exceptions | `from parallel import APIError, RateLimitError, AuthenticationError, BadRequestError, InternalServerError` | +| Typed params (strict mode) | `from parallel.types import TaskSpecParam, JsonSchemaParam, TextSchemaParam` | + +Typed params are optional — plain `dict` bodies work fine at runtime. Reach for the TypedDicts when your project has pyright/mypy in strict mode and the plain-dict `task_spec` argument gets flagged. + +### Result shape cheat-sheet + +- `client.search(...)` → `.results[i].{title, url, publish_date, excerpts: list[str]}` +- `client.extract(...)` → same shape as search results, plus optional `.results[i].full_content` +- `client.task_run.create(...)` → `.run_id` (you pass this to `.result()`) +- `client.task_run.result(run_id, api_timeout=N)` → `.output.content` is a dict matching your `json_schema` (or a string if you passed a text/bare-string schema) +- `client.beta.findall.create(...)` → `.findall_id` +- `client.beta.findall.result(findall_id=...)` → `.candidates[i].{name, url, description, ...}` + +## Snippets by use case + +Pick the block that matches the user's goal, drop it into a script in their repo, and wire in their topic/entity/URL. + +### Web Search — `client.search` + +Deep dive: [Search Best Practices](https://docs.parallel.ai/search/best-practices) · [Advanced Settings](https://docs.parallel.ai/search/advanced-search-settings) · [Modes](https://docs.parallel.ai/search/modes). + +```python +from parallel import Parallel + +client = Parallel() + +# Best practices: +# - Provide BOTH objective (what/why) and search_queries. Either alone works; +# together is best. Use 2-3 queries of 3-6 words each (max 5, ≤200 chars). +# - Modes: "basic" = lowest latency, "advanced" = higher recall + reranking. +# - Put freshness / source preferences in the objective ("official docs", +# "post-2024 only") rather than as extra keywords. +# - advanced_settings.source_policy restricts/blocks domains. +# - advanced_settings.fetch_policy.max_age_seconds for freshness control. +# - advanced_settings.location (ISO 3166 alpha-2) for geo-targeting. +# - session_id: pass the same UUID across related Search + Extract calls +# in one task, so Parallel treats them as one logical workflow. +# - client_model: declare your consuming LLM for server-side optimization. +# - If you expose Search as a tool to an agent, expose ONLY objective and +# search_queries. Exposing advanced_settings tempts the model to +# over-narrow the search and hurts recall. +search = client.search( + objective="Find the latest information about ", + search_queries=["", ""], + mode="advanced", + max_chars_total=27000, + # session_id="", + # client_model="claude-opus-4-7", + advanced_settings={ + "max_results": 10, + "excerpt_settings": {"max_chars_per_result": 10000}, + }, +) + +for result in search.results: + print(f"{result.title}: {result.url}") + for excerpt in result.excerpts: + print(excerpt[:200]) +``` + +### Research / structured task — `client.task_run` + +Deep dive: [Specify a Task](https://docs.parallel.ai/task-api/guides/specify-a-task) · [Choose a Processor](https://docs.parallel.ai/task-api/guides/choose-a-processor) · [Task Run Lifecycle](https://docs.parallel.ai/task-api/guides/execute-task-run) · [Webhooks](https://docs.parallel.ai/task-api/webhooks). + +```python +from parallel import Parallel + +client = Parallel() + +# Processors (pick based on complexity + latency budget): +# lite / base — simple lookups, single-hop facts (seconds) +# core — up to ~10 output fields, light research (30 s – 2 min) +# pro / ultra — deep, multi-hop research (2 – 25 min; use webhooks!) +# append "-fast" for lower-latency variants (e.g. "core-fast", "pro-fast") +# +# Schema rules (canonical, enforced by the API): +# - Root MUST be {"type": "object"} with "properties". Arrays cannot be root. +# - EVERY property must appear in "required". Optional fields use a union +# like {"type": ["string", "null"]} instead of being omitted. +# - Set "additionalProperties": false for strict validation. +# - Prefer flat schemas — top-level properties beat nesting for output quality. +# - The field "description" is your primary knob: specify format, sources, +# and missing-value behavior per field. +task_run = client.task_run.create( + input="", + task_spec={ + "output_schema": { + "type": "json", + "json_schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "summary": {"type": "string"}, + "key_findings": {"type": "array", "items": {"type": "string"}}, + "sources": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["summary", "key_findings", "sources"], + }, + } + }, + processor="core", + # webhook={"url": "https://your-app.com/webhook", "event_types": ["task_run.status"]}, +) + +# Block until the task completes (api_timeout is in seconds; bump for pro/ultra) +result = client.task_run.result(task_run.run_id, api_timeout=600) +print(result.output) +``` + +**Shortcut:** `client.task_run.execute(input=..., processor=..., output=)` creates, waits, and parses into your Python type in one call. Great for throwaway scripts. + +### Data enrichment — `client.task_run` with a structured schema + +```python +from parallel import Parallel + +client = Parallel() + +# Enrichment best practices (see specify-a-task): +# - Name fields specifically (ceo_name, not name; headquarters_address, +# not address). +# - Put EXACT format requirements in each description ("MM-YYYY", "USD", +# "ISO 3166-1 alpha-2"). The model honors descriptions tightly. +# - Tell the model what to do when data is missing ("Return 'Not Available' +# if no source confirms"). This prevents hallucination. +# - Every property MUST appear in "required"; for optional fields use a +# union like {"type": ["string", "null"]} instead of omitting. +# - Set "additionalProperties": false. +# - Keep schemas flat. +task_run = client.task_run.create( + input="", + task_spec={ + "output_schema": { + "type": "json", + "json_schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "founding_date": { + "type": "string", + "description": "Founding date in MM-YYYY format. Return 'Not Available' if unknown.", + }, + "employee_count": { + "type": "string", + "description": "Estimated employee count as a range, e.g. '500-1000'.", + }, + "funding_sources": { + "type": "string", + "description": "Funding sources and total raised in USD.", + }, + }, + "required": ["founding_date", "employee_count", "funding_sources"], + }, + } + }, + processor="core", +) + +result = client.task_run.result(task_run.run_id, api_timeout=600) +print(result.output) # result.output.content is a dict matching your json_schema +``` + +### Batch enrichment with Task Groups + +For 50+ rows, don't call `task_run.create + result` per row — it's wasteful. Use a **Task Group**, which accepts a batch of inputs, runs them concurrently under a shared spec, and streams progress events: + +```python +from parallel import Parallel + +client = Parallel() + +group = client.beta.task_group.create( + default_task_spec={ + "output_schema": { + "type": "json", + "json_schema": { + "type": "object", + "properties": { + "ceo_name": { + "type": "string", + "description": "Current CEO full name. Return 'Not Available' if unknown.", + }, + "revenue_2024": { + "type": "string", + "description": "2024 annual revenue in USD (e.g. '$5.2B'). Return 'Not Available' if unknown.", + }, + }, + "required": ["ceo_name", "revenue_2024"], + }, + } + }, +) + +inputs = [{"input": name, "processor": "core"} for name in ["Stripe", "OpenAI", "Anthropic"]] +client.beta.task_group.add_runs(group.taskgroup_id, runs=inputs) + +# Stream events until all runs finish +for event in client.beta.task_group.events(group.taskgroup_id): + print(event) +``` + +### Lead discovery — `client.beta.findall` + +Deep dive: [Generators & Pricing](https://docs.parallel.ai/findall-api/core-concepts/findall-generator-pricing). + +```python +from parallel import Parallel + +client = Parallel() + +# Best practices: +# - ALWAYS start with generator="preview" — ~10 candidates in seconds, low +# cost. Use it to validate your approach before committing to a big run. +# - Match generator to expected volume: +# base — broad criteria, many matches from common fields +# core — moderate specificity, ~20-50 results +# pro — rare / highly specific, thoroughness > cost +# - 0 MATCHES? Try UPGRADING THE GENERATOR before rewriting the query. +# Usually the issue is candidate pool size, not query quality. +# - Write DETAILED match_conditions. Each {name, description} is run +# against every candidate for verification. Detailed = higher precision. +# - For large runs, pass a webhook instead of polling .result(). +# - Enrichments multiply costs across matches — validate counts in preview +# before adding enrichments. Extend runs are cheaper than fresh ones. +findall_run = client.beta.findall.create( + objective="", + entity_type="companies", + match_conditions=[ + { + "name": "series_a_2024", + "description": ( + "Company must have raised a Series A funding round in 2024, " + "confirmed by a reputable source (TechCrunch, Crunchbase, " + "company press release, etc.)." + ), + }, + { + "name": "ai_focused", + "description": "Primary product must be AI-focused (LLMs, ML infra, AI apps).", + }, + ], + generator="core", + match_limit=20, +) + +# Blocks until complete. For core/pro this can take several minutes. +result = client.beta.findall.result(findall_id=findall_run.findall_id) + +for candidate in result.candidates: + print(f"{candidate.name}: {candidate.url}") + print(f" {candidate.description}") +``` + +### Web Monitoring — Monitor API (alpha) + +Monitor is an alpha API and doesn't yet have a typed SDK resource, so call it via `httpx` directly: + +```python +import os +import httpx + +API_KEY = os.environ["PARALLEL_API_KEY"] + +# Write queries in natural language focused on intent, not keywords. +# Cadence: "hourly" (fast-moving), "daily" (most news), "weekly" (slow-changing). +# Don't include dates — Monitor tracks new updates automatically. +# Tip: "simulate_event": True forces an immediate test event during development. +res = httpx.post( + "https://api.parallel.ai/v1alpha/monitors", + headers={"x-api-key": API_KEY, "Content-Type": "application/json"}, + json={ + "query": "", + "cadence": "daily", + "webhook": { + "url": "https://your-app.com/webhook", + "event_types": ["monitor.event.detected"], + }, + }, + timeout=30, +).raise_for_status().json() + +print(f"Monitor created: {res['monitor_id']} (status: {res['status']})") +``` + +Response shape: `{ "monitor_id": str, "status": str, "cadence": str, "query": str, ... }`. + +### Content extraction — `client.extract` + +Deep dive: [Extract Best Practices](https://docs.parallel.ai/extract/best-practices) · [Advanced Settings](https://docs.parallel.ai/extract/advanced-extract-settings). + +```python +from parallel import Parallel + +client = Parallel() + +# Best practices: +# - Always provide an "objective" — extraction ranks excerpts by relevance. +# - Add 2-3 search_queries (3-6 words each) for focus. +# - Up to 20 URLs per call. PDFs and JS-heavy pages are handled. +# - Set full_content=True only when you need the whole page as markdown — +# excerpts are usually enough and much cheaper. WARNING: full_content +# without an objective is redundant (excerpts duplicate the full page). +# - fetch_policy.max_age_seconds controls cache-vs-live freshness. +# - session_id: pass the SAME UUID across related Search + Extract calls +# in one task so Parallel treats them as one logical workflow. +# - client_model: declare your consuming LLM for server-side optimization. +extract = client.extract( + urls=["https://example.com/article"], + objective="", + excerpt_settings={"max_chars_per_result": 5000}, + # session_id="", + # client_model="claude-opus-4-7", + # fetch_policy={"max_age_seconds": 3600}, # uncomment for fresh content +) + +for result in extract.results: + print(f"{result.title}: {result.url}") + for excerpt in result.excerpts: + print(excerpt[:200]) +``` + +## Error handling + +- The SDK raises `parallel.APIError` subclasses for 4xx/5xx responses. Wrap calls in `try/except parallel.APIError as e:` and inspect `e.status_code` for retry decisions. +- `client.task_run.result(...)` can time out if the run hasn't finished within `api_timeout`. Re-call it, or better: pass a webhook and skip polling. + +## Typed outputs (optional) + +If the user wants typed results rather than the generic dict, use Pydantic or a TypedDict with `client.task_run.execute(..., output=MyOutput)` and it will validate + parse the output for you. diff --git a/skills/parallel-develop/references/typescript.md b/skills/parallel-develop/references/typescript.md new file mode 100644 index 0000000..c7845c2 --- /dev/null +++ b/skills/parallel-develop/references/typescript.md @@ -0,0 +1,301 @@ +# TypeScript recipe — `parallel-web` + +## Install + +```bash +npm install 'parallel-web@^0.4' # or pnpm add / yarn add +export PARALLEL_API_KEY="your-api-key" # get one at https://platform.parallel.ai +``` + +The current minor is `0.4.x`; pin with `^0.4` until v1.0 ships. `npm install parallel-web` (unpinned) also works — but a pin prevents silent major-version drift. The SDK reads `PARALLEL_API_KEY` from the environment automatically; use `new Parallel({ apiKey })` to override. + +### Response shape cheat-sheet + +- `client.search(...)` → `{ results: [{ title, url, publish_date, excerpts: string[] }] }` +- `client.extract(...)` → same, plus `results[i].full_content?` +- `client.taskRun.create(...)` → `{ run_id: string, ... }` +- `client.taskRun.result(runId, { timeout })` → `{ output: { content: , basis?: ... } }` +- `client.post('/v1beta/findall/runs', ...)` → `{ findall_id: string }` +- `client.get('/v1beta/findall/runs/{id}/result')` → `{ candidates: [{ name, url, description, ... }] }` + +**`tsconfig.json` note:** `parallel-web` uses modern class features (private `#fields`). If you type-check with a bare `tsc file.ts`, set `"target": "ES2022"` and `"skipLibCheck": true` — or just use a runner (tsx, bun, ts-node) that already does this. + +## Critical: snake_case, not camelCase + +The TypeScript SDK mirrors the REST body verbatim, so **request fields stay in snake_case**: + +- `task_spec`, not `taskSpec` +- `output_schema`, not `outputSchema` +- `json_schema`, not `jsonSchema` +- `search_queries`, not `searchQueries` +- `run_id`, not `runId` on the response + +The `taskRun.result(runId, opts)` second argument uses `{ timeout: number }` (in seconds), **not** `apiTimeout`. + +## Snippets by use case + +### Web Search — `client.search` + +Deep dive: [Search Best Practices](https://docs.parallel.ai/search/best-practices) · [Advanced Settings](https://docs.parallel.ai/search/advanced-search-settings) · [Modes](https://docs.parallel.ai/search/modes). + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Best practices: +// - Provide BOTH objective and search_queries. Use 2-3 queries of 3-6 words +// each (max 5, ≤200 chars). +// - Put freshness / source preferences in the objective ("official docs", +// "post-2024 only"), not as a separate keyword. +// - Modes: "basic" = lowest latency, "advanced" = higher recall + reranking. +// - session_id: pass the same UUID across related Search + Extract calls. +// - client_model: declare your consuming LLM for server-side optimization. +// - If exposing Search as an LLM tool, expose ONLY objective and +// search_queries — advanced_settings tempts the model to over-narrow. +const search = await client.search({ + objective: "Find the latest information about ", + search_queries: ["", ""], + mode: "advanced", + max_chars_total: 27000, + // session_id: "", + // client_model: "claude-opus-4-7", + advanced_settings: { + max_results: 10, + excerpt_settings: { max_chars_per_result: 10000 }, + // source_policy: { include_domains: [], after_date: "2024-01-01" }, + // fetch_policy: { max_age_seconds: 3600 }, + // location: "US", // ISO 3166 alpha-2 + }, +}); + +for (const result of search.results) { + console.log(`${result.title}: ${result.url}`); + for (const excerpt of result.excerpts) { + console.log(excerpt.slice(0, 200)); + } +} +``` + +### Research / structured task — `client.taskRun` + +Deep dive: [Specify a Task](https://docs.parallel.ai/task-api/guides/specify-a-task) · [Choose a Processor](https://docs.parallel.ai/task-api/guides/choose-a-processor) · [Webhooks](https://docs.parallel.ai/task-api/webhooks). + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Processors: +// lite / base — simple lookups, single-hop facts (seconds) +// core — up to ~10 output fields, light research (30 s – 2 min) +// pro / ultra — deep, multi-hop research (2 – 25 min; use webhooks!) +// append "-fast" for lower-latency variants. +// +// Schema rules (enforced by the API): +// - Root MUST be { type: "object", properties: ... }. Arrays cannot be root. +// - EVERY property must appear in "required". Optional fields use a union +// like { type: ["string", "null"] } instead of being omitted. +// - Set "additionalProperties": false for strict validation. +// - Prefer flat schemas — top-level properties beat nesting for quality. +// - "description" is your primary knob: specify format, sources, and +// missing-value behavior per field. +const taskRun = await client.taskRun.create({ + input: "", + task_spec: { + output_schema: { + type: "json", + json_schema: { + type: "object", + additionalProperties: false, + properties: { + summary: { type: "string" }, + key_findings: { type: "array", items: { type: "string" } }, + sources: { type: "array", items: { type: "string" } }, + }, + required: ["summary", "key_findings", "sources"], + }, + }, + }, + processor: "core", + // webhook: { url: "https://your-app.com/webhook", event_types: ["task_run.status"] }, +}); + +// Block until complete (timeout in seconds) +const result = await client.taskRun.result(taskRun.run_id, { timeout: 600 }); +console.log(result.output); +``` + +### Data enrichment — `client.taskRun` with a structured schema + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Enrichment best practices: specific field names (ceo_name, not name), +// exact formats in descriptions ("MM-YYYY", "USD"), explicit "Not Available" +// on missing data, additionalProperties: false, all fields in required. +const taskRun = await client.taskRun.create({ + input: "", + task_spec: { + output_schema: { + type: "json", + json_schema: { + type: "object", + additionalProperties: false, + properties: { + founding_date: { + type: "string", + description: "Founding date in MM-YYYY format. Return 'Not Available' if unknown.", + }, + employee_count: { + type: "string", + description: "Estimated employee count as a range (e.g. '500-1000').", + }, + funding_sources: { + type: "string", + description: "Description of funding sources and total raised in USD.", + }, + }, + required: ["founding_date", "employee_count", "funding_sources"], + }, + }, + }, + processor: "core", +}); + +const result = await client.taskRun.result(taskRun.run_id, { timeout: 600 }); +console.log(result.output); +``` + +### Lead discovery — FindAll API via `client.post` + +FindAll is not yet a typed resource on the TS SDK. Use the generic `client.post`/`client.get` helpers. Deep dive: [Generators & Pricing](https://docs.parallel.ai/findall-api/core-concepts/findall-generator-pricing). + +**Key tip:** if you get **0 matches**, upgrade the generator (`preview → base → core → pro`) before rewriting the query — the issue is usually pool size, not query quality. + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Tip: start with generator: "preview" to test your query (~10 candidates, low cost). +// Generators: preview (test), base (broad/common), core (specific), pro (rare/thorough). +const run = await client.post<{ findall_id: string }>( + "/v1beta/findall/runs", + { + body: { + objective: "", + entity_type: "companies", + match_conditions: [ + { + name: "series_a_2024", + description: + "Company must have raised a Series A funding round in 2024, confirmed by a reputable source.", + }, + { + name: "ai_focused", + description: "Primary product must be AI-focused (LLMs, ML infra, AI apps).", + }, + ], + generator: "core", + match_limit: 20, + }, + } +); + +// Blocks until complete — may take several minutes for "core" / "pro". +type FindAllResult = { + candidates: Array<{ name: string; url: string; description: string }>; +}; +const result = await client.get( + `/v1beta/findall/runs/${run.findall_id}/result` +); + +for (const candidate of result.candidates) { + console.log(`${candidate.name}: ${candidate.url}`); + console.log(` ${candidate.description}`); +} +``` + +### Web Monitoring — Monitor API via `client.post` + +Monitor is alpha; no typed resource yet. Use `client.post` with a response type generic: + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Write queries in natural language focused on intent, not keywords. +// Cadence: "hourly" (fast-moving), "daily" (most news), "weekly" (slow-changing). +// Don't include dates — Monitor tracks new updates automatically. +// Tip: simulate_event=true forces an immediate test event during dev. +type MonitorResponse = { monitor_id: string; status: string; cadence: string }; +const monitor = await client.post("/v1alpha/monitors", { + body: { + query: "", + cadence: "daily", + webhook: { + url: "https://your-app.com/webhook", + event_types: ["monitor.event.detected"], + }, + }, +}); + +console.log("Monitor created:", monitor.monitor_id, "status:", monitor.status); +``` + +### Content extraction — `client.extract` + +Deep dive: [Extract Best Practices](https://docs.parallel.ai/extract/best-practices) · [Advanced Settings](https://docs.parallel.ai/extract/advanced-extract-settings). + +```ts +import Parallel from "parallel-web"; + +const client = new Parallel(); + +// Best practices: +// - Always provide objective; add 2-3 search_queries (3-6 words each). +// - Up to 20 URLs per call. PDFs and JS-heavy pages are handled. +// - full_content=true only when you need the whole page as markdown. +// WARNING: full_content without an objective is redundant. +// - fetch_policy.max_age_seconds for freshness control. +// - session_id: pass the SAME UUID across related Search + Extract calls. +// - client_model: declare your consuming LLM for server-side optimization. +const extract = await client.extract({ + urls: ["https://example.com/article"], + objective: "", + excerpt_settings: { max_chars_per_result: 5000 }, + // session_id: "", + // client_model: "claude-opus-4-7", + // fetch_policy: { max_age_seconds: 3600 }, // uncomment for fresh content +}); + +for (const result of extract.results) { + console.log(`${result.title}: ${result.url}`); + for (const excerpt of result.excerpts) { + console.log(excerpt.slice(0, 200)); + } +} +``` + +## Error handling + +The SDK throws `Parallel.APIError` subclasses (`BadRequestError`, `AuthenticationError`, `RateLimitError`, `InternalServerError`, …). Catch and inspect `err.status` for retry logic. + +```ts +import Parallel from "parallel-web"; +try { + await client.search({ /* ... */ }); +} catch (err) { + if (err instanceof Parallel.RateLimitError) { + // back off and retry + } else if (err instanceof Parallel.APIError) { + console.error(err.status, err.message); + } else { + throw err; + } +} +```