diff --git a/README.md b/README.md index 60801ec6..08227f26 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,13 @@ Assuming you have an MCP-compatible client (OpenClaw, Cursor, Codex, LM Studio, The Tool workflow (tools-only, not MCP tasks protocol) 1. `prompt_examples` -2. `task_create` -3. `task_status` (poll every 5 minutes until done) -4. download the result via `task_download` or via `task_file_info` +2. `model_profiles` (optional, helps choose `model_profile`) +3. non-tool step: draft/approve prompt +4. `task_create` +5. `task_status` (poll every 5 minutes until done) +6. download the result via `task_download` or via `task_file_info` + +Concurrency note: each `task_create` call returns a new `task_id`; server-side global per-client concurrency is not capped, so clients should track their own parallel tasks. ### Option A: Remote MCP (fastest path) diff --git a/database_api/model_taskitem.py b/database_api/model_taskitem.py index 28ac5e4a..190bc2df 100644 --- a/database_api/model_taskitem.py +++ b/database_api/model_taskitem.py @@ -5,6 +5,31 @@ from sqlalchemy_utils import UUIDType from sqlalchemy import JSON from sqlalchemy.orm import column_property +from sqlalchemy import event + + +def _sanitize_utf8_text(value): + """Normalize values into valid UTF-8-safe text for persistence.""" + if value is None: + return None + + if isinstance(value, str): + text = value + elif isinstance(value, (bytes, bytearray, memoryview)): + text = bytes(value).decode("utf-8", errors="replace") + else: + text = str(value) + + # Postgres text does not support embedded NULL bytes. + if "\x00" in text: + text = text.replace("\x00", "") + + # Replace unpaired surrogates or other non-encodable code points. + try: + text.encode("utf-8", errors="strict") + except UnicodeEncodeError: + text = text.encode("utf-8", errors="replace").decode("utf-8") + return text class TaskState(enum.Enum): pending = 1 @@ -113,3 +138,10 @@ def demo_items(cls) -> list['TaskItem']: } ) return [task1, task2, task3] + + +@event.listens_for(TaskItem, "before_insert") +@event.listens_for(TaskItem, "before_update") +def _sanitize_taskitem_fields(_mapper, _connection, target): + # Enforce valid UTF-8-safe prompt text regardless of writer path. + target.prompt = _sanitize_utf8_text(target.prompt) diff --git a/database_api/tests/test_taskitem_model.py b/database_api/tests/test_taskitem_model.py index 248b829e..5ad311a8 100644 --- a/database_api/tests/test_taskitem_model.py +++ b/database_api/tests/test_taskitem_model.py @@ -39,3 +39,36 @@ def test_stop_request_fields_default(self): self.assertTrue(hasattr(fetched, "run_activity_overview_json")) self.assertTrue(hasattr(fetched, "run_artifact_layout_version")) self.assertFalse(bool(fetched.stop_requested)) + + def test_prompt_invalid_bytes_are_sanitized(self): + with self.app.app_context(): + bad_bytes = b"Hello \xe2\x80 world" + task = TaskItem( + state=TaskState.pending, + prompt=bad_bytes, + user_id="test_user", + ) + db.session.add(task) + db.session.commit() + + fetched = db.session.get(TaskItem, task.id) + self.assertIsInstance(fetched.prompt, str) + # Must be encodable after sanitization. + fetched.prompt.encode("utf-8") + self.assertIn("Hello", fetched.prompt) + self.assertIn("world", fetched.prompt) + + def test_prompt_surrogates_are_sanitized(self): + with self.app.app_context(): + task = TaskItem( + state=TaskState.pending, + prompt="prefix \ud800 suffix", + user_id="test_user", + ) + db.session.add(task) + db.session.commit() + + fetched = db.session.get(TaskItem, task.id) + self.assertIsInstance(fetched.prompt, str) + fetched.prompt.encode("utf-8") + self.assertFalse(any(0xD800 <= ord(ch) <= 0xDFFF for ch in fetched.prompt)) diff --git a/docker-compose.yml b/docker-compose.yml index 23588ae1..2bd9d77e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -242,6 +242,8 @@ services: PLANEXE_WORKER_PLAN_URL: ${PLANEXE_WORKER_PLAN_URL:-http://worker_plan:8000} ports: - "${PLANEXE_MCP_HTTP_PORT:-8001}:8001" + volumes: + - ./llm_config:/app/llm_config:ro restart: unless-stopped healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8001/healthcheck').read()"] diff --git a/docs/mcp/antigravity.md b/docs/mcp/antigravity.md index 40e1dca6..eaeca9b7 100644 --- a/docs/mcp/antigravity.md +++ b/docs/mcp/antigravity.md @@ -18,15 +18,13 @@ My interaction history: 4. I didn't meant outbreak, I meant vulcanic 5. your prompt is a bit shorter than the example prompts 6. go ahead create the plan -7. stop that plan you are creating. -8. now create the plan again, this time with ALL details. Last time you had FAST selected that would leave out most details. -9. check status +7. check status +8. status +9. status 10. status -11. status -12. status -13. download the report -14. summarize the report -15. does it correspond to your expectations? +11. download the report +12. summarize the report +13. does it correspond to your expectations? I had to manually ask about `check status` to get details how the plan creation was going. It's not something that Antigravity can do. diff --git a/docs/mcp/cursor.md b/docs/mcp/cursor.md index 7365904b..ee75da6d 100644 --- a/docs/mcp/cursor.md +++ b/docs/mcp/cursor.md @@ -51,7 +51,7 @@ My interaction with Cursor for creating a plan is like this: 2. I want you to come up with a good prompt 3. I want something ala winter olympics in Italy 2026 4. Slightly different idea. I want Denmark to switch from DKK to EUR. Use the persona of a person representing Denmark's ministers. -5. go ahead create plan with all details +5. go ahead create the plan 6. *wait for 18 minutes until the plan has been created* 7. download the plan diff --git a/docs/mcp/inspector.md b/docs/mcp/inspector.md index e013ff85..aff07940 100644 --- a/docs/mcp/inspector.md +++ b/docs/mcp/inspector.md @@ -68,18 +68,23 @@ When connected follow these steps: Now there should be a list with tool names and descriptions: ``` prompt_examples +model_profiles task_create task_status task_stop task_file_info ``` +When you inspect `task_create`, the visible input schema includes `prompt` and optional `model_profile`. +The `speed_vs_detail` parameter is intentionally hidden and only set via tool-specific metadata, since it confuses AI agents. + Follow these steps: ![screenshot of mcp inspector invoke tool](inspector_step5_mcp_planexe_org.webp) 1. In the `Tools` panel; Click on the `prompt_examples` tool. -2. In the `prompt_examples` right sidepanel; Click on `Run Tool`. -3. The MCP server should respond with a list of list of example prompts. +2. In the `prompt_examples` right sidepanel; Click on `Run Tool`. +3. The MCP server should respond with a list of example prompts. +4. Optionally run `model_profiles` to inspect available `model_profile` choices before `task_create`. ## Approach 2. MCP server inside docker diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 6efdc8d6..8b28a72c 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -10,12 +10,13 @@ This document lists the MCP tools exposed by PlanExe and example prompts for age - The primary MCP server runs in the cloud (see `mcp_cloud`). - The local MCP proxy (`mcp_local`) forwards calls to the server and adds a local download helper. - Tool responses return JSON in both `content.text` and `structuredContent`. +- Workflow note: drafting and user approval of the prompt is a non-tool step between setup tools and `task_create`. ## Tool Catalog, `mcp_cloud` ### prompt_examples -Returns around five example prompts that show what good prompts look like. Each sample is typically 300–800 words: detailed context, requirements, and success criteria. Usually the AI does the heavy lifting: the user has a vague idea, the agent calls `prompt_examples`, then expands that idea into a high-quality prompt (300–800 words). The prompt is shown to the user, who can ask for further changes or confirm it’s good to go. When the user confirms, the agent then calls `task_create`. Shorter or vaguer prompts produce lower-quality plans. +Returns around five example prompts that show what good prompts look like. Each sample is typically 300-800 words. Usually the AI does the heavy lifting: the user has a vague idea, the agent calls `prompt_examples`, then expands that idea into a high-quality prompt (300-800 words). A compact prompt shape works best: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. The prompt is shown to the user, who can ask for further changes or confirm it’s good to go. When the user confirms, the agent then calls `task_create`. Shorter or vaguer prompts produce lower-quality plans. Example prompt: ``` @@ -27,7 +28,33 @@ Example call: {} ``` -Response includes `samples` (array of prompt strings, each 300–800 words) and `message`. +Response includes `samples` (array of prompt strings, each ~300-800 words) and `message`. + +### model_profiles + +Returns profile guidance and model availability for `task_create.model_profile`. +This helps agents pick a profile without knowing internal `llm_config/*.json` details. +Profiles with zero models are omitted from the `profiles` list. +If no models are available in any profile, `model_profiles` returns `isError=true` with `error.code = MODEL_PROFILES_UNAVAILABLE`. + +Example prompt: +``` +List available model profiles and models. +``` + +Example call: +```json +{} +``` + +Response includes: +- `default_profile` +- `profiles[]` with: + - `profile` + - `title` + - `summary` + - `model_count` + - `models[]` (`key`, `provider_class`, `model`, `priority`) ### task_create @@ -41,11 +68,71 @@ Example call: {"prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes..."} ``` -Optional argument: +Optional visible argument: +```text +model_profile: "baseline" | "premium" | "frontier" | "custom" ``` + +Developer-only hidden metadata (not part of visible tool schema shown to agents): +```text speed_vs_detail: "ping" | "fast" | "all" ``` +Example with visible `model_profile`: +```json +{"prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", "model_profile": "premium"} +``` + +Example with hidden metadata override. The `ping` only checks if the LLMs are connected and doesn't trigger a full plan to be created: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "ping" + } + } +} +``` + +Example with hidden metadata override. The `fast` triggers a plan to be created, where the entire Luigi pipeline gets exercised, while skipping as much detail as possible: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "fast" + } + } +} +``` + +Example with hidden metadata override. The `all` is the default setting. Creates a plan with **ALL** details: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "all" + } + } +} +``` + +Counterexamples (do NOT use PlanExe for these): + +- "Give me a 5-point checklist for X." +- "Summarize this paragraph in 6 bullets." +- "Rewrite this email." +- "Identify the risks of this project." +- "Make a SWOT for this document." + +What to do instead: + +- For one-shot outputs, use a normal LLM response directly. +- For PlanExe, send a substantial multi-phase project prompt with scope, constraints, timeline, budget, stakeholders, and success criteria. +- PlanExe always runs a fixed end-to-end pipeline; it does not support selecting only internal pipeline subsets. + ### task_status Fetch status/progress and recent files for a task. @@ -60,6 +147,13 @@ Example call: {"task_id": "2d57a448-1b09-45aa-ad37-e69891ff6ec7"} ``` +State contract: + +- `pending`: queued and waiting for a worker, keep polling. +- `processing`: picked up by a worker, keep polling. +- `completed`: terminal success, proceed to download. +- `failed`: terminal error. + ### task_stop Request an active task to stop. @@ -135,11 +229,51 @@ Example call: {"task_id": "2d57a448-1b09-45aa-ad37-e69891ff6ec7", "artifact": "report"} ``` +`PLANEXE_PATH` behavior for `task_download`: +- Save directory is `PLANEXE_PATH`, or current working directory if unset. +- Non-existing directories are created automatically. +- If `PLANEXE_PATH` points to a file, download fails. +- Filename is prefixed with task id (for example `-030-report.html`). +- Response includes `saved_path` with the exact local file location. + +## Minimal error-handling contract + +Error payload shape: +```json +{"error": {"code": "SOME_CODE", "message": "Human readable message", "details": {}}} +``` + +Common cloud/core error codes: +- `TASK_NOT_FOUND` +- `INVALID_USER_API_KEY` +- `USER_API_KEY_REQUIRED` +- `INSUFFICIENT_CREDITS` +- `INTERNAL_ERROR` +- `MODEL_PROFILES_UNAVAILABLE` +- `generation_failed` +- `content_unavailable` + +Common local proxy error codes: +- `REMOTE_ERROR` +- `DOWNLOAD_FAILED` + +Special case: +- `task_file_info` may return `{}` while the artifact is not ready yet (not an error). + +## Concurrency semantics (practical) + +- Each `task_create` call creates a new task with a new `task_id`. +- The server does not enforce a global “one active task per client” cap. +- Parallelism is a client orchestration concern: + - start with 1 task + - scale to 2 in parallel if needed + - avoid more than 4 unless you have strong task-tracking UX + ## Typical Flow ### 1. Get example prompts -The user often starts with a vague idea. The AI calls `prompt_examples` first to see what good prompts look like (around five samples, 300–800 words each), then expands the user’s idea into a high-quality prompt and shows it to the user. +The user often starts with a vague idea. The AI calls `prompt_examples` first to see what good prompts look like (around five samples, typically 300-800 words each), then expands the user’s idea into a high-quality prompt using this compact shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. Prompt: ``` @@ -151,7 +285,23 @@ Tool call: {} ``` -### 2. Create a plan +### 2. Inspect model profiles (optional but recommended) + +Prompt: +``` +Show model profile options and available models. +``` + +Tool call: +```json +{} +``` + +### 3. Draft and approve the prompt (non-tool step) + +At this step, the agent writes a high-quality prompt draft (typically 300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria), shows it to the user, and waits for approval. + +### 4. Create a plan The user reviews the prompt and either asks for further changes or confirms it’s good to go. When the user confirms, the agent calls `task_create` with that prompt. @@ -160,7 +310,7 @@ Tool call: {"prompt": "..."} ``` -### 3. Get status +### 5. Get status Prompt: ``` @@ -172,7 +322,7 @@ Tool call: {"task_id": ""} ``` -### 4. Download the report +### 6. Download the report Prompt: ``` diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index 933d25ef..f5cd57f3 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -11,19 +11,31 @@ This is the shortest path to a working PlanExe MCP integration. ## 1. Understand the flow 1. Ask for prompt examples. -2. Expand the user idea into a high‑quality prompt. -3. Create the plan task. -4. Poll for status. -5. Download the report (HTML or zip). +2. Inspect `model_profile` options and available models. +3. Expand the user idea into a high-quality prompt (typically ~300-800 words) and get user approval. + Use this compact shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. +4. Create the plan task. +5. Poll for status (about every 5 minutes). +6. Download artifacts via `task_file_info` (cloud) or `task_download` (mcp_local helper). --- ## 2. Minimal tool usage 1. `prompt_examples` -2. `task_create` -3. `task_status` -4. `task_download` +2. `model_profiles` +3. `task_create` +4. `task_status` +5. `task_file_info` + +Optional local helper: +- `task_download` (provided by `mcp_local`, not `mcp_cloud`) + +For `task_create`: + +- Visible arguments: `prompt` (required), `model_profile` (optional). +- Hidden developer metadata: `speed_vs_detail` (`ping` | `fast` | `all`). +- Reference: [PlanExe MCP interface](planexe_mcp_interface.md#62-task_create) --- @@ -31,7 +43,9 @@ This is the shortest path to a working PlanExe MCP integration. - You can fetch example prompts. - You can create a plan task. -- You can download the report artifact. +- You can fetch artifact metadata/URLs with `task_file_info` (and optionally save locally via `task_download` when using `mcp_local`). +- Your client can parse `error.code` and `error.message` and handle `{}` from `task_file_info` as "not ready yet". +- If running parallel work, your client tracks multiple `task_id`s explicitly (server-side global cap is not enforced). --- diff --git a/docs/mcp/mcp_troubleshooting.md b/docs/mcp/mcp_troubleshooting.md index 1ecce90a..a2f90dcc 100644 --- a/docs/mcp/mcp_troubleshooting.md +++ b/docs/mcp/mcp_troubleshooting.md @@ -10,7 +10,7 @@ Common MCP integration issues and fixes. ## Cannot create a plan -- Ensure your prompt is detailed (300–800 words). +- Ensure your prompt is detailed (typically ~300-800 words) and includes objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. - Some topics may be refused by the model (harmful, unethical, or dangerous requests). - Try a smaller model or a more reliable paid model. - Confirm the MCP server is reachable from your client. diff --git a/docs/mcp/mcp_welcome.md b/docs/mcp/mcp_welcome.md index 997d1e6f..c181fca4 100644 --- a/docs/mcp/mcp_welcome.md +++ b/docs/mcp/mcp_welcome.md @@ -4,7 +4,7 @@ title: Welcome to PlanExe MCP # Welcome to PlanExe MCP -PlanExe MCP lets [AI agents](https://en.wikipedia.org/wiki/AI_agent) (and the tools you build) create [strategic plans](https://en.wikipedia.org/wiki/Strategic_planning) from a plain-English prompt. You send a goal; PlanExe produces a draft plan. The MCP user then chooses whether to download the **HTML report** or a **zip** of intermediary files (JSON, MD, CSV) used to build that report. +PlanExe MCP lets [AI agents](https://en.wikipedia.org/wiki/AI_agent) (and the tools you build) create [strategic project-plan drafts](https://en.wikipedia.org/wiki/Strategic_planning) from a plain-English prompt. You send a goal; PlanExe produces a draft plan with 20+ sections — including adversarial analysis that stress-tests whether the plan holds up. The MCP user then chooses whether to download the **HTML report** or a **zip** of intermediary files (JSON, MD, CSV) used to build that report. No MCP experience is required to get started. @@ -19,19 +19,21 @@ No MCP experience is required to get started. ## What you can do -- **Get example prompts** — See what good prompts look like (detailed, typically 300–800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. -- **Create a plan** — Send a prompt; PlanExe starts creating the plan (takes about 15–20 minutes). If the input prompt is of low quality, the output plan will be crap too. +- **Get example prompts** — See what good prompts look like (detailed, typically ~300-800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. A compact prompt shape works best: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. +- **Create a plan** — Send a prompt; PlanExe starts creating the plan (typically takes 10–20 minutes on baseline profile). If the input prompt is of low quality, the output plan will be crap too. Visible `task_create` options include `model_profile`. - **Check progress** — Ask for status and see how far the plan has gotten. - **Download the report** — When the plan is ready, the user specifies whether to download the HTML report or the zip of intermediary files (JSON, MD, CSV). +Developer note: `speed_vs_detail` is intentionally hidden from the visible `task_create` interface and is provided via tool-specific metadata when needed. + --- ## What you get The MCP user chooses which artifact to download: -- **HTML report** (around 40 pages) — executive summary, Gantt chart, risks, next steps, and more. Opens in a browser. -- **Zip** — intermediary files (JSON, MD, CSV) used to build the HTML report, for deeper inspection. +- **HTML report** (~700KB, self-contained) — 20+ sections including executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks). Opens in a browser with collapsible sections and interactive charts. +- **Zip** — intermediary pipeline files (JSON, MD, CSV) that fed the HTML report, for deeper inspection. --- diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 11eb7876..3bc711e1 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -4,19 +4,19 @@ ### 1.1 What is PlanExe -PlanExe is a service that generates **rough-draft project plans** from a natural-language prompt. You describe a large goal (e.g. open a clinic, launch a product, build a moon base)—the kind of project that in reality takes months or years. PlanExe produces a structured draft: steps, documents, and deliverables. The plan is not executable in its current form; it is a draft to refine and act on. Creating a plan is a long-running task (100+ LLM inference calls): create a task with a prompt, poll status, then download the HTML report and zip when done. +PlanExe is a service that generates **strategic project-plan drafts** from a natural-language prompt. You describe a large goal (e.g. open a clinic, launch a product, build a moon base)—the kind of project that in reality takes months or years. PlanExe produces a structured draft with 20+ sections: steps, documents, and deliverables. The plan is not executable in its current form; it is a draft to refine and act on. Creating a plan is a long-running task (100+ LLM inference calls): create a task with a prompt, poll status, then download the HTML report and zip when done. ### 1.2 What kind of plan does it create -The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifacts including a Gantt chart, risk analysis, and other project management deliverables. The main output is a large HTML file (approx 700KB) containing many sections. There is also a zip file containing all intermediary files (md, json, csv). Plan quality depends on prompt quality; use the prompt_examples tool to see the baseline before calling task_create. +The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifacts including a Gantt chart, risk analysis, and other project management deliverables. The main output is a self-contained interactive HTML report (~700KB) with collapsible sections, interactive Gantt charts, and embedded JavaScript. The report contains 20+ sections including executive summary, investor pitch, project plan with SMART criteria, strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, plan review, Q&A, premortem with failure scenarios, self-audit checklist, and adversarial premise attacks. There is also a zip file containing all intermediary pipeline files (md, json, csv) that fed the report. Plan quality depends on prompt quality; use the prompt_examples tool to see the baseline before calling task_create. #### 1.2.1 Agent-facing summary (for server instructions / tool descriptions) Implementors should expose the following to agents so they understand what PlanExe does: -- **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. -- **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). +- **What:** PlanExe turns a plain-English goal into a strategic project-plan draft (20+ sections) in ~10–20 min. Sections include executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks) that stress-test the plan. The output is a draft to refine, not an executable or final document — but it surfaces hard questions the prompter may not have considered. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` (mcp_local helper) or `task_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. +- **Output:** Self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. The zip contains the intermediary pipeline files (md, json, csv) that fed the report. ### 1.3 Scope of this document @@ -70,10 +70,10 @@ The interface is designed to support: The MCP specification defines two different mechanisms: -- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download. This document specifies those tools. +- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_file_info (and optionally task_download via mcp_local). This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download when complete. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: call `prompt_examples`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `task_create`; poll `task_status`; then call `task_file_info` (or `task_download` via mcp_local). --- @@ -92,13 +92,13 @@ A long-lived container for a PlanExe project run. - config: immutable run configuration (models, runtime limits, Luigi params) - created_at, updated_at -#### Run +#### Execution -A single execution attempt inside a task (e.g., after a resume). +A single execution attempt inside a task. **Key properties** -- state: running | stopped | completed | failed +- state: pending | processing | completed | failed - progress_percentage: computed progress percentage (float) - started_at, ended_at @@ -128,32 +128,25 @@ A typed message emitted during execution for UI/agent consumption. ## 5. State Machine -### 5.1 Task states +### 5.1 TaskItem.state values -Tasks may exist independent of active runs. +The public MCP `state` field is aligned with `TaskItem.state`: -- created: task initialized, no run started -- active: at least one run exists, may be running or stopped -- archived: optional; immutable, no new runs allowed - -### 5.2 Run states - -- running -- stopping (optional transitional state) -- stopped (user stopped, resumable) +- pending (queued, waiting for a worker) +- processing (picked up by a worker) - completed -- failed (resumable depending on failure type) +- failed -### 5.3 Allowed transitions +### 5.2 Allowed transitions -- running → stopped via task_stop -- running → completed via normal success -- running → failed via error +- pending → processing when picked up by a worker +- processing → completed via normal success +- processing → failed via error -**Invalid** +### 5.3 Invalid transitions -- completed → running (new run must be triggered by creating a new task) -- running → running (no concurrent runs in v1) +- completed → processing (new run must be triggered by creating a new task) +- processing → processing is not a state transition on the same task; create separate tasks for parallel work. --- @@ -163,7 +156,9 @@ All tool names below are normative. ### 6.1 prompt_examples -**Step 1 — Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: Step 1 — call this tool to fetch examples. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. If you call task_create before formulating and approving a prompt, the resulting plan will be lower quality than it could be. +**Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: call this tool; optionally call `model_profiles`; then complete a non-tool step (draft and approve a detailed prompt, typically ~300-800 words); only then call `task_create`. If you call `task_create` before formulating and approving a prompt, the resulting plan will be lower quality than it could be. + +Write the prompt as flowing prose, not structured markdown with headers or bullet lists. Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance structure inline. Typical length: 300–800 words. The examples demonstrate this prose style — match their tone and density. **Request:** no parameters (empty object). @@ -178,9 +173,46 @@ All tool names below are normative. --- +### 6.1.1 model_profiles + +Optional helper tool to discover valid `model_profile` choices and currently available models without relying on internal config knowledge. +Profiles with zero available models are omitted from the returned `profiles` array. +If no models are available in any profile, the tool returns `isError=true` with `error.code = MODEL_PROFILES_UNAVAILABLE`. + +**Request:** no parameters (empty object). + +**Response (shape)** + +```json +{ + "default_profile": "baseline", + "profiles": [ + { + "profile": "baseline", + "title": "Baseline", + "summary": "Cheap and fast; recommended default when creating a plan.", + "model_count": 5, + "models": [ + { + "key": "openrouter-gpt-oss-20b", + "provider_class": "OpenRouter", + "model": "openai/gpt-oss-20b", + "priority": 0 + } + ] + } + ], + "message": "..." +} +``` + +Use the returned `profile` values directly in `task_create.model_profile`. + +--- + ### 6.2 task_create -**Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2).** Start creating a new plan with the approved prompt. speed_vs_detail modes: 'all' runs the full pipeline with all details (slower, higher token usage/cost). 'fast' runs the full pipeline with minimal work per step (faster, fewer details), useful to verify the pipeline is working. 'ping' runs the pipeline entrypoint and makes a single LLM call to verify the worker_plan_database is processing tasks and can reach the LLM. +**Call only after prompt_examples and after the non-tool drafting/approval step.** Start creating a new plan with the approved prompt. **Request** @@ -191,11 +223,12 @@ All tool names below are normative. "type": "object", "properties": { "prompt": { "type": "string" }, - "speed_vs_detail": { + "model_profile": { "type": "string", - "enum": ["ping", "fast", "all"], - "default": "ping" - } + "enum": ["baseline", "premium", "frontier", "custom"], + "default": "baseline" + }, + "user_api_key": { "type": "string" } }, "required": ["prompt"] } @@ -206,24 +239,70 @@ All tool names below are normative. ```json { "prompt": "string", - "speed_vs_detail": "ping", + "model_profile": "baseline", "user_api_key": "pex_..." } ``` +**Tool-specific metadata (developer-only, hidden from model-visible schema)** + +Use tool-specific metadata when you need runtime overrides that should not be visible in the tool interface shown to AI agents. + +`speed_vs_detail` is read from metadata, not from the visible input schema. + +- `speed_vs_detail` accepted values: + - `ping`: single LLM call to verify the pipeline/LLM path. + - `fast`: reduced-detail run through the full pipeline. + - `all`: full-detail run through the full pipeline. + +**Metadata example** + +```json +{ + "prompt": "string", + "metadata": { + "task_create": { + "speed_vs_detail": "ping" + } + } +} +``` + **Prompt quality** -The `prompt` parameter should be a detailed description of what the plan should cover. Good prompts are typically 300–800 words and include: +The `prompt` parameter should be a detailed description of what the plan should cover. Good prompts are typically 300-800 words and include: + +- Objective +- Scope +- Constraints +- Timeline +- Stakeholders +- Budget/resources +- Success criteria + +Write as flowing prose, not structured markdown. Include banned approaches, governance preferences, and phasing inline. Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. + +**Counterexamples: when NOT to use PlanExe** + +Use a normal single LLM response (not PlanExe) for one-shot micro-tasks. PlanExe runs a heavy multi-step planning pipeline and is best for substantial project planning. + +- Bad (do not send to task_create): "Give me a 5-point checklist for launching a coffee shop." +- Better non-PlanExe action: ask the LLM directly for a checklist. +- Better PlanExe prompt: "Create a 12-month strategic launch plan for a coffee shop in Austin with budget caps, lease milestones, hiring plan, permits, supply chain, marketing channels, risk register, governance, and success KPIs." + +- Bad (do not send to task_create): "Summarize this text in 6 bullets." +- Better non-PlanExe action: use direct summarization in the chat model. -- Clear context: background, constraints, and goals -- Specific requirements: budget, timeline, location, or technical constraints -- Success criteria: what "done" looks like -- Banned words or approaches (if any) +- Bad (invalid assumption): "Run only the risk-register part of PlanExe." +- Rule: PlanExe pipeline execution is fixed end-to-end. Callers cannot choose internal step subsets. +- Better PlanExe prompt: request a full plan where risk analysis is one required deliverable. -Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. +- Bad (do not send to task_create): "Rewrite this email to sound professional." +- Better non-PlanExe action: use direct rewriting in the chat model. **Optional** +- model_profile: LLM profile (`baseline` | `premium` | `frontier` | `custom`). If unsure, call `model_profiles` first. - user_api_key: user API key for credits and attribution (if your deployment requires it). Clients can call the MCP tool **prompt_examples** to retrieve example prompts. Use these as examples for task_create; they can also call task_create with any prompt—short prompts produce less detailed plans. @@ -243,18 +322,19 @@ For the full catalog file: **Important** -- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_download. +- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_file_info (and task_download when using mcp_local). **Behavior** - Must be idempotent only if client supplies an optional client_request_id (optional extension). - Task config is immutable after creation in v1. +- By default, repeated `task_create` calls produce new tasks (new `task_id`s). --- ### 6.3 task_status -Returns run status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation takes 15–20+ minutes and frequent polling is unnecessary. +Returns task status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation typically takes 10–20 minutes (baseline profile) and may take longer on higher-quality profiles. **Request** @@ -268,12 +348,23 @@ Returns run status and progress. Used for progress bars and UI states. **Polling - task_id: UUID returned by task_create. Use it to reference the plan being created. +**Caller contract (state meanings)** + +- `pending`: queued and waiting for a worker. Keep polling. +- `processing`: picked up by a worker and in progress. Keep polling. +- `completed`: terminal success. Download artifacts now. +- `failed`: terminal error. Do not keep polling for completion. + +**Terminal states** + +- `completed`, `failed` + **Response** ```json { "task_id": "5e2b2a7c-8b49-4d2f-9b8f-6a3c1f05b9a1", - "state": "running", + "state": "processing", "progress_percentage": 62.0, "timing": { "started_at": "2026-01-14T12:35:10Z", @@ -296,7 +387,7 @@ Returns run status and progress. Used for progress bars and UI states. **Polling ### 6.4 task_stop -Requests the plan generation to stop. Pass the **task_id** (the UUID returned by task_create). This is a normal MCP tool call: call task_stop with that task_id. +Requests the plan generation to stop. Pass the **task_id** (the UUID returned by task_create). Call `task_stop` with that task_id. **Request** @@ -308,13 +399,14 @@ Requests the plan generation to stop. Pass the **task_id** (the UUID returned by **Input** -- task_id: UUID returned by task_create. Use this same UUID when calling task_stop to request the run to stop. +- task_id: UUID returned by task_create. Use this same UUID when calling task_stop to request the task to stop. **Response** ```json { - "state": "stopped" + "state": "processing", + "stop_requested": true } ``` @@ -336,6 +428,19 @@ Requests the plan generation to stop. Pass the **task_id** (the UUID returned by - task_id: UUID returned by task_create. Use it to download the created plan. - artifact: "report" or "zip" (default "report"). +**task_download local path behavior (mcp_local)** + +- Save directory is `PLANEXE_PATH`. +- If `PLANEXE_PATH` is unset, save to current working directory. +- If `PLANEXE_PATH` points to a file (not a directory), return an error. +- Filenames are `-030-report.html` or `-run.zip`. +- If a filename already exists, append `-1`, `-2`, ... before extension. +- Successful responses include `saved_path`. + +**task_file_info URL behavior (mcp_cloud)** + +- `download_url` is an absolute URL where the requested artifact can be downloaded. + --- ## 7. Targets @@ -354,46 +459,88 @@ Targets map to Luigi "final tasks". ## 8. Concurrency & Locking -### 8.1 Single active run per task +### 8.1 Client-side concurrency guidance + +The server does not enforce a global limit on how many tasks a client can create. +Concurrency is a client-side coordination concern. -In v1, tasks MUST enforce: +Recommended practice for MCP clients: -- at most one run in running state. +- Start with 1 active task. +- If needed, increase to 2 tasks in parallel. +- Going beyond 4 parallel tasks is usually hard to track; avoid unless necessary. + +Additional semantics: + +- Every `task_create` call creates a new independent task with a new `task_id`. +- The server does not deduplicate “same prompt” requests into a single shared task. +- Keep your own task registry/client state if you run multiple tasks concurrently. --- ## 9. Error Model -Errors MUST return: +### 9.1 Error object shape + +Tool errors return: -- code: stable machine-readable -- message: human-readable -- details: optional +- `error.code`: stable machine-readable string +- `error.message`: human-readable message +- `error.details`: optional object -**Example:** +Example: ```json { "error": { - "code": "RUN_ALREADY_ACTIVE", - "message": "A run is currently active for this task.", - "details": { "run_id": "run_0001" } + "code": "TASK_NOT_FOUND", + "message": "Task not found: " } } ``` -### 9.1 Required error codes - -- TASK_NOT_FOUND -- RUN_NOT_FOUND -- RUN_ALREADY_ACTIVE -- RUN_NOT_ACTIVE -- INVALID_TARGET -- INVALID_ARTIFACT_URI -- CONFLICT -- PERMISSION_DENIED -- RUNNING_READONLY -- INTERNAL_ERROR +### 9.2 isError behavior + +- `task_create`, `task_status`, `task_stop`: unknown/invalid requests return `isError=true` with `error`. +- `model_profiles`: returns `isError=true` with `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. +- `task_file_info`: uses mixed behavior: + - returns `{}` (not an error) while artifacts are not ready. + - may return `{"error": ...}` with `isError=false` for terminal artifact-level problems. + - returns `isError=true` for unknown task id (`TASK_NOT_FOUND`). +- `mcp_local` may return proxy/transport failures as `REMOTE_ERROR` and local download write failures as `DOWNLOAD_FAILED`. + +### 9.3 Minimal code contract (current) + +Cloud/core tool codes: + +- `INVALID_TOOL`: unknown MCP tool name. +- `INTERNAL_ERROR`: uncaught server error. +- `TASK_NOT_FOUND`: task id not found. +- `INVALID_USER_API_KEY`: provided user_api_key is invalid. +- `USER_API_KEY_REQUIRED`: deployment requires user_api_key for task_create. +- `INSUFFICIENT_CREDITS`: caller account has no credits for task_create. +- `MODEL_PROFILES_UNAVAILABLE`: model_profiles found zero available models across all profiles. +- `generation_failed`: task_file_info report path when task ended in failed. +- `content_unavailable`: task_file_info cannot read requested artifact bytes. + +Local proxy specific codes: + +- `REMOTE_ERROR`: mcp_local could not call mcp_cloud (network/HTTP/protocol layer failure). +- `DOWNLOAD_FAILED`: mcp_local could not write/download artifact to local filesystem. + +### 9.4 Caller handling guidance + +- Retry with backoff: + - `INTERNAL_ERROR` + - `REMOTE_ERROR` + - `content_unavailable` (short retry window) +- Do not retry unchanged request: + - `INVALID_USER_API_KEY` + - `USER_API_KEY_REQUIRED` + - `INSUFFICIENT_CREDITS` + - `INVALID_TOOL` +- For `TASK_NOT_FOUND`: verify task_id source and stop polling that id. +- For `generation_failed`: treat as terminal failure and surface task progress_message to user. --- diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index bfdccc34..baea60df 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -37,7 +37,7 @@ from worker_plan_api.filenames import FilenameEnum, ExtraFilenameEnum from worker_plan_api.prompt_catalog import PromptCatalog from sqlalchemy import text, inspect, func -from sqlalchemy.exc import OperationalError +from sqlalchemy.exc import OperationalError, DataError from database_api.model_taskitem import TaskItem, TaskState from database_api.model_event import EventType, EventItem from database_api.model_worker import WorkerItem @@ -1152,6 +1152,31 @@ def _format_relative_time(value: Any) -> str: n = seconds return f"{n} sec" if n == 1 else f"{n} secs" + def _load_prompt_preview_safe(self, task_id: Any, max_chars: int = 240) -> str: + """Load a prompt preview for one task, tolerating corrupted UTF-8 rows.""" + try: + preview = ( + self.db.session.query(func.substr(TaskItem.prompt, 1, max_chars)) + .filter(TaskItem.id == task_id) + .scalar() + ) + text = (preview or "").strip() + if text: + return text + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for task_id=%s; using placeholder preview.", + task_id, + exc_info=True, + ) + return "[Prompt unavailable due to encoding issue]" + except Exception: + self.db.session.rollback() + logger.debug("Unable to load prompt preview for task_id=%s", task_id, exc_info=True) + + return "[Prompt unavailable]" + def _get_current_user_account(self) -> Optional[UserAccount]: if not current_user.is_authenticated: return None @@ -2016,25 +2041,50 @@ def index(): if user_id: # Generate a nonce so the user can start a plan from the dashboard nonce = 'DASH_' + str(uuid.uuid4()) - recent_task_rows = ( - self.db.session.query( - TaskItem.id, - TaskItem.state, - func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + try: + recent_task_rows = ( + self.db.session.query( + TaskItem.id, + TaskItem.state, + func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + ) + .filter(TaskItem.user_id == str(user_id)) + .order_by(TaskItem.timestamp_created.desc()) + .limit(10) + .all() ) - .filter(TaskItem.user_id == str(user_id)) - .order_by(TaskItem.timestamp_created.desc()) - .limit(10) - .all() - ) - recent_tasks = [ - SimpleNamespace( - id=str(task.id), - state=task.state if isinstance(task.state, TaskState) else None, - prompt=(task.prompt_preview or "").strip(), + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for user_id=%s while loading dashboard; " + "falling back without prompt previews.", + user_id, + exc_info=True, + ) + recent_task_rows = ( + self.db.session.query( + TaskItem.id, + TaskItem.state, + ) + .filter(TaskItem.user_id == str(user_id)) + .order_by(TaskItem.timestamp_created.desc()) + .limit(10) + .all() + ) + recent_tasks = [] + for task in recent_task_rows: + prompt_preview = getattr(task, "prompt_preview", None) + if prompt_preview is None: + prompt_text = self._load_prompt_preview_safe(task.id) + else: + prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + recent_tasks.append( + SimpleNamespace( + id=str(task.id), + state=task.state if isinstance(task.state, TaskState) else None, + prompt=prompt_text, + ) ) - for task in recent_task_rows - ] total_tasks_count = ( TaskItem.query .filter_by(user_id=str(user_id)) @@ -2897,27 +2947,51 @@ def plan(): if not run_id: user_id = str(current_user.id) - tasks = ( - self.db.session.query( - TaskItem.id, - TaskItem.timestamp_created, - TaskItem.state, - func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + try: + tasks = ( + self.db.session.query( + TaskItem.id, + TaskItem.timestamp_created, + TaskItem.state, + func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + ) + .filter(TaskItem.user_id == user_id) + .order_by(TaskItem.timestamp_created.desc()) + .all() + ) + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for user_id=%s while loading /plan; " + "falling back without prompt previews.", + user_id, + exc_info=True, + ) + tasks = ( + self.db.session.query( + TaskItem.id, + TaskItem.timestamp_created, + TaskItem.state, + ) + .filter(TaskItem.user_id == user_id) + .order_by(TaskItem.timestamp_created.desc()) + .all() ) - .filter(TaskItem.user_id == user_id) - .order_by(TaskItem.timestamp_created.desc()) - .all() - ) rows = [] for task in tasks: ts = task.timestamp_created created_compact = ts.strftime("%y%m%d-%H%M") if isinstance(ts, datetime) else "-" + prompt_preview = getattr(task, "prompt_preview", None) + if prompt_preview is None: + prompt_text = self._load_prompt_preview_safe(task.id) + else: + prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" rows.append({ "id": str(task.id), "created_compact": created_compact, "created_relative": self._format_relative_time(ts), "status": task.state.name if isinstance(task.state, TaskState) else "pending", - "prompt": (task.prompt_preview or "").strip(), + "prompt": prompt_text, }) return render_template("plan_list.html", plan_rows=rows) diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index e28b41e8..7f17581b 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -8,12 +8,14 @@ from datetime import datetime from decimal import Decimal from enum import Enum +from typing import Any from flask_admin.contrib.sqla import ModelView from flask_admin.actions import action from markupsafe import Markup from flask import url_for, abort, redirect, Response from flask_login import current_user from sqlalchemy.orm import defer +from wtforms import FileField, BooleanField class AdminOnlyModelView(ModelView): """Restrict admin views to authenticated admin users only.""" @@ -152,6 +154,19 @@ class TaskItemView(AdminOnlyModelView): f'Download' ) if m.has_run_track_activity_jsonl else '—', } + form_excluded_columns = [ + 'generated_report_html', + 'run_zip_snapshot', + 'run_track_activity_jsonl', + ] + form_extra_fields = { + 'generated_report_html_upload': FileField('Upload Report HTML'), + 'generated_report_html_clear': BooleanField('Clear existing report HTML'), + 'run_zip_snapshot_upload': FileField('Upload Run ZIP'), + 'run_zip_snapshot_clear': BooleanField('Clear existing run ZIP'), + 'run_track_activity_jsonl_upload': FileField('Upload Track Activity JSONL'), + 'run_track_activity_jsonl_clear': BooleanField('Clear existing track activity JSONL'), + } def get_query(self): return super().get_query().options( @@ -160,6 +175,68 @@ def get_query(self): defer(self.model.run_track_activity_jsonl), ) + def on_form_prefill(self, form: Any, id: Any) -> None: + model = self.get_one(id) + if model is None: + return + + if hasattr(form, "generated_report_html_upload"): + if model.has_generated_report_html: + href = url_for("download_task_report", task_id=str(model.id)) + form.generated_report_html_upload.description = Markup( + f'Current file: download report.html' + ) + else: + form.generated_report_html_upload.description = "Current file: none" + + if hasattr(form, "run_zip_snapshot_upload"): + if model.has_run_zip_snapshot: + href = url_for("download_task_run_zip", task_id=str(model.id)) + form.run_zip_snapshot_upload.description = Markup( + f'Current file: download run.zip' + ) + else: + form.run_zip_snapshot_upload.description = "Current file: none" + + if hasattr(form, "run_track_activity_jsonl_upload"): + if model.has_run_track_activity_jsonl: + href = url_for("download_task_track_activity", task_id=str(model.id)) + form.run_track_activity_jsonl_upload.description = Markup( + f'Current file: download track_activity.jsonl' + ) + else: + form.run_track_activity_jsonl_upload.description = "Current file: none" + + def on_model_change(self, form: Any, model: Any, is_created: bool) -> None: + def _read_upload(field_name: str): + field = getattr(form, field_name, None) + data = getattr(field, "data", None) if field is not None else None + filename = getattr(data, "filename", None) if data is not None else None + if not data or not filename: + return None + return data.read() + + uploaded_report = _read_upload("generated_report_html_upload") + uploaded_zip = _read_upload("run_zip_snapshot_upload") + uploaded_track = _read_upload("run_track_activity_jsonl_upload") + + if uploaded_report is not None: + model.generated_report_html = uploaded_report.decode("utf-8", errors="replace") + elif bool(getattr(form.generated_report_html_clear, "data", False)): + model.generated_report_html = None + + if uploaded_zip is not None: + model.run_zip_snapshot = uploaded_zip + elif bool(getattr(form.run_zip_snapshot_clear, "data", False)): + model.run_zip_snapshot = None + + if uploaded_track is not None: + model.run_track_activity_jsonl = uploaded_track.decode("utf-8", errors="replace") + elif bool(getattr(form.run_track_activity_jsonl_clear, "data", False)): + model.run_track_activity_jsonl = None + + return super().on_model_change(form, model, is_created) + class NonceItemView(AdminOnlyModelView): """Custom ModelView for NonceItem""" def __init__(self, model, *args, **kwargs): diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index ba6b0048..386f50e1 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -16,17 +16,44 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - Task management maps to `TaskItem` records (each task = one TaskItem). - Events are queried from `EventItem` database records. - Use the TaskItem UUID as the MCP `task_id`. +- Public task state contract: + - `task_status.state` must use exactly: `pending`, `processing`, `completed`, `failed`. + - These values correspond 1:1 with `database_api.model_taskitem.TaskState`. + - Do not use legacy public names like `running`, `stopping`, or `stopped` for `task_status`. + - Do not expose internal symbol/class names (for example `TaskState.pending`, `TaskItem.state`) in model-facing tool descriptions; use plain public state strings. - Download contract: - `track_activity.jsonl` is internal-only (`TaskItem.run_track_activity_jsonl`). - Downloadable zip artifacts must never include `track_activity.jsonl`. - Serve new layout snapshots directly; sanitize only legacy/fallback zips. +- `task_stop` contract: + - `task_stop` does not create a separate lifecycle state. + - Return current public `state` plus `stop_requested` to acknowledge stop-flag request. - Forbidden imports: `worker_plan.app`, `worker_plan_internal`, `frontend_*`, `open_dir_server`. +## task_create contract +- Expose `model_profiles` as the discovery tool for profile selection. +- `model_profiles` must report profile guidance and currently available models after class whitelist filtering. +- Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. +- Keep concurrency wording explicit: each `task_create` call creates a new `task_id`; no global per-client concurrency cap is enforced server-side. +- Visible input schema is intentionally limited to: + - `prompt` + - `model_profile` (`baseline`, `premium`, `frontier`, `custom`) + - `user_api_key` (optional) +- Keep `speed_vs_detail` out of model-visible input schema. +- Runtime override for `speed_vs_detail` is metadata-only (tool-specific metadata), + read from hidden containers (`tool_metadata`, `metadata`, `_meta`) and nested + namespaces (`task_create`, `planexe_task_create`, `planexe`). +- Preserve compatibility aliases for metadata speed values: + - `ping` -> `ping_llm` + - `fast` -> `fast_but_skip_details` + - `all` -> `all_details_but_slow` + ## MCP Protocol - The server communicates over stdio (standard input/output) following the MCP protocol. - Tools are registered via `@mcp_cloud.list_tools()` and handled via `@mcp_cloud.call_tool()`. - All tool responses must be JSON-serializable and follow the error model in the spec. +- Keep tool error codes/docs aligned with actual runtime payloads (for example `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `generation_failed`, `content_unavailable`, `INTERNAL_ERROR`). - Event cursors use format `cursor_{event_id}` for incremental polling. - **Run as task**: We expose MCP **tools** only (task_create, task_status, task_stop, etc.), not the MCP **tasks** protocol (tasks/get, tasks/result, etc.). Do not advertise the tasks capability or add "Run as task" support; the spec and clients (e.g. Cursor) are aligned on tools-only. @@ -36,14 +63,28 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - OAuth is not supported for the MCP API. Do not document, imply, or advertise OAuth support. - In docs and user-facing error/help text, instruct clients to use `X-API-Key` custom headers. +## Download URL environment behavior +- `task_file_info.download_url` should be built from `PLANEXE_MCP_PUBLIC_BASE_URL` when set. +- If `PLANEXE_MCP_PUBLIC_BASE_URL` is unset in HTTP mode, use request host/scheme. +- If no public base URL is available, `download_url` may be absent; document this and guide operators to set `PLANEXE_MCP_PUBLIC_BASE_URL`. + ## mcp_local integration - `mcp_local` runs on the user's machine and forwards tool calls to this server over HTTP. - It targets either: - the HTTP wrapper endpoint (`/mcp/tools/call`), or - the streamable MCP JSON-RPC endpoint (`/mcp`). +- Tool-surface split must stay explicit: + - `mcp_cloud` exposes `task_file_info` (not `task_download`). + - `mcp_local` exposes `task_download` and implements it via cloud `task_file_info`. - `task_file_info` provides download metadata that `mcp_local` uses to download artifacts via `/download/{task_id}/...`. +## Troubleshooting guidance (caller-facing text) +- Keep guidance aligned across server instructions and tool descriptions: + - `pending` for longer than 5 minutes usually means queued but not picked up by worker. + - `processing` with no output-file changes for longer than 20 minutes usually means stalled/failed execution. + - In both cases, direct users to report issues at `https://github.com/PlanExeOrg/PlanExe/issues`. + ## MCP Registry metadata - Registry metadata for this server lives at `mcp_cloud/server.json`. - Keep `server.json` aligned with deployed behavior: @@ -52,6 +93,9 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - Publish with `mcp-publisher` from the `mcp_cloud/` directory so it picks up this file. ## Testing -- No automated tests currently. If you change MCP tool behavior or database mappings, - add a unit test close to the logic when feasible and run `python test.py` from - repo root. +- Automated tests exist under `mcp_cloud/tests/`. +- If you change MCP tool behavior, state mapping, or tool surface, update/add unit + tests close to the changed logic. +- Run focused tests from repo root, for example: + - `python -m unittest mcp_cloud.tests.test_tool_surface_consistency` + - `python -m unittest mcp_cloud.tests.test_task_status_tool` diff --git a/mcp_cloud/Dockerfile b/mcp_cloud/Dockerfile index 02efb72c..fd6837a2 100644 --- a/mcp_cloud/Dockerfile +++ b/mcp_cloud/Dockerfile @@ -14,6 +14,7 @@ WORKDIR /app COPY database_api /app/database_api COPY worker_plan/worker_plan_api /app/worker_plan_api COPY mcp_cloud /app/mcp_cloud +COPY llm_config /app/llm_config COPY public/llms.txt /app/public/llms.txt # Install dependencies diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index fcb0cc9b..4dd85075 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -14,7 +14,8 @@ mcp_cloud provides a standardized MCP interface for PlanExe's plan generation wo ## Run as task (MCP tasks protocol) -MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `task_create`, `task_status`, `task_stop`, `task_download`. The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. +MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_file_info` (or `task_download` via `mcp_local`). The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. +Workflow clarity: prompt drafting + user approval is a non-tool step between setup tools and `task_create`. ## Client Choice Guide @@ -90,7 +91,7 @@ Some MCP clients (e.g. OpenClaw/mcporter) connect by doing a **GET** to the serv **You do not need SSE for tools.** MCP over HTTP can use plain JSON: - **List tools:** `GET http://:8001/mcp/tools` → returns `{"tools": [...]}` (JSON). -- **Call a tool:** `POST http://:8001/mcp/tools/call` with body `{"tool": "task_create", "arguments": {"prompt": "…", "speed_vs_detail": "ping"}}` → returns JSON. +- **Call a tool:** `POST http://:8001/mcp/tools/call` with body `{"tool": "task_create", "arguments": {"prompt": "…"}, "metadata": {"task_create": {"speed_vs_detail": "ping"}}}` → returns JSON. If your client only supports Streamable HTTP and fails on `/mcp`, you have two options: @@ -105,7 +106,7 @@ If your client only supports Streamable HTTP and fails on `/mcp`, you have two o - `PLANEXE_MCP_API_KEY`: Optional shared secret for auth. When auth is enabled, clients can use this key instead of a UserApiKey. For production with user accounts, keys from home.planexe.org (UserApiKey) are validated against the database. - `PLANEXE_MCP_HTTP_HOST`: HTTP server host (default: `127.0.0.1`). Use `0.0.0.0` to bind all interfaces (containers/cloud). - `PLANEXE_MCP_HTTP_PORT`: HTTP server port (default: `8001`). Railway will override with `PORT` env var. -- `PLANEXE_MCP_PUBLIC_BASE_URL`: Public base URL for report/zip download links in `task_file_info` (e.g. `http://192.168.1.40:8001`). When unset, the HTTP server uses the request’s host (scheme + authority), so clients connecting at `http://192.168.1.40:8001/mcp/` get download URLs like `http://192.168.1.40:8001/download/...` instead of localhost. If clients still see localhost in download URLs (e.g. behind a proxy), uncomment and set this in the repo’s `.env.docker-example` or `.env.developer-example` (copy to `.env` and fill in your public URL). +- `PLANEXE_MCP_PUBLIC_BASE_URL`: Public base URL for report/zip download links in `task_file_info` (e.g. `http://192.168.1.40:8001`). When set, `download_url` is built from this value. When unset, the HTTP server uses the request’s host (scheme + authority), so clients connecting at `http://192.168.1.40:8001/mcp/` get download URLs like `http://192.168.1.40:8001/download/...` instead of localhost. If clients still see localhost in download URLs (e.g. behind a proxy), set this env var explicitly in `.env`. - `PORT`: Railway-provided port (takes precedence over `PLANEXE_MCP_HTTP_PORT`) - `PLANEXE_MCP_CORS_ORIGINS`: Comma-separated list of allowed origins. When unset, uses `*` (all origins) so browser-based tools like the MCP Inspector can connect. If you set it (e.g. for a specific frontend), include `http://localhost:6274` and `http://127.0.0.1:6274` for the Inspector. - `PLANEXE_MCP_MAX_BODY_BYTES`: Max request size for `POST /mcp/tools/call` (default: `1048576`). @@ -129,17 +130,34 @@ mcp_cloud uses the same database configuration as other PlanExe services: See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `prompt_examples` - Return example prompts. Use these as examples for task_create. +- `model_profiles` - List profile options and currently available models in each profile. - `task_create` - Create a new task (returns task_id as UUID; may require user_api_key for credits) - `task_status` - Get task status and progress - `task_stop` - Stop an active task - `task_file_info` - Get file metadata for report or zip +`task_status` caller contract: +- `pending` / `processing`: keep polling. +- `completed`: terminal success, download is ready. +- `failed`: terminal error. + +Concurrency semantics: +- Each `task_create` call creates a new `task_id`. +- Server does not enforce a global one-task-at-a-time cap per client. +- Client should track task ids explicitly when running tasks in parallel. + +Minimal error contract: +- Tool errors use `{"error":{"code","message","details?"}}`. +- Common codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- `task_file_info` may return `{}` while output is not ready (not an error payload). + Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this server. If your client exposes `task_download`, use it to save the report or zip locally; otherwise use `task_file_info` to get `download_url` and fetch the file yourself. -**Tip**: Call `prompt_examples` to get example prompts to use with task_create. The catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. +**Tip**: Call `prompt_examples` to get example prompts to use with task_create, then call `model_profiles` to choose `model_profile` based on current runtime availability. The prompt catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. Download flow: call `task_file_info` to obtain the `download_url`, then fetch the report via `GET /download/{task_id}/030-report.html` (API key required if configured). +If `download_url` is missing, configure `PLANEXE_MCP_PUBLIC_BASE_URL` so the server can emit a reachable absolute URL. ## Debugging with the MCP Inspector diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index b78fb4f0..57191e12 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -15,7 +15,6 @@ import tempfile import uuid import zipfile -import hashlib from dataclasses import dataclass from datetime import UTC, datetime from pathlib import Path @@ -29,7 +28,18 @@ from mcp.server.stdio import stdio_server from mcp.types import CallToolResult, Tool, TextContent from pydantic import BaseModel -from worker_plan_api.model_profile import normalize_model_profile +from worker_plan_api.model_profile import ( + ModelProfileEnum, + default_filename_for_profile, + normalize_model_profile, + resolve_model_profile_from_env, +) +from worker_plan_api.planexe_config import PlanExeConfig +from worker_plan_api.llm_class_filter import ( + ENV_PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES, + is_llm_class_allowed, + parse_llm_class_whitelist, +) from mcp_cloud.dotenv_utils import load_planexe_dotenv _dotenv_loaded, _dotenv_paths = load_planexe_dotenv(Path(__file__).parent) @@ -52,6 +62,8 @@ from database_api.model_user_api_key import UserApiKey from flask import Flask, has_app_context from mcp_cloud.tool_models import ( + ModelProfilesInput, + ModelProfilesOutput, PromptExamplesInput, PromptExamplesOutput, TaskCreateInput, @@ -111,12 +123,34 @@ def ensure_taskitem_stop_columns() -> None: # Shown in MCP initialize (e.g. Inspector) so clients know what PlanExe does. PLANEXE_SERVER_INSTRUCTIONS = ( - "PlanExe generates rough-draft project plans from a natural-language prompt. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " - "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." + "PlanExe generates strategic project-plan drafts from a natural-language prompt. " + "Output is a self-contained interactive HTML report (~700KB) with 20+ sections including " + "executive summary, interactive Gantt charts, risk analysis, SWOT, governance, investor pitch, " + "team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections " + "(premortem, self-audit checklist, premise attacks) that stress-test whether the plan holds up. " + "The output is a draft to refine, not final ground truth — but it surfaces hard questions the prompter may not have considered. " + "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " + "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " + "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " + "Required interaction order: call prompt_examples first. " + "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " + "Then perform a non-tool step: draft a strong prompt as flowing prose (not structured markdown with headers or bullets), " + "typically ~300-800 words, and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose — weave specs, constraints, and targets naturally into sentences. " + "Only after approval, call task_create. " + "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " + "Then poll task_status (about every 5 minutes); use task_file_info when complete. " + "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " + "Tool errors use {error:{code,message}}. task_file_info returns an empty object {} while the artifact is not ready; check readiness by testing whether download_url is present. " + "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " + "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " + "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " + "Main output: a self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. " + "The zip contains the intermediary pipeline files (md, json, csv) that fed the report." ) mcp_cloud = Server("planexe-mcp-cloud", instructions=PLANEXE_SERVER_INSTRUCTIONS) @@ -132,23 +166,13 @@ def ensure_taskitem_stop_columns() -> None: ZIP_CONTENT_TYPE = "application/zip" ZIP_SNAPSHOT_MAX_BYTES = 100_000_000 -SPEED_VS_DETAIL_DEFAULT = "ping_llm" -SPEED_VS_DETAIL_DEFAULT_ALIAS = "ping" +SPEED_VS_DETAIL_DEFAULT = "all_details_but_slow" +SPEED_VS_DETAIL_DEFAULT_ALIAS = "all" SPEED_VS_DETAIL_VALUES = ( "ping_llm", "fast_but_skip_details", "all_details_but_slow", ) -SPEED_VS_DETAIL_INPUT_VALUES = ( - "ping", - "fast", - "all", -) -SpeedVsDetailInput = Literal[ - "ping", - "fast", - "all", -] ModelProfileInput = Literal[ "baseline", "premium", @@ -160,10 +184,21 @@ def ensure_taskitem_stop_columns() -> None: "fast": "fast_but_skip_details", "all": "all_details_but_slow", } +MODEL_PROFILE_TITLES = { + ModelProfileEnum.BASELINE.value: "Baseline", + ModelProfileEnum.PREMIUM.value: "Premium", + ModelProfileEnum.FRONTIER.value: "Frontier", + ModelProfileEnum.CUSTOM.value: "Custom", +} +MODEL_PROFILE_SUMMARIES = { + ModelProfileEnum.BASELINE.value: "Cheap and fast; recommended default when creating a plan.", + ModelProfileEnum.PREMIUM.value: "Higher-cost profile tuned for stronger output quality.", + ModelProfileEnum.FRONTIER.value: "Most capable models first; usually slowest/most expensive.", + ModelProfileEnum.CUSTOM.value: "User-managed profile file for custom model ordering.", +} class TaskCreateRequest(BaseModel): prompt: str - speed_vs_detail: Optional[SpeedVsDetailInput] = None model_profile: Optional[ModelProfileInput] = None user_api_key: Optional[str] = None @@ -177,6 +212,11 @@ class TaskFileInfoRequest(BaseModel): task_id: str artifact: Optional[str] = None + +class ModelProfilesRequest(BaseModel): + """No input parameters.""" + pass + # Helper functions def find_task_by_task_id(task_id: str) -> Optional[TaskItem]: """Find TaskItem by MCP task_id (UUID), with legacy fallback.""" @@ -265,7 +305,7 @@ def _create_task_sync( task = TaskItem( prompt=prompt, state=TaskState.pending, - user_id=metadata.get("user_id", "mcp_user") if metadata else "mcp_user", + user_id=metadata.get("user_id", "admin") if metadata else "admin", parameters=parameters, ) db.session.add(task) @@ -310,18 +350,23 @@ def _get_task_status_snapshot_sync(task_id: str) -> Optional[dict[str, Any]]: "timestamp_created": task.timestamp_created, } -def _request_task_stop_sync(task_id: str) -> bool: +def _request_task_stop_sync(task_id: str) -> Optional[dict[str, Any]]: with app.app_context(): task = find_task_by_task_id(task_id) if task is None: - return False + return None + stop_requested = False if task.state in (TaskState.pending, TaskState.processing): task.stop_requested = True task.stop_requested_timestamp = datetime.now(UTC) task.progress_message = "Stop requested by user." db.session.commit() logger.info("Stop requested for task %s; stop flag set on task %s.", task_id, task.id) - return True + stop_requested = True + return { + "state": get_task_state_mapping(task.state), + "stop_requested": stop_requested, + } def _get_task_for_report_sync(task_id: str) -> Optional[dict[str, Any]]: with app.app_context(): @@ -613,14 +658,14 @@ def compute_sha256(content: str | bytes) -> str: return hashlib.sha256(content).hexdigest() def get_task_state_mapping(task_state: TaskState) -> str: - """Map TaskState to MCP run state.""" + """Map TaskState to MCP task state.""" mapping = { - TaskState.pending: "stopped", - TaskState.processing: "running", + TaskState.pending: "pending", + TaskState.processing: "processing", TaskState.completed: "completed", TaskState.failed: "failed", } - return mapping.get(task_state, "stopped") + return mapping.get(task_state, "pending") def resolve_speed_vs_detail(config: Optional[dict[str, Any]]) -> str: value: Optional[str] = None @@ -634,6 +679,37 @@ def resolve_speed_vs_detail(config: Optional[dict[str, Any]]) -> str: return value return SPEED_VS_DETAIL_DEFAULT + +def _extract_task_create_metadata_overrides(arguments: dict[str, Any]) -> dict[str, Any]: + """Extract task_create runtime overrides from hidden metadata containers. + + Supported hidden containers: + - arguments.tool_metadata + - arguments.metadata + - arguments._meta + + If a container includes nested namespaces, these are checked first: + - task_create + - planexe_task_create + - planexe + """ + merged: dict[str, Any] = {} + metadata_candidates: list[dict[str, Any]] = [] + + for key in ("tool_metadata", "metadata", "_meta"): + candidate = arguments.get(key) + if isinstance(candidate, dict): + metadata_candidates.append(candidate) + + for candidate in metadata_candidates: + merged.update(candidate) + for nested_key in ("task_create", "planexe_task_create", "planexe"): + nested = candidate.get(nested_key) + if isinstance(nested, dict): + merged.update(nested) + + return merged + def _merge_task_create_config( config: Optional[dict[str, Any]], speed_vs_detail: Optional[str], @@ -650,6 +726,131 @@ def _merge_task_create_config( merged["model_profile"] = candidate_profile return merged or None + +def _sort_llm_config_entries(items: list[tuple[str, Any]]) -> list[tuple[str, Any]]: + def sort_key(item: tuple[str, Any]) -> tuple[int, str]: + key, model_data = item + priority = None + if isinstance(model_data, dict): + maybe_priority = model_data.get("priority") + if isinstance(maybe_priority, int): + priority = maybe_priority + if priority is None: + priority = 999999 + return priority, key + + return sorted(items, key=sort_key) + + +def _extract_model_profile_entries( + model_map: dict[str, Any], + whitelist: Optional[set[str]], +) -> list[dict[str, Any]]: + models: list[dict[str, Any]] = [] + + for model_key, model_data in _sort_llm_config_entries(list(model_map.items())): + class_name = model_data.get("class") if isinstance(model_data, dict) else None + if not is_llm_class_allowed(class_name, whitelist): + continue + + model_name = None + priority = None + if isinstance(model_data, dict): + arguments = model_data.get("arguments") + if isinstance(arguments, dict): + maybe_model = arguments.get("model") + if isinstance(maybe_model, str): + model_name = maybe_model + maybe_priority = model_data.get("priority") + if isinstance(maybe_priority, int): + priority = maybe_priority + elif isinstance(model_data.get("prio"), int): + priority = model_data["prio"] + + models.append( + { + "key": model_key, + "provider_class": class_name if isinstance(class_name, str) else None, + "model": model_name, + "priority": priority, + } + ) + + return models + + +def _profile_models_payload( + profile: ModelProfileEnum, + whitelist: Optional[set[str]], +) -> dict[str, Any]: + config_filename = default_filename_for_profile(profile) + planexe_config_path = PlanExeConfig.resolve_planexe_config_path() + config_path = PlanExeConfig.find_file_in_search_order(config_filename, planexe_config_path) + if config_path is None: + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "model_count": 0, + "models": [], + } + + try: + with config_path.open("r", encoding="utf-8") as fh: + model_map = json.load(fh) + except Exception as exc: + logger.warning( + "Unable to read profile config %s for model profile %s: %s", + config_filename, + profile.value, + exc, + ) + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "model_count": 0, + "models": [], + } + + if not isinstance(model_map, dict): + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "model_count": 0, + "models": [], + } + + models = _extract_model_profile_entries(model_map, whitelist) + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "model_count": len(models), + "models": models, + } + + +def _get_model_profiles_sync() -> dict[str, Any]: + raw_whitelist = os.environ.get(ENV_PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES) + whitelist = parse_llm_class_whitelist(raw_whitelist) + default_profile = resolve_model_profile_from_env().value + profiles_all = [ + _profile_models_payload(profile, whitelist) + for profile in ModelProfileEnum + ] + profiles = [profile for profile in profiles_all if int(profile.get("model_count") or 0) > 0] + + return { + "default_profile": default_profile, + "profiles": profiles, + "message": ( + "Use one of these profile values in task_create.model_profile. " + "Model lists show what is currently available in each profile." + ), + } + # Context var set by HTTP server so download URLs use the request's host when # PLANEXE_MCP_PUBLIC_BASE_URL is not set (avoids localhost for remote clients). _download_base_url_ctx: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar( @@ -811,6 +1012,8 @@ def _builtin_mcp_example_prompts() -> list[str]: PROMPT_EXAMPLES_INPUT_SCHEMA = PromptExamplesInput.model_json_schema() PROMPT_EXAMPLES_OUTPUT_SCHEMA = PromptExamplesOutput.model_json_schema() +MODEL_PROFILES_INPUT_SCHEMA = ModelProfilesInput.model_json_schema() +MODEL_PROFILES_OUTPUT_SCHEMA = ModelProfilesOutput.model_json_schema() @dataclass(frozen=True) class ToolDefinition: @@ -823,25 +1026,46 @@ class ToolDefinition: ToolDefinition( name="prompt_examples", description=( - "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3)." + "Call this first. Returns example prompts that define what a good prompt looks like. " + "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " + "Next is a non-tool step: formulate a detailed prompt (typically ~300-800 words; use examples as a baseline, similar structure) and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " + "Then call task_create. " + "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, ), + ToolDefinition( + name="model_profiles", + description=( + "Optional helper before task_create. Returns model_profile options with plain-language guidance " + "and currently available models in each profile. " + "If no models are available, returns error code MODEL_PROFILES_UNAVAILABLE." + ), + input_schema=MODEL_PROFILES_INPUT_SCHEMA, + output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, + ), ToolDefinition( name="task_create", description=( - "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " - "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Returns task_id (UUID); use it for task_status, task_stop, and task_download. " + "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " + "PlanExe turns the approved prompt into a strategic project-plan draft (20+ sections) in ~10-20 min. " + "Sections include: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, " + "strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, " + "SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, " + "plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, " + "premortem with failure scenarios, self-audit checklist, and adversarial premise attacks that argue against the project. " + "The adversarial sections (premortem, self-audit, premise attacks) surface risks and questions the prompter may not have considered. " + "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " + "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " + "Each task_create call creates a new task_id (no server-side dedup). " + "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " - "speed_vs_detail modes: " - "'all' runs the full pipeline with all details (slower, higher token usage/cost). " - "'fast' runs the full pipeline with minimal work per step (faster, fewer details), " - "useful to verify the pipeline is working. " - "'ping' runs the pipeline entrypoint and makes a single LLM call to verify the " - "worker_plan_database is processing tasks and can reach the LLM." + "Common error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -850,8 +1074,15 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " - "and frequent polling is unnecessary." + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " + "(baseline profile) and may take longer on higher-quality profiles. " + "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "progress_percentage is 0-100 (integer-like float); 100 when completed. " + "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " + "Unknown task_id returns error code TASK_NOT_FOUND. " + "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " + "processing with no file-output changes for >20 minutes likely means failed/stalled. " + "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, @@ -860,7 +1091,10 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "This is a normal MCP tool call: call task_stop with that task_id." + "Stopping is asynchronous: the stop flag is set immediately but the task may continue briefly before halting. " + "A stopped task will eventually transition to the failed state. " + "If the task is already completed or failed, stop_requested returns false (the task already finished). " + "Unknown task_id returns error code TASK_NOT_FOUND." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, @@ -868,9 +1102,16 @@ class ToolDefinition: ToolDefinition( name="task_file_info", description=( - "Returns file metadata (content_type, download_url, download_size) for the report or zip. " - "If your client exposes task_download (e.g. mcp_local), use that to save the file locally; " - "otherwise use this tool to get download_url and fetch the file yourself." + "Returns file metadata (content_type, download_url, download_size) for the report or zip artifact. " + "Use artifact='report' (default) for the interactive HTML report (~700KB, self-contained with embedded JS " + "for collapsible sections and interactive Gantt charts — open in a browser). " + "Use artifact='zip' for the full pipeline output bundle (md, json, csv intermediary files that fed the report). " + "While the task is still pending or processing, returns an empty object {} (no fields). " + "Check readiness by testing whether download_url is present in the response. " + "Once ready, present download_url to the user or fetch and save the file locally. " + "If your client exposes task_download (e.g. mcp_local), prefer that to save the file locally. " + "Terminal error codes: generation_failed (plan failed), content_unavailable (artifact missing). " + "Unknown task_id returns error code TASK_NOT_FOUND." ), input_schema=TASK_FILE_INFO_INPUT_SCHEMA, output_schema=TASK_FILE_INFO_OUTPUT_SCHEMA, @@ -917,12 +1158,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Examples: - {"prompt": "Start a dental clinic in Copenhagen with 3 treatment rooms, targeting families and children. Budget 2.5M DKK. Open within 12 months."} → returns task_id (UUID) + created_at - - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "speed_vs_detail": "fast"} → faster run + - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "metadata": {"task_create": {"speed_vs_detail": "fast"}}} → faster run Args: - prompt: What the plan should cover (goal, context, constraints). - - speed_vs_detail: Optional mode ("ping" | "fast" | "all"). - - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). + - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). Call model_profiles to inspect options. + - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: - content: JSON string matching structuredContent. @@ -930,8 +1171,26 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: - isError: False on success. """ req = TaskCreateRequest(**arguments) - - merged_config = _merge_task_create_config(None, req.speed_vs_detail, req.model_profile) + metadata_overrides = _extract_task_create_metadata_overrides(arguments) + metadata_model_profile = metadata_overrides.get("model_profile") + model_profile = req.model_profile + if model_profile is None and isinstance(metadata_model_profile, str): + model_profile = metadata_model_profile + + speed_vs_detail = metadata_overrides.get("speed_vs_detail") + if not isinstance(speed_vs_detail, str): + speed_alias = metadata_overrides.get("speed") + if isinstance(speed_alias, str): + speed_vs_detail = speed_alias + else: + # Backward-compatible hidden override when callers still send legacy top-level args. + legacy_speed = arguments.get("speed_vs_detail") + if isinstance(legacy_speed, str): + speed_vs_detail = legacy_speed + elif isinstance(arguments.get("speed"), str): + speed_vs_detail = arguments.get("speed") + + merged_config = _merge_task_create_config(None, speed_vs_detail, model_profile) require_user_key = os.environ.get("PLANEXE_MCP_REQUIRE_USER_KEY", "false").lower() in ("1", "true", "yes", "on") user_context = None if req.user_api_key: @@ -952,12 +1211,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: ) if user_context and float(user_context.get("credits_balance", 0.0)) <= 0.0: - response = {"error": {"code": "INSUFFICIENT_CREDITS", "message": "Not enough credits."}} - return CallToolResult( - content=[TextContent(type="text", text=json.dumps(response))], - structuredContent=response, - isError=True, - ) + response = {"error": {"code": "INSUFFICIENT_CREDITS", "message": "Not enough credits."}} + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=True, + ) response = await asyncio.to_thread( _create_task_sync, @@ -978,8 +1237,14 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: payload = { "samples": samples, "message": ( - "Step 1 done. Next: Step 2 — Formulate a good prompt using these as a baseline (similar structure). Get user approval. " - "Step 3 — Only then call task_create with the approved prompt. Do not call task_create yet." + "Next: complete the non-tool step by drafting a detailed prompt (typically ~300-800 words) using these as a baseline (similar structure), then get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " + "Only after approval, call task_create. " + "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " + "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." ), } return CallToolResult( @@ -989,8 +1254,35 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: ) +async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: + """Return model profile options and currently available models in each profile.""" + _ = ModelProfilesRequest(**(arguments or {})) + payload = await asyncio.to_thread(_get_model_profiles_sync) + profiles = payload.get("profiles") + if not isinstance(profiles, list) or len(profiles) == 0: + response = { + "error": { + "code": "MODEL_PROFILES_UNAVAILABLE", + "message": ( + "No models are currently configured. " + "Inform the user that the server administrator needs to set up model profiles before plans can be created." + ), + } + } + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=True, + ) + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(payload))], + structuredContent=payload, + isError=False, + ) + + async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: - """Fetch the current run status, progress, and recent files for a task. + """Fetch the current task status, progress, and recent files for a task. Examples: - {"task_id": "uuid"} → state/progress/timing + recent files @@ -1024,8 +1316,6 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: task_state = task_snapshot["state"] state = get_task_state_mapping(task_state) - if task_state == TaskState.processing and task_snapshot["stop_requested"]: - state = "stopping" if task_state == TaskState.completed: progress_percentage = 100.0 @@ -1073,7 +1363,7 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: ) async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: - """Request the active run for a task to stop. + """Request an active task to stop. Examples: - {"task_id": "uuid"} → stop request accepted @@ -1083,14 +1373,14 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: Returns: - content: JSON string matching structuredContent. - - structuredContent: {"state": "stopped"} or error payload. + - structuredContent: {"state": "pending|processing|completed|failed", "stop_requested": bool} or error payload. - isError: True only when task_id is unknown. """ req = TaskStopRequest(**arguments) task_id = req.task_id - found = await asyncio.to_thread(_request_task_stop_sync, task_id) - if not found: + stop_result = await asyncio.to_thread(_request_task_stop_sync, task_id) + if stop_result is None: response = { "error": { "code": "TASK_NOT_FOUND", @@ -1103,9 +1393,7 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: isError=True, ) - response = { - "state": "stopped", - } + response = stop_result return CallToolResult( content=[TextContent(type="text", text=json.dumps(response))], @@ -1240,6 +1528,7 @@ async def handle_task_file_info(arguments: dict[str, Any]) -> CallToolResult: "task_stop": handle_task_stop, "task_file_info": handle_task_file_info, "prompt_examples": handle_prompt_examples, + "model_profiles": handle_model_profiles, } async def main(): diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index c62b9086..5087bb7d 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -25,6 +25,7 @@ from mcp_cloud.http_utils import strip_redundant_content from mcp_cloud.tool_models import ( + ModelProfilesOutput, TaskCreateOutput, TaskFileInfoOutput, TaskStatusOutput, @@ -46,6 +47,7 @@ ) from mcp_cloud.app import ( + PLANEXE_SERVER_INSTRUCTIONS, REPORT_CONTENT_TYPE, REPORT_FILENAME, TOOL_DEFINITIONS, @@ -55,6 +57,7 @@ fetch_artifact_from_worker_plan, fetch_user_downloadable_zip, handle_task_create, + handle_model_profiles, handle_task_status, handle_task_stop, handle_task_file_info, @@ -236,6 +239,7 @@ async def _enforce_body_size(request: Request) -> Optional[JSONResponse]: class MCPToolCallRequest(BaseModel): tool: str arguments: dict[str, Any] + metadata: Optional[dict[str, Any]] = None class MCPToolCallResponse(BaseModel): @@ -315,29 +319,21 @@ def _normalize_tool_result(result: Any) -> tuple[list[dict[str, Any]], Optional[ return content, error -SpeedVsDetailInput = Literal["ping", "fast", "all"] ModelProfileInput = Literal["baseline", "premium", "frontier", "custom"] ResultArtifactInput = Literal["report", "zip"] async def task_create( prompt: str, - speed_vs_detail: Annotated[ - SpeedVsDetailInput, - Field( - description="Defaults to ping (alias for ping_llm). Options: ping, fast, all.", - ), - ] = "ping", model_profile: Annotated[ ModelProfileInput, - Field(description="LLM profile: baseline, premium, frontier, custom."), + Field(description="Model profile: baseline, premium, frontier, custom. Call model_profiles to inspect options."), ] = "baseline", ) -> Annotated[CallToolResult, TaskCreateOutput]: """Create a new PlanExe task. Use prompt_examples first for example prompts.""" authenticated_user_api_key = _get_authenticated_user_api_key() arguments: dict[str, Any] = { "prompt": prompt, - "speed_vs_detail": speed_vs_detail, "model_profile": model_profile, } if authenticated_user_api_key: @@ -374,6 +370,11 @@ async def prompt_examples() -> CallToolResult: return await handle_prompt_examples({}) +async def model_profiles() -> Annotated[CallToolResult, ModelProfilesOutput]: + """Return model_profile options with currently available models.""" + return await handle_model_profiles({}) + + def _register_tools(server: FastMCP) -> None: handler_map = { "task_create": task_create, @@ -381,6 +382,7 @@ def _register_tools(server: FastMCP) -> None: "task_stop": task_stop, "task_file_info": task_file_info, "prompt_examples": prompt_examples, + "model_profiles": model_profiles, } for tool in TOOL_DEFINITIONS: handler = handler_map.get(tool.name) @@ -395,14 +397,7 @@ def _register_tools(server: FastMCP) -> None: fastmcp_server = FastMCP( name="planexe-mcp-server", - instructions=( - "PlanExe generates rough-draft project plans from a natural-language prompt. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " - "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." - ), + instructions=PLANEXE_SERVER_INSTRUCTIONS, host=HTTP_HOST, port=HTTP_PORT, streamable_http_path="/", @@ -437,7 +432,7 @@ async def _lifespan(app: FastAPI): app = FastAPI( title="PlanExe – AI Project Planning", - description="MCP server that generates rough-draft project plans from a natural-language prompt", + description="MCP server that generates strategic project-plan drafts from a natural-language prompt", version="1.0.0", lifespan=_lifespan, ) @@ -553,13 +548,36 @@ async def call_tool( Call an MCP tool by name with arguments. This endpoint wraps the stdio-based MCP tool handlers for HTTP access. - Download URLs use the request host when PLANEXE_MCP_PUBLIC_BASE_URL is not set (set in middleware). """ arguments = dict(payload.arguments or {}) if payload.tool == "task_create": authenticated_user_api_key = _get_authenticated_user_api_key() if authenticated_user_api_key and not arguments.get("user_api_key"): arguments["user_api_key"] = authenticated_user_api_key + if isinstance(payload.metadata, dict): + arguments["metadata"] = dict(payload.metadata) + + # Backward compatibility: move legacy speed args into hidden metadata. + legacy_speed_vs_detail = arguments.pop("speed_vs_detail", None) + legacy_speed = arguments.pop("speed", None) + if isinstance(legacy_speed_vs_detail, str) or isinstance(legacy_speed, str): + metadata = arguments.get("metadata") + if not isinstance(metadata, dict): + metadata = {} + arguments["metadata"] = metadata + task_create_metadata = metadata.get("task_create") + if not isinstance(task_create_metadata, dict): + task_create_metadata = {} + metadata["task_create"] = task_create_metadata + if isinstance(legacy_speed_vs_detail, str): + task_create_metadata.setdefault("speed_vs_detail", legacy_speed_vs_detail) + if isinstance(legacy_speed, str): + task_create_metadata.setdefault("speed", legacy_speed) + + result = await handle_task_create(arguments) + content, error = _normalize_tool_result(result) + return MCPToolCallResponse(content=content, error=error) + return await call_tool_via_registry(fastmcp_server, payload.tool, arguments) diff --git a/mcp_cloud/server.json b/mcp_cloud/server.json index e98a533f..a4e18fb2 100644 --- a/mcp_cloud/server.json +++ b/mcp_cloud/server.json @@ -2,7 +2,7 @@ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", "name": "io.github.PlanExeOrg/planexe", "title": "PlanExe", - "description": "MCP server for generating rough-draft project plans from natural-language prompts.", + "description": "MCP server for generating strategic project-plan drafts (20+ sections including adversarial analysis) from natural-language prompts.", "repository": { "url": "https://github.com/PlanExeOrg/PlanExe", "source": "github" diff --git a/mcp_cloud/tests/test_model_profiles_tool.py b/mcp_cloud/tests/test_model_profiles_tool.py new file mode 100644 index 00000000..340e16be --- /dev/null +++ b/mcp_cloud/tests/test_model_profiles_tool.py @@ -0,0 +1,59 @@ +import asyncio +import unittest +from unittest.mock import patch + +from mcp_cloud.app import handle_list_tools, handle_model_profiles + + +class TestModelProfilesTool(unittest.TestCase): + def test_model_profiles_tool_listed(self): + tools = asyncio.run(handle_list_tools()) + tool_names = {tool.name for tool in tools} + self.assertIn("model_profiles", tool_names) + + def test_model_profiles_returns_structured_content(self): + payload = { + "default_profile": "baseline", + "profiles": [ + { + "profile": "baseline", + "title": "Baseline", + "summary": "Cheap and fast; recommended default when creating a plan.", + "model_count": 1, + "models": [ + { + "key": "openrouter-gpt-oss-20b", + "provider_class": "OpenRouter", + "model": "openai/gpt-oss-20b", + "priority": 0, + } + ], + } + ], + "message": "Use one of these profile values in task_create.model_profile.", + } + + with patch("mcp_cloud.app._get_model_profiles_sync", return_value=payload): + result = asyncio.run(handle_model_profiles({})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent["default_profile"], "baseline") + self.assertEqual(result.structuredContent["profiles"][0]["profile"], "baseline") + self.assertNotIn("available", result.structuredContent["profiles"][0]) + + def test_model_profiles_returns_error_when_none_available(self): + payload = { + "default_profile": "baseline", + "profiles": [], + "message": "Use one of these profile values in task_create.model_profile.", + } + + with patch("mcp_cloud.app._get_model_profiles_sync", return_value=payload): + result = asyncio.run(handle_model_profiles({})) + + self.assertTrue(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "MODEL_PROFILES_UNAVAILABLE") + + +if __name__ == "__main__": + unittest.main() diff --git a/mcp_cloud/tests/test_speed_vs_detail.py b/mcp_cloud/tests/test_speed_vs_detail.py index 4f911913..04393b36 100644 --- a/mcp_cloud/tests/test_speed_vs_detail.py +++ b/mcp_cloud/tests/test_speed_vs_detail.py @@ -5,6 +5,7 @@ from mcp_cloud.app import ( SPEED_VS_DETAIL_DEFAULT, TaskCreateRequest, + _extract_task_create_metadata_overrides, _merge_task_create_config, resolve_speed_vs_detail, ) @@ -40,15 +41,6 @@ def test_merge_task_create_config_ignores_blank(self): class TestTaskCreateRequest(unittest.TestCase): - def test_speed_vs_detail_accepts_enum(self): - for value in ("ping", "fast", "all"): - req = TaskCreateRequest(prompt="demo", speed_vs_detail=value) - self.assertEqual(req.speed_vs_detail, value) - - def test_speed_vs_detail_rejects_invalid(self): - with self.assertRaises(ValidationError): - TaskCreateRequest(prompt="demo", speed_vs_detail="slow") - def test_model_profile_accepts_enum(self): for value in ("baseline", "premium", "frontier", "custom"): req = TaskCreateRequest(prompt="demo", model_profile=value) @@ -59,5 +51,31 @@ def test_model_profile_rejects_invalid(self): TaskCreateRequest(prompt="demo", model_profile="enterprise") +class TestTaskCreateMetadataOverrides(unittest.TestCase): + def test_extracts_nested_task_create_metadata(self): + overrides = _extract_task_create_metadata_overrides( + {"metadata": {"task_create": {"speed_vs_detail": "fast"}}} + ) + self.assertEqual(overrides.get("speed_vs_detail"), "fast") + + def test_extracts_top_level_metadata(self): + overrides = _extract_task_create_metadata_overrides( + {"_meta": {"speed": "all", "model_profile": "premium"}} + ) + self.assertEqual(overrides.get("speed"), "all") + self.assertEqual(overrides.get("model_profile"), "premium") + + def test_nested_namespace_overrides_top_level(self): + overrides = _extract_task_create_metadata_overrides( + { + "metadata": { + "speed_vs_detail": "fast", + "task_create": {"speed_vs_detail": "ping"}, + } + } + ) + self.assertEqual(overrides.get("speed_vs_detail"), "ping") + + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tests/test_task_create_tool.py b/mcp_cloud/tests/test_task_create_tool.py index 54274d91..8387d507 100644 --- a/mcp_cloud/tests/test_task_create_tool.py +++ b/mcp_cloud/tests/test_task_create_tool.py @@ -6,10 +6,18 @@ from unittest.mock import MagicMock, patch from mcp.types import CallToolResult -from mcp_cloud.app import handle_task_create +from mcp_cloud.app import handle_list_tools, handle_task_create class TestTaskCreateTool(unittest.TestCase): + def test_task_create_visible_schema_hides_speed_and_exposes_model_profile(self): + tools = asyncio.run(handle_list_tools()) + task_create_tool = next(tool for tool in tools if tool.name == "task_create") + properties = task_create_tool.inputSchema.get("properties", {}) + self.assertIn("prompt", properties) + self.assertIn("model_profile", properties) + self.assertNotIn("speed_vs_detail", properties) + def test_task_create_returns_structured_content(self): arguments = {"prompt": "xcv", "config": None, "metadata": None} fake_session = MagicMock() @@ -35,6 +43,22 @@ def __init__(self, prompt: str, state, user_id: str, parameters): self.assertIn("created_at", result.structuredContent) self.assertIsInstance(uuid.UUID(result.structuredContent["task_id"]), uuid.UUID) + def test_task_create_uses_hidden_metadata_speed_override(self): + fake_response = {"task_id": str(uuid.uuid4()), "created_at": "2026-01-01T00:00:00Z"} + with patch("mcp_cloud.app._create_task_sync", return_value=fake_response) as create_task_sync: + result = asyncio.run( + handle_task_create( + { + "prompt": "demo", + "metadata": {"task_create": {"speed_vs_detail": "ping"}}, + } + ) + ) + + self.assertFalse(result.isError) + _, merged_config, _ = create_task_sync.call_args.args + self.assertEqual(merged_config, {"speed_vs_detail": "ping"}) + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tests/test_task_file_info_tool.py b/mcp_cloud/tests/test_task_file_info_tool.py index 1abd2d17..016656e8 100644 --- a/mcp_cloud/tests/test_task_file_info_tool.py +++ b/mcp_cloud/tests/test_task_file_info_tool.py @@ -97,6 +97,32 @@ def test_report_read_zip_for_failed_task(self): self.assertEqual(payload["download_size"], len(content_bytes)) self.assertEqual(payload["content_type"], ZIP_CONTENT_TYPE) + def test_task_file_info_returns_empty_object_when_pending(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": "task-id", + "state": TaskState.pending, + "progress_message": None, + } + with patch("mcp_cloud.app._get_task_for_report_sync", return_value=task_snapshot): + result = asyncio.run(handle_task_file_info({"task_id": task_id})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent, {}) + + def test_task_file_info_returns_generation_failed_payload(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": "task-id", + "state": TaskState.failed, + "progress_message": "Pipeline failed", + } + with patch("mcp_cloud.app._get_task_for_report_sync", return_value=task_snapshot): + result = asyncio.run(handle_task_file_info({"task_id": task_id, "artifact": "report"})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "generation_failed") + def test_sanitize_legacy_zip_snapshot_removes_track_activity_jsonl(self): buffer = BytesIO() with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: diff --git a/mcp_cloud/tests/test_task_status_tool.py b/mcp_cloud/tests/test_task_status_tool.py index 1a765e5c..d48309ca 100644 --- a/mcp_cloud/tests/test_task_status_tool.py +++ b/mcp_cloud/tests/test_task_status_tool.py @@ -63,6 +63,34 @@ def test_task_status_falls_back_to_zip_snapshot_files_when_primary_source_empty( self.assertEqual(len(files), 1) self.assertEqual(files[0]["path"], "001-2-plan.txt") + def test_task_status_uses_processing_state_name(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": task_id, + "state": TaskState.processing, + "stop_requested": True, + "progress_percentage": 10.0, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.app._get_task_status_snapshot_sync", + return_value=task_snapshot, + ), patch( + "mcp_cloud.app.fetch_file_list_from_worker_plan", + new=AsyncMock(return_value=[]), + ): + result = asyncio.run(handle_task_status({"task_id": task_id})) + + self.assertEqual(result.structuredContent["state"], "processing") + + def test_task_status_returns_task_not_found_error(self): + task_id = str(uuid.uuid4()) + with patch("mcp_cloud.app._get_task_status_snapshot_sync", return_value=None): + result = asyncio.run(handle_task_status({"task_id": task_id})) + + self.assertTrue(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "TASK_NOT_FOUND") + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py new file mode 100644 index 00000000..f4c1c698 --- /dev/null +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -0,0 +1,147 @@ +import unittest + +import mcp_cloud.app as cloud_app +import mcp_local.planexe_mcp_local as local_app + + +def _tool_desc(tool_defs, name: str) -> str: + for definition in tool_defs: + if definition.name == name: + return definition.description + raise AssertionError(f"Tool not found: {name}") + + +def _tool_def(tool_defs, name: str): + for definition in tool_defs: + if definition.name == name: + return definition + raise AssertionError(f"Tool not found: {name}") + + +class TestAllToolsHaveOutputSchema(unittest.TestCase): + """Every tool must declare an output_schema so callers know the response shape.""" + + def test_cloud_all_tools_have_output_schema(self): + for definition in cloud_app.TOOL_DEFINITIONS: + with self.subTest(tool=definition.name): + self.assertIsNotNone( + definition.output_schema, + f"Cloud tool {definition.name!r} is missing output_schema", + ) + + def test_local_all_tools_have_output_schema(self): + for definition in local_app.TOOL_DEFINITIONS: + with self.subTest(tool=definition.name): + self.assertIsNotNone( + definition.output_schema, + f"Local tool {definition.name!r} is missing output_schema", + ) + + +class TestTaskCreateInputSchemaHasUserApiKey(unittest.TestCase): + """user_api_key must be in the visible task_create input schema.""" + + def test_cloud_task_create_schema_has_user_api_key(self): + props = cloud_app.TASK_CREATE_INPUT_SCHEMA.get("properties", {}) + self.assertIn("user_api_key", props) + + def test_local_task_create_schema_has_user_api_key(self): + props = local_app.TASK_CREATE_INPUT_SCHEMA.get("properties", {}) + self.assertIn("user_api_key", props) + + +class TestCloudToolSurfaceConsistency(unittest.TestCase): + def test_cloud_exposes_model_profiles_tool(self): + cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} + self.assertIn("model_profiles", cloud_tool_names) + + def test_cloud_exposes_task_file_info_not_task_download(self): + cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} + self.assertIn("task_file_info", cloud_tool_names) + self.assertNotIn("task_download", cloud_tool_names) + + def test_cloud_instructions_reference_cloud_download_tool(self): + self.assertIn("task_file_info", cloud_app.PLANEXE_SERVER_INSTRUCTIONS) + self.assertNotIn("task_download", cloud_app.PLANEXE_SERVER_INSTRUCTIONS) + + def test_cloud_task_create_description_references_cloud_download_tool(self): + description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_create") + self.assertIn("task_file_info", description) + self.assertNotIn("task_download", description) + + def test_cloud_instructions_include_task_status_state_contract(self): + instructions = cloud_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("pending/processing", instructions) + self.assertIn("completed", instructions) + self.assertIn("failed", instructions) + self.assertNotIn("running/stopping", instructions) + self.assertIn("pending for longer than 5 minutes", instructions) + self.assertIn("longer than 20 minutes", instructions) + self.assertIn("PlanExeOrg/PlanExe/issues", instructions) + + def test_cloud_task_status_description_includes_state_contract(self): + description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_status") + self.assertIn("pending/processing", description) + self.assertIn("completed", description) + self.assertIn("failed", description) + self.assertNotIn("running/stopping", description) + self.assertIn("pending for >5 minutes", description) + self.assertIn(">20 minutes", description) + self.assertIn("PlanExeOrg/PlanExe/issues", description) + + def test_cloud_instructions_include_model_profiles_unavailable_guidance(self): + instructions = cloud_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("MODEL_PROFILES_UNAVAILABLE", instructions) + + def test_cloud_prompt_schema_includes_prompt_shape_guidance(self): + prompt_schema = cloud_app.TASK_CREATE_INPUT_SCHEMA["properties"]["prompt"]["description"] + self.assertIn("300-800 words", prompt_schema) + self.assertIn("objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria", prompt_schema) + + +class TestLocalToolSurfaceConsistency(unittest.TestCase): + def test_local_exposes_model_profiles_tool(self): + local_tool_names = {definition.name for definition in local_app.TOOL_DEFINITIONS} + self.assertIn("model_profiles", local_tool_names) + + def test_local_exposes_task_download_not_task_file_info(self): + local_tool_names = {definition.name for definition in local_app.TOOL_DEFINITIONS} + self.assertIn("task_download", local_tool_names) + self.assertNotIn("task_file_info", local_tool_names) + + def test_local_instructions_reference_local_download_tool(self): + self.assertIn("task_download", local_app.PLANEXE_SERVER_INSTRUCTIONS) + self.assertNotIn("task_file_info", local_app.PLANEXE_SERVER_INSTRUCTIONS) + + def test_local_task_create_description_references_local_download_tool(self): + description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_create") + self.assertIn("task_download", description) + self.assertNotIn("task_file_info", description) + + def test_local_instructions_include_task_status_state_contract(self): + instructions = local_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("pending/processing", instructions) + self.assertIn("completed", instructions) + self.assertIn("failed", instructions) + self.assertNotIn("running/stopping", instructions) + self.assertIn("pending for longer than 5 minutes", instructions) + self.assertIn("longer than 20 minutes", instructions) + self.assertIn("PlanExeOrg/PlanExe/issues", instructions) + + def test_local_task_status_description_includes_state_contract(self): + description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_status") + self.assertIn("pending/processing", description) + self.assertIn("completed", description) + self.assertIn("failed", description) + self.assertNotIn("running/stopping", description) + self.assertIn("pending for >5 minutes", description) + self.assertIn(">20 minutes", description) + self.assertIn("PlanExeOrg/PlanExe/issues", description) + + def test_local_instructions_include_model_profiles_unavailable_guidance(self): + instructions = local_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("MODEL_PROFILES_UNAVAILABLE", instructions) + + +if __name__ == "__main__": + unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index ea5289f0..fd267876 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Any, Literal from pydantic import BaseModel, Field @@ -6,6 +6,7 @@ class ErrorDetail(BaseModel): code: str message: str + details: dict[str, Any] | None = None class PromptExamplesOutput(BaseModel): @@ -13,7 +14,9 @@ class PromptExamplesOutput(BaseModel): ..., description=( "Example prompts that define the baseline for what a good prompt looks like. " - "Take inspiration from these when writing your own prompt for task_create." + "Take inspiration from these when writing your own prompt for task_create " + "(typically ~300-800 words). Good prompt shape: objective, scope, constraints, " + "timeline, stakeholders, budget/resources, and success criteria." ), ) message: str @@ -24,6 +27,50 @@ class PromptExamplesInput(BaseModel): pass +class ModelProfilesInput(BaseModel): + """No input parameters.""" + pass + + +class ModelProfileModelEntry(BaseModel): + key: str = Field(..., description="Model key from llm_config/.json.") + provider_class: str | None = Field( + default=None, + description="Provider class (for example OpenRouter, OpenAI, Ollama).", + ) + model: str | None = Field(default=None, description="Provider model identifier when present.") + priority: int | None = Field( + default=None, + description="Priority from config (lower number means earlier in selection order).", + ) + + +class ModelProfileInfo(BaseModel): + profile: Literal["baseline", "premium", "frontier", "custom"] = Field( + ..., + description="Model profile value accepted by task_create.model_profile.", + ) + title: str = Field(..., description="Human-friendly profile label.") + summary: str = Field(..., description="Short profile guidance for callers.") + model_count: int = Field(..., description="Number of models currently available in this profile.") + models: list[ModelProfileModelEntry] = Field( + ..., + description="Models currently available to this profile.", + ) + + +class ModelProfilesOutput(BaseModel): + default_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( + ..., + description="Default model profile used when task_create.model_profile is omitted/invalid.", + ) + profiles: list[ModelProfileInfo] = Field( + ..., + description="Available profile options and their model inventory.", + ) + message: str = Field(..., description="Caller guidance for selecting task_create.model_profile.") + + class TaskStatusInput(BaseModel): task_id: str = Field( ..., @@ -52,7 +99,7 @@ class TaskFileInfoInput(BaseModel): class TaskCreateOutput(BaseModel): task_id: str = Field( ..., - description="Task UUID returned by task_create. Stable across task_status/task_stop/task_download." + description="Task UUID returned by task_create. Stable across task_status/task_stop/task_file_info." ) created_at: str @@ -72,10 +119,26 @@ class TaskStatusSuccess(BaseModel): ..., description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] - progress_percentage: float + state: Literal["pending", "processing", "completed", "failed"] = Field( + ..., + description=( + "Caller contract: pending/processing => keep polling; " + "completed => download is ready; failed => terminal error." + ), + ) + progress_percentage: float = Field( + ..., + description="Completion progress from 0 to 100. Monotonically increasing; 100 when state is completed.", + ) timing: TaskStatusTiming - files: list[TaskStatusFile] + files: list[TaskStatusFile] = Field( + ..., + description=( + "Intermediate output files produced so far. " + "Use updated_at timestamps to detect stalls. " + "These files are included in the zip artifact when the task completes." + ), + ) class TaskStatusOutput(BaseModel): @@ -83,30 +146,59 @@ class TaskStatusOutput(BaseModel): default=None, description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] | None = None - progress_percentage: float | None = None + state: Literal["pending", "processing", "completed", "failed"] | None = Field( + default=None, + description=( + "Caller contract: pending/processing => keep polling; " + "completed => download is ready; failed => terminal error." + ), + ) + progress_percentage: float | None = Field( + default=None, + description="Completion progress from 0 to 100. Monotonically increasing; 100 when state is completed.", + ) timing: TaskStatusTiming | None = None - files: list[TaskStatusFile] | None = None + files: list[TaskStatusFile] | None = Field( + default=None, + description=( + "Intermediate output files produced so far. " + "Use updated_at timestamps to detect stalls. " + "These files are included in the zip artifact when the task completes." + ), + ) error: ErrorDetail | None = None class TaskStopOutput(BaseModel): - state: Literal["stopped"] | None = None + state: Literal["pending", "processing", "completed", "failed"] | None = Field( + default=None, + description="Current task state after stop request.", + ) + stop_requested: bool | None = Field( + default=None, + description="True when stop request flag was set for a pending/processing task.", + ) error: ErrorDetail | None = None class TaskFileInfoReadyOutput(BaseModel): - content_type: str - sha256: str - download_size: int - download_url: str | None = None + content_type: str = Field(..., description="Artifact content type.") + sha256: str = Field(..., description="SHA-256 hash of artifact bytes.") + download_size: int = Field(..., description="Artifact size in bytes.") + download_url: str | None = Field( + default=None, + description="Absolute URL where the requested artifact can be downloaded.", + ) class TaskFileInfoOutput(BaseModel): - content_type: str | None = None - sha256: str | None = None - download_size: int | None = None - download_url: str | None = None + content_type: str | None = Field(default=None, description="Artifact content type.") + sha256: str | None = Field(default=None, description="SHA-256 hash of artifact bytes.") + download_size: int | None = Field(default=None, description="Artifact size in bytes.") + download_url: str | None = Field( + default=None, + description="Absolute URL where the requested artifact can be downloaded.", + ) error: ErrorDetail | None = None @@ -116,16 +208,21 @@ class TaskCreateInput(BaseModel): description=( "What the plan should cover (goal, context, constraints). " "Use prompt_examples to get example prompts; use these as examples for task_create. " - "Short prompts produce less detailed plans." + "For best results, provide a detailed prompt (typically ~300-800 words). " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " + "budget/resources, and success criteria. " + "Write as flowing prose, not structured markdown. Include banned approaches, " + "governance preferences, and phasing inline. " + "Short prompts produce less detailed plans. " + "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist); use direct LLM responses for those." ), ) - speed_vs_detail: Literal["ping", "fast", "all"] = Field( - default="ping", - description="Defaults to ping (alias for ping_llm). Options: ping, fast, all.", - ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", - description="LLM profile mapping to llm_config/.json (baseline, premium, frontier, custom).", + description=( + "Model profile selection: baseline (cheap/fast), premium (higher quality), " + "frontier (most capable), custom (user-defined). Call model_profiles for runtime availability." + ), ) user_api_key: str | None = Field( default=None, diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index ec3cd2a9..dd45be82 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -6,9 +6,34 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. ## Interaction model - The local proxy exposes MCP tools over stdio and forwards requests to mcp_cloud using `PLANEXE_URL` (defaults to the hosted `/mcp` endpoint). -- Supported tools: `task_create`, `task_status`, `task_stop`, `task_download`, `prompt_examples`. +- Supported tools: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_download`. - `task_download` calls the remote `task_file_info` tool to obtain a download URL, then downloads the artifact to `PLANEXE_PATH` on the local machine. +- `task_create` visible input schema includes `prompt`, optional `model_profile`, and optional `user_api_key`. +- Use `model_profiles` to help agents select `task_create.model_profile` without relying on internal file knowledge. +- Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. +- Keep concurrency wording explicit: each `task_create` call creates a new `task_id`; no global per-client concurrency cap is enforced server-side. +- Runtime override `speed_vs_detail` is metadata-only (hidden from visible schema); + when callers still pass legacy top-level `speed_vs_detail`/`speed`, forward those + into `metadata.task_create` for backward compatibility. + +## Public state contract +- `task_status.state` must use exactly: `pending`, `processing`, `completed`, `failed`. +- Caller contract: + - `pending`/`processing`: keep polling. + - `completed`: download is ready. + - `failed`: terminal error. +- Do not use legacy public names such as `running`, `stopping`, or `stopped`. +- Do not expose internal implementation symbols (for example `TaskState.pending`) in + model-facing text; use plain public strings. +- Troubleshooting guidance to keep aligned with cloud docs/instructions: + - `pending` for longer than 5 minutes likely means queued but not picked up by worker. + - `processing` with no output-file changes for longer than 20 minutes likely means stalled/failed execution. + - Report both cases at `https://github.com/PlanExeOrg/PlanExe/issues`. + +## task_stop semantics +- `task_stop` is a stop request/acknowledgement, not a separate lifecycle state. +- Return payload should include current public `state` plus `stop_requested`. ## Constraints - Do not add dependencies outside the existing runtime (stdlib + `mcp`). @@ -16,9 +41,18 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - HTTP wrapper (`/mcp/tools/call`) - Streamable MCP JSON-RPC (`/mcp`) - Ensure all tool responses include structured content when an output schema is defined. +- Keep local proxy error semantics documented and stable (`REMOTE_ERROR`, `DOWNLOAD_FAILED`) and pass through cloud error payloads unchanged when possible. +- Tool-surface split must remain explicit: + - local exposes `task_download`. + - cloud exposes `task_file_info`. + - do not expose `task_file_info` as a local tool name. - **Run as task**: Do not advertise the MCP **tasks** protocol (tasks/get, tasks/result, tasks/cancel, tasks/list) or add tool-level "Run as task" support. PlanExe’s interface is tool-based only (task_create → task_status → task_download). The MCP tasks protocol is a different, client-driven feature; Cursor and the Python MCP SDK do not support it properly, so we keep tools-only for compatibility. ## Env vars - `PLANEXE_URL`: Base URL for mcp_cloud (e.g., `http://localhost:8001/mcp`). -- `PLANEXE_MCP_API_KEY`: API key passed as `Authorization: Bearer ...` if provided. +- `PLANEXE_MCP_API_KEY`: API key forwarded to remote as custom header `X-API-Key`. - `PLANEXE_PATH`: Local directory where downloads are saved. + - Must be a directory. + - Created automatically when missing. + - Defaults to current working directory when unset. + - Saved filename pattern: `-` with numeric suffixes on collisions. diff --git a/mcp_local/README.md b/mcp_local/README.md index ee1cea2a..06fa67ae 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -9,22 +9,46 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas ## Tools `prompt_examples` - Return example prompts. Use these as examples for task_create. You can also call `task_create` with any prompt—short prompts produce less detailed plans. +`model_profiles` - Show model_profile options and currently available models in each profile. `task_create` - Initiate creation of a plan. `task_status` - Get status and progress about the creation of a plan. `task_stop` - Abort creation of a plan. `task_download` - Download the plan, either html report or a zip with everything, and save it to disk. +`task_status` caller contract: +- `pending` / `processing`: keep polling. +- `completed`: terminal success, download is ready. +- `failed`: terminal error. + +Concurrency semantics: +- Each `task_create` call creates a new `task_id`. +- Server does not enforce a global one-task-at-a-time cap per client. +- Local clients should track task ids explicitly when running tasks in parallel. + +Minimal error contract: +- Tool errors use `{"error":{"code","message","details?"}}`. +- Common proxied cloud codes include: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- Local proxy specific codes: `REMOTE_ERROR`, `DOWNLOAD_FAILED`. +- `task_file_info` (called under the hood by task_download) may return `{}` while output is not ready. + **Tip**: Call `prompt_examples` to get example prompts to use with task_create. The full catalog lives at `worker_plan/worker_plan_api/prompt/data/simple_plan_prompts.jsonl`. `task_download` is a synthetic tool provided by the local proxy. It calls the remote MCP tool `task_file_info` to obtain a download URL, then downloads the file locally into `PLANEXE_PATH`. +`PLANEXE_PATH` behavior: +- If unset, downloads are saved to the current working directory. +- If the path does not exist, it is created. +- If the path points to a file (not a directory), download fails. +- Filenames are `-030-report.html` or `-run.zip` (with `-1`, `-2`, ... suffixes on collisions). +- `task_download` returns `saved_path` with the final file location. + ## Run as task (MCP tasks protocol) Some MCP clients (e.g. the MCP Inspector) show a **"Run as task"** option for tools. That refers to the MCP **tasks** protocol: a separate mechanism where the client runs a tool in the background using RPC methods like `tasks/run`, `tasks/get`, `tasks/result`, and `tasks/cancel`, instead of a single blocking tool call. -**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. +**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `prompt_examples` and `model_profiles` for setup, completes a non-tool prompt drafting/approval step, then `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. You should **not** enable "Run as task" for PlanExe. The Python MCP SDK and clients like Cursor do not properly support the tasks protocol (method registration and initialization fail). Use the tools directly: create a task, poll status, then download when done. @@ -35,7 +59,7 @@ You should **not** enable "Run as task" for PlanExe. The Python MCP SDK and clie - If the HTTP wrapper is unavailable, the proxy falls back to MCP JSON-RPC over `POST /mcp` (not SSE). - Downloads use the remote `/download/{task_id}/...` endpoints. -- Authentication uses `PLANEXE_MCP_API_KEY` as a `Bearer` token. +- Authentication uses `PLANEXE_MCP_API_KEY` as custom header `X-API-Key` (not OAuth/Bearer). - **Retry behavior**: Transient failures (server 5xx errors, network timeouts) are automatically retried up to 3 times with exponential backoff (1s, 2s delays). Client errors (4xx) are not retried. Retries are logged at WARNING level. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 07d9dce1..2d2e47a5 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -29,16 +29,18 @@ DEFAULT_MCP_URL = "https://your-railway-app.up.railway.app/mcp" REPORT_FILENAME = "030-report.html" ZIP_FILENAME = "run.zip" -SpeedVsDetailInput = Literal[ - "ping", - "fast", - "all", +ModelProfileInput = Literal[ + "baseline", + "premium", + "frontier", + "custom", ] class TaskCreateRequest(BaseModel): prompt: str - speed_vs_detail: Optional[SpeedVsDetailInput] = None + model_profile: Optional[ModelProfileInput] = None + user_api_key: Optional[str] = None class TaskStatusRequest(BaseModel): @@ -94,7 +96,7 @@ def _build_headers() -> dict[str, str]: } api_key = _get_env("PLANEXE_MCP_API_KEY") if api_key: - headers["Authorization"] = f"Bearer {api_key}" + headers["X-API-Key"] = api_key return headers @@ -310,6 +312,7 @@ class ToolDefinition: "properties": { "code": {"type": "string"}, "message": {"type": "string"}, + "details": {"type": ["object", "null"]}, }, "required": ["code", "message"], } @@ -321,14 +324,28 @@ class ToolDefinition: "type": "string", "description": ( "What the plan should cover. Good prompts are often 300–800 words. " - "Use prompt_examples to get example prompts; use these as examples for task_create. Short prompts produce less detailed plans." + "Use prompt_examples to get example prompts; use these as examples for task_create. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " + "budget/resources, and success criteria. " + "Write as flowing prose, not structured markdown. Include banned approaches, " + "governance preferences, and phasing inline. " + "Short prompts produce less detailed plans. " + "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist)." ), }, - "speed_vs_detail": { + "model_profile": { "type": "string", - "enum": ["ping", "fast", "all"], - "default": "ping", - "description": "How much work to run. 'ping': single LLM call to check the pipeline is reachable (check logs if it fails). 'fast': minimal run (approx 5-10 min) through all pipeline steps, skipping where possible—use to verify the pipeline works. 'all': full plan with full detail (approx 10-20 min).", + "enum": ["baseline", "premium", "frontier", "custom"], + "default": "baseline", + "description": ( + "Model profile selection: baseline (cheap/fast), premium (higher quality), " + "frontier (most capable), custom (user-defined). Call model_profiles for runtime availability." + ), + }, + "user_api_key": { + "type": ["string", "null"], + "default": None, + "description": "Optional user API key for credits and attribution.", }, }, "required": ["prompt"], @@ -390,6 +407,61 @@ class ToolDefinition: }, "required": ["samples", "message"], } +MODEL_PROFILES_INPUT_SCHEMA = { + "type": "object", + "properties": {}, + "required": [], +} +MODEL_PROFILES_OUTPUT_SCHEMA = { + "type": "object", + "properties": { + "default_profile": { + "type": "string", + "enum": ["baseline", "premium", "frontier", "custom"], + }, + "profiles": { + "type": "array", + "items": { + "type": "object", + "properties": { + "profile": { + "type": "string", + "enum": ["baseline", "premium", "frontier", "custom"], + }, + "title": {"type": "string"}, + "summary": {"type": "string"}, + "model_count": {"type": "integer"}, + "models": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": {"type": "string"}, + "provider_class": {"type": ["string", "null"]}, + "model": {"type": ["string", "null"]}, + "priority": {"type": ["integer", "null"]}, + }, + "required": ["key"], + }, + }, + }, + "required": [ + "profile", + "title", + "summary", + "model_count", + "models", + ], + }, + }, + "message": {"type": "string"}, + }, + "required": [ + "default_profile", + "profiles", + "message", + ], +} TASK_CREATE_OUTPUT_SCHEMA = { "type": "object", @@ -437,11 +509,14 @@ class ToolDefinition: TASK_DOWNLOAD_OUTPUT_SCHEMA = { "type": "object", "properties": { - "content_type": {"type": "string"}, - "sha256": {"type": "string"}, - "download_size": {"type": "integer"}, - "download_url": {"type": "string"}, - "saved_path": {"type": "string"}, + "content_type": {"type": "string", "description": "Artifact content type."}, + "sha256": {"type": "string", "description": "SHA-256 hash of artifact bytes."}, + "download_size": {"type": "integer", "description": "Artifact size in bytes."}, + "download_url": {"type": "string", "description": "Remote URL used for download."}, + "saved_path": { + "type": "string", + "description": "Local file path written by task_download.", + }, "error": ERROR_SCHEMA, }, "additionalProperties": False, @@ -451,18 +526,46 @@ class ToolDefinition: ToolDefinition( name="prompt_examples", description=( - "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3)." + "Call this first. Returns example prompts that define what a good prompt looks like. " + "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " + "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " + "Then call task_create. " + "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, ), + ToolDefinition( + name="model_profiles", + description=( + "Optional helper before task_create. Returns model_profile options with plain-language guidance " + "and currently available models in each profile. " + "If no models are available, returns error code MODEL_PROFILES_UNAVAILABLE." + ), + input_schema=MODEL_PROFILES_INPUT_SCHEMA, + output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, + ), ToolDefinition( name="task_create", description=( - "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " - "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Runs in the background (10–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." + "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " + "PlanExe turns the approved prompt into a strategic project-plan draft (20+ sections) in ~10-20 min. " + "Sections include: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, " + "strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, " + "SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, " + "plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, " + "premortem with failure scenarios, self-audit checklist, and adversarial premise attacks that argue against the project. " + "The adversarial sections (premortem, self-audit, premise attacks) surface risks and questions the prompter may not have considered. " + "Returns task_id (UUID); use it for task_status, task_stop, and task_download. " + "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " + "Each task_create call creates a new task_id (proxied to cloud; no server-side dedup). " + "If you are unsure which model_profile to choose, call model_profiles first. " + "If your deployment uses credits, include user_api_key to charge the correct account. " + "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -471,8 +574,15 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " - "and frequent polling is unnecessary." + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " + "(baseline profile) and may take longer on higher-quality profiles. " + "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "progress_percentage is 0-100 (integer-like float); 100 when completed. " + "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " + "Unknown task_id returns TASK_NOT_FOUND (or REMOTE_ERROR when transport fails). " + "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " + "processing with no file-output changes for >20 minutes likely means failed/stalled. " + "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, @@ -481,7 +591,10 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "This is a normal MCP tool call: call task_stop with that task_id." + "Stopping is asynchronous: the stop flag is set immediately but the task may continue briefly before halting. " + "A stopped task will eventually transition to the failed state. " + "If the task is already completed or failed, stop_requested returns false (the task already finished). " + "Unknown task_id returns TASK_NOT_FOUND (or REMOTE_ERROR when transport fails)." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, @@ -489,9 +602,13 @@ class ToolDefinition: ToolDefinition( name="task_download", description=( - "Download the plan output and save it locally (calls task_file_info, then fetches and saves to PLANEXE_PATH). " - "Choose the HTML report (default) or a zip of all generated files. " - "Prefer this over task_file_info when you want the file on disk." + "Download the plan output and save it locally to PLANEXE_PATH. " + "Use artifact='report' (default) for the interactive HTML report (~700KB, self-contained with embedded JS " + "for collapsible sections and interactive Gantt charts — open in a browser). " + "Use artifact='zip' for the full pipeline output bundle (md, json, csv intermediary files that fed the report). " + "If PLANEXE_PATH is unset, files are saved to the current working directory. " + "Filename format is - with numeric suffixes when collisions occur. " + "Common local error codes: DOWNLOAD_FAILED, REMOTE_ERROR." ), input_schema=TASK_DOWNLOAD_INPUT_SCHEMA, output_schema=TASK_DOWNLOAD_OUTPUT_SCHEMA, @@ -500,12 +617,34 @@ class ToolDefinition: # Shown in MCP initialize response (e.g. Inspector) so clients know what PlanExe is. PLANEXE_SERVER_INSTRUCTIONS = ( - "PlanExe generates rough-draft project plans from a natural-language prompt. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " - "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." + "PlanExe generates strategic project-plan drafts from a natural-language prompt. " + "Output is a self-contained interactive HTML report (~700KB) with 20+ sections including " + "executive summary, interactive Gantt charts, risk analysis, SWOT, governance, investor pitch, " + "team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections " + "(premortem, self-audit checklist, premise attacks) that stress-test whether the plan holds up. " + "The output is a draft to refine, not final ground truth — but it surfaces hard questions the prompter may not have considered. " + "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " + "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " + "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " + "Required interaction order: call prompt_examples first. " + "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " + "Then perform a non-tool step: draft a strong prompt as flowing prose (not structured markdown with headers or bullets), " + "typically ~300-800 words, and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose — weave specs, constraints, and targets naturally into sentences. " + "Only after approval, call task_create. " + "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " + "Then poll task_status (about every 5 minutes); use task_download when complete. " + "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " + "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " + "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " + "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " + "Main output: a self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. " + "The zip contains the intermediary pipeline files (md, json, csv) that fed the report." ) mcp_local = Server("planexe-mcp-local", instructions=PLANEXE_SERVER_INSTRUCTIONS) @@ -552,11 +691,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Examples: - {"prompt": "Start a dental clinic in Copenhagen with 3 treatment rooms, targeting families and children. Budget 2.5M DKK. Open within 12 months."} → task_id + created_at - - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "speed_vs_detail": "fast"} + - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "metadata": {"task_create": {"speed_vs_detail": "fast"}}} Args: - prompt: What the plan should cover (goal, context, constraints). - - speed_vs_detail: Optional mode ("ping" | "fast" | "all"). + - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). Call model_profiles to inspect options. + - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: - content: JSON string matching structuredContent. @@ -564,9 +704,35 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: - isError: True when the remote tool call fails. """ req = TaskCreateRequest(**arguments) + payload: dict[str, Any] = {"prompt": req.prompt} + if req.model_profile: + payload["model_profile"] = req.model_profile + if req.user_api_key: + payload["user_api_key"] = req.user_api_key + + metadata = arguments.get("metadata") + if isinstance(metadata, dict): + payload["metadata"] = metadata + + # Backward compatibility: if callers still pass top-level speed args, + # forward them as hidden metadata so cloud can resolve the execution mode. + legacy_speed_vs_detail = arguments.get("speed_vs_detail") + legacy_speed = arguments.get("speed") + if isinstance(legacy_speed_vs_detail, str) or isinstance(legacy_speed, str): + if not isinstance(payload.get("metadata"), dict): + payload["metadata"] = {} + task_create_metadata = payload["metadata"].get("task_create") + if not isinstance(task_create_metadata, dict): + task_create_metadata = {} + payload["metadata"]["task_create"] = task_create_metadata + if isinstance(legacy_speed_vs_detail, str): + task_create_metadata.setdefault("speed_vs_detail", legacy_speed_vs_detail) + if isinstance(legacy_speed, str): + task_create_metadata.setdefault("speed", legacy_speed) + payload, error = _call_remote_tool( "task_create", - {"prompt": req.prompt, "speed_vs_detail": req.speed_vs_detail} if req.speed_vs_detail else {"prompt": req.prompt}, + payload, ) if error: return _wrap_response({"error": error}, is_error=True) @@ -581,6 +747,14 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: return _wrap_response(payload) +async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: + """Return model_profile options and available models from mcp_cloud.""" + payload, error = _call_remote_tool("model_profiles", arguments or {}) + if error: + return _wrap_response({"error": error}, is_error=True) + return _wrap_response(payload) + + async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: """Fetch status/progress for a task from mcp_cloud. @@ -603,7 +777,7 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: - """Request mcp_cloud to stop a running task. + """Request mcp_cloud to stop an active task. Examples: - {"task_id": "uuid"} → stop request acknowledged @@ -613,7 +787,7 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: Returns: - content: JSON string matching structuredContent. - - structuredContent: {"state": "stopped"} or error. + - structuredContent: {"state": "pending|processing|completed|failed", "stop_requested": bool} or error. - isError: True when the remote tool call fails. """ req = TaskStopRequest(**arguments) @@ -695,6 +869,7 @@ async def handle_task_download(arguments: dict[str, Any]) -> CallToolResult: "task_stop": handle_task_stop, "task_download": handle_task_download, "prompt_examples": handle_prompt_examples, + "model_profiles": handle_model_profiles, } diff --git a/public/llms.txt b/public/llms.txt index ca5aed74..0b921d03 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -7,11 +7,22 @@ Use PlanExe when you need to: - Estimate timelines and identify dependencies and risks. - Generate plan artifacts for review and iteration. +Do NOT use PlanExe when you only need a one-shot answer: +- "Give me a 5-point checklist for X" -> use a normal LLM response. +- "Summarize this paragraph in 6 bullets" -> use a normal LLM response. +- "Rewrite this email" -> use a normal LLM response. + +Important pipeline constraint: +- PlanExe always runs a fixed end-to-end planning pipeline. +- You cannot request only specific internal pipeline parts (for example "run only risk section" or "create only the Gantt"). +- If you need only one small output artifact, use a normal LLM response instead. + ## What PlanExe Produces -- A long-running planning task (often ~10-20 minutes, depending on model and configuration). -- A large HTML report and optional zip bundle of intermediate artifacts. -- Typical plan sections: executive summary, work breakdown, timeline/Gantt-style schedule, risks, assumptions, and governance guidance. +- A long-running planning task (typically ~10-20 minutes on baseline profile, longer on higher-quality profiles). +- A self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. Optional zip bundle of intermediary pipeline files (md, json, csv). +- The report contains 20+ sections including: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, premortem with failure scenarios, self-audit checklist, and adversarial premise attacks. +- The adversarial sections (premortem, self-audit, premise attacks) stress-test whether the plan holds up and surface risks the prompter may not have considered. - Output is a draft to refine, not final ground truth. Runtime, quality, and cost tradeoffs: @@ -53,26 +64,53 @@ MCP Inspector setup guide: The MCP server exposes tool-based workflows (not MCP tasks protocol): - prompt_examples +- model_profiles - task_create - task_status - task_stop - task_file_info Key tool inputs/outputs: -- task_create inputs: prompt (required), speed_vs_detail (ping | fast | all). +- task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). +- task_create prompt quality: for best results, provide a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. +- model_profiles output: profile guidance + currently available models in each profile. +- model_profiles returns `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. +- task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). +- task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. - task_file_info output: downloadable report/zip metadata and URLs. +- mcp_local task_download output: includes local saved_path where artifact was written. + +task_status caller contract: +- pending: keep polling. +- processing: keep polling. +- completed: terminal success; download is ready. +- failed: terminal error. + +Minimal error-handling contract: +- Errors use `{"error":{"code","message","details?"}}`. +- Common cloud/core codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `MODEL_PROFILES_UNAVAILABLE`, `generation_failed`, `content_unavailable`. +- Common local-proxy codes: `REMOTE_ERROR`, `DOWNLOAD_FAILED`. +- `task_file_info` may return `{}` while artifact output is not ready yet. Recommended interaction order: 1. Call prompt_examples. -2. Prepare and approve a strong prompt. -3. Call task_create. -4. Poll task_status until complete (repeat every 5 minutes). -5. Use task_file_info to get download URLs. -6. Use task_stop if the run must be cancelled. +2. Optionally call model_profiles to choose model_profile based on current availability. +3. Non-tool step: prepare and approve a strong prompt as flowing prose (not structured markdown), typically ~300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. +4. Call task_create. +5. Poll task_status until complete (repeat every 5 minutes). +6. Use task_file_info to get download URLs. +7. Use task_stop if the run must be cancelled. + +Concurrency semantics: +- Each task_create call creates a new task_id. +- Server does not enforce a global per-client concurrency cap. +- Client should track task_ids and usually start with 1 active task, then 2 if needed. Note: - task_download is provided by mcp_local wrappers in some client setups, not by mcp_cloud directly. +- In mcp_local, downloads save to PLANEXE_PATH (or current working directory if PLANEXE_PATH is unset). +- In mcp_cloud, task_file_info download_url is an absolute URL where the requested artifact can be downloaded. ## Authentication @@ -130,9 +168,14 @@ curl -X POST https://mcp.planexe.org/mcp/tools/call \ "tool": "task_create", "arguments": { "prompt": "20-year, €40 billion infrastructure initiative to construct a pillar-supported transoceanic submerged tunnel connecting Spain and Morocco. This project will deploy a system of submerged, buoyant concrete tunnels engineered for high-speed rail traffic, which will be securely anchored at a controlled depth of 100 meters below sea level.", - "speed_vs_detail": "all" + "model_profile": "baseline" + }, + "metadata": { + "task_create": { + "speed_vs_detail": "all" + } } }' ``` -Last updated: 2026-02-21 +Last updated: 2026-02-23 diff --git a/skills/planexe-mcp/SKILL.md b/skills/planexe-mcp/SKILL.md index 1519301e..da9de8fa 100644 --- a/skills/planexe-mcp/SKILL.md +++ b/skills/planexe-mcp/SKILL.md @@ -120,7 +120,6 @@ Create a new planning task. This is the main entry point for generating plans. "name": "task_create", "arguments": { "prompt": "Create a project launch plan for Q2 2026", - "speed_vs_detail": "all", "model_profile": "premium", "user_api_key": "your_optional_api_key" } @@ -129,12 +128,8 @@ Create a new planning task. This is the main entry point for generating plans. ``` **Parameter Guide:** -- `prompt` (required): Your planning request in natural language -- `speed_vs_detail` (required): One of `"ping"`, `"fast"`, or `"all"` - - `"ping"`: Quick outline (~2-5 min) - - `"fast"`: Standard plan (~10-15 min) - - `"all"`: Comprehensive analysis (~20-30+ min) -- `model_profile` (required): One of `"baseline"`, `"premium"`, `"frontier"`, or `"custom"` +- `prompt` (required): Your planning request in natural language. Write as flowing prose (not structured markdown), typically 300-800 words. +- `model_profile` (optional): One of `"baseline"`, `"premium"`, `"frontier"`, or `"custom"`. Defaults to `"baseline"`. - `user_api_key` (optional): Your PlanExe API key (if not set in environment) **Returns:** `task_id` for polling status and retrieving results. @@ -160,9 +155,9 @@ Poll the status of a running planning task. } ``` -**Usage:** Planning tasks take 15-20+ minutes. Poll every 5+ minutes to check progress. +**Usage:** Planning tasks typically take 10-20 minutes (baseline profile). Poll every 5+ minutes to check progress. -**Returns:** Current status (`queued`, `running`, `completed`, `failed`), progress percentage, and estimated time remaining. +**Returns:** Current status (`pending`, `processing`, `completed`, `failed`), and progress percentage. --- @@ -210,8 +205,8 @@ Retrieve download information for completed plan artifacts. ``` **Artifact Options:** -- `"report"`: Markdown/PDF plan document -- `"zip"`: Complete deliverables package +- `"report"`: Interactive HTML report (~700KB, self-contained — open in a browser) +- `"zip"`: Pipeline output bundle (md, json, csv intermediary files) **Returns:** `download_url` for accessing the artifact.