From a72feb044660488a794d3fec52c870b3f9ec76a5 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 15:03:30 +0100 Subject: [PATCH 01/38] Removed the confusing speed_vs_detail parameter. And made "all" the default. The docs was missing the "model_profile" parameter. --- docs/mcp/antigravity.md | 14 ++-- docs/mcp/cursor.md | 2 +- docs/mcp/inspector.md | 3 + docs/mcp/mcp_details.md | 48 ++++++++++++- docs/mcp/mcp_setup.md | 6 ++ docs/mcp/mcp_welcome.md | 4 +- docs/mcp/planexe_mcp_interface.md | 44 +++++++++--- mcp_cloud/README.md | 4 +- mcp_cloud/app.py | 90 ++++++++++++++++-------- mcp_cloud/http_server.py | 33 ++++++--- mcp_cloud/tests/test_speed_vs_detail.py | 36 +++++++--- mcp_cloud/tests/test_task_create_tool.py | 26 ++++++- mcp_cloud/tool_models.py | 6 +- mcp_local/planexe_mcp_local.py | 52 ++++++++++---- public/llms.txt | 13 +++- 15 files changed, 292 insertions(+), 89 deletions(-) diff --git a/docs/mcp/antigravity.md b/docs/mcp/antigravity.md index 40e1dca6..eaeca9b7 100644 --- a/docs/mcp/antigravity.md +++ b/docs/mcp/antigravity.md @@ -18,15 +18,13 @@ My interaction history: 4. I didn't meant outbreak, I meant vulcanic 5. your prompt is a bit shorter than the example prompts 6. go ahead create the plan -7. stop that plan you are creating. -8. now create the plan again, this time with ALL details. Last time you had FAST selected that would leave out most details. -9. check status +7. check status +8. status +9. status 10. status -11. status -12. status -13. download the report -14. summarize the report -15. does it correspond to your expectations? +11. download the report +12. summarize the report +13. does it correspond to your expectations? I had to manually ask about `check status` to get details how the plan creation was going. It's not something that Antigravity can do. diff --git a/docs/mcp/cursor.md b/docs/mcp/cursor.md index 7365904b..ee75da6d 100644 --- a/docs/mcp/cursor.md +++ b/docs/mcp/cursor.md @@ -51,7 +51,7 @@ My interaction with Cursor for creating a plan is like this: 2. I want you to come up with a good prompt 3. I want something ala winter olympics in Italy 2026 4. Slightly different idea. I want Denmark to switch from DKK to EUR. Use the persona of a person representing Denmark's ministers. -5. go ahead create plan with all details +5. go ahead create the plan 6. *wait for 18 minutes until the plan has been created* 7. download the plan diff --git a/docs/mcp/inspector.md b/docs/mcp/inspector.md index e013ff85..088afeab 100644 --- a/docs/mcp/inspector.md +++ b/docs/mcp/inspector.md @@ -74,6 +74,9 @@ task_stop task_file_info ``` +When you inspect `task_create`, the visible input schema includes `prompt` and optional `model_profile`. +The `speed_vs_detail` parameter is intentionally hidden and only set via tool-specific metadata, since it confuses AI agents. + Follow these steps: ![screenshot of mcp inspector invoke tool](inspector_step5_mcp_planexe_org.webp) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 6efdc8d6..adb93dd1 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -41,11 +41,57 @@ Example call: {"prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes..."} ``` -Optional argument: +Optional visible argument: +```text +model_profile: "baseline" | "premium" | "frontier" | "custom" ``` + +Developer-only hidden metadata (not part of visible tool schema shown to agents): +```text speed_vs_detail: "ping" | "fast" | "all" ``` +Example with visible `model_profile`: +```json +{"prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", "model_profile": "premium"} +``` + +Example with hidden metadata override. The `ping` only checks if the LLMs are connected and doesn't trigger a full plan to be created: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "ping" + } + } +} +``` + +Example with hidden metadata override. The `fast` triggers a plan to be created, where the entire Luigi pipeline gets exercised, but skipping ever detail that is possible: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "fast" + } + } +} +``` + +Example with hidden metadata override. The `all` is the default setting. Creates a plan with **ALL** details: +```json +{ + "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", + "metadata": { + "task_create": { + "speed_vs_detail": "all" + } + } +} +``` + ### task_status Fetch status/progress and recent files for a task. diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index 933d25ef..7d19dfc0 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -25,6 +25,12 @@ This is the shortest path to a working PlanExe MCP integration. 3. `task_status` 4. `task_download` +For `task_create`: + +- Visible arguments: `prompt` (required), `model_profile` (optional). +- Hidden developer metadata: `speed_vs_detail` (`ping` | `fast` | `all`). +- Reference: [PlanExe MCP interface](planexe_mcp_interface.md#62-task_create) + --- ## 3. Success criteria diff --git a/docs/mcp/mcp_welcome.md b/docs/mcp/mcp_welcome.md index 997d1e6f..b4081d10 100644 --- a/docs/mcp/mcp_welcome.md +++ b/docs/mcp/mcp_welcome.md @@ -20,10 +20,12 @@ No MCP experience is required to get started. ## What you can do - **Get example prompts** — See what good prompts look like (detailed, typically 300–800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. -- **Create a plan** — Send a prompt; PlanExe starts creating the plan (takes about 15–20 minutes). If the input prompt is of low quality, the output plan will be crap too. +- **Create a plan** — Send a prompt; PlanExe starts creating the plan (takes about 15–20 minutes). If the input prompt is of low quality, the output plan will be crap too. Visible `task_create` options include `model_profile`. - **Check progress** — Ask for status and see how far the plan has gotten. - **Download the report** — When the plan is ready, the user specifies whether to download the HTML report or the zip of intermediary files (JSON, MD, CSV). +Developer note: `speed_vs_detail` is intentionally hidden from the visible `task_create` interface and is provided via tool-specific metadata when needed. + --- ## What you get diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 11eb7876..f6758e65 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -70,10 +70,10 @@ The interface is designed to support: The MCP specification defines two different mechanisms: -- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download. This document specifies those tools. +- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download or task_file_info. This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download when complete. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download or task_file_info when complete. --- @@ -180,7 +180,7 @@ All tool names below are normative. ### 6.2 task_create -**Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2).** Start creating a new plan with the approved prompt. speed_vs_detail modes: 'all' runs the full pipeline with all details (slower, higher token usage/cost). 'fast' runs the full pipeline with minimal work per step (faster, fewer details), useful to verify the pipeline is working. 'ping' runs the pipeline entrypoint and makes a single LLM call to verify the worker_plan_database is processing tasks and can reach the LLM. +**Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2).** Start creating a new plan with the approved prompt. **Request** @@ -191,11 +191,12 @@ All tool names below are normative. "type": "object", "properties": { "prompt": { "type": "string" }, - "speed_vs_detail": { + "model_profile": { "type": "string", - "enum": ["ping", "fast", "all"], - "default": "ping" - } + "enum": ["baseline", "premium", "frontier", "custom"], + "default": "baseline" + }, + "user_api_key": { "type": "string" } }, "required": ["prompt"] } @@ -206,11 +207,35 @@ All tool names below are normative. ```json { "prompt": "string", - "speed_vs_detail": "ping", + "model_profile": "baseline", "user_api_key": "pex_..." } ``` +**Tool-specific metadata (developer-only, hidden from model-visible schema)** + +Use tool-specific metadata when you need runtime overrides that should not be visible in the tool interface shown to AI agents. + +`speed_vs_detail` is read from metadata, not from the visible input schema. + +- `speed_vs_detail` accepted values: + - `ping`: single LLM call to verify the pipeline/LLM path. + - `fast`: reduced-detail run through the full pipeline. + - `all`: full-detail run through the full pipeline. + +**Metadata example** + +```json +{ + "prompt": "string", + "metadata": { + "task_create": { + "speed_vs_detail": "ping" + } + } +} +``` + **Prompt quality** The `prompt` parameter should be a detailed description of what the plan should cover. Good prompts are typically 300–800 words and include: @@ -224,6 +249,7 @@ Short one-liners (e.g., "Construct a bridge") tend to produce poor output becaus **Optional** +- model_profile: LLM profile (`baseline` | `premium` | `frontier` | `custom`). - user_api_key: user API key for credits and attribution (if your deployment requires it). Clients can call the MCP tool **prompt_examples** to retrieve example prompts. Use these as examples for task_create; they can also call task_create with any prompt—short prompts produce less detailed plans. @@ -243,7 +269,7 @@ For the full catalog file: **Important** -- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_download. +- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_download/task_file_info. **Behavior** diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index fcb0cc9b..d316fa1c 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -14,7 +14,7 @@ mcp_cloud provides a standardized MCP interface for PlanExe's plan generation wo ## Run as task (MCP tasks protocol) -MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `task_create`, `task_status`, `task_stop`, `task_download`. The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. +MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `task_create`, `task_status`, `task_stop`, `task_file_info` (or `task_download` via `mcp_local`). The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. ## Client Choice Guide @@ -90,7 +90,7 @@ Some MCP clients (e.g. OpenClaw/mcporter) connect by doing a **GET** to the serv **You do not need SSE for tools.** MCP over HTTP can use plain JSON: - **List tools:** `GET http://:8001/mcp/tools` → returns `{"tools": [...]}` (JSON). -- **Call a tool:** `POST http://:8001/mcp/tools/call` with body `{"tool": "task_create", "arguments": {"prompt": "…", "speed_vs_detail": "ping"}}` → returns JSON. +- **Call a tool:** `POST http://:8001/mcp/tools/call` with body `{"tool": "task_create", "arguments": {"prompt": "…"}, "metadata": {"task_create": {"speed_vs_detail": "ping"}}}` → returns JSON. If your client only supports Streamable HTTP and fails on `/mcp`, you have two options: diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index b78fb4f0..de221049 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -139,16 +139,6 @@ def ensure_taskitem_stop_columns() -> None: "fast_but_skip_details", "all_details_but_slow", ) -SPEED_VS_DETAIL_INPUT_VALUES = ( - "ping", - "fast", - "all", -) -SpeedVsDetailInput = Literal[ - "ping", - "fast", - "all", -] ModelProfileInput = Literal[ "baseline", "premium", @@ -163,7 +153,6 @@ def ensure_taskitem_stop_columns() -> None: class TaskCreateRequest(BaseModel): prompt: str - speed_vs_detail: Optional[SpeedVsDetailInput] = None model_profile: Optional[ModelProfileInput] = None user_api_key: Optional[str] = None @@ -634,6 +623,37 @@ def resolve_speed_vs_detail(config: Optional[dict[str, Any]]) -> str: return value return SPEED_VS_DETAIL_DEFAULT + +def _extract_task_create_metadata_overrides(arguments: dict[str, Any]) -> dict[str, Any]: + """Extract task_create runtime overrides from hidden metadata containers. + + Supported hidden containers: + - arguments.tool_metadata + - arguments.metadata + - arguments._meta + + If a container includes nested namespaces, these are checked first: + - task_create + - planexe_task_create + - planexe + """ + merged: dict[str, Any] = {} + metadata_candidates: list[dict[str, Any]] = [] + + for key in ("tool_metadata", "metadata", "_meta"): + candidate = arguments.get(key) + if isinstance(candidate, dict): + metadata_candidates.append(candidate) + + for candidate in metadata_candidates: + merged.update(candidate) + for nested_key in ("task_create", "planexe_task_create", "planexe"): + nested = candidate.get(nested_key) + if isinstance(nested, dict): + merged.update(nested) + + return merged + def _merge_task_create_config( config: Optional[dict[str, Any]], speed_vs_detail: Optional[str], @@ -834,14 +854,10 @@ class ToolDefinition: description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Returns task_id (UUID); use it for task_status, task_stop, and task_download. " + "Returns task_id (UUID); use it for task_status, task_stop, task_download, and task_file_info. " "If your deployment uses credits, include user_api_key to charge the correct account. " - "speed_vs_detail modes: " - "'all' runs the full pipeline with all details (slower, higher token usage/cost). " - "'fast' runs the full pipeline with minimal work per step (faster, fewer details), " - "useful to verify the pipeline is working. " - "'ping' runs the pipeline entrypoint and makes a single LLM call to verify the " - "worker_plan_database is processing tasks and can reach the LLM." + "Optional runtime overrides such as speed_vs_detail are intentionally hidden from the visible tool schema " + "and can be provided via tool-specific metadata by developers." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -917,12 +933,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Examples: - {"prompt": "Start a dental clinic in Copenhagen with 3 treatment rooms, targeting families and children. Budget 2.5M DKK. Open within 12 months."} → returns task_id (UUID) + created_at - - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "speed_vs_detail": "fast"} → faster run + - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "metadata": {"task_create": {"speed_vs_detail": "fast"}}} → faster run Args: - prompt: What the plan should cover (goal, context, constraints). - - speed_vs_detail: Optional mode ("ping" | "fast" | "all"). - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). + - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: - content: JSON string matching structuredContent. @@ -930,8 +946,26 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: - isError: False on success. """ req = TaskCreateRequest(**arguments) - - merged_config = _merge_task_create_config(None, req.speed_vs_detail, req.model_profile) + metadata_overrides = _extract_task_create_metadata_overrides(arguments) + metadata_model_profile = metadata_overrides.get("model_profile") + model_profile = req.model_profile + if model_profile is None and isinstance(metadata_model_profile, str): + model_profile = metadata_model_profile + + speed_vs_detail = metadata_overrides.get("speed_vs_detail") + if not isinstance(speed_vs_detail, str): + speed_alias = metadata_overrides.get("speed") + if isinstance(speed_alias, str): + speed_vs_detail = speed_alias + else: + # Backward-compatible hidden override when callers still send legacy top-level args. + legacy_speed = arguments.get("speed_vs_detail") + if isinstance(legacy_speed, str): + speed_vs_detail = legacy_speed + elif isinstance(arguments.get("speed"), str): + speed_vs_detail = arguments.get("speed") + + merged_config = _merge_task_create_config(None, speed_vs_detail, model_profile) require_user_key = os.environ.get("PLANEXE_MCP_REQUIRE_USER_KEY", "false").lower() in ("1", "true", "yes", "on") user_context = None if req.user_api_key: @@ -952,12 +986,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: ) if user_context and float(user_context.get("credits_balance", 0.0)) <= 0.0: - response = {"error": {"code": "INSUFFICIENT_CREDITS", "message": "Not enough credits."}} - return CallToolResult( - content=[TextContent(type="text", text=json.dumps(response))], - structuredContent=response, - isError=True, - ) + response = {"error": {"code": "INSUFFICIENT_CREDITS", "message": "Not enough credits."}} + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=True, + ) response = await asyncio.to_thread( _create_task_sync, diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index c62b9086..8eebc7f4 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -236,6 +236,7 @@ async def _enforce_body_size(request: Request) -> Optional[JSONResponse]: class MCPToolCallRequest(BaseModel): tool: str arguments: dict[str, Any] + metadata: Optional[dict[str, Any]] = None class MCPToolCallResponse(BaseModel): @@ -315,19 +316,12 @@ def _normalize_tool_result(result: Any) -> tuple[list[dict[str, Any]], Optional[ return content, error -SpeedVsDetailInput = Literal["ping", "fast", "all"] ModelProfileInput = Literal["baseline", "premium", "frontier", "custom"] ResultArtifactInput = Literal["report", "zip"] async def task_create( prompt: str, - speed_vs_detail: Annotated[ - SpeedVsDetailInput, - Field( - description="Defaults to ping (alias for ping_llm). Options: ping, fast, all.", - ), - ] = "ping", model_profile: Annotated[ ModelProfileInput, Field(description="LLM profile: baseline, premium, frontier, custom."), @@ -337,7 +331,6 @@ async def task_create( authenticated_user_api_key = _get_authenticated_user_api_key() arguments: dict[str, Any] = { "prompt": prompt, - "speed_vs_detail": speed_vs_detail, "model_profile": model_profile, } if authenticated_user_api_key: @@ -560,6 +553,30 @@ async def call_tool( authenticated_user_api_key = _get_authenticated_user_api_key() if authenticated_user_api_key and not arguments.get("user_api_key"): arguments["user_api_key"] = authenticated_user_api_key + if isinstance(payload.metadata, dict): + arguments["metadata"] = dict(payload.metadata) + + # Backward compatibility: move legacy speed args into hidden metadata. + legacy_speed_vs_detail = arguments.pop("speed_vs_detail", None) + legacy_speed = arguments.pop("speed", None) + if isinstance(legacy_speed_vs_detail, str) or isinstance(legacy_speed, str): + metadata = arguments.get("metadata") + if not isinstance(metadata, dict): + metadata = {} + arguments["metadata"] = metadata + task_create_metadata = metadata.get("task_create") + if not isinstance(task_create_metadata, dict): + task_create_metadata = {} + metadata["task_create"] = task_create_metadata + if isinstance(legacy_speed_vs_detail, str): + task_create_metadata.setdefault("speed_vs_detail", legacy_speed_vs_detail) + if isinstance(legacy_speed, str): + task_create_metadata.setdefault("speed", legacy_speed) + + result = await handle_task_create(arguments) + content, error = _normalize_tool_result(result) + return MCPToolCallResponse(content=content, error=error) + return await call_tool_via_registry(fastmcp_server, payload.tool, arguments) diff --git a/mcp_cloud/tests/test_speed_vs_detail.py b/mcp_cloud/tests/test_speed_vs_detail.py index 4f911913..04393b36 100644 --- a/mcp_cloud/tests/test_speed_vs_detail.py +++ b/mcp_cloud/tests/test_speed_vs_detail.py @@ -5,6 +5,7 @@ from mcp_cloud.app import ( SPEED_VS_DETAIL_DEFAULT, TaskCreateRequest, + _extract_task_create_metadata_overrides, _merge_task_create_config, resolve_speed_vs_detail, ) @@ -40,15 +41,6 @@ def test_merge_task_create_config_ignores_blank(self): class TestTaskCreateRequest(unittest.TestCase): - def test_speed_vs_detail_accepts_enum(self): - for value in ("ping", "fast", "all"): - req = TaskCreateRequest(prompt="demo", speed_vs_detail=value) - self.assertEqual(req.speed_vs_detail, value) - - def test_speed_vs_detail_rejects_invalid(self): - with self.assertRaises(ValidationError): - TaskCreateRequest(prompt="demo", speed_vs_detail="slow") - def test_model_profile_accepts_enum(self): for value in ("baseline", "premium", "frontier", "custom"): req = TaskCreateRequest(prompt="demo", model_profile=value) @@ -59,5 +51,31 @@ def test_model_profile_rejects_invalid(self): TaskCreateRequest(prompt="demo", model_profile="enterprise") +class TestTaskCreateMetadataOverrides(unittest.TestCase): + def test_extracts_nested_task_create_metadata(self): + overrides = _extract_task_create_metadata_overrides( + {"metadata": {"task_create": {"speed_vs_detail": "fast"}}} + ) + self.assertEqual(overrides.get("speed_vs_detail"), "fast") + + def test_extracts_top_level_metadata(self): + overrides = _extract_task_create_metadata_overrides( + {"_meta": {"speed": "all", "model_profile": "premium"}} + ) + self.assertEqual(overrides.get("speed"), "all") + self.assertEqual(overrides.get("model_profile"), "premium") + + def test_nested_namespace_overrides_top_level(self): + overrides = _extract_task_create_metadata_overrides( + { + "metadata": { + "speed_vs_detail": "fast", + "task_create": {"speed_vs_detail": "ping"}, + } + } + ) + self.assertEqual(overrides.get("speed_vs_detail"), "ping") + + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tests/test_task_create_tool.py b/mcp_cloud/tests/test_task_create_tool.py index 54274d91..8387d507 100644 --- a/mcp_cloud/tests/test_task_create_tool.py +++ b/mcp_cloud/tests/test_task_create_tool.py @@ -6,10 +6,18 @@ from unittest.mock import MagicMock, patch from mcp.types import CallToolResult -from mcp_cloud.app import handle_task_create +from mcp_cloud.app import handle_list_tools, handle_task_create class TestTaskCreateTool(unittest.TestCase): + def test_task_create_visible_schema_hides_speed_and_exposes_model_profile(self): + tools = asyncio.run(handle_list_tools()) + task_create_tool = next(tool for tool in tools if tool.name == "task_create") + properties = task_create_tool.inputSchema.get("properties", {}) + self.assertIn("prompt", properties) + self.assertIn("model_profile", properties) + self.assertNotIn("speed_vs_detail", properties) + def test_task_create_returns_structured_content(self): arguments = {"prompt": "xcv", "config": None, "metadata": None} fake_session = MagicMock() @@ -35,6 +43,22 @@ def __init__(self, prompt: str, state, user_id: str, parameters): self.assertIn("created_at", result.structuredContent) self.assertIsInstance(uuid.UUID(result.structuredContent["task_id"]), uuid.UUID) + def test_task_create_uses_hidden_metadata_speed_override(self): + fake_response = {"task_id": str(uuid.uuid4()), "created_at": "2026-01-01T00:00:00Z"} + with patch("mcp_cloud.app._create_task_sync", return_value=fake_response) as create_task_sync: + result = asyncio.run( + handle_task_create( + { + "prompt": "demo", + "metadata": {"task_create": {"speed_vs_detail": "ping"}}, + } + ) + ) + + self.assertFalse(result.isError) + _, merged_config, _ = create_task_sync.call_args.args + self.assertEqual(merged_config, {"speed_vs_detail": "ping"}) + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index ea5289f0..d602f7ab 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -52,7 +52,7 @@ class TaskFileInfoInput(BaseModel): class TaskCreateOutput(BaseModel): task_id: str = Field( ..., - description="Task UUID returned by task_create. Stable across task_status/task_stop/task_download." + description="Task UUID returned by task_create. Stable across task_status/task_stop/task_download/task_file_info." ) created_at: str @@ -119,10 +119,6 @@ class TaskCreateInput(BaseModel): "Short prompts produce less detailed plans." ), ) - speed_vs_detail: Literal["ping", "fast", "all"] = Field( - default="ping", - description="Defaults to ping (alias for ping_llm). Options: ping, fast, all.", - ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", description="LLM profile mapping to llm_config/.json (baseline, premium, frontier, custom).", diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 07d9dce1..15bce825 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -29,16 +29,17 @@ DEFAULT_MCP_URL = "https://your-railway-app.up.railway.app/mcp" REPORT_FILENAME = "030-report.html" ZIP_FILENAME = "run.zip" -SpeedVsDetailInput = Literal[ - "ping", - "fast", - "all", +ModelProfileInput = Literal[ + "baseline", + "premium", + "frontier", + "custom", ] class TaskCreateRequest(BaseModel): prompt: str - speed_vs_detail: Optional[SpeedVsDetailInput] = None + model_profile: Optional[ModelProfileInput] = None class TaskStatusRequest(BaseModel): @@ -324,11 +325,11 @@ class ToolDefinition: "Use prompt_examples to get example prompts; use these as examples for task_create. Short prompts produce less detailed plans." ), }, - "speed_vs_detail": { + "model_profile": { "type": "string", - "enum": ["ping", "fast", "all"], - "default": "ping", - "description": "How much work to run. 'ping': single LLM call to check the pipeline is reachable (check logs if it fails). 'fast': minimal run (approx 5-10 min) through all pipeline steps, skipping where possible—use to verify the pipeline works. 'all': full plan with full detail (approx 10-20 min).", + "enum": ["baseline", "premium", "frontier", "custom"], + "default": "baseline", + "description": "LLM profile mapping to llm_config/.json.", }, }, "required": ["prompt"], @@ -462,7 +463,7 @@ class ToolDefinition: description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Runs in the background (10–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." + "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download (or task_file_info when calling mcp_cloud directly)." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -552,11 +553,12 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Examples: - {"prompt": "Start a dental clinic in Copenhagen with 3 treatment rooms, targeting families and children. Budget 2.5M DKK. Open within 12 months."} → task_id + created_at - - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "speed_vs_detail": "fast"} + - {"prompt": "Launch a bike repair shop in Amsterdam with retail sales, service bays, and mobile repair van. Budget 150k EUR. Profitability goal: month 18.", "metadata": {"task_create": {"speed_vs_detail": "fast"}}} Args: - prompt: What the plan should cover (goal, context, constraints). - - speed_vs_detail: Optional mode ("ping" | "fast" | "all"). + - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). + - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: - content: JSON string matching structuredContent. @@ -564,9 +566,33 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: - isError: True when the remote tool call fails. """ req = TaskCreateRequest(**arguments) + payload: dict[str, Any] = {"prompt": req.prompt} + if req.model_profile: + payload["model_profile"] = req.model_profile + + metadata = arguments.get("metadata") + if isinstance(metadata, dict): + payload["metadata"] = metadata + + # Backward compatibility: if callers still pass top-level speed args, + # forward them as hidden metadata so cloud can resolve the execution mode. + legacy_speed_vs_detail = arguments.get("speed_vs_detail") + legacy_speed = arguments.get("speed") + if isinstance(legacy_speed_vs_detail, str) or isinstance(legacy_speed, str): + if not isinstance(payload.get("metadata"), dict): + payload["metadata"] = {} + task_create_metadata = payload["metadata"].get("task_create") + if not isinstance(task_create_metadata, dict): + task_create_metadata = {} + payload["metadata"]["task_create"] = task_create_metadata + if isinstance(legacy_speed_vs_detail, str): + task_create_metadata.setdefault("speed_vs_detail", legacy_speed_vs_detail) + if isinstance(legacy_speed, str): + task_create_metadata.setdefault("speed", legacy_speed) + payload, error = _call_remote_tool( "task_create", - {"prompt": req.prompt, "speed_vs_detail": req.speed_vs_detail} if req.speed_vs_detail else {"prompt": req.prompt}, + payload, ) if error: return _wrap_response({"error": error}, is_error=True) diff --git a/public/llms.txt b/public/llms.txt index ca5aed74..9cd8292e 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -59,7 +59,9 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): - task_file_info Key tool inputs/outputs: -- task_create inputs: prompt (required), speed_vs_detail (ping | fast | all). +- task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). +- task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). +- task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. - task_file_info output: downloadable report/zip metadata and URLs. @@ -130,9 +132,14 @@ curl -X POST https://mcp.planexe.org/mcp/tools/call \ "tool": "task_create", "arguments": { "prompt": "20-year, €40 billion infrastructure initiative to construct a pillar-supported transoceanic submerged tunnel connecting Spain and Morocco. This project will deploy a system of submerged, buoyant concrete tunnels engineered for high-speed rail traffic, which will be securely anchored at a controlled depth of 100 meters below sea level.", - "speed_vs_detail": "all" + "model_profile": "baseline" + }, + "metadata": { + "task_create": { + "speed_vs_detail": "all" + } } }' ``` -Last updated: 2026-02-21 +Last updated: 2026-02-23 From 875773ac2d63a1573908e03b875d3141c1b591ac Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 16:57:20 +0100 Subject: [PATCH 02/38] Counterexamples showcasing how NOT to use PlanExe. --- docs/mcp/mcp_details.md | 16 +++++++++++++++- docs/mcp/planexe_mcp_interface.md | 20 +++++++++++++++++++- mcp_cloud/app.py | 12 +++++++++--- mcp_cloud/http_server.py | 3 +++ mcp_cloud/tool_models.py | 3 ++- mcp_local/planexe_mcp_local.py | 12 +++++++++--- public/llms.txt | 10 ++++++++++ 7 files changed, 67 insertions(+), 9 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index adb93dd1..1d9d083a 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -68,7 +68,7 @@ Example with hidden metadata override. The `ping` only checks if the LLMs are co } ``` -Example with hidden metadata override. The `fast` triggers a plan to be created, where the entire Luigi pipeline gets exercised, but skipping ever detail that is possible: +Example with hidden metadata override. The `fast` triggers a plan to be created, where the entire Luigi pipeline gets exercised, while skipping as much detail as possible: ```json { "prompt": "Weekly meetup for humans where participants are randomly paired every 5 minutes...", @@ -92,6 +92,20 @@ Example with hidden metadata override. The `all` is the default setting. Creates } ``` +Counterexamples (do NOT use PlanExe for these): + +- "Give me a 5-point checklist for X." +- "Summarize this paragraph in 6 bullets." +- "Rewrite this email." +- "Identify the risks of this project." +- "Make a SWOT for this document." + +What to do instead: + +- For one-shot outputs, use a normal LLM response directly. +- For PlanExe, send a substantial multi-phase project prompt with scope, constraints, timeline, budget, stakeholders, and success criteria. +- PlanExe always runs a fixed end-to-end pipeline; it does not support selecting only internal pipeline subsets. + ### task_status Fetch status/progress and recent files for a task. diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index f6758e65..d7ee9c72 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -247,6 +247,24 @@ The `prompt` parameter should be a detailed description of what the plan should Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. +**Counterexamples: when NOT to use PlanExe** + +Use a normal single LLM response (not PlanExe) for one-shot micro-tasks. PlanExe runs a heavy multi-step planning pipeline and is best for substantial project planning. + +- Bad (do not send to task_create): "Give me a 5-point checklist for launching a coffee shop." +- Better non-PlanExe action: ask the LLM directly for a checklist. +- Better PlanExe prompt: "Create a 12-month strategic launch plan for a coffee shop in Austin with budget caps, lease milestones, hiring plan, permits, supply chain, marketing channels, risk register, governance, and success KPIs." + +- Bad (do not send to task_create): "Summarize this text in 6 bullets." +- Better non-PlanExe action: use direct summarization in the chat model. + +- Bad (invalid assumption): "Run only the risk-register part of PlanExe." +- Rule: PlanExe pipeline execution is fixed end-to-end. Callers cannot choose internal step subsets. +- Better PlanExe prompt: request a full plan where risk analysis is one required deliverable. + +- Bad (do not send to task_create): "Rewrite this email to sound professional." +- Better non-PlanExe action: use direct rewriting in the chat model. + **Optional** - model_profile: LLM profile (`baseline` | `premium` | `frontier` | `custom`). @@ -322,7 +340,7 @@ Returns run status and progress. Used for progress bars and UI states. **Polling ### 6.4 task_stop -Requests the plan generation to stop. Pass the **task_id** (the UUID returned by task_create). This is a normal MCP tool call: call task_stop with that task_id. +Requests the plan generation to stop. Pass the **task_id** (the UUID returned by task_create). Call `task_stop` with that task_id. **Request** diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index de221049..4a0f6153 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -112,6 +112,9 @@ def ensure_taskitem_stop_columns() -> None: # Shown in MCP initialize (e.g. Inspector) so clients know what PlanExe does. PLANEXE_SERVER_INSTRUCTIONS = ( "PlanExe generates rough-draft project plans from a natural-language prompt. " + "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " + "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " + "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " @@ -844,7 +847,8 @@ class ToolDefinition: name="prompt_examples", description=( "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3)." + "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, @@ -876,7 +880,7 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "This is a normal MCP tool call: call task_stop with that task_id." + "Call task_stop with that task_id." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, @@ -1013,7 +1017,9 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: "samples": samples, "message": ( "Step 1 done. Next: Step 2 — Formulate a good prompt using these as a baseline (similar structure). Get user approval. " - "Step 3 — Only then call task_create with the approved prompt. Do not call task_create yet." + "Step 3 — Only then call task_create with the approved prompt. " + "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " + "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." ), } return CallToolResult( diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 8eebc7f4..f1502a1f 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -390,6 +390,9 @@ def _register_tools(server: FastMCP) -> None: name="planexe-mcp-server", instructions=( "PlanExe generates rough-draft project plans from a natural-language prompt. " + "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " + "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " + "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index d602f7ab..6823edc5 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -116,7 +116,8 @@ class TaskCreateInput(BaseModel): description=( "What the plan should cover (goal, context, constraints). " "Use prompt_examples to get example prompts; use these as examples for task_create. " - "Short prompts produce less detailed plans." + "Short prompts produce less detailed plans. " + "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist); use direct LLM responses for those." ), ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 15bce825..c236c08c 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -322,7 +322,9 @@ class ToolDefinition: "type": "string", "description": ( "What the plan should cover. Good prompts are often 300–800 words. " - "Use prompt_examples to get example prompts; use these as examples for task_create. Short prompts produce less detailed plans." + "Use prompt_examples to get example prompts; use these as examples for task_create. " + "Short prompts produce less detailed plans. " + "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist)." ), }, "model_profile": { @@ -453,7 +455,8 @@ class ToolDefinition: name="prompt_examples", description=( "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3)." + "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, @@ -482,7 +485,7 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "This is a normal MCP tool call: call task_stop with that task_id." + "Call task_stop with that task_id." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, @@ -502,6 +505,9 @@ class ToolDefinition: # Shown in MCP initialize response (e.g. Inspector) so clients know what PlanExe is. PLANEXE_SERVER_INSTRUCTIONS = ( "PlanExe generates rough-draft project plans from a natural-language prompt. " + "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " + "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " + "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " diff --git a/public/llms.txt b/public/llms.txt index 9cd8292e..b912a9e6 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -7,6 +7,16 @@ Use PlanExe when you need to: - Estimate timelines and identify dependencies and risks. - Generate plan artifacts for review and iteration. +Do NOT use PlanExe when you only need a one-shot answer: +- "Give me a 5-point checklist for X" -> use a normal LLM response. +- "Summarize this paragraph in 6 bullets" -> use a normal LLM response. +- "Rewrite this email" -> use a normal LLM response. + +Important pipeline constraint: +- PlanExe always runs a fixed end-to-end planning pipeline. +- You cannot request only specific internal pipeline parts (for example "run only risk section" or "create only the Gantt"). +- If you need only one small output artifact, use a normal LLM response instead. + ## What PlanExe Produces - A long-running planning task (often ~10-20 minutes, depending on model and configuration). From 26fcc487fca37d7806dbc327bbbe29dd228090cc Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 17:34:56 +0100 Subject: [PATCH 03/38] Align MCP instructions with exposed tools --- mcp_cloud/app.py | 4 +- mcp_cloud/http_server.py | 2 +- .../tests/test_tool_surface_consistency.py | 47 +++++++++++++++++++ mcp_cloud/tool_models.py | 2 +- mcp_local/planexe_mcp_local.py | 9 ++-- 5 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 mcp_cloud/tests/test_tool_surface_consistency.py diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 4a0f6153..17669f3b 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -118,7 +118,7 @@ def ensure_taskitem_stop_columns() -> None: "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) @@ -858,7 +858,7 @@ class ToolDefinition: description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Returns task_id (UUID); use it for task_status, task_stop, task_download, and task_file_info. " + "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " "If your deployment uses credits, include user_api_key to charge the correct account. " "Optional runtime overrides such as speed_vs_detail are intentionally hidden from the visible tool schema " "and can be provided via tool-specific metadata by developers." diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index f1502a1f..f62e9a02 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -396,7 +396,7 @@ def _register_tools(server: FastMCP) -> None: "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ), host=HTTP_HOST, diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py new file mode 100644 index 00000000..8a74ee1f --- /dev/null +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -0,0 +1,47 @@ +import unittest + +import mcp_cloud.app as cloud_app +import mcp_local.planexe_mcp_local as local_app + + +def _tool_desc(tool_defs, name: str) -> str: + for definition in tool_defs: + if definition.name == name: + return definition.description + raise AssertionError(f"Tool not found: {name}") + + +class TestCloudToolSurfaceConsistency(unittest.TestCase): + def test_cloud_exposes_task_file_info_not_task_download(self): + cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} + self.assertIn("task_file_info", cloud_tool_names) + self.assertNotIn("task_download", cloud_tool_names) + + def test_cloud_instructions_reference_cloud_download_tool(self): + self.assertIn("task_file_info", cloud_app.PLANEXE_SERVER_INSTRUCTIONS) + self.assertNotIn("task_download", cloud_app.PLANEXE_SERVER_INSTRUCTIONS) + + def test_cloud_task_create_description_references_cloud_download_tool(self): + description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_create") + self.assertIn("task_file_info", description) + self.assertNotIn("task_download", description) + + +class TestLocalToolSurfaceConsistency(unittest.TestCase): + def test_local_exposes_task_download_not_task_file_info(self): + local_tool_names = {definition.name for definition in local_app.TOOL_DEFINITIONS} + self.assertIn("task_download", local_tool_names) + self.assertNotIn("task_file_info", local_tool_names) + + def test_local_instructions_reference_local_download_tool(self): + self.assertIn("task_download", local_app.PLANEXE_SERVER_INSTRUCTIONS) + self.assertNotIn("task_file_info", local_app.PLANEXE_SERVER_INSTRUCTIONS) + + def test_local_task_create_description_references_local_download_tool(self): + description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_create") + self.assertIn("task_download", description) + self.assertNotIn("task_file_info", description) + + +if __name__ == "__main__": + unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 6823edc5..fe4d370e 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -52,7 +52,7 @@ class TaskFileInfoInput(BaseModel): class TaskCreateOutput(BaseModel): task_id: str = Field( ..., - description="Task UUID returned by task_create. Stable across task_status/task_stop/task_download/task_file_info." + description="Task UUID returned by task_create. Stable across task_status/task_stop/task_file_info." ) created_at: str diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index c236c08c..e24b37b8 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -466,7 +466,7 @@ class ToolDefinition: description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " - "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download (or task_file_info when calling mcp_cloud directly)." + "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -493,9 +493,8 @@ class ToolDefinition: ToolDefinition( name="task_download", description=( - "Download the plan output and save it locally (calls task_file_info, then fetches and saves to PLANEXE_PATH). " - "Choose the HTML report (default) or a zip of all generated files. " - "Prefer this over task_file_info when you want the file on disk." + "Download the plan output and save it locally to PLANEXE_PATH. " + "Choose the HTML report (default) or a zip of all generated files." ), input_schema=TASK_DOWNLOAD_INPUT_SCHEMA, output_schema=TASK_DOWNLOAD_OUTPUT_SCHEMA, @@ -511,7 +510,7 @@ class ToolDefinition: "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) From d92519b66ca9f17d090fe5402b08488398899114 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 17:41:04 +0100 Subject: [PATCH 04/38] Document task_status state contract for callers --- docs/mcp/mcp_details.md | 8 ++++++ docs/mcp/planexe_mcp_interface.md | 14 +++++++++- mcp_cloud/README.md | 6 ++++ mcp_cloud/app.py | 5 +++- mcp_cloud/http_server.py | 1 + .../tests/test_tool_surface_consistency.py | 28 +++++++++++++++++++ mcp_cloud/tool_models.py | 23 +++++++++++++-- mcp_local/README.md | 6 ++++ mcp_local/planexe_mcp_local.py | 5 +++- public/llms.txt | 7 +++++ 10 files changed, 97 insertions(+), 6 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 1d9d083a..2ba7c53e 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -120,6 +120,14 @@ Example call: {"task_id": "2d57a448-1b09-45aa-ad37-e69891ff6ec7"} ``` +State contract: + +- `running`: keep polling. +- `stopping`: stop requested and in progress, keep polling. +- `completed`: terminal success, proceed to download. +- `failed`: terminal error. +- `stopped`: terminal stop acknowledged. + ### task_stop Request an active task to stop. diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index d7ee9c72..2200e68a 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete. To stop, call task_stop with the task_id from task_create. +- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`running`/`stopping` = keep polling, `completed` = download now, `failed`/`stopped` = terminal). To stop, call task_stop with the task_id from task_create. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -312,6 +312,18 @@ Returns run status and progress. Used for progress bars and UI states. **Polling - task_id: UUID returned by task_create. Use it to reference the plan being created. +**Caller contract (state meanings)** + +- `running`: work is still in progress. Keep polling. +- `stopping`: stop requested and in progress. Keep polling. +- `completed`: terminal success. Download artifacts now. +- `failed`: terminal error. Do not keep polling for completion. +- `stopped`: terminal stop acknowledged by the system/user request. + +**Terminal states** + +- `completed`, `failed`, `stopped` + **Response** ```json diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index d316fa1c..c33b589c 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -134,6 +134,12 @@ See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `task_stop` - Stop an active task - `task_file_info` - Get file metadata for report or zip +`task_status` caller contract: +- `running` / `stopping`: keep polling. +- `completed`: terminal success, download is ready. +- `failed`: terminal error. +- `stopped`: terminal stop acknowledged. + Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this server. If your client exposes `task_download`, use it to save the report or zip locally; otherwise use `task_file_info` to get `download_url` and fetch the file yourself. **Tip**: Call `prompt_examples` to get example prompts to use with task_create. The catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 17669f3b..dbd169bf 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -119,6 +119,7 @@ def ensure_taskitem_stop_columns() -> None: "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) @@ -871,7 +872,9 @@ class ToolDefinition: description=( "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " - "and frequent polling is unnecessary." + "and frequent polling is unnecessary. " + "State contract: running/stopping => keep polling; completed => download is ready; " + "failed => terminal error; stopped => stop acknowledged (terminal)." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index f62e9a02..f63bc340 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -397,6 +397,7 @@ def _register_tools(server: FastMCP) -> None: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ), host=HTTP_HOST, diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index 8a74ee1f..8ea56981 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -26,6 +26,20 @@ def test_cloud_task_create_description_references_cloud_download_tool(self): self.assertIn("task_file_info", description) self.assertNotIn("task_download", description) + def test_cloud_instructions_include_task_status_state_contract(self): + instructions = cloud_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("running/stopping", instructions) + self.assertIn("completed", instructions) + self.assertIn("failed", instructions) + self.assertIn("stopped", instructions) + + def test_cloud_task_status_description_includes_state_contract(self): + description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_status") + self.assertIn("running/stopping", description) + self.assertIn("completed", description) + self.assertIn("failed", description) + self.assertIn("stopped", description) + class TestLocalToolSurfaceConsistency(unittest.TestCase): def test_local_exposes_task_download_not_task_file_info(self): @@ -42,6 +56,20 @@ def test_local_task_create_description_references_local_download_tool(self): self.assertIn("task_download", description) self.assertNotIn("task_file_info", description) + def test_local_instructions_include_task_status_state_contract(self): + instructions = local_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("running/stopping", instructions) + self.assertIn("completed", instructions) + self.assertIn("failed", instructions) + self.assertIn("stopped", instructions) + + def test_local_task_status_description_includes_state_contract(self): + description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_status") + self.assertIn("running/stopping", description) + self.assertIn("completed", description) + self.assertIn("failed", description) + self.assertIn("stopped", description) + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index fe4d370e..b9ead4aa 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -72,7 +72,14 @@ class TaskStatusSuccess(BaseModel): ..., description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] + state: Literal["stopped", "running", "completed", "failed", "stopping"] = Field( + ..., + description=( + "Caller contract: running/stopping => keep polling; " + "completed => download is ready; failed => terminal error; " + "stopped => stop acknowledged (terminal)." + ), + ) progress_percentage: float timing: TaskStatusTiming files: list[TaskStatusFile] @@ -83,7 +90,14 @@ class TaskStatusOutput(BaseModel): default=None, description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] | None = None + state: Literal["stopped", "running", "completed", "failed", "stopping"] | None = Field( + default=None, + description=( + "Caller contract: running/stopping => keep polling; " + "completed => download is ready; failed => terminal error; " + "stopped => stop acknowledged (terminal)." + ), + ) progress_percentage: float | None = None timing: TaskStatusTiming | None = None files: list[TaskStatusFile] | None = None @@ -91,7 +105,10 @@ class TaskStatusOutput(BaseModel): class TaskStopOutput(BaseModel): - state: Literal["stopped"] | None = None + state: Literal["stopped"] | None = Field( + default=None, + description="Stop acknowledged. stopped is terminal for this run.", + ) error: ErrorDetail | None = None diff --git a/mcp_local/README.md b/mcp_local/README.md index ee1cea2a..2a96f6df 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -14,6 +14,12 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas `task_stop` - Abort creation of a plan. `task_download` - Download the plan, either html report or a zip with everything, and save it to disk. +`task_status` caller contract: +- `running` / `stopping`: keep polling. +- `completed`: terminal success, download is ready. +- `failed`: terminal error. +- `stopped`: terminal stop acknowledged. + **Tip**: Call `prompt_examples` to get example prompts to use with task_create. The full catalog lives at `worker_plan/worker_plan_api/prompt/data/simple_plan_prompts.jsonl`. `task_download` is a synthetic tool provided by the local proxy. It calls the diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index e24b37b8..addd3b65 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -476,7 +476,9 @@ class ToolDefinition: description=( "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " - "and frequent polling is unnecessary." + "and frequent polling is unnecessary. " + "State contract: running/stopping => keep polling; completed => download is ready; " + "failed => terminal error; stopped => stop acknowledged (terminal)." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, @@ -511,6 +513,7 @@ class ToolDefinition: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " + "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) diff --git a/public/llms.txt b/public/llms.txt index b912a9e6..d48158ac 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -75,6 +75,13 @@ Key tool inputs/outputs: - task_status output: current state and progress for a task_id. - task_file_info output: downloadable report/zip metadata and URLs. +task_status caller contract: +- running: keep polling. +- stopping: keep polling. +- completed: terminal success; download is ready. +- failed: terminal error. +- stopped: terminal stop acknowledged. + Recommended interaction order: 1. Call prompt_examples. 2. Prepare and approve a strong prompt. From 035e848c083f7f5572390cf6381afdd4f900c183 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 17:51:24 +0100 Subject: [PATCH 05/38] Error scenarios. --- mcp_cloud/app.py | 8 +++++++- mcp_cloud/http_server.py | 3 +++ mcp_cloud/tests/test_tool_surface_consistency.py | 12 ++++++++++++ mcp_local/planexe_mcp_local.py | 8 +++++++- 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index dbd169bf..8fada0ce 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -120,6 +120,9 @@ def ensure_taskitem_stop_columns() -> None: "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " + "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) @@ -874,7 +877,10 @@ class ToolDefinition: "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " "State contract: running/stopping => keep polling; completed => download is ready; " - "failed => terminal error; stopped => stop acknowledged (terminal)." + "failed => terminal error; stopped => stop acknowledged (terminal). " + "Troubleshooting: TaskState.pending for >5 minutes likely means queued but not picked up by a worker. " + "TaskState.processing/running with no file-output changes for >20 minutes likely means failed/stalled. " + "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index f63bc340..8221a1e2 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -398,6 +398,9 @@ def _register_tools(server: FastMCP) -> None: "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " + "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ), host=HTTP_HOST, diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index 8ea56981..364fc5b2 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -32,6 +32,9 @@ def test_cloud_instructions_include_task_status_state_contract(self): self.assertIn("completed", instructions) self.assertIn("failed", instructions) self.assertIn("stopped", instructions) + self.assertIn("pending for longer than 5 minutes", instructions) + self.assertIn("longer than 20 minutes", instructions) + self.assertIn("PlanExeOrg/PlanExe/issues", instructions) def test_cloud_task_status_description_includes_state_contract(self): description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_status") @@ -39,6 +42,9 @@ def test_cloud_task_status_description_includes_state_contract(self): self.assertIn("completed", description) self.assertIn("failed", description) self.assertIn("stopped", description) + self.assertIn("pending for >5 minutes", description) + self.assertIn(">20 minutes", description) + self.assertIn("PlanExeOrg/PlanExe/issues", description) class TestLocalToolSurfaceConsistency(unittest.TestCase): @@ -62,6 +68,9 @@ def test_local_instructions_include_task_status_state_contract(self): self.assertIn("completed", instructions) self.assertIn("failed", instructions) self.assertIn("stopped", instructions) + self.assertIn("pending for longer than 5 minutes", instructions) + self.assertIn("longer than 20 minutes", instructions) + self.assertIn("PlanExeOrg/PlanExe/issues", instructions) def test_local_task_status_description_includes_state_contract(self): description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_status") @@ -69,6 +78,9 @@ def test_local_task_status_description_includes_state_contract(self): self.assertIn("completed", description) self.assertIn("failed", description) self.assertIn("stopped", description) + self.assertIn("pending for >5 minutes", description) + self.assertIn(">20 minutes", description) + self.assertIn("PlanExeOrg/PlanExe/issues", description) if __name__ == "__main__": diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index addd3b65..e5a75a71 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -478,7 +478,10 @@ class ToolDefinition: "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " "State contract: running/stopping => keep polling; completed => download is ready; " - "failed => terminal error; stopped => stop acknowledged (terminal)." + "failed => terminal error; stopped => stop acknowledged (terminal). " + "Troubleshooting: TaskState.pending for >5 minutes likely means queued but not picked up by a worker. " + "TaskState.processing/running with no file-output changes for >20 minutes likely means failed/stalled. " + "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, output_schema=TASK_STATUS_OUTPUT_SCHEMA, @@ -514,6 +517,9 @@ class ToolDefinition: "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " + "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) From 8cf42248846fe66e9c25cf51c6689b8e0f40a2e1 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 18:08:36 +0100 Subject: [PATCH 06/38] Align MCP public states to pending processing completed failed --- docs/mcp/mcp_details.md | 5 +- docs/mcp/planexe_mcp_interface.md | 66 +++++++++---------- mcp_cloud/README.md | 3 +- mcp_cloud/app.py | 48 +++++++------- mcp_cloud/http_server.py | 6 +- mcp_cloud/tests/test_task_status_tool.py | 20 ++++++ .../tests/test_tool_surface_consistency.py | 16 ++--- mcp_cloud/tool_models.py | 22 ++++--- mcp_local/README.md | 3 +- mcp_local/planexe_mcp_local.py | 17 +++-- public/llms.txt | 5 +- 11 files changed, 113 insertions(+), 98 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 2ba7c53e..e618aeb8 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -122,11 +122,10 @@ Example call: State contract: -- `running`: keep polling. -- `stopping`: stop requested and in progress, keep polling. +- `pending`: queued and waiting for a worker, keep polling. +- `processing`: picked up by a worker, keep polling. - `completed`: terminal success, proceed to download. - `failed`: terminal error. -- `stopped`: terminal stop acknowledged. ### task_stop diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 2200e68a..92ef23ff 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`running`/`stopping` = keep polling, `completed` = download now, `failed`/`stopped` = terminal). To stop, call task_stop with the task_id from task_create. +- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call task_stop with the task_id from task_create. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -92,13 +92,13 @@ A long-lived container for a PlanExe project run. - config: immutable run configuration (models, runtime limits, Luigi params) - created_at, updated_at -#### Run +#### Execution -A single execution attempt inside a task (e.g., after a resume). +A single execution attempt inside a task. **Key properties** -- state: running | stopped | completed | failed +- state: pending | processing | completed | failed - progress_percentage: computed progress percentage (float) - started_at, ended_at @@ -128,32 +128,25 @@ A typed message emitted during execution for UI/agent consumption. ## 5. State Machine -### 5.1 Task states +### 5.1 TaskItem.state values -Tasks may exist independent of active runs. +The public MCP `state` field is aligned with `TaskItem.state`: -- created: task initialized, no run started -- active: at least one run exists, may be running or stopped -- archived: optional; immutable, no new runs allowed - -### 5.2 Run states - -- running -- stopping (optional transitional state) -- stopped (user stopped, resumable) +- pending (queued, waiting for a worker) +- processing (picked up by a worker) - completed -- failed (resumable depending on failure type) +- failed -### 5.3 Allowed transitions +### 5.2 Allowed transitions -- running → stopped via task_stop -- running → completed via normal success -- running → failed via error +- pending → processing when picked up by a worker +- processing → completed via normal success +- processing → failed via error -**Invalid** +### 5.3 Invalid transitions -- completed → running (new run must be triggered by creating a new task) -- running → running (no concurrent runs in v1) +- completed → processing (new run must be triggered by creating a new task) +- processing → processing is not a state transition on the same task; create separate tasks for parallel work. --- @@ -298,7 +291,7 @@ For the full catalog file: ### 6.3 task_status -Returns run status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation takes 15–20+ minutes and frequent polling is unnecessary. +Returns task status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation takes 15–20+ minutes and frequent polling is unnecessary. **Request** @@ -314,22 +307,21 @@ Returns run status and progress. Used for progress bars and UI states. **Polling **Caller contract (state meanings)** -- `running`: work is still in progress. Keep polling. -- `stopping`: stop requested and in progress. Keep polling. +- `pending`: queued and waiting for a worker. Keep polling. +- `processing`: picked up by a worker and in progress. Keep polling. - `completed`: terminal success. Download artifacts now. - `failed`: terminal error. Do not keep polling for completion. -- `stopped`: terminal stop acknowledged by the system/user request. **Terminal states** -- `completed`, `failed`, `stopped` +- `completed`, `failed` **Response** ```json { "task_id": "5e2b2a7c-8b49-4d2f-9b8f-6a3c1f05b9a1", - "state": "running", + "state": "processing", "progress_percentage": 62.0, "timing": { "started_at": "2026-01-14T12:35:10Z", @@ -364,13 +356,14 @@ Requests the plan generation to stop. Pass the **task_id** (the UUID returned by **Input** -- task_id: UUID returned by task_create. Use this same UUID when calling task_stop to request the run to stop. +- task_id: UUID returned by task_create. Use this same UUID when calling task_stop to request the task to stop. **Response** ```json { - "state": "stopped" + "state": "processing", + "stop_requested": true } ``` @@ -410,11 +403,16 @@ Targets map to Luigi "final tasks". ## 8. Concurrency & Locking -### 8.1 Single active run per task +### 8.1 Client-side concurrency guidance + +The server does not enforce a global limit on how many tasks a client can create. +Concurrency is a client-side coordination concern. -In v1, tasks MUST enforce: +Recommended practice for MCP clients: -- at most one run in running state. +- Start with 1 active task. +- If needed, increase to 2 tasks in parallel. +- Going beyond 4 parallel tasks is usually hard to track; avoid unless necessary. --- diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index c33b589c..925e1122 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -135,10 +135,9 @@ See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `task_file_info` - Get file metadata for report or zip `task_status` caller contract: -- `running` / `stopping`: keep polling. +- `pending` / `processing`: keep polling. - `completed`: terminal success, download is ready. - `failed`: terminal error. -- `stopped`: terminal stop acknowledged. Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this server. If your client exposes `task_download`, use it to save the report or zip locally; otherwise use `task_file_info` to get `download_url` and fetch the file yourself. diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 8fada0ce..5e13a3ee 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -119,9 +119,9 @@ def ensure_taskitem_stop_columns() -> None: "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " - "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " - "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) @@ -306,18 +306,23 @@ def _get_task_status_snapshot_sync(task_id: str) -> Optional[dict[str, Any]]: "timestamp_created": task.timestamp_created, } -def _request_task_stop_sync(task_id: str) -> bool: +def _request_task_stop_sync(task_id: str) -> Optional[dict[str, Any]]: with app.app_context(): task = find_task_by_task_id(task_id) if task is None: - return False + return None + stop_requested = False if task.state in (TaskState.pending, TaskState.processing): task.stop_requested = True task.stop_requested_timestamp = datetime.now(UTC) task.progress_message = "Stop requested by user." db.session.commit() logger.info("Stop requested for task %s; stop flag set on task %s.", task_id, task.id) - return True + stop_requested = True + return { + "state": get_task_state_mapping(task.state), + "stop_requested": stop_requested, + } def _get_task_for_report_sync(task_id: str) -> Optional[dict[str, Any]]: with app.app_context(): @@ -609,14 +614,14 @@ def compute_sha256(content: str | bytes) -> str: return hashlib.sha256(content).hexdigest() def get_task_state_mapping(task_state: TaskState) -> str: - """Map TaskState to MCP run state.""" + """Map TaskState to MCP task state.""" mapping = { - TaskState.pending: "stopped", - TaskState.processing: "running", + TaskState.pending: "pending", + TaskState.processing: "processing", TaskState.completed: "completed", TaskState.failed: "failed", } - return mapping.get(task_state, "stopped") + return mapping.get(task_state, "pending") def resolve_speed_vs_detail(config: Optional[dict[str, Any]]) -> str: value: Optional[str] = None @@ -876,10 +881,9 @@ class ToolDefinition: "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " - "State contract: running/stopping => keep polling; completed => download is ready; " - "failed => terminal error; stopped => stop acknowledged (terminal). " - "Troubleshooting: TaskState.pending for >5 minutes likely means queued but not picked up by a worker. " - "TaskState.processing/running with no file-output changes for >20 minutes likely means failed/stalled. " + "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " + "processing with no file-output changes for >20 minutes likely means failed/stalled. " "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, @@ -1039,7 +1043,7 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: - """Fetch the current run status, progress, and recent files for a task. + """Fetch the current task status, progress, and recent files for a task. Examples: - {"task_id": "uuid"} → state/progress/timing + recent files @@ -1073,8 +1077,6 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: task_state = task_snapshot["state"] state = get_task_state_mapping(task_state) - if task_state == TaskState.processing and task_snapshot["stop_requested"]: - state = "stopping" if task_state == TaskState.completed: progress_percentage = 100.0 @@ -1122,7 +1124,7 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: ) async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: - """Request the active run for a task to stop. + """Request an active task to stop. Examples: - {"task_id": "uuid"} → stop request accepted @@ -1132,14 +1134,14 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: Returns: - content: JSON string matching structuredContent. - - structuredContent: {"state": "stopped"} or error payload. + - structuredContent: {"state": "pending|processing|completed|failed", "stop_requested": bool} or error payload. - isError: True only when task_id is unknown. """ req = TaskStopRequest(**arguments) task_id = req.task_id - found = await asyncio.to_thread(_request_task_stop_sync, task_id) - if not found: + stop_result = await asyncio.to_thread(_request_task_stop_sync, task_id) + if stop_result is None: response = { "error": { "code": "TASK_NOT_FOUND", @@ -1152,9 +1154,7 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: isError=True, ) - response = { - "state": "stopped", - } + response = stop_result return CallToolResult( content=[TextContent(type="text", text=json.dumps(response))], diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 8221a1e2..13227250 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -397,9 +397,9 @@ def _register_tools(server: FastMCP) -> None: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " - "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " - "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ), diff --git a/mcp_cloud/tests/test_task_status_tool.py b/mcp_cloud/tests/test_task_status_tool.py index 1a765e5c..2f0cfb54 100644 --- a/mcp_cloud/tests/test_task_status_tool.py +++ b/mcp_cloud/tests/test_task_status_tool.py @@ -63,6 +63,26 @@ def test_task_status_falls_back_to_zip_snapshot_files_when_primary_source_empty( self.assertEqual(len(files), 1) self.assertEqual(files[0]["path"], "001-2-plan.txt") + def test_task_status_uses_processing_state_name(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": task_id, + "state": TaskState.processing, + "stop_requested": True, + "progress_percentage": 10.0, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.app._get_task_status_snapshot_sync", + return_value=task_snapshot, + ), patch( + "mcp_cloud.app.fetch_file_list_from_worker_plan", + new=AsyncMock(return_value=[]), + ): + result = asyncio.run(handle_task_status({"task_id": task_id})) + + self.assertEqual(result.structuredContent["state"], "processing") + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index 364fc5b2..eb014c4f 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -28,20 +28,20 @@ def test_cloud_task_create_description_references_cloud_download_tool(self): def test_cloud_instructions_include_task_status_state_contract(self): instructions = cloud_app.PLANEXE_SERVER_INSTRUCTIONS - self.assertIn("running/stopping", instructions) + self.assertIn("pending/processing", instructions) self.assertIn("completed", instructions) self.assertIn("failed", instructions) - self.assertIn("stopped", instructions) + self.assertNotIn("running/stopping", instructions) self.assertIn("pending for longer than 5 minutes", instructions) self.assertIn("longer than 20 minutes", instructions) self.assertIn("PlanExeOrg/PlanExe/issues", instructions) def test_cloud_task_status_description_includes_state_contract(self): description = _tool_desc(cloud_app.TOOL_DEFINITIONS, "task_status") - self.assertIn("running/stopping", description) + self.assertIn("pending/processing", description) self.assertIn("completed", description) self.assertIn("failed", description) - self.assertIn("stopped", description) + self.assertNotIn("running/stopping", description) self.assertIn("pending for >5 minutes", description) self.assertIn(">20 minutes", description) self.assertIn("PlanExeOrg/PlanExe/issues", description) @@ -64,20 +64,20 @@ def test_local_task_create_description_references_local_download_tool(self): def test_local_instructions_include_task_status_state_contract(self): instructions = local_app.PLANEXE_SERVER_INSTRUCTIONS - self.assertIn("running/stopping", instructions) + self.assertIn("pending/processing", instructions) self.assertIn("completed", instructions) self.assertIn("failed", instructions) - self.assertIn("stopped", instructions) + self.assertNotIn("running/stopping", instructions) self.assertIn("pending for longer than 5 minutes", instructions) self.assertIn("longer than 20 minutes", instructions) self.assertIn("PlanExeOrg/PlanExe/issues", instructions) def test_local_task_status_description_includes_state_contract(self): description = _tool_desc(local_app.TOOL_DEFINITIONS, "task_status") - self.assertIn("running/stopping", description) + self.assertIn("pending/processing", description) self.assertIn("completed", description) self.assertIn("failed", description) - self.assertIn("stopped", description) + self.assertNotIn("running/stopping", description) self.assertIn("pending for >5 minutes", description) self.assertIn(">20 minutes", description) self.assertIn("PlanExeOrg/PlanExe/issues", description) diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index b9ead4aa..9c0ec3f9 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -72,12 +72,11 @@ class TaskStatusSuccess(BaseModel): ..., description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] = Field( + state: Literal["pending", "processing", "completed", "failed"] = Field( ..., description=( - "Caller contract: running/stopping => keep polling; " - "completed => download is ready; failed => terminal error; " - "stopped => stop acknowledged (terminal)." + "Caller contract: pending/processing => keep polling; " + "completed => download is ready; failed => terminal error." ), ) progress_percentage: float @@ -90,12 +89,11 @@ class TaskStatusOutput(BaseModel): default=None, description="Task UUID returned by task_create." ) - state: Literal["stopped", "running", "completed", "failed", "stopping"] | None = Field( + state: Literal["pending", "processing", "completed", "failed"] | None = Field( default=None, description=( - "Caller contract: running/stopping => keep polling; " - "completed => download is ready; failed => terminal error; " - "stopped => stop acknowledged (terminal)." + "Caller contract: pending/processing => keep polling; " + "completed => download is ready; failed => terminal error." ), ) progress_percentage: float | None = None @@ -105,9 +103,13 @@ class TaskStatusOutput(BaseModel): class TaskStopOutput(BaseModel): - state: Literal["stopped"] | None = Field( + state: Literal["pending", "processing", "completed", "failed"] | None = Field( default=None, - description="Stop acknowledged. stopped is terminal for this run.", + description="Current task state after stop request.", + ) + stop_requested: bool | None = Field( + default=None, + description="True when stop request flag was set for a pending/processing task.", ) error: ErrorDetail | None = None diff --git a/mcp_local/README.md b/mcp_local/README.md index 2a96f6df..c79c9903 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -15,10 +15,9 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas `task_download` - Download the plan, either html report or a zip with everything, and save it to disk. `task_status` caller contract: -- `running` / `stopping`: keep polling. +- `pending` / `processing`: keep polling. - `completed`: terminal success, download is ready. - `failed`: terminal error. -- `stopped`: terminal stop acknowledged. **Tip**: Call `prompt_examples` to get example prompts to use with task_create. The full catalog lives at `worker_plan/worker_plan_api/prompt/data/simple_plan_prompts.jsonl`. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index e5a75a71..c52a320e 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -477,10 +477,9 @@ class ToolDefinition: "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " - "State contract: running/stopping => keep polling; completed => download is ready; " - "failed => terminal error; stopped => stop acknowledged (terminal). " - "Troubleshooting: TaskState.pending for >5 minutes likely means queued but not picked up by a worker. " - "TaskState.processing/running with no file-output changes for >20 minutes likely means failed/stalled. " + "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " + "processing with no file-output changes for >20 minutes likely means failed/stalled. " "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." ), input_schema=TASK_STATUS_INPUT_SCHEMA, @@ -516,9 +515,9 @@ class ToolDefinition: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " - "task_status state contract: running/stopping => keep polling; completed => download is ready; failed => terminal error; stopped => stop acknowledged (terminal). " - "Troubleshooting: if task_status stays in TaskState.pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " - "If task_status is in TaskState.processing/running and output files do not change for longer than 20 minutes, the run likely failed/stalled. " + "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " + "If task_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." ) @@ -643,7 +642,7 @@ async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: - """Request mcp_cloud to stop a running task. + """Request mcp_cloud to stop an active task. Examples: - {"task_id": "uuid"} → stop request acknowledged @@ -653,7 +652,7 @@ async def handle_task_stop(arguments: dict[str, Any]) -> CallToolResult: Returns: - content: JSON string matching structuredContent. - - structuredContent: {"state": "stopped"} or error. + - structuredContent: {"state": "pending|processing|completed|failed", "stop_requested": bool} or error. - isError: True when the remote tool call fails. """ req = TaskStopRequest(**arguments) diff --git a/public/llms.txt b/public/llms.txt index d48158ac..fbb0b06c 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -76,11 +76,10 @@ Key tool inputs/outputs: - task_file_info output: downloadable report/zip metadata and URLs. task_status caller contract: -- running: keep polling. -- stopping: keep polling. +- pending: keep polling. +- processing: keep polling. - completed: terminal success; download is ready. - failed: terminal error. -- stopped: terminal stop acknowledged. Recommended interaction order: 1. Call prompt_examples. From f7798d2f2e32930c79ea7b4e2f7e7f8d0f71e335 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 18:56:48 +0100 Subject: [PATCH 07/38] Clarify MCP state contract and local/cloud agent guidance --- mcp_cloud/AGENTS.md | 40 +++++++++++++++++++++++++++++++++++++--- mcp_local/AGENTS.md | 30 +++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index ba6b0048..6986266a 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -16,13 +16,35 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - Task management maps to `TaskItem` records (each task = one TaskItem). - Events are queried from `EventItem` database records. - Use the TaskItem UUID as the MCP `task_id`. +- Public task state contract: + - `task_status.state` must use exactly: `pending`, `processing`, `completed`, `failed`. + - These values correspond 1:1 with `database_api.model_taskitem.TaskState`. + - Do not use legacy public names like `running`, `stopping`, or `stopped` for `task_status`. + - Do not expose internal symbol/class names (for example `TaskState.pending`, `TaskItem.state`) in model-facing tool descriptions; use plain public state strings. - Download contract: - `track_activity.jsonl` is internal-only (`TaskItem.run_track_activity_jsonl`). - Downloadable zip artifacts must never include `track_activity.jsonl`. - Serve new layout snapshots directly; sanitize only legacy/fallback zips. +- `task_stop` contract: + - `task_stop` does not create a separate lifecycle state. + - Return current public `state` plus `stop_requested` to acknowledge stop-flag request. - Forbidden imports: `worker_plan.app`, `worker_plan_internal`, `frontend_*`, `open_dir_server`. +## task_create contract +- Visible input schema is intentionally limited to: + - `prompt` + - `model_profile` (`baseline`, `premium`, `frontier`, `custom`) + - `user_api_key` (optional) +- Keep `speed_vs_detail` out of model-visible input schema. +- Runtime override for `speed_vs_detail` is metadata-only (tool-specific metadata), + read from hidden containers (`tool_metadata`, `metadata`, `_meta`) and nested + namespaces (`task_create`, `planexe_task_create`, `planexe`). +- Preserve compatibility aliases for metadata speed values: + - `ping` -> `ping_llm` + - `fast` -> `fast_but_skip_details` + - `all` -> `all_details_but_slow` + ## MCP Protocol - The server communicates over stdio (standard input/output) following the MCP protocol. - Tools are registered via `@mcp_cloud.list_tools()` and handled via `@mcp_cloud.call_tool()`. @@ -41,9 +63,18 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - It targets either: - the HTTP wrapper endpoint (`/mcp/tools/call`), or - the streamable MCP JSON-RPC endpoint (`/mcp`). +- Tool-surface split must stay explicit: + - `mcp_cloud` exposes `task_file_info` (not `task_download`). + - `mcp_local` exposes `task_download` and implements it via cloud `task_file_info`. - `task_file_info` provides download metadata that `mcp_local` uses to download artifacts via `/download/{task_id}/...`. +## Troubleshooting guidance (caller-facing text) +- Keep guidance aligned across server instructions and tool descriptions: + - `pending` for longer than 5 minutes usually means queued but not picked up by worker. + - `processing` with no output-file changes for longer than 20 minutes usually means stalled/failed execution. + - In both cases, direct users to report issues at `https://github.com/PlanExeOrg/PlanExe/issues`. + ## MCP Registry metadata - Registry metadata for this server lives at `mcp_cloud/server.json`. - Keep `server.json` aligned with deployed behavior: @@ -52,6 +83,9 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - Publish with `mcp-publisher` from the `mcp_cloud/` directory so it picks up this file. ## Testing -- No automated tests currently. If you change MCP tool behavior or database mappings, - add a unit test close to the logic when feasible and run `python test.py` from - repo root. +- Automated tests exist under `mcp_cloud/tests/`. +- If you change MCP tool behavior, state mapping, or tool surface, update/add unit + tests close to the changed logic. +- Run focused tests from repo root, for example: + - `python -m unittest mcp_cloud.tests.test_tool_surface_consistency` + - `python -m unittest mcp_cloud.tests.test_task_status_tool` diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index ec3cd2a9..5d988580 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -9,6 +9,28 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - Supported tools: `task_create`, `task_status`, `task_stop`, `task_download`, `prompt_examples`. - `task_download` calls the remote `task_file_info` tool to obtain a download URL, then downloads the artifact to `PLANEXE_PATH` on the local machine. +- `task_create` visible input schema includes `prompt` and optional `model_profile`. +- Runtime override `speed_vs_detail` is metadata-only (hidden from visible schema); + when callers still pass legacy top-level `speed_vs_detail`/`speed`, forward those + into `metadata.task_create` for backward compatibility. + +## Public state contract +- `task_status.state` must use exactly: `pending`, `processing`, `completed`, `failed`. +- Caller contract: + - `pending`/`processing`: keep polling. + - `completed`: download is ready. + - `failed`: terminal error. +- Do not use legacy public names such as `running`, `stopping`, or `stopped`. +- Do not expose internal implementation symbols (for example `TaskState.pending`) in + model-facing text; use plain public strings. +- Troubleshooting guidance to keep aligned with cloud docs/instructions: + - `pending` for longer than 5 minutes likely means queued but not picked up by worker. + - `processing` with no output-file changes for longer than 20 minutes likely means stalled/failed execution. + - Report both cases at `https://github.com/PlanExeOrg/PlanExe/issues`. + +## task_stop semantics +- `task_stop` is a stop request/acknowledgement, not a separate lifecycle state. +- Return payload should include current public `state` plus `stop_requested`. ## Constraints - Do not add dependencies outside the existing runtime (stdlib + `mcp`). @@ -16,9 +38,15 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - HTTP wrapper (`/mcp/tools/call`) - Streamable MCP JSON-RPC (`/mcp`) - Ensure all tool responses include structured content when an output schema is defined. +- Tool-surface split must remain explicit: + - local exposes `task_download`. + - cloud exposes `task_file_info`. + - do not expose `task_file_info` as a local tool name. - **Run as task**: Do not advertise the MCP **tasks** protocol (tasks/get, tasks/result, tasks/cancel, tasks/list) or add tool-level "Run as task" support. PlanExe’s interface is tool-based only (task_create → task_status → task_download). The MCP tasks protocol is a different, client-driven feature; Cursor and the Python MCP SDK do not support it properly, so we keep tools-only for compatibility. ## Env vars - `PLANEXE_URL`: Base URL for mcp_cloud (e.g., `http://localhost:8001/mcp`). -- `PLANEXE_MCP_API_KEY`: API key passed as `Authorization: Bearer ...` if provided. +- `PLANEXE_MCP_API_KEY`: API key forwarded to remote as `Authorization: Bearer ...`. - `PLANEXE_PATH`: Local directory where downloads are saved. + - Must be a directory. + - Defaults to current working directory when unset. From 8b74659b005f6b2d324f8b77e393e1ff9f4d5e6b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:01:00 +0100 Subject: [PATCH 08/38] Use X-API-Key for mcp_local auth header --- mcp_local/AGENTS.md | 2 +- mcp_local/README.md | 2 +- mcp_local/planexe_mcp_local.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index 5d988580..be09c819 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -46,7 +46,7 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. ## Env vars - `PLANEXE_URL`: Base URL for mcp_cloud (e.g., `http://localhost:8001/mcp`). -- `PLANEXE_MCP_API_KEY`: API key forwarded to remote as `Authorization: Bearer ...`. +- `PLANEXE_MCP_API_KEY`: API key forwarded to remote as custom header `X-API-Key`. - `PLANEXE_PATH`: Local directory where downloads are saved. - Must be a directory. - Defaults to current working directory when unset. diff --git a/mcp_local/README.md b/mcp_local/README.md index c79c9903..fa629006 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -40,7 +40,7 @@ You should **not** enable "Run as task" for PlanExe. The Python MCP SDK and clie - If the HTTP wrapper is unavailable, the proxy falls back to MCP JSON-RPC over `POST /mcp` (not SSE). - Downloads use the remote `/download/{task_id}/...` endpoints. -- Authentication uses `PLANEXE_MCP_API_KEY` as a `Bearer` token. +- Authentication uses `PLANEXE_MCP_API_KEY` as custom header `X-API-Key` (not OAuth/Bearer). - **Retry behavior**: Transient failures (server 5xx errors, network timeouts) are automatically retried up to 3 times with exponential backoff (1s, 2s delays). Client errors (4xx) are not retried. Retries are logged at WARNING level. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index c52a320e..19ddb5f3 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -95,7 +95,7 @@ def _build_headers() -> dict[str, str]: } api_key = _get_env("PLANEXE_MCP_API_KEY") if api_key: - headers["Authorization"] = f"Bearer {api_key}" + headers["X-API-Key"] = api_key return headers From dd6bb0f871d5560d5eb2e53f1dac6665913081c2 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:05:11 +0100 Subject: [PATCH 09/38] Clarify MCP download path and URL environment behavior --- docs/mcp/mcp_details.md | 7 +++++++ docs/mcp/planexe_mcp_interface.md | 16 ++++++++++++++++ mcp_cloud/AGENTS.md | 5 +++++ mcp_cloud/README.md | 3 ++- mcp_cloud/app.py | 5 ++++- mcp_cloud/tool_models.py | 28 ++++++++++++++++++++-------- mcp_local/AGENTS.md | 2 ++ mcp_local/README.md | 7 +++++++ mcp_local/planexe_mcp_local.py | 18 ++++++++++++------ public/llms.txt | 3 +++ 10 files changed, 78 insertions(+), 16 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index e618aeb8..4197a4db 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -202,6 +202,13 @@ Example call: {"task_id": "2d57a448-1b09-45aa-ad37-e69891ff6ec7", "artifact": "report"} ``` +`PLANEXE_PATH` behavior for `task_download`: +- Save directory is `PLANEXE_PATH`, or current working directory if unset. +- Non-existing directories are created automatically. +- If `PLANEXE_PATH` points to a file, download fails. +- Filename is prefixed with task id (for example `-030-report.html`). +- Response includes `saved_path` with the exact local file location. + ## Typical Flow ### 1. Get example prompts diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 92ef23ff..5f0bb6e1 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -385,6 +385,22 @@ Requests the plan generation to stop. Pass the **task_id** (the UUID returned by - task_id: UUID returned by task_create. Use it to download the created plan. - artifact: "report" or "zip" (default "report"). +**task_download local path behavior (mcp_local)** + +- Save directory is `PLANEXE_PATH`. +- If `PLANEXE_PATH` is unset, save to current working directory. +- If `PLANEXE_PATH` points to a file (not a directory), return an error. +- Filenames are `-030-report.html` or `-run.zip`. +- If a filename already exists, append `-1`, `-2`, ... before extension. +- Successful responses include `saved_path`. + +**task_file_info URL behavior (mcp_cloud)** + +- `download_url` is generated from `PLANEXE_MCP_PUBLIC_BASE_URL` when set. +- Otherwise, cloud HTTP mode uses request host/scheme when available. +- If no public base URL can be determined (for example some stdio-only flows), `download_url` may be absent. +- In deployments behind proxies/CDNs, set `PLANEXE_MCP_PUBLIC_BASE_URL` so clients receive a reachable URL. + --- ## 7. Targets diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index 6986266a..fe0196a8 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -58,6 +58,11 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - OAuth is not supported for the MCP API. Do not document, imply, or advertise OAuth support. - In docs and user-facing error/help text, instruct clients to use `X-API-Key` custom headers. +## Download URL environment behavior +- `task_file_info.download_url` should be built from `PLANEXE_MCP_PUBLIC_BASE_URL` when set. +- If `PLANEXE_MCP_PUBLIC_BASE_URL` is unset in HTTP mode, use request host/scheme. +- If no public base URL is available, `download_url` may be absent; document this and guide operators to set `PLANEXE_MCP_PUBLIC_BASE_URL`. + ## mcp_local integration - `mcp_local` runs on the user's machine and forwards tool calls to this server over HTTP. - It targets either: diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index 925e1122..a53e78cf 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -105,7 +105,7 @@ If your client only supports Streamable HTTP and fails on `/mcp`, you have two o - `PLANEXE_MCP_API_KEY`: Optional shared secret for auth. When auth is enabled, clients can use this key instead of a UserApiKey. For production with user accounts, keys from home.planexe.org (UserApiKey) are validated against the database. - `PLANEXE_MCP_HTTP_HOST`: HTTP server host (default: `127.0.0.1`). Use `0.0.0.0` to bind all interfaces (containers/cloud). - `PLANEXE_MCP_HTTP_PORT`: HTTP server port (default: `8001`). Railway will override with `PORT` env var. -- `PLANEXE_MCP_PUBLIC_BASE_URL`: Public base URL for report/zip download links in `task_file_info` (e.g. `http://192.168.1.40:8001`). When unset, the HTTP server uses the request’s host (scheme + authority), so clients connecting at `http://192.168.1.40:8001/mcp/` get download URLs like `http://192.168.1.40:8001/download/...` instead of localhost. If clients still see localhost in download URLs (e.g. behind a proxy), uncomment and set this in the repo’s `.env.docker-example` or `.env.developer-example` (copy to `.env` and fill in your public URL). +- `PLANEXE_MCP_PUBLIC_BASE_URL`: Public base URL for report/zip download links in `task_file_info` (e.g. `http://192.168.1.40:8001`). When set, `download_url` is built from this value. When unset, the HTTP server uses the request’s host (scheme + authority), so clients connecting at `http://192.168.1.40:8001/mcp/` get download URLs like `http://192.168.1.40:8001/download/...` instead of localhost. If clients still see localhost in download URLs (e.g. behind a proxy), set this env var explicitly in `.env`. - `PORT`: Railway-provided port (takes precedence over `PLANEXE_MCP_HTTP_PORT`) - `PLANEXE_MCP_CORS_ORIGINS`: Comma-separated list of allowed origins. When unset, uses `*` (all origins) so browser-based tools like the MCP Inspector can connect. If you set it (e.g. for a specific frontend), include `http://localhost:6274` and `http://127.0.0.1:6274` for the Inspector. - `PLANEXE_MCP_MAX_BODY_BYTES`: Max request size for `POST /mcp/tools/call` (default: `1048576`). @@ -145,6 +145,7 @@ Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this s Download flow: call `task_file_info` to obtain the `download_url`, then fetch the report via `GET /download/{task_id}/030-report.html` (API key required if configured). +If `download_url` is missing, configure `PLANEXE_MCP_PUBLIC_BASE_URL` so the server can emit a reachable absolute URL. ## Debugging with the MCP Inspector diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 5e13a3ee..5e5b9f21 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -119,6 +119,8 @@ def ensure_taskitem_stop_columns() -> None: "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "task_file_info download_url is absolute when PLANEXE_MCP_PUBLIC_BASE_URL is configured or request host is available. " + "If download_url is missing, configure PLANEXE_MCP_PUBLIC_BASE_URL on the server. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " @@ -903,7 +905,8 @@ class ToolDefinition: description=( "Returns file metadata (content_type, download_url, download_size) for the report or zip. " "If your client exposes task_download (e.g. mcp_local), use that to save the file locally; " - "otherwise use this tool to get download_url and fetch the file yourself." + "otherwise use this tool to get download_url and fetch the file yourself. " + "download_url is generated from PLANEXE_MCP_PUBLIC_BASE_URL (or request host when available)." ), input_schema=TASK_FILE_INFO_INPUT_SCHEMA, output_schema=TASK_FILE_INFO_OUTPUT_SCHEMA, diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 9c0ec3f9..2cda11a8 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -115,17 +115,29 @@ class TaskStopOutput(BaseModel): class TaskFileInfoReadyOutput(BaseModel): - content_type: str - sha256: str - download_size: int - download_url: str | None = None + content_type: str = Field(..., description="Artifact content type.") + sha256: str = Field(..., description="SHA-256 hash of artifact bytes.") + download_size: int = Field(..., description="Artifact size in bytes.") + download_url: str | None = Field( + default=None, + description=( + "Absolute artifact download URL when server base URL is known " + "(PLANEXE_MCP_PUBLIC_BASE_URL or request host)." + ), + ) class TaskFileInfoOutput(BaseModel): - content_type: str | None = None - sha256: str | None = None - download_size: int | None = None - download_url: str | None = None + content_type: str | None = Field(default=None, description="Artifact content type.") + sha256: str | None = Field(default=None, description="SHA-256 hash of artifact bytes.") + download_size: int | None = Field(default=None, description="Artifact size in bytes.") + download_url: str | None = Field( + default=None, + description=( + "Absolute artifact download URL when server base URL is known. " + "May be omitted in some deployments." + ), + ) error: ErrorDetail | None = None diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index be09c819..bf273503 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -49,4 +49,6 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - `PLANEXE_MCP_API_KEY`: API key forwarded to remote as custom header `X-API-Key`. - `PLANEXE_PATH`: Local directory where downloads are saved. - Must be a directory. + - Created automatically when missing. - Defaults to current working directory when unset. + - Saved filename pattern: `-` with numeric suffixes on collisions. diff --git a/mcp_local/README.md b/mcp_local/README.md index fa629006..0bbed730 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -25,6 +25,13 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas remote MCP tool `task_file_info` to obtain a download URL, then downloads the file locally into `PLANEXE_PATH`. +`PLANEXE_PATH` behavior: +- If unset, downloads are saved to the current working directory. +- If the path does not exist, it is created. +- If the path points to a file (not a directory), download fails. +- Filenames are `-030-report.html` or `-run.zip` (with `-1`, `-2`, ... suffixes on collisions). +- `task_download` returns `saved_path` with the final file location. + ## Run as task (MCP tasks protocol) Some MCP clients (e.g. the MCP Inspector) show a **"Run as task"** option for tools. That refers to the MCP **tasks** protocol: a separate mechanism where the client runs a tool in the background using RPC methods like `tasks/run`, `tasks/get`, `tasks/result`, and `tasks/cancel`, instead of a single blocking tool call. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 19ddb5f3..02efe2cd 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -440,11 +440,14 @@ class ToolDefinition: TASK_DOWNLOAD_OUTPUT_SCHEMA = { "type": "object", "properties": { - "content_type": {"type": "string"}, - "sha256": {"type": "string"}, - "download_size": {"type": "integer"}, - "download_url": {"type": "string"}, - "saved_path": {"type": "string"}, + "content_type": {"type": "string", "description": "Artifact content type."}, + "sha256": {"type": "string", "description": "SHA-256 hash of artifact bytes."}, + "download_size": {"type": "integer", "description": "Artifact size in bytes."}, + "download_url": {"type": "string", "description": "Remote URL used for download."}, + "saved_path": { + "type": "string", + "description": "Local file path written by task_download.", + }, "error": ERROR_SCHEMA, }, "additionalProperties": False, @@ -498,7 +501,9 @@ class ToolDefinition: name="task_download", description=( "Download the plan output and save it locally to PLANEXE_PATH. " - "Choose the HTML report (default) or a zip of all generated files." + "Choose the HTML report (default) or a zip of all generated files. " + "If PLANEXE_PATH is unset, files are saved to the current working directory. " + "Filename format is - with numeric suffixes when collisions occur." ), input_schema=TASK_DOWNLOAD_INPUT_SCHEMA, output_schema=TASK_DOWNLOAD_OUTPUT_SCHEMA, @@ -515,6 +520,7 @@ class ToolDefinition: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " + "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " diff --git a/public/llms.txt b/public/llms.txt index fbb0b06c..9d141839 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -74,6 +74,7 @@ Key tool inputs/outputs: - task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. - task_file_info output: downloadable report/zip metadata and URLs. +- mcp_local task_download output: includes local saved_path where artifact was written. task_status caller contract: - pending: keep polling. @@ -91,6 +92,8 @@ Recommended interaction order: Note: - task_download is provided by mcp_local wrappers in some client setups, not by mcp_cloud directly. +- In mcp_local, downloads save to PLANEXE_PATH (or current working directory if PLANEXE_PATH is unset). +- In mcp_cloud, task_file_info download_url depends on deployment URL settings; configure PLANEXE_MCP_PUBLIC_BASE_URL for stable absolute URLs. ## Authentication From 1e99b2dbf3426ca0fca3542267f8301ee22be54c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:16:39 +0100 Subject: [PATCH 10/38] Add model_profiles tool for runtime profile selection --- README.md | 7 +- docs/mcp/mcp_details.md | 47 +++- docs/mcp/mcp_setup.md | 16 +- docs/mcp/planexe_mcp_interface.md | 46 +++- mcp_cloud/AGENTS.md | 2 + mcp_cloud/README.md | 5 +- mcp_cloud/app.py | 204 +++++++++++++++++- mcp_cloud/http_server.py | 11 +- mcp_cloud/tests/test_model_profiles_tool.py | 50 +++++ .../tests/test_tool_surface_consistency.py | 8 + mcp_cloud/tool_models.py | 63 +++++- mcp_local/AGENTS.md | 3 +- mcp_local/README.md | 3 +- mcp_local/planexe_mcp_local.py | 98 ++++++++- public/llms.txt | 13 +- 15 files changed, 542 insertions(+), 34 deletions(-) create mode 100644 mcp_cloud/tests/test_model_profiles_tool.py diff --git a/README.md b/README.md index 60801ec6..459d9725 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,10 @@ Assuming you have an MCP-compatible client (OpenClaw, Cursor, Codex, LM Studio, The Tool workflow (tools-only, not MCP tasks protocol) 1. `prompt_examples` -2. `task_create` -3. `task_status` (poll every 5 minutes until done) -4. download the result via `task_download` or via `task_file_info` +2. `model_profiles` (optional, helps choose `model_profile`) +3. `task_create` +4. `task_status` (poll every 5 minutes until done) +5. download the result via `task_download` or via `task_file_info` ### Option A: Remote MCP (fastest path) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 4197a4db..dfe64b58 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -29,6 +29,35 @@ Example call: Response includes `samples` (array of prompt strings, each 300–800 words) and `message`. +### model_profiles + +Returns profile guidance and model availability for `task_create.model_profile`. +This helps agents pick a profile without knowing internal `llm_config/*.json` details. + +Example prompt: +``` +List available model profiles and models. +``` + +Example call: +```json +{} +``` + +Response includes: +- `default_profile` +- `whitelist_active` +- `whitelisted_classes` +- `profiles[]` with: + - `profile` + - `title` + - `summary` + - `config_filename` + - `available` + - `model_count` + - `filtered_out_count` + - `models[]` (`key`, `provider_class`, `model`, `priority`) + ### task_create Create a new plan task. @@ -225,7 +254,19 @@ Tool call: {} ``` -### 2. Create a plan +### 2. Inspect model profiles (optional but recommended) + +Prompt: +``` +Show model profile options and available models. +``` + +Tool call: +```json +{} +``` + +### 3. Create a plan The user reviews the prompt and either asks for further changes or confirms it’s good to go. When the user confirms, the agent calls `task_create` with that prompt. @@ -234,7 +275,7 @@ Tool call: {"prompt": "..."} ``` -### 3. Get status +### 4. Get status Prompt: ``` @@ -246,7 +287,7 @@ Tool call: {"task_id": ""} ``` -### 4. Download the report +### 5. Download the report Prompt: ``` diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index 7d19dfc0..0f2b09c6 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -11,19 +11,21 @@ This is the shortest path to a working PlanExe MCP integration. ## 1. Understand the flow 1. Ask for prompt examples. -2. Expand the user idea into a high‑quality prompt. -3. Create the plan task. -4. Poll for status. -5. Download the report (HTML or zip). +2. Inspect `model_profile` options and available models. +3. Expand the user idea into a high‑quality prompt. +4. Create the plan task. +5. Poll for status. +6. Download the report (HTML or zip). --- ## 2. Minimal tool usage 1. `prompt_examples` -2. `task_create` -3. `task_status` -4. `task_download` +2. `model_profiles` +3. `task_create` +4. `task_status` +5. `task_download` For `task_create`: diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 5f0bb6e1..a56edc84 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call task_stop with the task_id from task_create. +- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Optional before task_create: call model_profiles to inspect profile guidance and available models under current whitelist settings. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call task_stop with the task_id from task_create. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -73,7 +73,7 @@ The MCP specification defines two different mechanisms: - **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download or task_file_info. This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download or task_file_info when complete. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; optional before task_create — call model_profiles; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download or task_file_info when complete. --- @@ -171,6 +171,46 @@ All tool names below are normative. --- +### 6.1.1 model_profiles + +Optional helper tool to discover valid `model_profile` choices and currently available models without relying on internal config knowledge. + +**Request:** no parameters (empty object). + +**Response (shape)** + +```json +{ + "default_profile": "baseline", + "whitelist_active": true, + "whitelisted_classes": ["openrouter"], + "profiles": [ + { + "profile": "baseline", + "title": "Baseline", + "summary": "Cheap and fast; recommended default for most runs.", + "config_filename": "baseline.json", + "available": true, + "model_count": 5, + "filtered_out_count": 2, + "models": [ + { + "key": "openrouter-gpt-oss-20b", + "provider_class": "OpenRouter", + "model": "openai/gpt-oss-20b", + "priority": 0 + } + ] + } + ], + "message": "..." +} +``` + +Use the returned `profile` values directly in `task_create.model_profile`. + +--- + ### 6.2 task_create **Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2).** Start creating a new plan with the approved prompt. @@ -260,7 +300,7 @@ Use a normal single LLM response (not PlanExe) for one-shot micro-tasks. PlanExe **Optional** -- model_profile: LLM profile (`baseline` | `premium` | `frontier` | `custom`). +- model_profile: LLM profile (`baseline` | `premium` | `frontier` | `custom`). If unsure, call `model_profiles` first. - user_api_key: user API key for credits and attribution (if your deployment requires it). Clients can call the MCP tool **prompt_examples** to retrieve example prompts. Use these as examples for task_create; they can also call task_create with any prompt—short prompts produce less detailed plans. diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index fe0196a8..4047157d 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -32,6 +32,8 @@ for AI agents and developer tools to interact with PlanExe. Communicates with `open_dir_server`. ## task_create contract +- Expose `model_profiles` as the discovery tool for profile selection. +- `model_profiles` must report profile guidance and currently available models after class whitelist filtering. - Visible input schema is intentionally limited to: - `prompt` - `model_profile` (`baseline`, `premium`, `frontier`, `custom`) diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index a53e78cf..758fd407 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -14,7 +14,7 @@ mcp_cloud provides a standardized MCP interface for PlanExe's plan generation wo ## Run as task (MCP tasks protocol) -MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `task_create`, `task_status`, `task_stop`, `task_file_info` (or `task_download` via `mcp_local`). The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. +MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_file_info` (or `task_download` via `mcp_local`). The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. ## Client Choice Guide @@ -129,6 +129,7 @@ mcp_cloud uses the same database configuration as other PlanExe services: See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `prompt_examples` - Return example prompts. Use these as examples for task_create. +- `model_profiles` - List profile options and currently available models after whitelist filtering. - `task_create` - Create a new task (returns task_id as UUID; may require user_api_key for credits) - `task_status` - Get task status and progress - `task_stop` - Stop an active task @@ -141,7 +142,7 @@ See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this server. If your client exposes `task_download`, use it to save the report or zip locally; otherwise use `task_file_info` to get `download_url` and fetch the file yourself. -**Tip**: Call `prompt_examples` to get example prompts to use with task_create. The catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. +**Tip**: Call `prompt_examples` to get example prompts to use with task_create, then call `model_profiles` to choose `model_profile` based on current runtime availability. The prompt catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. Download flow: call `task_file_info` to obtain the `download_url`, then fetch the report via `GET /download/{task_id}/030-report.html` (API key required if configured). diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 5e5b9f21..fce54f43 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -15,7 +15,6 @@ import tempfile import uuid import zipfile -import hashlib from dataclasses import dataclass from datetime import UTC, datetime from pathlib import Path @@ -29,7 +28,18 @@ from mcp.server.stdio import stdio_server from mcp.types import CallToolResult, Tool, TextContent from pydantic import BaseModel -from worker_plan_api.model_profile import normalize_model_profile +from worker_plan_api.model_profile import ( + ModelProfileEnum, + default_filename_for_profile, + normalize_model_profile, + resolve_model_profile_from_env, +) +from worker_plan_api.planexe_config import PlanExeConfig +from worker_plan_api.llm_class_filter import ( + ENV_PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES, + is_llm_class_allowed, + parse_llm_class_whitelist, +) from mcp_cloud.dotenv_utils import load_planexe_dotenv _dotenv_loaded, _dotenv_paths = load_planexe_dotenv(Path(__file__).parent) @@ -52,6 +62,8 @@ from database_api.model_user_api_key import UserApiKey from flask import Flask, has_app_context from mcp_cloud.tool_models import ( + ModelProfilesInput, + ModelProfilesOutput, PromptExamplesInput, PromptExamplesOutput, TaskCreateInput, @@ -116,6 +128,7 @@ def ensure_taskitem_stop_columns() -> None: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " @@ -159,6 +172,18 @@ def ensure_taskitem_stop_columns() -> None: "fast": "fast_but_skip_details", "all": "all_details_but_slow", } +MODEL_PROFILE_TITLES = { + ModelProfileEnum.BASELINE.value: "Baseline", + ModelProfileEnum.PREMIUM.value: "Premium", + ModelProfileEnum.FRONTIER.value: "Frontier", + ModelProfileEnum.CUSTOM.value: "Custom", +} +MODEL_PROFILE_SUMMARIES = { + ModelProfileEnum.BASELINE.value: "Cheap and fast; recommended default for most runs.", + ModelProfileEnum.PREMIUM.value: "Higher-cost profile tuned for stronger output quality.", + ModelProfileEnum.FRONTIER.value: "Most capable models first; usually slowest/most expensive.", + ModelProfileEnum.CUSTOM.value: "User-managed profile file for custom model ordering.", +} class TaskCreateRequest(BaseModel): prompt: str @@ -175,6 +200,11 @@ class TaskFileInfoRequest(BaseModel): task_id: str artifact: Optional[str] = None + +class ModelProfilesRequest(BaseModel): + """No input parameters.""" + pass + # Helper functions def find_task_by_task_id(task_id: str) -> Optional[TaskItem]: """Find TaskItem by MCP task_id (UUID), with legacy fallback.""" @@ -684,6 +714,147 @@ def _merge_task_create_config( merged["model_profile"] = candidate_profile return merged or None + +def _sort_llm_config_entries(items: list[tuple[str, Any]]) -> list[tuple[str, Any]]: + def sort_key(item: tuple[str, Any]) -> tuple[int, str]: + key, model_data = item + priority = None + if isinstance(model_data, dict): + maybe_priority = model_data.get("priority") + if isinstance(maybe_priority, int): + priority = maybe_priority + if priority is None: + priority = 999999 + return priority, key + + return sorted(items, key=sort_key) + + +def _extract_model_profile_entries( + model_map: dict[str, Any], + whitelist: Optional[set[str]], +) -> tuple[list[dict[str, Any]], int]: + models: list[dict[str, Any]] = [] + filtered_out_count = 0 + + for model_key, model_data in _sort_llm_config_entries(list(model_map.items())): + class_name = model_data.get("class") if isinstance(model_data, dict) else None + if not is_llm_class_allowed(class_name, whitelist): + filtered_out_count += 1 + continue + + model_name = None + priority = None + if isinstance(model_data, dict): + arguments = model_data.get("arguments") + if isinstance(arguments, dict): + maybe_model = arguments.get("model") + if isinstance(maybe_model, str): + model_name = maybe_model + maybe_priority = model_data.get("priority") + if isinstance(maybe_priority, int): + priority = maybe_priority + elif isinstance(model_data.get("prio"), int): + priority = model_data["prio"] + + models.append( + { + "key": model_key, + "provider_class": class_name if isinstance(class_name, str) else None, + "model": model_name, + "priority": priority, + } + ) + + return models, filtered_out_count + + +def _profile_models_payload( + profile: ModelProfileEnum, + whitelist: Optional[set[str]], +) -> dict[str, Any]: + config_filename = default_filename_for_profile(profile) + planexe_config_path = PlanExeConfig.resolve_planexe_config_path() + config_path = PlanExeConfig.find_file_in_search_order(config_filename, planexe_config_path) + if config_path is None: + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "config_filename": config_filename, + "available": False, + "model_count": 0, + "filtered_out_count": 0, + "models": [], + } + + try: + with config_path.open("r", encoding="utf-8") as fh: + model_map = json.load(fh) + except Exception as exc: + logger.warning( + "Unable to read profile config %s for model profile %s: %s", + config_filename, + profile.value, + exc, + ) + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "config_filename": config_filename, + "available": False, + "model_count": 0, + "filtered_out_count": 0, + "models": [], + } + + if not isinstance(model_map, dict): + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "config_filename": config_filename, + "available": False, + "model_count": 0, + "filtered_out_count": 0, + "models": [], + } + + models, filtered_out_count = _extract_model_profile_entries(model_map, whitelist) + return { + "profile": profile.value, + "title": MODEL_PROFILE_TITLES[profile.value], + "summary": MODEL_PROFILE_SUMMARIES[profile.value], + "config_filename": config_filename, + "available": True, + "model_count": len(models), + "filtered_out_count": filtered_out_count, + "models": models, + } + + +def _get_model_profiles_sync() -> dict[str, Any]: + raw_whitelist = os.environ.get(ENV_PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES) + whitelist = parse_llm_class_whitelist(raw_whitelist) + default_profile = resolve_model_profile_from_env().value + profiles = [ + _profile_models_payload(profile, whitelist) + for profile in ModelProfileEnum + ] + whitelist_values = sorted(whitelist) if whitelist is not None else [] + + return { + "default_profile": default_profile, + "whitelist_active": whitelist is not None, + "whitelisted_classes": whitelist_values, + "profiles": profiles, + "message": ( + "Use one of these profile values in task_create.model_profile. " + "Model lists reflect current PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES filtering." + ), + } + # Context var set by HTTP server so download URLs use the request's host when # PLANEXE_MCP_PUBLIC_BASE_URL is not set (avoids localhost for remote clients). _download_base_url_ctx: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar( @@ -845,6 +1016,8 @@ def _builtin_mcp_example_prompts() -> list[str]: PROMPT_EXAMPLES_INPUT_SCHEMA = PromptExamplesInput.model_json_schema() PROMPT_EXAMPLES_OUTPUT_SCHEMA = PromptExamplesOutput.model_json_schema() +MODEL_PROFILES_INPUT_SCHEMA = ModelProfilesInput.model_json_schema() +MODEL_PROFILES_OUTPUT_SCHEMA = ModelProfilesOutput.model_json_schema() @dataclass(frozen=True) class ToolDefinition: @@ -858,18 +1031,29 @@ class ToolDefinition: name="prompt_examples", description=( "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " + "Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, ), + ToolDefinition( + name="model_profiles", + description=( + "Optional helper before task_create. Returns model_profile options with plain-language guidance " + "and currently available models after PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES filtering." + ), + input_schema=MODEL_PROFILES_INPUT_SCHEMA, + output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, + ), ToolDefinition( name="task_create", description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " + "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " "Optional runtime overrides such as speed_vs_detail are intentionally hidden from the visible tool schema " "and can be provided via tool-specific metadata by developers." @@ -957,7 +1141,7 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Args: - prompt: What the plan should cover (goal, context, constraints). - - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). + - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). Call model_profiles to inspect options. - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: @@ -1045,6 +1229,17 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: ) +async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: + """Return model profile options and available models after whitelist filtering.""" + _ = ModelProfilesRequest(**(arguments or {})) + payload = await asyncio.to_thread(_get_model_profiles_sync) + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(payload))], + structuredContent=payload, + isError=False, + ) + + async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: """Fetch the current task status, progress, and recent files for a task. @@ -1292,6 +1487,7 @@ async def handle_task_file_info(arguments: dict[str, Any]) -> CallToolResult: "task_stop": handle_task_stop, "task_file_info": handle_task_file_info, "prompt_examples": handle_prompt_examples, + "model_profiles": handle_model_profiles, } async def main(): diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 13227250..7934ce6b 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -25,6 +25,7 @@ from mcp_cloud.http_utils import strip_redundant_content from mcp_cloud.tool_models import ( + ModelProfilesOutput, TaskCreateOutput, TaskFileInfoOutput, TaskStatusOutput, @@ -55,6 +56,7 @@ fetch_artifact_from_worker_plan, fetch_user_downloadable_zip, handle_task_create, + handle_model_profiles, handle_task_status, handle_task_stop, handle_task_file_info, @@ -324,7 +326,7 @@ async def task_create( prompt: str, model_profile: Annotated[ ModelProfileInput, - Field(description="LLM profile: baseline, premium, frontier, custom."), + Field(description="Model profile: baseline, premium, frontier, custom. Call model_profiles to inspect options."), ] = "baseline", ) -> Annotated[CallToolResult, TaskCreateOutput]: """Create a new PlanExe task. Use prompt_examples first for example prompts.""" @@ -367,6 +369,11 @@ async def prompt_examples() -> CallToolResult: return await handle_prompt_examples({}) +async def model_profiles() -> Annotated[CallToolResult, ModelProfilesOutput]: + """Return model_profile options with currently available models.""" + return await handle_model_profiles({}) + + def _register_tools(server: FastMCP) -> None: handler_map = { "task_create": task_create, @@ -374,6 +381,7 @@ def _register_tools(server: FastMCP) -> None: "task_stop": task_stop, "task_file_info": task_file_info, "prompt_examples": prompt_examples, + "model_profiles": model_profiles, } for tool in TOOL_DEFINITIONS: handler = handler_map.get(tool.name) @@ -394,6 +402,7 @@ def _register_tools(server: FastMCP) -> None: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " diff --git a/mcp_cloud/tests/test_model_profiles_tool.py b/mcp_cloud/tests/test_model_profiles_tool.py new file mode 100644 index 00000000..087e1636 --- /dev/null +++ b/mcp_cloud/tests/test_model_profiles_tool.py @@ -0,0 +1,50 @@ +import asyncio +import unittest +from unittest.mock import patch + +from mcp_cloud.app import handle_list_tools, handle_model_profiles + + +class TestModelProfilesTool(unittest.TestCase): + def test_model_profiles_tool_listed(self): + tools = asyncio.run(handle_list_tools()) + tool_names = {tool.name for tool in tools} + self.assertIn("model_profiles", tool_names) + + def test_model_profiles_returns_structured_content(self): + payload = { + "default_profile": "baseline", + "whitelist_active": True, + "whitelisted_classes": ["openrouter"], + "profiles": [ + { + "profile": "baseline", + "title": "Baseline", + "summary": "Cheap and fast; recommended default for most runs.", + "config_filename": "baseline.json", + "available": True, + "model_count": 1, + "filtered_out_count": 0, + "models": [ + { + "key": "openrouter-gpt-oss-20b", + "provider_class": "OpenRouter", + "model": "openai/gpt-oss-20b", + "priority": 0, + } + ], + } + ], + "message": "Use one of these profile values in task_create.model_profile.", + } + + with patch("mcp_cloud.app._get_model_profiles_sync", return_value=payload): + result = asyncio.run(handle_model_profiles({})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent["default_profile"], "baseline") + self.assertEqual(result.structuredContent["profiles"][0]["profile"], "baseline") + + +if __name__ == "__main__": + unittest.main() diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index eb014c4f..1140091d 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -12,6 +12,10 @@ def _tool_desc(tool_defs, name: str) -> str: class TestCloudToolSurfaceConsistency(unittest.TestCase): + def test_cloud_exposes_model_profiles_tool(self): + cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} + self.assertIn("model_profiles", cloud_tool_names) + def test_cloud_exposes_task_file_info_not_task_download(self): cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} self.assertIn("task_file_info", cloud_tool_names) @@ -48,6 +52,10 @@ def test_cloud_task_status_description_includes_state_contract(self): class TestLocalToolSurfaceConsistency(unittest.TestCase): + def test_local_exposes_model_profiles_tool(self): + local_tool_names = {definition.name for definition in local_app.TOOL_DEFINITIONS} + self.assertIn("model_profiles", local_tool_names) + def test_local_exposes_task_download_not_task_file_info(self): local_tool_names = {definition.name for definition in local_app.TOOL_DEFINITIONS} self.assertIn("task_download", local_tool_names) diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 2cda11a8..8fcb9fd3 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -24,6 +24,64 @@ class PromptExamplesInput(BaseModel): pass +class ModelProfilesInput(BaseModel): + """No input parameters.""" + pass + + +class ModelProfileModelEntry(BaseModel): + key: str = Field(..., description="Model key from llm_config/.json.") + provider_class: str | None = Field( + default=None, + description="Provider class (for example OpenRouter, OpenAI, Ollama).", + ) + model: str | None = Field(default=None, description="Provider model identifier when present.") + priority: int | None = Field( + default=None, + description="Priority from config (lower number means earlier in selection order).", + ) + + +class ModelProfileInfo(BaseModel): + profile: Literal["baseline", "premium", "frontier", "custom"] = Field( + ..., + description="Model profile value accepted by task_create.model_profile.", + ) + title: str = Field(..., description="Human-friendly profile label.") + summary: str = Field(..., description="Short profile guidance for callers.") + config_filename: str = Field(..., description="Filename resolved for this profile.") + available: bool = Field(..., description="True when the profile config file was found and parsed.") + model_count: int = Field(..., description="Number of models available after whitelist filtering.") + filtered_out_count: int = Field( + ..., + description="How many config entries were filtered out by class whitelist.", + ) + models: list[ModelProfileModelEntry] = Field( + ..., + description="Models available to this profile after whitelist filtering.", + ) + + +class ModelProfilesOutput(BaseModel): + default_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( + ..., + description="Default model profile used when task_create.model_profile is omitted/invalid.", + ) + whitelist_active: bool = Field( + ..., + description="True when PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES is set.", + ) + whitelisted_classes: list[str] = Field( + ..., + description="Normalized whitelist class names currently applied.", + ) + profiles: list[ModelProfileInfo] = Field( + ..., + description="Available profile options and their model inventory.", + ) + message: str = Field(..., description="Caller guidance for selecting task_create.model_profile.") + + class TaskStatusInput(BaseModel): task_id: str = Field( ..., @@ -153,7 +211,10 @@ class TaskCreateInput(BaseModel): ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", - description="LLM profile mapping to llm_config/.json (baseline, premium, frontier, custom).", + description=( + "Model profile selection: baseline (cheap/fast), premium (higher quality), " + "frontier (most capable), custom (user-defined). Call model_profiles for runtime availability." + ), ) user_api_key: str | None = Field( default=None, diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index bf273503..c412722e 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -6,10 +6,11 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. ## Interaction model - The local proxy exposes MCP tools over stdio and forwards requests to mcp_cloud using `PLANEXE_URL` (defaults to the hosted `/mcp` endpoint). -- Supported tools: `task_create`, `task_status`, `task_stop`, `task_download`, `prompt_examples`. +- Supported tools: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_download`. - `task_download` calls the remote `task_file_info` tool to obtain a download URL, then downloads the artifact to `PLANEXE_PATH` on the local machine. - `task_create` visible input schema includes `prompt` and optional `model_profile`. +- Use `model_profiles` to help agents select `task_create.model_profile` without relying on internal file knowledge. - Runtime override `speed_vs_detail` is metadata-only (hidden from visible schema); when callers still pass legacy top-level `speed_vs_detail`/`speed`, forward those into `metadata.task_create` for backward compatibility. diff --git a/mcp_local/README.md b/mcp_local/README.md index 0bbed730..1838a4b5 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -9,6 +9,7 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas ## Tools `prompt_examples` - Return example prompts. Use these as examples for task_create. You can also call `task_create` with any prompt—short prompts produce less detailed plans. +`model_profiles` - Show model_profile options and currently available models after whitelist filtering. `task_create` - Initiate creation of a plan. `task_status` - Get status and progress about the creation of a plan. `task_stop` - Abort creation of a plan. @@ -36,7 +37,7 @@ file locally into `PLANEXE_PATH`. Some MCP clients (e.g. the MCP Inspector) show a **"Run as task"** option for tools. That refers to the MCP **tasks** protocol: a separate mechanism where the client runs a tool in the background using RPC methods like `tasks/run`, `tasks/get`, `tasks/result`, and `tasks/cancel`, instead of a single blocking tool call. -**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. +**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `prompt_examples` and `model_profiles` for setup, then `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. You should **not** enable "Run as task" for PlanExe. The Python MCP SDK and clients like Cursor do not properly support the tasks protocol (method registration and initialization fail). Use the tools directly: create a task, poll status, then download when done. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 02efe2cd..a1e20eb1 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -331,7 +331,10 @@ class ToolDefinition: "type": "string", "enum": ["baseline", "premium", "frontier", "custom"], "default": "baseline", - "description": "LLM profile mapping to llm_config/.json.", + "description": ( + "Model profile selection: baseline (cheap/fast), premium (higher quality), " + "frontier (most capable), custom (user-defined). Call model_profiles for runtime availability." + ), }, }, "required": ["prompt"], @@ -393,6 +396,74 @@ class ToolDefinition: }, "required": ["samples", "message"], } +MODEL_PROFILES_INPUT_SCHEMA = { + "type": "object", + "properties": {}, + "required": [], +} +MODEL_PROFILES_OUTPUT_SCHEMA = { + "type": "object", + "properties": { + "default_profile": { + "type": "string", + "enum": ["baseline", "premium", "frontier", "custom"], + }, + "whitelist_active": {"type": "boolean"}, + "whitelisted_classes": { + "type": "array", + "items": {"type": "string"}, + }, + "profiles": { + "type": "array", + "items": { + "type": "object", + "properties": { + "profile": { + "type": "string", + "enum": ["baseline", "premium", "frontier", "custom"], + }, + "title": {"type": "string"}, + "summary": {"type": "string"}, + "config_filename": {"type": "string"}, + "available": {"type": "boolean"}, + "model_count": {"type": "integer"}, + "filtered_out_count": {"type": "integer"}, + "models": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": {"type": "string"}, + "provider_class": {"type": ["string", "null"]}, + "model": {"type": ["string", "null"]}, + "priority": {"type": ["integer", "null"]}, + }, + "required": ["key"], + }, + }, + }, + "required": [ + "profile", + "title", + "summary", + "config_filename", + "available", + "model_count", + "filtered_out_count", + "models", + ], + }, + }, + "message": {"type": "string"}, + }, + "required": [ + "default_profile", + "whitelist_active", + "whitelisted_classes", + "profiles", + "message", + ], +} TASK_CREATE_OUTPUT_SCHEMA = { "type": "object", @@ -458,17 +529,28 @@ class ToolDefinition: name="prompt_examples", description=( "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " - "Do NOT call task_create yet. Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " + "Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, output_schema=PROMPT_EXAMPLES_OUTPUT_SCHEMA, ), + ToolDefinition( + name="model_profiles", + description=( + "Optional helper before task_create. Returns model_profile options with plain-language guidance " + "and currently available models after whitelist filtering." + ), + input_schema=MODEL_PROFILES_INPUT_SCHEMA, + output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, + ), ToolDefinition( name="task_create", description=( "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " + "If you are unsure which model_profile to choose, call model_profiles first. " "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." ), input_schema=TASK_CREATE_INPUT_SCHEMA, @@ -517,6 +599,7 @@ class ToolDefinition: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " @@ -576,7 +659,7 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: Args: - prompt: What the plan should cover (goal, context, constraints). - - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). + - model_profile: Optional profile ("baseline" | "premium" | "frontier" | "custom"). Call model_profiles to inspect options. - speed_vs_detail: Optional hidden runtime override via tool-specific metadata. Returns: @@ -626,6 +709,14 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: return _wrap_response(payload) +async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: + """Return model_profile options and available models from mcp_cloud.""" + payload, error = _call_remote_tool("model_profiles", arguments or {}) + if error: + return _wrap_response({"error": error}, is_error=True) + return _wrap_response(payload) + + async def handle_task_status(arguments: dict[str, Any]) -> CallToolResult: """Fetch status/progress for a task from mcp_cloud. @@ -740,6 +831,7 @@ async def handle_task_download(arguments: dict[str, Any]) -> CallToolResult: "task_stop": handle_task_stop, "task_download": handle_task_download, "prompt_examples": handle_prompt_examples, + "model_profiles": handle_model_profiles, } diff --git a/public/llms.txt b/public/llms.txt index 9d141839..97b0f8f9 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -63,6 +63,7 @@ MCP Inspector setup guide: The MCP server exposes tool-based workflows (not MCP tasks protocol): - prompt_examples +- model_profiles - task_create - task_status - task_stop @@ -70,6 +71,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). +- model_profiles output: profile guidance + currently available models after whitelist filtering. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). - task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. @@ -84,11 +86,12 @@ task_status caller contract: Recommended interaction order: 1. Call prompt_examples. -2. Prepare and approve a strong prompt. -3. Call task_create. -4. Poll task_status until complete (repeat every 5 minutes). -5. Use task_file_info to get download URLs. -6. Use task_stop if the run must be cancelled. +2. Optionally call model_profiles to choose model_profile based on current availability. +3. Prepare and approve a strong prompt. +4. Call task_create. +5. Poll task_status until complete (repeat every 5 minutes). +6. Use task_file_info to get download URLs. +7. Use task_stop if the run must be cancelled. Note: - task_download is provided by mcp_local wrappers in some client setups, not by mcp_cloud directly. From ce1d782f5dca8a04716681f2a357a24a4190015c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:28:57 +0100 Subject: [PATCH 11/38] Document minimal MCP error-handling contract --- docs/mcp/mcp_details.md | 23 +++++++ docs/mcp/mcp_setup.md | 1 + docs/mcp/planexe_mcp_interface.md | 69 +++++++++++++++------ mcp_cloud/AGENTS.md | 1 + mcp_cloud/README.md | 5 ++ mcp_cloud/app.py | 6 +- mcp_cloud/tests/test_task_file_info_tool.py | 26 ++++++++ mcp_cloud/tests/test_task_status_tool.py | 8 +++ mcp_cloud/tool_models.py | 3 +- mcp_local/AGENTS.md | 1 + mcp_local/README.md | 6 ++ mcp_local/planexe_mcp_local.py | 7 ++- public/llms.txt | 6 ++ 13 files changed, 139 insertions(+), 23 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index dfe64b58..73e9215d 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -238,6 +238,29 @@ Example call: - Filename is prefixed with task id (for example `-030-report.html`). - Response includes `saved_path` with the exact local file location. +## Minimal error-handling contract + +Error payload shape: +```json +{"error": {"code": "SOME_CODE", "message": "Human readable message", "details": {}}} +``` + +Common cloud/core error codes: +- `TASK_NOT_FOUND` +- `INVALID_USER_API_KEY` +- `USER_API_KEY_REQUIRED` +- `INSUFFICIENT_CREDITS` +- `INTERNAL_ERROR` +- `generation_failed` +- `content_unavailable` + +Common local proxy error codes: +- `REMOTE_ERROR` +- `DOWNLOAD_FAILED` + +Special case: +- `task_file_info` may return `{}` while the artifact is not ready yet (not an error). + ## Typical Flow ### 1. Get example prompts diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index 0f2b09c6..a31e6516 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -40,6 +40,7 @@ For `task_create`: - You can fetch example prompts. - You can create a plan task. - You can download the report artifact. +- Your client can parse `error.code` and `error.message` and handle `{}` from `task_file_info` as "not ready yet". --- diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index a56edc84..2a7a3de9 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -474,36 +474,65 @@ Recommended practice for MCP clients: ## 9. Error Model -Errors MUST return: +### 9.1 Error object shape -- code: stable machine-readable -- message: human-readable -- details: optional +Tool errors return: -**Example:** +- `error.code`: stable machine-readable string +- `error.message`: human-readable message +- `error.details`: optional object + +Example: ```json { "error": { - "code": "RUN_ALREADY_ACTIVE", - "message": "A run is currently active for this task.", - "details": { "run_id": "run_0001" } + "code": "TASK_NOT_FOUND", + "message": "Task not found: " } } ``` -### 9.1 Required error codes - -- TASK_NOT_FOUND -- RUN_NOT_FOUND -- RUN_ALREADY_ACTIVE -- RUN_NOT_ACTIVE -- INVALID_TARGET -- INVALID_ARTIFACT_URI -- CONFLICT -- PERMISSION_DENIED -- RUNNING_READONLY -- INTERNAL_ERROR +### 9.2 isError behavior + +- `task_create`, `task_status`, `task_stop`: unknown/invalid requests return `isError=true` with `error`. +- `task_file_info`: uses mixed behavior: + - returns `{}` (not an error) while artifacts are not ready. + - may return `{"error": ...}` with `isError=false` for terminal artifact-level problems. + - returns `isError=true` for unknown task id (`TASK_NOT_FOUND`). +- `mcp_local` may return proxy/transport failures as `REMOTE_ERROR` and local download write failures as `DOWNLOAD_FAILED`. + +### 9.3 Minimal code contract (current) + +Cloud/core tool codes: + +- `INVALID_TOOL`: unknown MCP tool name. +- `INTERNAL_ERROR`: uncaught server error. +- `TASK_NOT_FOUND`: task id not found. +- `INVALID_USER_API_KEY`: provided user_api_key is invalid. +- `USER_API_KEY_REQUIRED`: deployment requires user_api_key for task_create. +- `INSUFFICIENT_CREDITS`: caller account has no credits for task_create. +- `generation_failed`: task_file_info report path when task ended in failed. +- `content_unavailable`: task_file_info cannot read requested artifact bytes. + +Local proxy specific codes: + +- `REMOTE_ERROR`: mcp_local could not call mcp_cloud (network/HTTP/protocol layer failure). +- `DOWNLOAD_FAILED`: mcp_local could not write/download artifact to local filesystem. + +### 9.4 Caller handling guidance + +- Retry with backoff: + - `INTERNAL_ERROR` + - `REMOTE_ERROR` + - `content_unavailable` (short retry window) +- Do not retry unchanged request: + - `INVALID_USER_API_KEY` + - `USER_API_KEY_REQUIRED` + - `INSUFFICIENT_CREDITS` + - `INVALID_TOOL` +- For `TASK_NOT_FOUND`: verify task_id source and stop polling that id. +- For `generation_failed`: treat as terminal failure and surface task progress_message to user. --- diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index 4047157d..ed4440a0 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -51,6 +51,7 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - The server communicates over stdio (standard input/output) following the MCP protocol. - Tools are registered via `@mcp_cloud.list_tools()` and handled via `@mcp_cloud.call_tool()`. - All tool responses must be JSON-serializable and follow the error model in the spec. +- Keep tool error codes/docs aligned with actual runtime payloads (for example `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `generation_failed`, `content_unavailable`, `INTERNAL_ERROR`). - Event cursors use format `cursor_{event_id}` for incremental polling. - **Run as task**: We expose MCP **tools** only (task_create, task_status, task_stop, etc.), not the MCP **tasks** protocol (tasks/get, tasks/result, etc.). Do not advertise the tasks capability or add "Run as task" support; the spec and clients (e.g. Cursor) are aligned on tools-only. diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index 758fd407..c6a8d877 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -140,6 +140,11 @@ See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `completed`: terminal success, download is ready. - `failed`: terminal error. +Minimal error contract: +- Tool errors use `{"error":{"code","message","details?"}}`. +- Common codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- `task_file_info` may return `{}` while output is not ready (not an error payload). + Note: `task_download` is a synthetic tool provided by `mcp_local`, not by this server. If your client exposes `task_download`, use it to save the report or zip locally; otherwise use `task_file_info` to get `download_url` and fetch the file yourself. **Tip**: Call `prompt_examples` to get example prompts to use with task_create, then call `model_profiles` to choose `model_profile` based on current runtime availability. The prompt catalog is the same as in the frontends (`worker_plan.worker_plan_api.PromptCatalog`). When running with `PYTHONPATH` set to the repo root (e.g. stdio setup), the catalog is loaded automatically; otherwise built-in examples are returned. diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index fce54f43..87f2b8aa 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -132,6 +132,7 @@ def ensure_taskitem_stop_columns() -> None: "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " "task_file_info download_url is absolute when PLANEXE_MCP_PUBLIC_BASE_URL is configured or request host is available. " "If download_url is missing, configure PLANEXE_MCP_PUBLIC_BASE_URL on the server. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " @@ -1055,6 +1056,7 @@ class ToolDefinition: "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " + "Common error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS. " "Optional runtime overrides such as speed_vs_detail are intentionally hidden from the visible tool schema " "and can be provided via tool-specific metadata by developers." ), @@ -1068,6 +1070,7 @@ class ToolDefinition: "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Unknown task_id returns error code TASK_NOT_FOUND. " "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " "processing with no file-output changes for >20 minutes likely means failed/stalled. " "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." @@ -1090,7 +1093,8 @@ class ToolDefinition: "Returns file metadata (content_type, download_url, download_size) for the report or zip. " "If your client exposes task_download (e.g. mcp_local), use that to save the file locally; " "otherwise use this tool to get download_url and fetch the file yourself. " - "download_url is generated from PLANEXE_MCP_PUBLIC_BASE_URL (or request host when available)." + "download_url is generated from PLANEXE_MCP_PUBLIC_BASE_URL (or request host when available). " + "Returns {} while artifact is not ready. Terminal tool-level error payloads use codes generation_failed or content_unavailable." ), input_schema=TASK_FILE_INFO_INPUT_SCHEMA, output_schema=TASK_FILE_INFO_OUTPUT_SCHEMA, diff --git a/mcp_cloud/tests/test_task_file_info_tool.py b/mcp_cloud/tests/test_task_file_info_tool.py index 1abd2d17..016656e8 100644 --- a/mcp_cloud/tests/test_task_file_info_tool.py +++ b/mcp_cloud/tests/test_task_file_info_tool.py @@ -97,6 +97,32 @@ def test_report_read_zip_for_failed_task(self): self.assertEqual(payload["download_size"], len(content_bytes)) self.assertEqual(payload["content_type"], ZIP_CONTENT_TYPE) + def test_task_file_info_returns_empty_object_when_pending(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": "task-id", + "state": TaskState.pending, + "progress_message": None, + } + with patch("mcp_cloud.app._get_task_for_report_sync", return_value=task_snapshot): + result = asyncio.run(handle_task_file_info({"task_id": task_id})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent, {}) + + def test_task_file_info_returns_generation_failed_payload(self): + task_id = str(uuid.uuid4()) + task_snapshot = { + "id": "task-id", + "state": TaskState.failed, + "progress_message": "Pipeline failed", + } + with patch("mcp_cloud.app._get_task_for_report_sync", return_value=task_snapshot): + result = asyncio.run(handle_task_file_info({"task_id": task_id, "artifact": "report"})) + + self.assertFalse(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "generation_failed") + def test_sanitize_legacy_zip_snapshot_removes_track_activity_jsonl(self): buffer = BytesIO() with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: diff --git a/mcp_cloud/tests/test_task_status_tool.py b/mcp_cloud/tests/test_task_status_tool.py index 2f0cfb54..d48309ca 100644 --- a/mcp_cloud/tests/test_task_status_tool.py +++ b/mcp_cloud/tests/test_task_status_tool.py @@ -83,6 +83,14 @@ def test_task_status_uses_processing_state_name(self): self.assertEqual(result.structuredContent["state"], "processing") + def test_task_status_returns_task_not_found_error(self): + task_id = str(uuid.uuid4()) + with patch("mcp_cloud.app._get_task_status_snapshot_sync", return_value=None): + result = asyncio.run(handle_task_status({"task_id": task_id})) + + self.assertTrue(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "TASK_NOT_FOUND") + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 8fcb9fd3..ddc76625 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Any, Literal from pydantic import BaseModel, Field @@ -6,6 +6,7 @@ class ErrorDetail(BaseModel): code: str message: str + details: dict[str, Any] | None = None class PromptExamplesOutput(BaseModel): diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index c412722e..7cea3fb0 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -39,6 +39,7 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - HTTP wrapper (`/mcp/tools/call`) - Streamable MCP JSON-RPC (`/mcp`) - Ensure all tool responses include structured content when an output schema is defined. +- Keep local proxy error semantics documented and stable (`REMOTE_ERROR`, `DOWNLOAD_FAILED`) and pass through cloud error payloads unchanged when possible. - Tool-surface split must remain explicit: - local exposes `task_download`. - cloud exposes `task_file_info`. diff --git a/mcp_local/README.md b/mcp_local/README.md index 1838a4b5..a126e463 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -20,6 +20,12 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas - `completed`: terminal success, download is ready. - `failed`: terminal error. +Minimal error contract: +- Tool errors use `{"error":{"code","message","details?"}}`. +- Common proxied cloud codes include: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- Local proxy specific codes: `REMOTE_ERROR`, `DOWNLOAD_FAILED`. +- `task_file_info` (called under the hood by task_download) may return `{}` while output is not ready. + **Tip**: Call `prompt_examples` to get example prompts to use with task_create. The full catalog lives at `worker_plan/worker_plan_api/prompt/data/simple_plan_prompts.jsonl`. `task_download` is a synthetic tool provided by the local proxy. It calls the diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index a1e20eb1..b2c761a7 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -311,6 +311,7 @@ class ToolDefinition: "properties": { "code": {"type": "string"}, "message": {"type": "string"}, + "details": {"type": ["object", "null"]}, }, "required": ["code", "message"], } @@ -551,6 +552,7 @@ class ToolDefinition: "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "If you are unsure which model_profile to choose, call model_profiles first. " + "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR. " "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." ), input_schema=TASK_CREATE_INPUT_SCHEMA, @@ -563,6 +565,7 @@ class ToolDefinition: "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " "and frequent polling is unnecessary. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "Unknown task_id returns TASK_NOT_FOUND (or REMOTE_ERROR when transport fails). " "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " "processing with no file-output changes for >20 minutes likely means failed/stalled. " "Report these issues to https://github.com/PlanExeOrg/PlanExe/issues ." @@ -585,7 +588,8 @@ class ToolDefinition: "Download the plan output and save it locally to PLANEXE_PATH. " "Choose the HTML report (default) or a zip of all generated files. " "If PLANEXE_PATH is unset, files are saved to the current working directory. " - "Filename format is - with numeric suffixes when collisions occur." + "Filename format is - with numeric suffixes when collisions occur. " + "Common local error codes: DOWNLOAD_FAILED, REMOTE_ERROR." ), input_schema=TASK_DOWNLOAD_INPUT_SCHEMA, output_schema=TASK_DOWNLOAD_OUTPUT_SCHEMA, @@ -603,6 +607,7 @@ class ToolDefinition: "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " "Step 3 — Only then call task_create with the approved prompt. " "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " + "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " diff --git a/public/llms.txt b/public/llms.txt index 97b0f8f9..f2c9a476 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -84,6 +84,12 @@ task_status caller contract: - completed: terminal success; download is ready. - failed: terminal error. +Minimal error-handling contract: +- Errors use `{"error":{"code","message","details?"}}`. +- Common cloud/core codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- Common local-proxy codes: `REMOTE_ERROR`, `DOWNLOAD_FAILED`. +- `task_file_info` may return `{}` while artifact output is not ready yet. + Recommended interaction order: 1. Call prompt_examples. 2. Optionally call model_profiles to choose model_profile based on current availability. From 70ebb0f58069908dd837c64875d1a85b2d188965 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:33:25 +0100 Subject: [PATCH 12/38] Clarify MCP flow wording and non-tool approval step --- README.md | 7 ++++--- docs/mcp/mcp_details.md | 11 ++++++++--- docs/mcp/mcp_setup.md | 4 ++-- docs/mcp/planexe_mcp_interface.md | 8 ++++---- mcp_cloud/AGENTS.md | 1 + mcp_cloud/README.md | 1 + mcp_cloud/app.py | 19 ++++++++++--------- mcp_cloud/http_server.py | 8 ++++---- mcp_local/AGENTS.md | 1 + mcp_local/README.md | 2 +- mcp_local/planexe_mcp_local.py | 15 ++++++++------- public/llms.txt | 2 +- 12 files changed, 45 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 459d9725..68f7f2fe 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,10 @@ The Tool workflow (tools-only, not MCP tasks protocol) 1. `prompt_examples` 2. `model_profiles` (optional, helps choose `model_profile`) -3. `task_create` -4. `task_status` (poll every 5 minutes until done) -5. download the result via `task_download` or via `task_file_info` +3. non-tool step: draft/approve prompt +4. `task_create` +5. `task_status` (poll every 5 minutes until done) +6. download the result via `task_download` or via `task_file_info` ### Option A: Remote MCP (fastest path) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 73e9215d..841e4b8d 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -10,6 +10,7 @@ This document lists the MCP tools exposed by PlanExe and example prompts for age - The primary MCP server runs in the cloud (see `mcp_cloud`). - The local MCP proxy (`mcp_local`) forwards calls to the server and adds a local download helper. - Tool responses return JSON in both `content.text` and `structuredContent`. +- Workflow note: drafting and user approval of the prompt is a non-tool step between setup tools and `task_create`. ## Tool Catalog, `mcp_cloud` @@ -289,7 +290,11 @@ Tool call: {} ``` -### 3. Create a plan +### 3. Draft and approve the prompt (non-tool step) + +At this step, the agent writes a high-quality prompt draft, shows it to the user, and waits for approval. + +### 4. Create a plan The user reviews the prompt and either asks for further changes or confirms it’s good to go. When the user confirms, the agent calls `task_create` with that prompt. @@ -298,7 +303,7 @@ Tool call: {"prompt": "..."} ``` -### 4. Get status +### 5. Get status Prompt: ``` @@ -310,7 +315,7 @@ Tool call: {"task_id": ""} ``` -### 5. Download the report +### 6. Download the report Prompt: ``` diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index a31e6516..711c2ddf 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -12,9 +12,9 @@ This is the shortest path to a working PlanExe MCP integration. 1. Ask for prompt examples. 2. Inspect `model_profile` options and available models. -3. Expand the user idea into a high‑quality prompt. +3. Expand the user idea into a high‑quality prompt (non-tool step) and get user approval. 4. Create the plan task. -5. Poll for status. +5. Poll for status (about every 5 minutes). 6. Download the report (HTML or zip). --- diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 2a7a3de9..2f4383a1 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Step 1 — Call prompt_examples to fetch example prompts. Optional before task_create: call model_profiles to inspect profile guidance and available models under current whitelist settings. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. Then poll task_status; use task_download or task_file_info when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call task_stop with the task_id from task_create. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models under current whitelist settings. Then complete a non-tool step: formulate a good prompt (use examples as a baseline; similar structure) and get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -73,7 +73,7 @@ The MCP specification defines two different mechanisms: - **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download or task_file_info. This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. The intended flow is: Step 1 — call prompt_examples; optional before task_create — call model_profiles; Step 2 — formulate a good prompt (user approval); Step 3 — call task_create; then poll task_status and call task_download or task_file_info when complete. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: call `prompt_examples`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `task_create`; poll `task_status`; then call `task_download` or `task_file_info`. --- @@ -156,7 +156,7 @@ All tool names below are normative. ### 6.1 prompt_examples -**Step 1 — Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: Step 1 — call this tool to fetch examples. Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). Step 3 — Only then call task_create with the approved prompt. If you call task_create before formulating and approving a prompt, the resulting plan will be lower quality than it could be. +**Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: call this tool; optionally call `model_profiles`; then complete a non-tool step (draft and approve prompt); only then call `task_create`. If you call `task_create` before formulating and approving a prompt, the resulting plan will be lower quality than it could be. **Request:** no parameters (empty object). @@ -213,7 +213,7 @@ Use the returned `profile` values directly in `task_create.model_profile`. ### 6.2 task_create -**Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2).** Start creating a new plan with the approved prompt. +**Call only after prompt_examples and after the non-tool drafting/approval step.** Start creating a new plan with the approved prompt. **Request** diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index ed4440a0..0d5c57d7 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -34,6 +34,7 @@ for AI agents and developer tools to interact with PlanExe. Communicates with ## task_create contract - Expose `model_profiles` as the discovery tool for profile selection. - `model_profiles` must report profile guidance and currently available models after class whitelist filtering. +- Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. - Visible input schema is intentionally limited to: - `prompt` - `model_profile` (`baseline`, `premium`, `frontier`, `custom`) diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index c6a8d877..2a934384 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -15,6 +15,7 @@ mcp_cloud provides a standardized MCP interface for PlanExe's plan generation wo ## Run as task (MCP tasks protocol) MCP has two ways to run long-running work: **tools** (what we use) and the **tasks** protocol ("Run as task" in some UIs). PlanExe uses **tools only**: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_file_info` (or `task_download` via `mcp_local`). The agent creates a task, polls status, then downloads; that is the intended flow per `docs/mcp/planexe_mcp_interface.md`. We do not advertise or implement the MCP tasks protocol (tasks/get, tasks/result, etc.). Clients like Cursor do not support it properly—use the tools directly. +Workflow clarity: prompt drafting + user approval is a non-tool step between setup tools and `task_create`. ## Client Choice Guide diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 87f2b8aa..e0d8430c 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -127,11 +127,11 @@ def ensure_taskitem_stop_columns() -> None: "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " - "Step 2 — Formulate a good prompt (use the examples as a baseline; draft a prompt with similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Only after approval, call task_create. " + "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " "task_file_info download_url is absolute when PLANEXE_MCP_PUBLIC_BASE_URL is configured or request host is available. " "If download_url is missing, configure PLANEXE_MCP_PUBLIC_BASE_URL on the server. " @@ -1031,9 +1031,10 @@ class ToolDefinition: ToolDefinition( name="prompt_examples", description=( - "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " + "Call this first. Returns example prompts that define what a good prompt looks like. " "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " - "Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " + "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, @@ -1051,7 +1052,7 @@ class ToolDefinition: ToolDefinition( name="task_create", description=( - "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " + "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " "If you are unsure which model_profile to choose, call model_profiles first. " @@ -1220,8 +1221,8 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: payload = { "samples": samples, "message": ( - "Step 1 done. Next: Step 2 — Formulate a good prompt using these as a baseline (similar structure). Get user approval. " - "Step 3 — Only then call task_create with the approved prompt. " + "Next: complete the non-tool step by drafting a good prompt using these as a baseline (similar structure), then get user approval. " + "Only after approval, call task_create. " "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." ), diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 7934ce6b..60274e0f 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -401,11 +401,11 @@ def _register_tools(server: FastMCP) -> None: "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " - "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Only after approval, call task_create. " + "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index 7cea3fb0..5da2620b 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -11,6 +11,7 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. then downloads the artifact to `PLANEXE_PATH` on the local machine. - `task_create` visible input schema includes `prompt` and optional `model_profile`. - Use `model_profiles` to help agents select `task_create.model_profile` without relying on internal file knowledge. +- Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. - Runtime override `speed_vs_detail` is metadata-only (hidden from visible schema); when callers still pass legacy top-level `speed_vs_detail`/`speed`, forward those into `metadata.task_create` for backward compatibility. diff --git a/mcp_local/README.md b/mcp_local/README.md index a126e463..3a94fe35 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -43,7 +43,7 @@ file locally into `PLANEXE_PATH`. Some MCP clients (e.g. the MCP Inspector) show a **"Run as task"** option for tools. That refers to the MCP **tasks** protocol: a separate mechanism where the client runs a tool in the background using RPC methods like `tasks/run`, `tasks/get`, `tasks/result`, and `tasks/cancel`, instead of a single blocking tool call. -**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `prompt_examples` and `model_profiles` for setup, then `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. +**PlanExe does not use or advertise the MCP tasks protocol.** Our interface is **tool-based** only: the agent calls `prompt_examples` and `model_profiles` for setup, completes a non-tool prompt drafting/approval step, then `task_create` → gets a `task_id` → polls `task_status` → uses `task_download`. That flow is defined in `docs/mcp/planexe_mcp_interface.md` and is the intended design. You should **not** enable "Run as task" for PlanExe. The Python MCP SDK and clients like Cursor do not properly support the tasks protocol (method registration and initialization fail). Use the tools directly: create a task, poll status, then download when done. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index b2c761a7..a463e809 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -529,9 +529,10 @@ class ToolDefinition: ToolDefinition( name="prompt_examples", description=( - "Step 1 — Call this first. Returns example prompts that define what a good prompt looks like. " + "Call this first. Returns example prompts that define what a good prompt looks like. " "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " - "Next: formulate a prompt (use examples as a baseline, similar structure), get user approval, then call task_create (Step 3). " + "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " + "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), input_schema=PROMPT_EXAMPLES_INPUT_SCHEMA, @@ -549,7 +550,7 @@ class ToolDefinition: ToolDefinition( name="task_create", description=( - "Step 3 — Call only after prompt_examples (Step 1) and after you have formulated a good prompt and got user approval (Step 2). " + "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "If you are unsure which model_profile to choose, call model_profiles first. " "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR. " @@ -602,11 +603,11 @@ class ToolDefinition: "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " - "Required interaction order: Step 1 — Call prompt_examples to fetch example prompts. " + "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " - "Step 2 — Formulate a good prompt (use examples as a baseline; similar structure; get user approval). " - "Step 3 — Only then call task_create with the approved prompt. " - "Then poll task_status; use task_download when complete. To stop, call task_stop with the task_id from task_create. " + "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Only after approval, call task_create. " + "Then poll task_status (about every 5 minutes); use task_download when complete. To stop, call task_stop with the task_id from task_create. " "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " diff --git a/public/llms.txt b/public/llms.txt index f2c9a476..e69d7186 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -93,7 +93,7 @@ Minimal error-handling contract: Recommended interaction order: 1. Call prompt_examples. 2. Optionally call model_profiles to choose model_profile based on current availability. -3. Prepare and approve a strong prompt. +3. Non-tool step: prepare and approve a strong prompt. 4. Call task_create. 5. Poll task_status until complete (repeat every 5 minutes). 6. Use task_file_info to get download URLs. From 64cc1cfce9ab9cb2ec05347904475a987c9e96af Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:39:43 +0100 Subject: [PATCH 13/38] Clarify MCP concurrency semantics and client responsibilities --- README.md | 2 ++ docs/mcp/mcp_details.md | 9 +++++++++ docs/mcp/mcp_setup.md | 1 + docs/mcp/planexe_mcp_interface.md | 7 +++++++ mcp_cloud/AGENTS.md | 1 + mcp_cloud/README.md | 5 +++++ mcp_cloud/app.py | 2 ++ mcp_cloud/http_server.py | 1 + mcp_local/AGENTS.md | 1 + mcp_local/README.md | 5 +++++ mcp_local/planexe_mcp_local.py | 2 ++ public/llms.txt | 5 +++++ 12 files changed, 41 insertions(+) diff --git a/README.md b/README.md index 68f7f2fe..08227f26 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,8 @@ The Tool workflow (tools-only, not MCP tasks protocol) 5. `task_status` (poll every 5 minutes until done) 6. download the result via `task_download` or via `task_file_info` +Concurrency note: each `task_create` call returns a new `task_id`; server-side global per-client concurrency is not capped, so clients should track their own parallel tasks. + ### Option A: Remote MCP (fastest path) #### Prerequisites diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 841e4b8d..a9941389 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -262,6 +262,15 @@ Common local proxy error codes: Special case: - `task_file_info` may return `{}` while the artifact is not ready yet (not an error). +## Concurrency semantics (practical) + +- Each `task_create` call creates a new task with a new `task_id`. +- The server does not enforce a global “one active task per client” cap. +- Parallelism is a client orchestration concern: + - start with 1 task + - scale to 2 in parallel if needed + - avoid more than 4 unless you have strong task-tracking UX + ## Typical Flow ### 1. Get example prompts diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index 711c2ddf..fe1997c2 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -41,6 +41,7 @@ For `task_create`: - You can create a plan task. - You can download the report artifact. - Your client can parse `error.code` and `error.message` and handle `{}` from `task_file_info` as "not ready yet". +- If running parallel work, your client tracks multiple `task_id`s explicitly (server-side global cap is not enforced). --- diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 2f4383a1..4edf8a72 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -326,6 +326,7 @@ For the full catalog file: - Must be idempotent only if client supplies an optional client_request_id (optional extension). - Task config is immutable after creation in v1. +- By default, repeated `task_create` calls produce new tasks (new `task_id`s). --- @@ -470,6 +471,12 @@ Recommended practice for MCP clients: - If needed, increase to 2 tasks in parallel. - Going beyond 4 parallel tasks is usually hard to track; avoid unless necessary. +Additional semantics: + +- Every `task_create` call creates a new independent task with a new `task_id`. +- The server does not deduplicate “same prompt” requests into a single shared task. +- Keep your own task registry/client state if you run multiple tasks concurrently. + --- ## 9. Error Model diff --git a/mcp_cloud/AGENTS.md b/mcp_cloud/AGENTS.md index 0d5c57d7..386f50e1 100644 --- a/mcp_cloud/AGENTS.md +++ b/mcp_cloud/AGENTS.md @@ -35,6 +35,7 @@ for AI agents and developer tools to interact with PlanExe. Communicates with - Expose `model_profiles` as the discovery tool for profile selection. - `model_profiles` must report profile guidance and currently available models after class whitelist filtering. - Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. +- Keep concurrency wording explicit: each `task_create` call creates a new `task_id`; no global per-client concurrency cap is enforced server-side. - Visible input schema is intentionally limited to: - `prompt` - `model_profile` (`baseline`, `premium`, `frontier`, `custom`) diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index 2a934384..00ec6a65 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -141,6 +141,11 @@ See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `completed`: terminal success, download is ready. - `failed`: terminal error. +Concurrency semantics: +- Each `task_create` call creates a new `task_id`. +- Server does not enforce a global one-task-at-a-time cap per client. +- Client should track task ids explicitly when running tasks in parallel. + Minimal error contract: - Tool errors use `{"error":{"code","message","details?"}}`. - Common codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index e0d8430c..1cfb4388 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -131,6 +131,7 @@ def ensure_taskitem_stop_columns() -> None: "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " + "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " "task_file_info download_url is absolute when PLANEXE_MCP_PUBLIC_BASE_URL is configured or request host is available. " @@ -1055,6 +1056,7 @@ class ToolDefinition: "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " + "Each task_create call creates a new task_id (no server-side dedup). " "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " "Common error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS. " diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 60274e0f..d3d1ffbe 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -405,6 +405,7 @@ def _register_tools(server: FastMCP) -> None: "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " + "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index 5da2620b..e7fbac93 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -12,6 +12,7 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - `task_create` visible input schema includes `prompt` and optional `model_profile`. - Use `model_profiles` to help agents select `task_create.model_profile` without relying on internal file knowledge. - Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. +- Keep concurrency wording explicit: each `task_create` call creates a new `task_id`; no global per-client concurrency cap is enforced server-side. - Runtime override `speed_vs_detail` is metadata-only (hidden from visible schema); when callers still pass legacy top-level `speed_vs_detail`/`speed`, forward those into `metadata.task_create` for backward compatibility. diff --git a/mcp_local/README.md b/mcp_local/README.md index 3a94fe35..77770b13 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -20,6 +20,11 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas - `completed`: terminal success, download is ready. - `failed`: terminal error. +Concurrency semantics: +- Each `task_create` call creates a new `task_id`. +- Server does not enforce a global one-task-at-a-time cap per client. +- Local clients should track task ids explicitly when running tasks in parallel. + Minimal error contract: - Tool errors use `{"error":{"code","message","details?"}}`. - Common proxied cloud codes include: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index a463e809..75268cd0 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -552,6 +552,7 @@ class ToolDefinition: description=( "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " + "Each task_create call creates a new task_id (proxied to cloud; no server-side dedup). " "If you are unsure which model_profile to choose, call model_profiles first. " "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR. " "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." @@ -607,6 +608,7 @@ class ToolDefinition: "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " + "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_download when complete. To stop, call task_stop with the task_id from task_create. " "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " diff --git a/public/llms.txt b/public/llms.txt index e69d7186..e0f2bf38 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -99,6 +99,11 @@ Recommended interaction order: 6. Use task_file_info to get download URLs. 7. Use task_stop if the run must be cancelled. +Concurrency semantics: +- Each task_create call creates a new task_id. +- Server does not enforce a global per-client concurrency cap. +- Client should track task_ids and usually start with 1 active task, then 2 if needed. + Note: - task_download is provided by mcp_local wrappers in some client setups, not by mcp_cloud directly. - In mcp_local, downloads save to PLANEXE_PATH (or current working directory if PLANEXE_PATH is unset). From 4424425156298c23cca7d16cd072f4129ce43559 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 19:52:09 +0100 Subject: [PATCH 14/38] Hide internal whitelist details from model_profiles UX --- docs/mcp/mcp_details.md | 4 --- docs/mcp/planexe_mcp_interface.md | 6 +---- mcp_cloud/README.md | 2 +- mcp_cloud/app.py | 27 ++++++--------------- mcp_cloud/http_server.py | 2 +- mcp_cloud/tests/test_model_profiles_tool.py | 4 --- mcp_cloud/tool_models.py | 17 ++----------- mcp_local/README.md | 2 +- mcp_local/planexe_mcp_local.py | 15 ++---------- public/llms.txt | 2 +- 10 files changed, 16 insertions(+), 65 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index a9941389..b161efab 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -47,16 +47,12 @@ Example call: Response includes: - `default_profile` -- `whitelist_active` -- `whitelisted_classes` - `profiles[]` with: - `profile` - `title` - `summary` - - `config_filename` - `available` - `model_count` - - `filtered_out_count` - `models[]` (`key`, `provider_class`, `model`, `priority`) ### task_create diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 4edf8a72..308cf637 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models under current whitelist settings. Then complete a non-tool step: formulate a good prompt (use examples as a baseline; similar structure) and get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a good prompt (use examples as a baseline; similar structure) and get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -182,17 +182,13 @@ Optional helper tool to discover valid `model_profile` choices and currently ava ```json { "default_profile": "baseline", - "whitelist_active": true, - "whitelisted_classes": ["openrouter"], "profiles": [ { "profile": "baseline", "title": "Baseline", "summary": "Cheap and fast; recommended default for most runs.", - "config_filename": "baseline.json", "available": true, "model_count": 5, - "filtered_out_count": 2, "models": [ { "key": "openrouter-gpt-oss-20b", diff --git a/mcp_cloud/README.md b/mcp_cloud/README.md index 00ec6a65..4dd85075 100644 --- a/mcp_cloud/README.md +++ b/mcp_cloud/README.md @@ -130,7 +130,7 @@ mcp_cloud uses the same database configuration as other PlanExe services: See `docs/mcp/planexe_mcp_interface.md` for full specification. Available tools: - `prompt_examples` - Return example prompts. Use these as examples for task_create. -- `model_profiles` - List profile options and currently available models after whitelist filtering. +- `model_profiles` - List profile options and currently available models in each profile. - `task_create` - Create a new task (returns task_id as UUID; may require user_api_key for credits) - `task_status` - Get task status and progress - `task_stop` - Stop an active task diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 1cfb4388..0ea51571 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -128,7 +128,7 @@ def ensure_taskitem_stop_columns() -> None: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " - "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " + "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " @@ -735,14 +735,12 @@ def sort_key(item: tuple[str, Any]) -> tuple[int, str]: def _extract_model_profile_entries( model_map: dict[str, Any], whitelist: Optional[set[str]], -) -> tuple[list[dict[str, Any]], int]: +) -> list[dict[str, Any]]: models: list[dict[str, Any]] = [] - filtered_out_count = 0 for model_key, model_data in _sort_llm_config_entries(list(model_map.items())): class_name = model_data.get("class") if isinstance(model_data, dict) else None if not is_llm_class_allowed(class_name, whitelist): - filtered_out_count += 1 continue model_name = None @@ -768,7 +766,7 @@ def _extract_model_profile_entries( } ) - return models, filtered_out_count + return models def _profile_models_payload( @@ -783,10 +781,8 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "config_filename": config_filename, "available": False, "model_count": 0, - "filtered_out_count": 0, "models": [], } @@ -804,10 +800,8 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "config_filename": config_filename, "available": False, "model_count": 0, - "filtered_out_count": 0, "models": [], } @@ -816,22 +810,18 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "config_filename": config_filename, "available": False, "model_count": 0, - "filtered_out_count": 0, "models": [], } - models, filtered_out_count = _extract_model_profile_entries(model_map, whitelist) + models = _extract_model_profile_entries(model_map, whitelist) return { "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "config_filename": config_filename, "available": True, "model_count": len(models), - "filtered_out_count": filtered_out_count, "models": models, } @@ -844,16 +834,13 @@ def _get_model_profiles_sync() -> dict[str, Any]: _profile_models_payload(profile, whitelist) for profile in ModelProfileEnum ] - whitelist_values = sorted(whitelist) if whitelist is not None else [] return { "default_profile": default_profile, - "whitelist_active": whitelist is not None, - "whitelisted_classes": whitelist_values, "profiles": profiles, "message": ( "Use one of these profile values in task_create.model_profile. " - "Model lists reflect current PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES filtering." + "Model lists show what is currently available in each profile." ), } @@ -1045,7 +1032,7 @@ class ToolDefinition: name="model_profiles", description=( "Optional helper before task_create. Returns model_profile options with plain-language guidance " - "and currently available models after PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES filtering." + "and currently available models in each profile." ), input_schema=MODEL_PROFILES_INPUT_SCHEMA, output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, @@ -1237,7 +1224,7 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: - """Return model profile options and available models after whitelist filtering.""" + """Return model profile options and currently available models in each profile.""" _ = ModelProfilesRequest(**(arguments or {})) payload = await asyncio.to_thread(_get_model_profiles_sync) return CallToolResult( diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index d3d1ffbe..385e9269 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -402,7 +402,7 @@ def _register_tools(server: FastMCP) -> None: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " - "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " + "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " diff --git a/mcp_cloud/tests/test_model_profiles_tool.py b/mcp_cloud/tests/test_model_profiles_tool.py index 087e1636..fe3d3fcf 100644 --- a/mcp_cloud/tests/test_model_profiles_tool.py +++ b/mcp_cloud/tests/test_model_profiles_tool.py @@ -14,17 +14,13 @@ def test_model_profiles_tool_listed(self): def test_model_profiles_returns_structured_content(self): payload = { "default_profile": "baseline", - "whitelist_active": True, - "whitelisted_classes": ["openrouter"], "profiles": [ { "profile": "baseline", "title": "Baseline", "summary": "Cheap and fast; recommended default for most runs.", - "config_filename": "baseline.json", "available": True, "model_count": 1, - "filtered_out_count": 0, "models": [ { "key": "openrouter-gpt-oss-20b", diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index ddc76625..177851a5 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -50,16 +50,11 @@ class ModelProfileInfo(BaseModel): ) title: str = Field(..., description="Human-friendly profile label.") summary: str = Field(..., description="Short profile guidance for callers.") - config_filename: str = Field(..., description="Filename resolved for this profile.") available: bool = Field(..., description="True when the profile config file was found and parsed.") - model_count: int = Field(..., description="Number of models available after whitelist filtering.") - filtered_out_count: int = Field( - ..., - description="How many config entries were filtered out by class whitelist.", - ) + model_count: int = Field(..., description="Number of models currently available in this profile.") models: list[ModelProfileModelEntry] = Field( ..., - description="Models available to this profile after whitelist filtering.", + description="Models currently available to this profile.", ) @@ -68,14 +63,6 @@ class ModelProfilesOutput(BaseModel): ..., description="Default model profile used when task_create.model_profile is omitted/invalid.", ) - whitelist_active: bool = Field( - ..., - description="True when PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES is set.", - ) - whitelisted_classes: list[str] = Field( - ..., - description="Normalized whitelist class names currently applied.", - ) profiles: list[ModelProfileInfo] = Field( ..., description="Available profile options and their model inventory.", diff --git a/mcp_local/README.md b/mcp_local/README.md index 77770b13..06fa67ae 100644 --- a/mcp_local/README.md +++ b/mcp_local/README.md @@ -9,7 +9,7 @@ proxy forwards tool calls over HTTP and downloads artifacts from `/download/{tas ## Tools `prompt_examples` - Return example prompts. Use these as examples for task_create. You can also call `task_create` with any prompt—short prompts produce less detailed plans. -`model_profiles` - Show model_profile options and currently available models after whitelist filtering. +`model_profiles` - Show model_profile options and currently available models in each profile. `task_create` - Initiate creation of a plan. `task_status` - Get status and progress about the creation of a plan. `task_stop` - Abort creation of a plan. diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 75268cd0..330ef230 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -409,11 +409,6 @@ class ToolDefinition: "type": "string", "enum": ["baseline", "premium", "frontier", "custom"], }, - "whitelist_active": {"type": "boolean"}, - "whitelisted_classes": { - "type": "array", - "items": {"type": "string"}, - }, "profiles": { "type": "array", "items": { @@ -425,10 +420,8 @@ class ToolDefinition: }, "title": {"type": "string"}, "summary": {"type": "string"}, - "config_filename": {"type": "string"}, "available": {"type": "boolean"}, "model_count": {"type": "integer"}, - "filtered_out_count": {"type": "integer"}, "models": { "type": "array", "items": { @@ -447,10 +440,8 @@ class ToolDefinition: "profile", "title", "summary", - "config_filename", "available", "model_count", - "filtered_out_count", "models", ], }, @@ -459,8 +450,6 @@ class ToolDefinition: }, "required": [ "default_profile", - "whitelist_active", - "whitelisted_classes", "profiles", "message", ], @@ -542,7 +531,7 @@ class ToolDefinition: name="model_profiles", description=( "Optional helper before task_create. Returns model_profile options with plain-language guidance " - "and currently available models after whitelist filtering." + "and currently available models in each profile." ), input_schema=MODEL_PROFILES_INPUT_SCHEMA, output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, @@ -605,7 +594,7 @@ class ToolDefinition: "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " - "Optional before task_create: call model_profiles to see profile guidance and available models under current whitelist settings. " + "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " "Then perform a non-tool step: draft a strong prompt and get user approval. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " diff --git a/public/llms.txt b/public/llms.txt index e0f2bf38..2e6fe62c 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -71,7 +71,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). -- model_profiles output: profile guidance + currently available models after whitelist filtering. +- model_profiles output: profile guidance + currently available models in each profile. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). - task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. From 5c56800db9456091384b15f8d42890b6115eba5e Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:06:01 +0100 Subject: [PATCH 15/38] Removed "available" field. --- docs/mcp/mcp_details.md | 2 +- docs/mcp/planexe_mcp_interface.md | 4 ++-- mcp_cloud/app.py | 9 +++------ mcp_cloud/tests/test_model_profiles_tool.py | 4 ++-- mcp_cloud/tool_models.py | 1 - mcp_local/planexe_mcp_local.py | 2 -- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index b161efab..a491fc1b 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -34,6 +34,7 @@ Response includes `samples` (array of prompt strings, each 300–800 words) and Returns profile guidance and model availability for `task_create.model_profile`. This helps agents pick a profile without knowing internal `llm_config/*.json` details. +Profiles with zero models are omitted from the `profiles` list. Example prompt: ``` @@ -51,7 +52,6 @@ Response includes: - `profile` - `title` - `summary` - - `available` - `model_count` - `models[]` (`key`, `provider_class`, `model`, `priority`) diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 308cf637..d53f9bfb 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -174,6 +174,7 @@ All tool names below are normative. ### 6.1.1 model_profiles Optional helper tool to discover valid `model_profile` choices and currently available models without relying on internal config knowledge. +Profiles with zero available models are omitted from the returned `profiles` array. **Request:** no parameters (empty object). @@ -186,8 +187,7 @@ Optional helper tool to discover valid `model_profile` choices and currently ava { "profile": "baseline", "title": "Baseline", - "summary": "Cheap and fast; recommended default for most runs.", - "available": true, + "summary": "Cheap and fast; recommended default when creating a plan.", "model_count": 5, "models": [ { diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 0ea51571..fff30e00 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -181,7 +181,7 @@ def ensure_taskitem_stop_columns() -> None: ModelProfileEnum.CUSTOM.value: "Custom", } MODEL_PROFILE_SUMMARIES = { - ModelProfileEnum.BASELINE.value: "Cheap and fast; recommended default for most runs.", + ModelProfileEnum.BASELINE.value: "Cheap and fast; recommended default when creating a plan.", ModelProfileEnum.PREMIUM.value: "Higher-cost profile tuned for stronger output quality.", ModelProfileEnum.FRONTIER.value: "Most capable models first; usually slowest/most expensive.", ModelProfileEnum.CUSTOM.value: "User-managed profile file for custom model ordering.", @@ -781,7 +781,6 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "available": False, "model_count": 0, "models": [], } @@ -800,7 +799,6 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "available": False, "model_count": 0, "models": [], } @@ -810,7 +808,6 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "available": False, "model_count": 0, "models": [], } @@ -820,7 +817,6 @@ def _profile_models_payload( "profile": profile.value, "title": MODEL_PROFILE_TITLES[profile.value], "summary": MODEL_PROFILE_SUMMARIES[profile.value], - "available": True, "model_count": len(models), "models": models, } @@ -830,10 +826,11 @@ def _get_model_profiles_sync() -> dict[str, Any]: raw_whitelist = os.environ.get(ENV_PLANEXE_LLM_CONFIG_WHITELISTED_CLASSES) whitelist = parse_llm_class_whitelist(raw_whitelist) default_profile = resolve_model_profile_from_env().value - profiles = [ + profiles_all = [ _profile_models_payload(profile, whitelist) for profile in ModelProfileEnum ] + profiles = [profile for profile in profiles_all if int(profile.get("model_count") or 0) > 0] return { "default_profile": default_profile, diff --git a/mcp_cloud/tests/test_model_profiles_tool.py b/mcp_cloud/tests/test_model_profiles_tool.py index fe3d3fcf..0e445f41 100644 --- a/mcp_cloud/tests/test_model_profiles_tool.py +++ b/mcp_cloud/tests/test_model_profiles_tool.py @@ -18,8 +18,7 @@ def test_model_profiles_returns_structured_content(self): { "profile": "baseline", "title": "Baseline", - "summary": "Cheap and fast; recommended default for most runs.", - "available": True, + "summary": "Cheap and fast; recommended default when creating a plan.", "model_count": 1, "models": [ { @@ -40,6 +39,7 @@ def test_model_profiles_returns_structured_content(self): self.assertFalse(result.isError) self.assertEqual(result.structuredContent["default_profile"], "baseline") self.assertEqual(result.structuredContent["profiles"][0]["profile"], "baseline") + self.assertNotIn("available", result.structuredContent["profiles"][0]) if __name__ == "__main__": diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 177851a5..8e1e99c7 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -50,7 +50,6 @@ class ModelProfileInfo(BaseModel): ) title: str = Field(..., description="Human-friendly profile label.") summary: str = Field(..., description="Short profile guidance for callers.") - available: bool = Field(..., description="True when the profile config file was found and parsed.") model_count: int = Field(..., description="Number of models currently available in this profile.") models: list[ModelProfileModelEntry] = Field( ..., diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 330ef230..e71e8655 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -420,7 +420,6 @@ class ToolDefinition: }, "title": {"type": "string"}, "summary": {"type": "string"}, - "available": {"type": "boolean"}, "model_count": {"type": "integer"}, "models": { "type": "array", @@ -440,7 +439,6 @@ class ToolDefinition: "profile", "title", "summary", - "available", "model_count", "models", ], From ea62e722037c047c8417524252e0b82e9963ea70 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:10:11 +0100 Subject: [PATCH 16/38] task_file_info had PLANEXE_MCP_PUBLIC_BASE_URL in its description. The PLANEXE_MCP_PUBLIC_BASE_URL is an internal thing of PlanExe, and not something to expose to the outside. The outside world has no idea what's going on insdie PlanExe. --- docs/mcp/planexe_mcp_interface.md | 5 +---- mcp_cloud/app.py | 5 ++--- mcp_cloud/http_server.py | 1 - mcp_cloud/tool_models.py | 10 ++-------- public/llms.txt | 2 +- 5 files changed, 6 insertions(+), 17 deletions(-) diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index d53f9bfb..7e24445d 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -433,10 +433,7 @@ Requests the plan generation to stop. Pass the **task_id** (the UUID returned by **task_file_info URL behavior (mcp_cloud)** -- `download_url` is generated from `PLANEXE_MCP_PUBLIC_BASE_URL` when set. -- Otherwise, cloud HTTP mode uses request host/scheme when available. -- If no public base URL can be determined (for example some stdio-only flows), `download_url` may be absent. -- In deployments behind proxies/CDNs, set `PLANEXE_MCP_PUBLIC_BASE_URL` so clients receive a reachable URL. +- `download_url` is an absolute URL where the requested artifact can be downloaded. --- diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index fff30e00..41631108 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -134,8 +134,7 @@ def ensure_taskitem_stop_columns() -> None: "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " - "task_file_info download_url is absolute when PLANEXE_MCP_PUBLIC_BASE_URL is configured or request host is available. " - "If download_url is missing, configure PLANEXE_MCP_PUBLIC_BASE_URL on the server. " + "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " @@ -1080,7 +1079,7 @@ class ToolDefinition: "Returns file metadata (content_type, download_url, download_size) for the report or zip. " "If your client exposes task_download (e.g. mcp_local), use that to save the file locally; " "otherwise use this tool to get download_url and fetch the file yourself. " - "download_url is generated from PLANEXE_MCP_PUBLIC_BASE_URL (or request host when available). " + "download_url is an absolute URL where the requested artifact can be downloaded. " "Returns {} while artifact is not ready. Terminal tool-level error payloads use codes generation_failed or content_unavailable." ), input_schema=TASK_FILE_INFO_INPUT_SCHEMA, diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 385e9269..4125dd3c 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -563,7 +563,6 @@ async def call_tool( Call an MCP tool by name with arguments. This endpoint wraps the stdio-based MCP tool handlers for HTTP access. - Download URLs use the request host when PLANEXE_MCP_PUBLIC_BASE_URL is not set (set in middleware). """ arguments = dict(payload.arguments or {}) if payload.tool == "task_create": diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 8e1e99c7..2184651a 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -165,10 +165,7 @@ class TaskFileInfoReadyOutput(BaseModel): download_size: int = Field(..., description="Artifact size in bytes.") download_url: str | None = Field( default=None, - description=( - "Absolute artifact download URL when server base URL is known " - "(PLANEXE_MCP_PUBLIC_BASE_URL or request host)." - ), + description="Absolute URL where the requested artifact can be downloaded.", ) @@ -178,10 +175,7 @@ class TaskFileInfoOutput(BaseModel): download_size: int | None = Field(default=None, description="Artifact size in bytes.") download_url: str | None = Field( default=None, - description=( - "Absolute artifact download URL when server base URL is known. " - "May be omitted in some deployments." - ), + description="Absolute URL where the requested artifact can be downloaded.", ) error: ErrorDetail | None = None diff --git a/public/llms.txt b/public/llms.txt index 2e6fe62c..1441d5ef 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -107,7 +107,7 @@ Concurrency semantics: Note: - task_download is provided by mcp_local wrappers in some client setups, not by mcp_cloud directly. - In mcp_local, downloads save to PLANEXE_PATH (or current working directory if PLANEXE_PATH is unset). -- In mcp_cloud, task_file_info download_url depends on deployment URL settings; configure PLANEXE_MCP_PUBLIC_BASE_URL for stable absolute URLs. +- In mcp_cloud, task_file_info download_url is an absolute URL where the requested artifact can be downloaded. ## Authentication From 6370d01d845c2ffd8575855205a52f3b3c3dd68c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:27:29 +0100 Subject: [PATCH 17/38] No model_profiles shown in the mcp model_profiles tool --- docker-compose.yml | 2 ++ docs/mcp/mcp_details.md | 2 ++ docs/mcp/planexe_mcp_interface.md | 2 ++ mcp_cloud/Dockerfile | 1 + mcp_cloud/app.py | 20 +++++++++++++++++++- mcp_cloud/tests/test_model_profiles_tool.py | 13 +++++++++++++ public/llms.txt | 3 ++- 7 files changed, 41 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 23588ae1..2bd9d77e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -242,6 +242,8 @@ services: PLANEXE_WORKER_PLAN_URL: ${PLANEXE_WORKER_PLAN_URL:-http://worker_plan:8000} ports: - "${PLANEXE_MCP_HTTP_PORT:-8001}:8001" + volumes: + - ./llm_config:/app/llm_config:ro restart: unless-stopped healthcheck: test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8001/healthcheck').read()"] diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index a491fc1b..57a45de8 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -35,6 +35,7 @@ Response includes `samples` (array of prompt strings, each 300–800 words) and Returns profile guidance and model availability for `task_create.model_profile`. This helps agents pick a profile without knowing internal `llm_config/*.json` details. Profiles with zero models are omitted from the `profiles` list. +If no models are available in any profile, `model_profiles` returns `isError=true` with `error.code = MODEL_PROFILES_UNAVAILABLE`. Example prompt: ``` @@ -248,6 +249,7 @@ Common cloud/core error codes: - `USER_API_KEY_REQUIRED` - `INSUFFICIENT_CREDITS` - `INTERNAL_ERROR` +- `MODEL_PROFILES_UNAVAILABLE` - `generation_failed` - `content_unavailable` diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 7e24445d..90c370d7 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -175,6 +175,7 @@ All tool names below are normative. Optional helper tool to discover valid `model_profile` choices and currently available models without relying on internal config knowledge. Profiles with zero available models are omitted from the returned `profiles` array. +If no models are available in any profile, the tool returns `isError=true` with `error.code = MODEL_PROFILES_UNAVAILABLE`. **Request:** no parameters (empty object). @@ -512,6 +513,7 @@ Cloud/core tool codes: - `INVALID_USER_API_KEY`: provided user_api_key is invalid. - `USER_API_KEY_REQUIRED`: deployment requires user_api_key for task_create. - `INSUFFICIENT_CREDITS`: caller account has no credits for task_create. +- `MODEL_PROFILES_UNAVAILABLE`: model_profiles found zero available models across all profiles. - `generation_failed`: task_file_info report path when task ended in failed. - `content_unavailable`: task_file_info cannot read requested artifact bytes. diff --git a/mcp_cloud/Dockerfile b/mcp_cloud/Dockerfile index 02efb72c..fd6837a2 100644 --- a/mcp_cloud/Dockerfile +++ b/mcp_cloud/Dockerfile @@ -14,6 +14,7 @@ WORKDIR /app COPY database_api /app/database_api COPY worker_plan/worker_plan_api /app/worker_plan_api COPY mcp_cloud /app/mcp_cloud +COPY llm_config /app/llm_config COPY public/llms.txt /app/public/llms.txt # Install dependencies diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 41631108..fcdd9440 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -133,6 +133,7 @@ def ensure_taskitem_stop_columns() -> None: "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " @@ -1028,7 +1029,8 @@ class ToolDefinition: name="model_profiles", description=( "Optional helper before task_create. Returns model_profile options with plain-language guidance " - "and currently available models in each profile." + "and currently available models in each profile. " + "If no models are available, returns error code MODEL_PROFILES_UNAVAILABLE." ), input_schema=MODEL_PROFILES_INPUT_SCHEMA, output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, @@ -1223,6 +1225,22 @@ async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: """Return model profile options and currently available models in each profile.""" _ = ModelProfilesRequest(**(arguments or {})) payload = await asyncio.to_thread(_get_model_profiles_sync) + profiles = payload.get("profiles") + if not isinstance(profiles, list) or len(profiles) == 0: + response = { + "error": { + "code": "MODEL_PROFILES_UNAVAILABLE", + "message": ( + "No models are currently available in any model_profile. " + "Ensure profile config files are present and contain at least one enabled model, then retry model_profiles." + ), + } + } + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=True, + ) return CallToolResult( content=[TextContent(type="text", text=json.dumps(payload))], structuredContent=payload, diff --git a/mcp_cloud/tests/test_model_profiles_tool.py b/mcp_cloud/tests/test_model_profiles_tool.py index 0e445f41..340e16be 100644 --- a/mcp_cloud/tests/test_model_profiles_tool.py +++ b/mcp_cloud/tests/test_model_profiles_tool.py @@ -41,6 +41,19 @@ def test_model_profiles_returns_structured_content(self): self.assertEqual(result.structuredContent["profiles"][0]["profile"], "baseline") self.assertNotIn("available", result.structuredContent["profiles"][0]) + def test_model_profiles_returns_error_when_none_available(self): + payload = { + "default_profile": "baseline", + "profiles": [], + "message": "Use one of these profile values in task_create.model_profile.", + } + + with patch("mcp_cloud.app._get_model_profiles_sync", return_value=payload): + result = asyncio.run(handle_model_profiles({})) + + self.assertTrue(result.isError) + self.assertEqual(result.structuredContent["error"]["code"], "MODEL_PROFILES_UNAVAILABLE") + if __name__ == "__main__": unittest.main() diff --git a/public/llms.txt b/public/llms.txt index 1441d5ef..b7b25327 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -72,6 +72,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). - model_profiles output: profile guidance + currently available models in each profile. +- model_profiles returns `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). - task_create output: task_id (use this for task_status, task_stop, and task_file_info). - task_status output: current state and progress for a task_id. @@ -86,7 +87,7 @@ task_status caller contract: Minimal error-handling contract: - Errors use `{"error":{"code","message","details?"}}`. -- Common cloud/core codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `generation_failed`, `content_unavailable`. +- Common cloud/core codes: `TASK_NOT_FOUND`, `INVALID_USER_API_KEY`, `USER_API_KEY_REQUIRED`, `INSUFFICIENT_CREDITS`, `INTERNAL_ERROR`, `MODEL_PROFILES_UNAVAILABLE`, `generation_failed`, `content_unavailable`. - Common local-proxy codes: `REMOTE_ERROR`, `DOWNLOAD_FAILED`. - `task_file_info` may return `{}` while artifact output is not ready yet. From 08b9edf23a6808563d74b779438acac6f4f90a6b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:43:43 +0100 Subject: [PATCH 18/38] prompt lengths --- mcp_cloud/app.py | 6 +++--- mcp_cloud/tool_models.py | 4 +++- public/llms.txt | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index fcdd9440..0173bcd2 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -129,7 +129,7 @@ def ensure_taskitem_stop_columns() -> None: "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " - "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Then perform a non-tool step: draft a strong prompt (typically ~300-800 words) and get user approval. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " @@ -1018,7 +1018,7 @@ class ToolDefinition: description=( "Call this first. Returns example prompts that define what a good prompt looks like. " "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " - "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " + "Next is a non-tool step: formulate a detailed prompt (typically ~300-800 words; use examples as a baseline, similar structure) and get user approval. " "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), @@ -1208,7 +1208,7 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: payload = { "samples": samples, "message": ( - "Next: complete the non-tool step by drafting a good prompt using these as a baseline (similar structure), then get user approval. " + "Next: complete the non-tool step by drafting a detailed prompt (typically ~300-800 words) using these as a baseline (similar structure), then get user approval. " "Only after approval, call task_create. " "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 2184651a..a56aa700 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -14,7 +14,8 @@ class PromptExamplesOutput(BaseModel): ..., description=( "Example prompts that define the baseline for what a good prompt looks like. " - "Take inspiration from these when writing your own prompt for task_create." + "Take inspiration from these when writing your own prompt for task_create " + "(typically ~300-800 words)." ), ) message: str @@ -186,6 +187,7 @@ class TaskCreateInput(BaseModel): description=( "What the plan should cover (goal, context, constraints). " "Use prompt_examples to get example prompts; use these as examples for task_create. " + "For best results, provide a detailed prompt (typically ~300-800 words). " "Short prompts produce less detailed plans. " "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist); use direct LLM responses for those." ), diff --git a/public/llms.txt b/public/llms.txt index b7b25327..76af171a 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -71,6 +71,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). +- task_create prompt quality: for best results, provide a detailed prompt (typically ~300-800 words). - model_profiles output: profile guidance + currently available models in each profile. - model_profiles returns `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). @@ -94,7 +95,7 @@ Minimal error-handling contract: Recommended interaction order: 1. Call prompt_examples. 2. Optionally call model_profiles to choose model_profile based on current availability. -3. Non-tool step: prepare and approve a strong prompt. +3. Non-tool step: prepare and approve a strong prompt (typically ~300-800 words). 4. Call task_create. 5. Poll task_status until complete (repeat every 5 minutes). 6. Use task_file_info to get download URLs. From d703f4f3309f870abbc84226add813bb23e316a9 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:47:59 +0100 Subject: [PATCH 19/38] prompt shape --- mcp_cloud/app.py | 3 +++ mcp_cloud/tool_models.py | 5 ++++- mcp_local/planexe_mcp_local.py | 4 ++++ public/llms.txt | 4 ++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 0173bcd2..2a967f4a 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -130,6 +130,7 @@ def ensure_taskitem_stop_columns() -> None: "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " "Then perform a non-tool step: draft a strong prompt (typically ~300-800 words) and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " @@ -1019,6 +1020,7 @@ class ToolDefinition: "Call this first. Returns example prompts that define what a good prompt looks like. " "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " "Next is a non-tool step: formulate a detailed prompt (typically ~300-800 words; use examples as a baseline, similar structure) and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), @@ -1209,6 +1211,7 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: "samples": samples, "message": ( "Next: complete the non-tool step by drafting a detailed prompt (typically ~300-800 words) using these as a baseline (similar structure), then get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Only after approval, call task_create. " "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index a56aa700..68eaaf7d 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -15,7 +15,8 @@ class PromptExamplesOutput(BaseModel): description=( "Example prompts that define the baseline for what a good prompt looks like. " "Take inspiration from these when writing your own prompt for task_create " - "(typically ~300-800 words)." + "(typically ~300-800 words). Good prompt shape: objective, scope, constraints, " + "timeline, stakeholders, budget/resources, and success criteria." ), ) message: str @@ -188,6 +189,8 @@ class TaskCreateInput(BaseModel): "What the plan should cover (goal, context, constraints). " "Use prompt_examples to get example prompts; use these as examples for task_create. " "For best results, provide a detailed prompt (typically ~300-800 words). " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " + "budget/resources, and success criteria. " "Short prompts produce less detailed plans. " "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist); use direct LLM responses for those." ), diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index e71e8655..86d027b4 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -324,6 +324,8 @@ class ToolDefinition: "description": ( "What the plan should cover. Good prompts are often 300–800 words. " "Use prompt_examples to get example prompts; use these as examples for task_create. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " + "budget/resources, and success criteria. " "Short prompts produce less detailed plans. " "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist)." ), @@ -519,6 +521,7 @@ class ToolDefinition: "Call this first. Returns example prompts that define what a good prompt looks like. " "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), @@ -594,6 +597,7 @@ class ToolDefinition: "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_download when complete. To stop, call task_stop with the task_id from task_create. " diff --git a/public/llms.txt b/public/llms.txt index 76af171a..1035aed1 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -71,7 +71,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). -- task_create prompt quality: for best results, provide a detailed prompt (typically ~300-800 words). +- task_create prompt quality: for best results, provide a detailed prompt (typically ~300-800 words) with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. - model_profiles output: profile guidance + currently available models in each profile. - model_profiles returns `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). @@ -95,7 +95,7 @@ Minimal error-handling contract: Recommended interaction order: 1. Call prompt_examples. 2. Optionally call model_profiles to choose model_profile based on current availability. -3. Non-tool step: prepare and approve a strong prompt (typically ~300-800 words). +3. Non-tool step: prepare and approve a strong prompt (typically ~300-800 words) with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. 4. Call task_create. 5. Poll task_status until complete (repeat every 5 minutes). 6. Use task_file_info to get download URLs. From e5e7ace706d8ec7ab75405749ec14bfd0ee8c514 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 20:53:11 +0100 Subject: [PATCH 20/38] Docs cleanups --- docs/mcp/inspector.md | 6 ++++-- docs/mcp/mcp_details.md | 8 ++++---- docs/mcp/mcp_setup.md | 12 ++++++++---- docs/mcp/mcp_troubleshooting.md | 2 +- docs/mcp/mcp_welcome.md | 2 +- docs/mcp/planexe_mcp_interface.md | 17 ++++++++++------- 6 files changed, 28 insertions(+), 19 deletions(-) diff --git a/docs/mcp/inspector.md b/docs/mcp/inspector.md index 088afeab..aff07940 100644 --- a/docs/mcp/inspector.md +++ b/docs/mcp/inspector.md @@ -68,6 +68,7 @@ When connected follow these steps: Now there should be a list with tool names and descriptions: ``` prompt_examples +model_profiles task_create task_status task_stop @@ -81,8 +82,9 @@ Follow these steps: ![screenshot of mcp inspector invoke tool](inspector_step5_mcp_planexe_org.webp) 1. In the `Tools` panel; Click on the `prompt_examples` tool. -2. In the `prompt_examples` right sidepanel; Click on `Run Tool`. -3. The MCP server should respond with a list of list of example prompts. +2. In the `prompt_examples` right sidepanel; Click on `Run Tool`. +3. The MCP server should respond with a list of example prompts. +4. Optionally run `model_profiles` to inspect available `model_profile` choices before `task_create`. ## Approach 2. MCP server inside docker diff --git a/docs/mcp/mcp_details.md b/docs/mcp/mcp_details.md index 57a45de8..8b28a72c 100644 --- a/docs/mcp/mcp_details.md +++ b/docs/mcp/mcp_details.md @@ -16,7 +16,7 @@ This document lists the MCP tools exposed by PlanExe and example prompts for age ### prompt_examples -Returns around five example prompts that show what good prompts look like. Each sample is typically 300–800 words: detailed context, requirements, and success criteria. Usually the AI does the heavy lifting: the user has a vague idea, the agent calls `prompt_examples`, then expands that idea into a high-quality prompt (300–800 words). The prompt is shown to the user, who can ask for further changes or confirm it’s good to go. When the user confirms, the agent then calls `task_create`. Shorter or vaguer prompts produce lower-quality plans. +Returns around five example prompts that show what good prompts look like. Each sample is typically 300-800 words. Usually the AI does the heavy lifting: the user has a vague idea, the agent calls `prompt_examples`, then expands that idea into a high-quality prompt (300-800 words). A compact prompt shape works best: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. The prompt is shown to the user, who can ask for further changes or confirm it’s good to go. When the user confirms, the agent then calls `task_create`. Shorter or vaguer prompts produce lower-quality plans. Example prompt: ``` @@ -28,7 +28,7 @@ Example call: {} ``` -Response includes `samples` (array of prompt strings, each 300–800 words) and `message`. +Response includes `samples` (array of prompt strings, each ~300-800 words) and `message`. ### model_profiles @@ -273,7 +273,7 @@ Special case: ### 1. Get example prompts -The user often starts with a vague idea. The AI calls `prompt_examples` first to see what good prompts look like (around five samples, 300–800 words each), then expands the user’s idea into a high-quality prompt and shows it to the user. +The user often starts with a vague idea. The AI calls `prompt_examples` first to see what good prompts look like (around five samples, typically 300-800 words each), then expands the user’s idea into a high-quality prompt using this compact shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. Prompt: ``` @@ -299,7 +299,7 @@ Tool call: ### 3. Draft and approve the prompt (non-tool step) -At this step, the agent writes a high-quality prompt draft, shows it to the user, and waits for approval. +At this step, the agent writes a high-quality prompt draft (typically 300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria), shows it to the user, and waits for approval. ### 4. Create a plan diff --git a/docs/mcp/mcp_setup.md b/docs/mcp/mcp_setup.md index fe1997c2..f5cd57f3 100644 --- a/docs/mcp/mcp_setup.md +++ b/docs/mcp/mcp_setup.md @@ -12,10 +12,11 @@ This is the shortest path to a working PlanExe MCP integration. 1. Ask for prompt examples. 2. Inspect `model_profile` options and available models. -3. Expand the user idea into a high‑quality prompt (non-tool step) and get user approval. +3. Expand the user idea into a high-quality prompt (typically ~300-800 words) and get user approval. + Use this compact shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. 4. Create the plan task. 5. Poll for status (about every 5 minutes). -6. Download the report (HTML or zip). +6. Download artifacts via `task_file_info` (cloud) or `task_download` (mcp_local helper). --- @@ -25,7 +26,10 @@ This is the shortest path to a working PlanExe MCP integration. 2. `model_profiles` 3. `task_create` 4. `task_status` -5. `task_download` +5. `task_file_info` + +Optional local helper: +- `task_download` (provided by `mcp_local`, not `mcp_cloud`) For `task_create`: @@ -39,7 +43,7 @@ For `task_create`: - You can fetch example prompts. - You can create a plan task. -- You can download the report artifact. +- You can fetch artifact metadata/URLs with `task_file_info` (and optionally save locally via `task_download` when using `mcp_local`). - Your client can parse `error.code` and `error.message` and handle `{}` from `task_file_info` as "not ready yet". - If running parallel work, your client tracks multiple `task_id`s explicitly (server-side global cap is not enforced). diff --git a/docs/mcp/mcp_troubleshooting.md b/docs/mcp/mcp_troubleshooting.md index 1ecce90a..a2f90dcc 100644 --- a/docs/mcp/mcp_troubleshooting.md +++ b/docs/mcp/mcp_troubleshooting.md @@ -10,7 +10,7 @@ Common MCP integration issues and fixes. ## Cannot create a plan -- Ensure your prompt is detailed (300–800 words). +- Ensure your prompt is detailed (typically ~300-800 words) and includes objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. - Some topics may be refused by the model (harmful, unethical, or dangerous requests). - Try a smaller model or a more reliable paid model. - Confirm the MCP server is reachable from your client. diff --git a/docs/mcp/mcp_welcome.md b/docs/mcp/mcp_welcome.md index b4081d10..b28d7bfd 100644 --- a/docs/mcp/mcp_welcome.md +++ b/docs/mcp/mcp_welcome.md @@ -19,7 +19,7 @@ No MCP experience is required to get started. ## What you can do -- **Get example prompts** — See what good prompts look like (detailed, typically 300–800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. +- **Get example prompts** — See what good prompts look like (detailed, typically ~300-800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. A compact prompt shape works best: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. - **Create a plan** — Send a prompt; PlanExe starts creating the plan (takes about 15–20 minutes). If the input prompt is of low quality, the output plan will be crap too. Visible `task_create` options include `model_profile`. - **Check progress** — Ask for status and see how far the plan has gotten. - **Download the report** — When the plan is ready, the user specifies whether to download the HTML report or the zip of intermediary files (JSON, MD, CSV). diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 90c370d7..24bf6dcb 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a good prompt (use examples as a baseline; similar structure) and get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt (typically ~300-800 words) using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -156,7 +156,7 @@ All tool names below are normative. ### 6.1 prompt_examples -**Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: call this tool; optionally call `model_profiles`; then complete a non-tool step (draft and approve prompt); only then call `task_create`. If you call `task_create` before formulating and approving a prompt, the resulting plan will be lower quality than it could be. +**Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: call this tool; optionally call `model_profiles`; then complete a non-tool step (draft and approve a detailed prompt, typically ~300-800 words); only then call `task_create`. If you call `task_create` before formulating and approving a prompt, the resulting plan will be lower quality than it could be. **Request:** no parameters (empty object). @@ -268,12 +268,15 @@ Use tool-specific metadata when you need runtime overrides that should not be vi **Prompt quality** -The `prompt` parameter should be a detailed description of what the plan should cover. Good prompts are typically 300–800 words and include: +The `prompt` parameter should be a detailed description of what the plan should cover. Good prompts are typically 300-800 words and include: -- Clear context: background, constraints, and goals -- Specific requirements: budget, timeline, location, or technical constraints -- Success criteria: what "done" looks like -- Banned words or approaches (if any) +- Objective +- Scope +- Constraints +- Timeline +- Stakeholders +- Budget/resources +- Success criteria Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. From f98d4f8e213a617adb626fa0f6cf9d7467b59e6c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:01:45 +0100 Subject: [PATCH 21/38] mcp docs cleanups --- docs/mcp/planexe_mcp_interface.md | 9 +++++---- mcp_cloud/http_server.py | 19 ++----------------- mcp_local/planexe_mcp_local.py | 4 +++- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 24bf6dcb..19f506ba 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifa Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt (typically ~300-800 words) using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` or `task_file_info` when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt (typically ~300-800 words) using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` (mcp_local helper) or `task_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. - **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). ### 1.3 Scope of this document @@ -70,10 +70,10 @@ The interface is designed to support: The MCP specification defines two different mechanisms: -- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_download or task_file_info. This document specifies those tools. +- **MCP tools** (e.g. task_create, task_status, task_stop): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls task_create → receives task_id → polls task_status → uses task_file_info (and optionally task_download via mcp_local). This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: call `prompt_examples`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `task_create`; poll `task_status`; then call `task_download` or `task_file_info`. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: call `prompt_examples`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `task_create`; poll `task_status`; then call `task_file_info` (or `task_download` via mcp_local). --- @@ -320,7 +320,7 @@ For the full catalog file: **Important** -- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_download/task_file_info. +- task_id is a UUID returned by task_create. Use this exact UUID for task_status/task_stop/task_file_info (and task_download when using mcp_local). **Behavior** @@ -500,6 +500,7 @@ Example: ### 9.2 isError behavior - `task_create`, `task_status`, `task_stop`: unknown/invalid requests return `isError=true` with `error`. +- `model_profiles`: returns `isError=true` with `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. - `task_file_info`: uses mixed behavior: - returns `{}` (not an error) while artifacts are not ready. - may return `{"error": ...}` with `isError=false` for terminal artifact-level problems. diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 4125dd3c..0e0212e9 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -47,6 +47,7 @@ ) from mcp_cloud.app import ( + PLANEXE_SERVER_INSTRUCTIONS, REPORT_CONTENT_TYPE, REPORT_FILENAME, TOOL_DEFINITIONS, @@ -396,23 +397,7 @@ def _register_tools(server: FastMCP) -> None: fastmcp_server = FastMCP( name="planexe-mcp-server", - instructions=( - "PlanExe generates rough-draft project plans from a natural-language prompt. " - "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " - "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " - "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " - "Required interaction order: call prompt_examples first. " - "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " - "Then perform a non-tool step: draft a strong prompt and get user approval. " - "Only after approval, call task_create. " - "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " - "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " - "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " - "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " - "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " - "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." - ), + instructions=PLANEXE_SERVER_INSTRUCTIONS, host=HTTP_HOST, port=HTTP_PORT, streamable_http_path="/", diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 86d027b4..b4a551c7 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -532,7 +532,8 @@ class ToolDefinition: name="model_profiles", description=( "Optional helper before task_create. Returns model_profile options with plain-language guidance " - "and currently available models in each profile." + "and currently available models in each profile. " + "If no models are available, returns error code MODEL_PROFILES_UNAVAILABLE." ), input_schema=MODEL_PROFILES_INPUT_SCHEMA, output_schema=MODEL_PROFILES_OUTPUT_SCHEMA, @@ -601,6 +602,7 @@ class ToolDefinition: "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_download when complete. To stop, call task_stop with the task_id from task_create. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " From 33a9f24bc68741c104d997fd782ad85131466bb0 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:15:59 +0100 Subject: [PATCH 22/38] mcp tests --- mcp_cloud/tests/test_tool_surface_consistency.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index 1140091d..ac11e0ae 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -50,6 +50,15 @@ def test_cloud_task_status_description_includes_state_contract(self): self.assertIn(">20 minutes", description) self.assertIn("PlanExeOrg/PlanExe/issues", description) + def test_cloud_instructions_include_model_profiles_unavailable_guidance(self): + instructions = cloud_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("MODEL_PROFILES_UNAVAILABLE", instructions) + + def test_cloud_prompt_schema_includes_prompt_shape_guidance(self): + prompt_schema = cloud_app.TASK_CREATE_INPUT_SCHEMA["properties"]["prompt"]["description"] + self.assertIn("300-800 words", prompt_schema) + self.assertIn("objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria", prompt_schema) + class TestLocalToolSurfaceConsistency(unittest.TestCase): def test_local_exposes_model_profiles_tool(self): @@ -90,6 +99,10 @@ def test_local_task_status_description_includes_state_contract(self): self.assertIn(">20 minutes", description) self.assertIn("PlanExeOrg/PlanExe/issues", description) + def test_local_instructions_include_model_profiles_unavailable_guidance(self): + instructions = local_app.PLANEXE_SERVER_INSTRUCTIONS + self.assertIn("MODEL_PROFILES_UNAVAILABLE", instructions) + if __name__ == "__main__": unittest.main() From 8619f48cabf82f324f4f163fc20d132ded419e6b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:26:13 +0100 Subject: [PATCH 23/38] mcp: document task_stop terminal state and edge cases --- mcp_cloud/app.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 2a967f4a..743fc3fb 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -133,7 +133,8 @@ def ensure_taskitem_stop_columns() -> None: "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " - "Then poll task_status (about every 5 minutes); use task_file_info when complete. To stop, call task_stop with the task_id from task_create. " + "Then poll task_status (about every 5 minutes); use task_file_info when complete. " + "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " @@ -1072,7 +1073,10 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "Call task_stop with that task_id." + "Stopping is asynchronous: the stop flag is set immediately but the task may continue briefly before halting. " + "A stopped task will eventually transition to the failed state. " + "If the task is already completed or failed, stop_requested returns false (the task already finished). " + "Unknown task_id returns error code TASK_NOT_FOUND." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, From 0d594b31379dd3cde2aec2d15fb319bf0bb4a1df Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:26:40 +0100 Subject: [PATCH 24/38] mcp: clarify task_file_info empty-object behavior and add artifact/download guidance --- mcp_cloud/app.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 743fc3fb..ac8f8bc0 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -136,7 +136,7 @@ def ensure_taskitem_stop_columns() -> None: "Then poll task_status (about every 5 minutes); use task_file_info when complete. " "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " - "Tool errors use {error:{code,message}}. task_file_info returns {} while output is not ready. " + "Tool errors use {error:{code,message}}. task_file_info returns an empty object {} while the artifact is not ready; check readiness by testing whether download_url is present. " "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " @@ -1084,11 +1084,14 @@ class ToolDefinition: ToolDefinition( name="task_file_info", description=( - "Returns file metadata (content_type, download_url, download_size) for the report or zip. " - "If your client exposes task_download (e.g. mcp_local), use that to save the file locally; " - "otherwise use this tool to get download_url and fetch the file yourself. " - "download_url is an absolute URL where the requested artifact can be downloaded. " - "Returns {} while artifact is not ready. Terminal tool-level error payloads use codes generation_failed or content_unavailable." + "Returns file metadata (content_type, download_url, download_size) for the report or zip artifact. " + "Use artifact='report' (default) for the final HTML deliverable; use artifact='zip' for underlying data files (md, json, csv). " + "While the task is still pending or processing, returns an empty object {} (no fields). " + "Check readiness by testing whether download_url is present in the response. " + "Once ready, present download_url to the user or fetch and save the file locally. " + "If your client exposes task_download (e.g. mcp_local), prefer that to save the file locally. " + "Terminal error codes: generation_failed (plan failed), content_unavailable (artifact missing). " + "Unknown task_id returns error code TASK_NOT_FOUND." ), input_schema=TASK_FILE_INFO_INPUT_SCHEMA, output_schema=TASK_FILE_INFO_OUTPUT_SCHEMA, From ea558e8c441534a1f8d48f8eb88ff773448178f9 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:26:54 +0100 Subject: [PATCH 25/38] mcp: remove developer-facing speed_vs_detail mention from task_create description --- mcp_cloud/app.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index ac8f8bc0..14ea9395 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -1047,9 +1047,7 @@ class ToolDefinition: "Each task_create call creates a new task_id (no server-side dedup). " "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " - "Common error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS. " - "Optional runtime overrides such as speed_vs_detail are intentionally hidden from the visible tool schema " - "and can be provided via tool-specific metadata by developers." + "Common error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, From 58c621f4102ddd48b4aa7ffd98c00460c6115c1d Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:27:14 +0100 Subject: [PATCH 26/38] mcp: reword MODEL_PROFILES_UNAVAILABLE guidance for LLM callers --- mcp_cloud/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 14ea9395..e85dd506 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -135,7 +135,7 @@ def ensure_taskitem_stop_columns() -> None: "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. " "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " - "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. task_file_info returns an empty object {} while the artifact is not ready; check readiness by testing whether download_url is present. " "task_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " @@ -1239,8 +1239,8 @@ async def handle_model_profiles(arguments: dict[str, Any]) -> CallToolResult: "error": { "code": "MODEL_PROFILES_UNAVAILABLE", "message": ( - "No models are currently available in any model_profile. " - "Ensure profile config files are present and contain at least one enabled model, then retry model_profiles." + "No models are currently configured. " + "Inform the user that the server administrator needs to set up model profiles before plans can be created." ), } } From c5f01b57c0aab6149ad281ba8f946d08f7b39cd1 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:27:27 +0100 Subject: [PATCH 27/38] mcp: warn that task_id cannot be recovered once lost --- mcp_cloud/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index e85dd506..13c04cc8 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -1044,6 +1044,7 @@ class ToolDefinition: "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " + "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " "Each task_create call creates a new task_id (no server-side dedup). " "If you are unsure which model_profile to choose, call model_profiles first. " "If your deployment uses credits, include user_api_key to charge the correct account. " From 6d6be467b86cd794cfa109d991d744168adbeebe Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:28:00 +0100 Subject: [PATCH 28/38] mcp: document progress_percentage range (0-100) and files array purpose --- mcp_cloud/app.py | 4 +++- mcp_cloud/tool_models.py | 28 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 13c04cc8..082ad497 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -1057,9 +1057,11 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15-20+ minutes " "and frequent polling is unnecessary. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "progress_percentage is 0-100 (integer-like float); 100 when completed. " + "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " "Unknown task_id returns error code TASK_NOT_FOUND. " "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " "processing with no file-output changes for >20 minutes likely means failed/stalled. " diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 68eaaf7d..6ffbc727 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -126,9 +126,19 @@ class TaskStatusSuccess(BaseModel): "completed => download is ready; failed => terminal error." ), ) - progress_percentage: float + progress_percentage: float = Field( + ..., + description="Completion progress from 0 to 100. Monotonically increasing; 100 when state is completed.", + ) timing: TaskStatusTiming - files: list[TaskStatusFile] + files: list[TaskStatusFile] = Field( + ..., + description=( + "Intermediate output files produced so far. " + "Use updated_at timestamps to detect stalls. " + "These files are included in the zip artifact when the task completes." + ), + ) class TaskStatusOutput(BaseModel): @@ -143,9 +153,19 @@ class TaskStatusOutput(BaseModel): "completed => download is ready; failed => terminal error." ), ) - progress_percentage: float | None = None + progress_percentage: float | None = Field( + default=None, + description="Completion progress from 0 to 100. Monotonically increasing; 100 when state is completed.", + ) timing: TaskStatusTiming | None = None - files: list[TaskStatusFile] | None = None + files: list[TaskStatusFile] | None = Field( + default=None, + description=( + "Intermediate output files produced so far. " + "Use updated_at timestamps to detect stalls. " + "These files are included in the zip artifact when the task completes." + ), + ) error: ErrorDetail | None = None From 06bda9809498f79f3e60f7b0141cfc99ffb51d42 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:59:37 +0100 Subject: [PATCH 29/38] mcp_local: add user_api_key to task_create schema and sync tool descriptions with cloud --- mcp_local/planexe_mcp_local.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index b4a551c7..6c03f310 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -40,6 +40,7 @@ class TaskCreateRequest(BaseModel): prompt: str model_profile: Optional[ModelProfileInput] = None + user_api_key: Optional[str] = None class TaskStatusRequest(BaseModel): @@ -339,6 +340,11 @@ class ToolDefinition: "frontier (most capable), custom (user-defined). Call model_profiles for runtime availability." ), }, + "user_api_key": { + "type": ["string", "null"], + "default": None, + "description": "Optional user API key for credits and attribution.", + }, }, "required": ["prompt"], } @@ -542,11 +548,13 @@ class ToolDefinition: name="task_create", description=( "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " - "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " + "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15-20 min. " + "Returns task_id (UUID); use it for task_status, task_stop, and task_download. " + "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " "Each task_create call creates a new task_id (proxied to cloud; no server-side dedup). " "If you are unsure which model_profile to choose, call model_profiles first. " - "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR. " - "Runs in the background (15–20 min). Returns task_id (UUID); use it for task_status, task_stop, and task_download." + "If your deployment uses credits, include user_api_key to charge the correct account. " + "Common proxied error codes: INVALID_USER_API_KEY, USER_API_KEY_REQUIRED, INSUFFICIENT_CREDITS, REMOTE_ERROR." ), input_schema=TASK_CREATE_INPUT_SCHEMA, output_schema=TASK_CREATE_OUTPUT_SCHEMA, @@ -555,9 +563,11 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15–20+ minutes " + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15-20+ minutes " "and frequent polling is unnecessary. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "progress_percentage is 0-100 (integer-like float); 100 when completed. " + "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " "Unknown task_id returns TASK_NOT_FOUND (or REMOTE_ERROR when transport fails). " "Troubleshooting: pending for >5 minutes likely means queued but not picked up by a worker. " "processing with no file-output changes for >20 minutes likely means failed/stalled. " @@ -570,7 +580,10 @@ class ToolDefinition: name="task_stop", description=( "Request the plan generation to stop. Pass the task_id (the UUID returned by task_create). " - "Call task_stop with that task_id." + "Stopping is asynchronous: the stop flag is set immediately but the task may continue briefly before halting. " + "A stopped task will eventually transition to the failed state. " + "If the task is already completed or failed, stop_requested returns false (the task already finished). " + "Unknown task_id returns TASK_NOT_FOUND (or REMOTE_ERROR when transport fails)." ), input_schema=TASK_STOP_INPUT_SCHEMA, output_schema=TASK_STOP_OUTPUT_SCHEMA, @@ -579,7 +592,7 @@ class ToolDefinition: name="task_download", description=( "Download the plan output and save it locally to PLANEXE_PATH. " - "Choose the HTML report (default) or a zip of all generated files. " + "Use artifact='report' (default) for the final HTML deliverable; use artifact='zip' for underlying data files (md, json, csv). " "If PLANEXE_PATH is unset, files are saved to the current working directory. " "Filename format is - with numeric suffixes when collisions occur. " "Common local error codes: DOWNLOAD_FAILED, REMOTE_ERROR." @@ -601,8 +614,9 @@ class ToolDefinition: "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " - "Then poll task_status (about every 5 minutes); use task_download when complete. To stop, call task_stop with the task_id from task_create. " - "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, fix model profile configuration and retry. " + "Then poll task_status (about every 5 minutes); use task_download when complete. " + "To stop, call task_stop with the task_id from task_create; stopping is asynchronous and the task will eventually transition to failed. " + "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. task_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "task_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "task_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " @@ -672,6 +686,8 @@ async def handle_task_create(arguments: dict[str, Any]) -> CallToolResult: payload: dict[str, Any] = {"prompt": req.prompt} if req.model_profile: payload["model_profile"] = req.model_profile + if req.user_api_key: + payload["user_api_key"] = req.user_api_key metadata = arguments.get("metadata") if isinstance(metadata, dict): From aaf20f1b25d2de2884230cd5e09caf1f226c283a Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 21:59:48 +0100 Subject: [PATCH 30/38] mcp_local: update AGENTS.md to include user_api_key in visible schema --- mcp_local/AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcp_local/AGENTS.md b/mcp_local/AGENTS.md index e7fbac93..dd45be82 100644 --- a/mcp_local/AGENTS.md +++ b/mcp_local/AGENTS.md @@ -9,7 +9,7 @@ to mcp_cloud, a MCP server running in the cloud, over HTTP. - Supported tools: `prompt_examples`, `model_profiles`, `task_create`, `task_status`, `task_stop`, `task_download`. - `task_download` calls the remote `task_file_info` tool to obtain a download URL, then downloads the artifact to `PLANEXE_PATH` on the local machine. -- `task_create` visible input schema includes `prompt` and optional `model_profile`. +- `task_create` visible input schema includes `prompt`, optional `model_profile`, and optional `user_api_key`. - Use `model_profiles` to help agents select `task_create.model_profile` without relying on internal file knowledge. - Keep workflow wording explicit that prompt drafting + user approval is a non-tool step before `task_create`. - Keep concurrency wording explicit: each `task_create` call creates a new `task_id`; no global per-client concurrency cap is enforced server-side. From 7d23af82587fb221c00778c9afa8808b6ce730bb Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 22:01:05 +0100 Subject: [PATCH 31/38] mcp: add tests verifying all tools have output_schema and task_create has user_api_key --- .../tests/test_tool_surface_consistency.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mcp_cloud/tests/test_tool_surface_consistency.py b/mcp_cloud/tests/test_tool_surface_consistency.py index ac11e0ae..f4c1c698 100644 --- a/mcp_cloud/tests/test_tool_surface_consistency.py +++ b/mcp_cloud/tests/test_tool_surface_consistency.py @@ -11,6 +11,45 @@ def _tool_desc(tool_defs, name: str) -> str: raise AssertionError(f"Tool not found: {name}") +def _tool_def(tool_defs, name: str): + for definition in tool_defs: + if definition.name == name: + return definition + raise AssertionError(f"Tool not found: {name}") + + +class TestAllToolsHaveOutputSchema(unittest.TestCase): + """Every tool must declare an output_schema so callers know the response shape.""" + + def test_cloud_all_tools_have_output_schema(self): + for definition in cloud_app.TOOL_DEFINITIONS: + with self.subTest(tool=definition.name): + self.assertIsNotNone( + definition.output_schema, + f"Cloud tool {definition.name!r} is missing output_schema", + ) + + def test_local_all_tools_have_output_schema(self): + for definition in local_app.TOOL_DEFINITIONS: + with self.subTest(tool=definition.name): + self.assertIsNotNone( + definition.output_schema, + f"Local tool {definition.name!r} is missing output_schema", + ) + + +class TestTaskCreateInputSchemaHasUserApiKey(unittest.TestCase): + """user_api_key must be in the visible task_create input schema.""" + + def test_cloud_task_create_schema_has_user_api_key(self): + props = cloud_app.TASK_CREATE_INPUT_SCHEMA.get("properties", {}) + self.assertIn("user_api_key", props) + + def test_local_task_create_schema_has_user_api_key(self): + props = local_app.TASK_CREATE_INPUT_SCHEMA.get("properties", {}) + self.assertIn("user_api_key", props) + + class TestCloudToolSurfaceConsistency(unittest.TestCase): def test_cloud_exposes_model_profiles_tool(self): cloud_tool_names = {definition.name for definition in cloud_app.TOOL_DEFINITIONS} From 0f502688955d784c9b3b2684f843a0ae817402e5 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 22:28:44 +0100 Subject: [PATCH 32/38] mcp: change speed_vs_detail default from ping_llm to all_details_but_slow --- mcp_cloud/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 082ad497..20eb2893 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -158,8 +158,8 @@ def ensure_taskitem_stop_columns() -> None: ZIP_CONTENT_TYPE = "application/zip" ZIP_SNAPSHOT_MAX_BYTES = 100_000_000 -SPEED_VS_DETAIL_DEFAULT = "ping_llm" -SPEED_VS_DETAIL_DEFAULT_ALIAS = "ping" +SPEED_VS_DETAIL_DEFAULT = "all_details_but_slow" +SPEED_VS_DETAIL_DEFAULT_ALIAS = "all" SPEED_VS_DETAIL_VALUES = ( "ping_llm", "fast_but_skip_details", From 1e7f05398f8adc2c7c5b0de0c7b92d49cc17d139 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 23:30:23 +0100 Subject: [PATCH 33/38] mcp: improve tool descriptions to accurately reflect output breadth Replace "rough-draft project plans" with "strategic project-plan drafts" across all MCP-facing descriptions. Enumerate the 20+ report sections (including adversarial analysis: premortem, self-audit, premise attacks) so callers understand what PlanExe actually produces. Add prose-style guidance for prompt writing. Update generation time from "15-20+" to "10-20 minutes (baseline profile)". Describe the HTML report as interactive (collapsible sections, interactive Gantt charts). Fix SKILL.md inaccuracies (wrong state names, speed_vs_detail as visible required param, wrong artifact types). --- docs/mcp/mcp_welcome.md | 8 +++---- docs/mcp/planexe_mcp_interface.md | 16 ++++++++------ mcp_cloud/app.py | 36 +++++++++++++++++++++++++------ mcp_cloud/http_server.py | 2 +- mcp_cloud/server.json | 2 +- mcp_cloud/tool_models.py | 2 ++ mcp_local/planexe_mcp_local.py | 35 ++++++++++++++++++++++++------ skills/planexe-mcp/SKILL.md | 17 ++++++--------- 8 files changed, 80 insertions(+), 38 deletions(-) diff --git a/docs/mcp/mcp_welcome.md b/docs/mcp/mcp_welcome.md index b28d7bfd..c181fca4 100644 --- a/docs/mcp/mcp_welcome.md +++ b/docs/mcp/mcp_welcome.md @@ -4,7 +4,7 @@ title: Welcome to PlanExe MCP # Welcome to PlanExe MCP -PlanExe MCP lets [AI agents](https://en.wikipedia.org/wiki/AI_agent) (and the tools you build) create [strategic plans](https://en.wikipedia.org/wiki/Strategic_planning) from a plain-English prompt. You send a goal; PlanExe produces a draft plan. The MCP user then chooses whether to download the **HTML report** or a **zip** of intermediary files (JSON, MD, CSV) used to build that report. +PlanExe MCP lets [AI agents](https://en.wikipedia.org/wiki/AI_agent) (and the tools you build) create [strategic project-plan drafts](https://en.wikipedia.org/wiki/Strategic_planning) from a plain-English prompt. You send a goal; PlanExe produces a draft plan with 20+ sections — including adversarial analysis that stress-tests whether the plan holds up. The MCP user then chooses whether to download the **HTML report** or a **zip** of intermediary files (JSON, MD, CSV) used to build that report. No MCP experience is required to get started. @@ -20,7 +20,7 @@ No MCP experience is required to get started. ## What you can do - **Get example prompts** — See what good prompts look like (detailed, typically ~300-800 words). It is the **caller’s responsibility** to take inspiration from these examples and ensure the prompt sent to PlanExe is of similar or better quality. A compact prompt shape works best: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. The agent can refine a vague idea into a high-quality prompt and show it to the user for approval before creating the plan. -- **Create a plan** — Send a prompt; PlanExe starts creating the plan (takes about 15–20 minutes). If the input prompt is of low quality, the output plan will be crap too. Visible `task_create` options include `model_profile`. +- **Create a plan** — Send a prompt; PlanExe starts creating the plan (typically takes 10–20 minutes on baseline profile). If the input prompt is of low quality, the output plan will be crap too. Visible `task_create` options include `model_profile`. - **Check progress** — Ask for status and see how far the plan has gotten. - **Download the report** — When the plan is ready, the user specifies whether to download the HTML report or the zip of intermediary files (JSON, MD, CSV). @@ -32,8 +32,8 @@ Developer note: `speed_vs_detail` is intentionally hidden from the visible `task The MCP user chooses which artifact to download: -- **HTML report** (around 40 pages) — executive summary, Gantt chart, risks, next steps, and more. Opens in a browser. -- **Zip** — intermediary files (JSON, MD, CSV) used to build the HTML report, for deeper inspection. +- **HTML report** (~700KB, self-contained) — 20+ sections including executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks). Opens in a browser with collapsible sections and interactive charts. +- **Zip** — intermediary pipeline files (JSON, MD, CSV) that fed the HTML report, for deeper inspection. --- diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index 19f506ba..3bc711e1 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -4,19 +4,19 @@ ### 1.1 What is PlanExe -PlanExe is a service that generates **rough-draft project plans** from a natural-language prompt. You describe a large goal (e.g. open a clinic, launch a product, build a moon base)—the kind of project that in reality takes months or years. PlanExe produces a structured draft: steps, documents, and deliverables. The plan is not executable in its current form; it is a draft to refine and act on. Creating a plan is a long-running task (100+ LLM inference calls): create a task with a prompt, poll status, then download the HTML report and zip when done. +PlanExe is a service that generates **strategic project-plan drafts** from a natural-language prompt. You describe a large goal (e.g. open a clinic, launch a product, build a moon base)—the kind of project that in reality takes months or years. PlanExe produces a structured draft with 20+ sections: steps, documents, and deliverables. The plan is not executable in its current form; it is a draft to refine and act on. Creating a plan is a long-running task (100+ LLM inference calls): create a task with a prompt, poll status, then download the HTML report and zip when done. ### 1.2 What kind of plan does it create -The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifacts including a Gantt chart, risk analysis, and other project management deliverables. The main output is a large HTML file (approx 700KB) containing many sections. There is also a zip file containing all intermediary files (md, json, csv). Plan quality depends on prompt quality; use the prompt_examples tool to see the baseline before calling task_create. +The plan is a **project plan**: a DAG of steps (Luigi tasks) that produce artifacts including a Gantt chart, risk analysis, and other project management deliverables. The main output is a self-contained interactive HTML report (~700KB) with collapsible sections, interactive Gantt charts, and embedded JavaScript. The report contains 20+ sections including executive summary, investor pitch, project plan with SMART criteria, strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, plan review, Q&A, premortem with failure scenarios, self-audit checklist, and adversarial premise attacks. There is also a zip file containing all intermediary pipeline files (md, json, csv) that fed the report. Plan quality depends on prompt quality; use the prompt_examples tool to see the baseline before calling task_create. #### 1.2.1 Agent-facing summary (for server instructions / tool descriptions) Implementors should expose the following to agents so they understand what PlanExe does: -- **What:** PlanExe turns a plain-English goal into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. The plan is a draft to refine, not an executable or final document. -- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt (typically ~300-800 words) using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` (mcp_local helper) or `task_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. -- **Output:** Large HTML report (~700KB) and optional zip of intermediate files (md, json, csv). +- **What:** PlanExe turns a plain-English goal into a strategic project-plan draft (20+ sections) in ~10–20 min. Sections include executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks) that stress-test the plan. The output is a draft to refine, not an executable or final document — but it surfaces hard questions the prompter may not have considered. +- **Required interaction order:** Call `prompt_examples` first. Optional before `task_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `task_create`. Then poll `task_status` (about every 5 minutes); use `task_download` (mcp_local helper) or `task_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). To stop, call `task_stop` with the `task_id` from `task_create`. +- **Output:** Self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. The zip contains the intermediary pipeline files (md, json, csv) that fed the report. ### 1.3 Scope of this document @@ -158,6 +158,8 @@ All tool names below are normative. **Call this first.** Returns example prompts that define the baseline for what a good prompt looks like. Do not call task_create yet. Correct flow: call this tool; optionally call `model_profiles`; then complete a non-tool step (draft and approve a detailed prompt, typically ~300-800 words); only then call `task_create`. If you call `task_create` before formulating and approving a prompt, the resulting plan will be lower quality than it could be. +Write the prompt as flowing prose, not structured markdown with headers or bullet lists. Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance structure inline. Typical length: 300–800 words. The examples demonstrate this prose style — match their tone and density. + **Request:** no parameters (empty object). **Response:** @@ -278,7 +280,7 @@ The `prompt` parameter should be a detailed description of what the plan should - Budget/resources - Success criteria -Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. +Write as flowing prose, not structured markdown. Include banned approaches, governance preferences, and phasing inline. Short one-liners (e.g., "Construct a bridge") tend to produce poor output because they lack context for the planning pipeline. Important details are location, budget, time frame. **Counterexamples: when NOT to use PlanExe** @@ -332,7 +334,7 @@ For the full catalog file: ### 6.3 task_status -Returns task status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation takes 15–20+ minutes and frequent polling is unnecessary. +Returns task status and progress. Used for progress bars and UI states. **Polling interval:** call at reasonable intervals only (e.g. every 5 minutes); plan generation typically takes 10–20 minutes (baseline profile) and may take longer on higher-quality profiles. **Request** diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 20eb2893..76045108 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -123,14 +123,21 @@ def ensure_taskitem_stop_columns() -> None: # Shown in MCP initialize (e.g. Inspector) so clients know what PlanExe does. PLANEXE_SERVER_INSTRUCTIONS = ( - "PlanExe generates rough-draft project plans from a natural-language prompt. " + "PlanExe generates strategic project-plan drafts from a natural-language prompt. " + "Output is a self-contained interactive HTML report (~700KB) with 20+ sections including " + "executive summary, interactive Gantt charts, risk analysis, SWOT, governance, investor pitch, " + "team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections " + "(premortem, self-audit checklist, premise attacks) that stress-test whether the plan holds up. " + "The output is a draft to refine, not final ground truth — but it surfaces hard questions the prompter may not have considered. " "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " - "Then perform a non-tool step: draft a strong prompt (typically ~300-800 words) and get user approval. " + "Then perform a non-tool step: draft a strong prompt as flowing prose (not structured markdown with headers or bullets), " + "typically ~300-800 words, and get user approval. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose — weave specs, constraints, and targets naturally into sentences. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_file_info when complete. " @@ -142,7 +149,8 @@ def ensure_taskitem_stop_columns() -> None: "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the task_create likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." + "Main output: a self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. " + "The zip contains the intermediary pipeline files (md, json, csv) that fed the report." ) mcp_cloud = Server("planexe-mcp-cloud", instructions=PLANEXE_SERVER_INSTRUCTIONS) @@ -1022,6 +1030,9 @@ class ToolDefinition: "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " "Next is a non-tool step: formulate a detailed prompt (typically ~300-800 words; use examples as a baseline, similar structure) and get user approval. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), @@ -1042,7 +1053,13 @@ class ToolDefinition: name="task_create", description=( "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " - "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15–20 min. " + "PlanExe turns the approved prompt into a strategic project-plan draft (20+ sections) in ~10-20 min. " + "Sections include: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, " + "strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, " + "SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, " + "plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, " + "premortem with failure scenarios, self-audit checklist, and adversarial premise attacks that argue against the project. " + "The adversarial sections (premortem, self-audit, premise attacks) surface risks and questions the prompter may not have considered. " "Returns task_id (UUID); use it for task_status, task_stop, and task_file_info. " "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " "Each task_create call creates a new task_id (no server-side dedup). " @@ -1057,8 +1074,8 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15-20+ minutes " - "and frequent polling is unnecessary. " + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " + "(baseline profile) and may take longer on higher-quality profiles. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " @@ -1086,7 +1103,9 @@ class ToolDefinition: name="task_file_info", description=( "Returns file metadata (content_type, download_url, download_size) for the report or zip artifact. " - "Use artifact='report' (default) for the final HTML deliverable; use artifact='zip' for underlying data files (md, json, csv). " + "Use artifact='report' (default) for the interactive HTML report (~700KB, self-contained with embedded JS " + "for collapsible sections and interactive Gantt charts — open in a browser). " + "Use artifact='zip' for the full pipeline output bundle (md, json, csv intermediary files that fed the report). " "While the task is still pending or processing, returns an empty object {} (no fields). " "Check readiness by testing whether download_url is present in the response. " "Once ready, present download_url to the user or fetch and save the file locally. " @@ -1220,6 +1239,9 @@ async def handle_prompt_examples(arguments: dict[str, Any]) -> CallToolResult: "message": ( "Next: complete the non-tool step by drafting a detailed prompt (typically ~300-800 words) using these as a baseline (similar structure), then get user approval. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " "Only after approval, call task_create. " "Do not use PlanExe for tiny one-shot requests (e.g., rewrite this email, summarize this document). " "PlanExe always runs the full fixed planning pipeline; callers cannot run only selected internal steps." diff --git a/mcp_cloud/http_server.py b/mcp_cloud/http_server.py index 0e0212e9..5087bb7d 100644 --- a/mcp_cloud/http_server.py +++ b/mcp_cloud/http_server.py @@ -432,7 +432,7 @@ async def _lifespan(app: FastAPI): app = FastAPI( title="PlanExe – AI Project Planning", - description="MCP server that generates rough-draft project plans from a natural-language prompt", + description="MCP server that generates strategic project-plan drafts from a natural-language prompt", version="1.0.0", lifespan=_lifespan, ) diff --git a/mcp_cloud/server.json b/mcp_cloud/server.json index e98a533f..a4e18fb2 100644 --- a/mcp_cloud/server.json +++ b/mcp_cloud/server.json @@ -2,7 +2,7 @@ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", "name": "io.github.PlanExeOrg/planexe", "title": "PlanExe", - "description": "MCP server for generating rough-draft project plans from natural-language prompts.", + "description": "MCP server for generating strategic project-plan drafts (20+ sections including adversarial analysis) from natural-language prompts.", "repository": { "url": "https://github.com/PlanExeOrg/PlanExe", "source": "github" diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 6ffbc727..fd267876 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -211,6 +211,8 @@ class TaskCreateInput(BaseModel): "For best results, provide a detailed prompt (typically ~300-800 words). " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " "budget/resources, and success criteria. " + "Write as flowing prose, not structured markdown. Include banned approaches, " + "governance preferences, and phasing inline. " "Short prompts produce less detailed plans. " "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist); use direct LLM responses for those." ), diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 6c03f310..2d2e47a5 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -327,6 +327,8 @@ class ToolDefinition: "Use prompt_examples to get example prompts; use these as examples for task_create. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, " "budget/resources, and success criteria. " + "Write as flowing prose, not structured markdown. Include banned approaches, " + "governance preferences, and phasing inline. " "Short prompts produce less detailed plans. " "Do not use task_create for tiny one-shot outputs (e.g., a 5-point checklist)." ), @@ -528,6 +530,9 @@ class ToolDefinition: "Do NOT call task_create yet. Optional before task_create: call model_profiles to choose model_profile. " "Next is a non-tool step: formulate a prompt (use examples as a baseline, similar structure) and get user approval. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose, not structured markdown with headers or bullet lists. " + "Weave technical specs, constraints, and targets naturally into sentences. Include banned words/approaches and governance preferences inline. " + "The examples demonstrate this prose style — match their tone and density. " "Then call task_create. " "PlanExe is not for tiny one-shot outputs like a 5-point checklist; and it does not support selecting only some internal pipeline steps." ), @@ -548,7 +553,13 @@ class ToolDefinition: name="task_create", description=( "Call only after prompt_examples and after you have completed prompt drafting/approval (non-tool step). " - "PlanExe turns the approved prompt into a structured strategic-plan draft (executive summary, Gantt, risk register, governance, etc.) in ~15-20 min. " + "PlanExe turns the approved prompt into a strategic project-plan draft (20+ sections) in ~10-20 min. " + "Sections include: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, " + "strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, " + "SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, " + "plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, " + "premortem with failure scenarios, self-audit checklist, and adversarial premise attacks that argue against the project. " + "The adversarial sections (premortem, self-audit, premise attacks) surface risks and questions the prompter may not have considered. " "Returns task_id (UUID); use it for task_status, task_stop, and task_download. " "Save task_id immediately: there is no task_list tool, so a lost task_id cannot be recovered. " "Each task_create call creates a new task_id (proxied to cloud; no server-side dedup). " @@ -563,8 +574,8 @@ class ToolDefinition: name="task_status", description=( "Returns status and progress of the plan currently being created. " - "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation takes 15-20+ minutes " - "and frequent polling is unnecessary. " + "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " + "(baseline profile) and may take longer on higher-quality profiles. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " @@ -592,7 +603,9 @@ class ToolDefinition: name="task_download", description=( "Download the plan output and save it locally to PLANEXE_PATH. " - "Use artifact='report' (default) for the final HTML deliverable; use artifact='zip' for underlying data files (md, json, csv). " + "Use artifact='report' (default) for the interactive HTML report (~700KB, self-contained with embedded JS " + "for collapsible sections and interactive Gantt charts — open in a browser). " + "Use artifact='zip' for the full pipeline output bundle (md, json, csv intermediary files that fed the report). " "If PLANEXE_PATH is unset, files are saved to the current working directory. " "Filename format is - with numeric suffixes when collisions occur. " "Common local error codes: DOWNLOAD_FAILED, REMOTE_ERROR." @@ -604,14 +617,21 @@ class ToolDefinition: # Shown in MCP initialize response (e.g. Inspector) so clients know what PlanExe is. PLANEXE_SERVER_INSTRUCTIONS = ( - "PlanExe generates rough-draft project plans from a natural-language prompt. " + "PlanExe generates strategic project-plan drafts from a natural-language prompt. " + "Output is a self-contained interactive HTML report (~700KB) with 20+ sections including " + "executive summary, interactive Gantt charts, risk analysis, SWOT, governance, investor pitch, " + "team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections " + "(premortem, self-audit checklist, premise attacks) that stress-test whether the plan holds up. " + "The output is a draft to refine, not final ground truth — but it surfaces hard questions the prompter may not have considered. " "Use PlanExe for substantial multi-phase projects with constraints, stakeholders, budgets, and timelines. " "Do not use PlanExe for tiny one-shot outputs (for example: 'give me a 5-point checklist'); use a normal LLM response for that. " "The planning pipeline is fixed end-to-end; callers cannot select individual internal pipeline steps to run. " "Required interaction order: call prompt_examples first. " "Optional before task_create: call model_profiles to see profile guidance and available models in each profile. " - "Then perform a non-tool step: draft a strong prompt and get user approval. " + "Then perform a non-tool step: draft a strong prompt as flowing prose (not structured markdown with headers or bullets), " + "typically ~300-800 words, and get user approval. " "Good prompt shape: objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. " + "Write the prompt as flowing prose — weave specs, constraints, and targets naturally into sentences. " "Only after approval, call task_create. " "Each task_create call creates a new task_id; the server does not enforce a global per-client concurrency limit. " "Then poll task_status (about every 5 minutes); use task_download when complete. " @@ -623,7 +643,8 @@ class ToolDefinition: "Troubleshooting: if task_status stays in pending for longer than 5 minutes, the task was likely queued but not picked up by a worker (server issue). " "If task_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " - "Main output: large HTML report (~700KB) and zip of intermediary files (md, json, csv)." + "Main output: a self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. " + "The zip contains the intermediary pipeline files (md, json, csv) that fed the report." ) mcp_local = Server("planexe-mcp-local", instructions=PLANEXE_SERVER_INSTRUCTIONS) diff --git a/skills/planexe-mcp/SKILL.md b/skills/planexe-mcp/SKILL.md index 1519301e..da9de8fa 100644 --- a/skills/planexe-mcp/SKILL.md +++ b/skills/planexe-mcp/SKILL.md @@ -120,7 +120,6 @@ Create a new planning task. This is the main entry point for generating plans. "name": "task_create", "arguments": { "prompt": "Create a project launch plan for Q2 2026", - "speed_vs_detail": "all", "model_profile": "premium", "user_api_key": "your_optional_api_key" } @@ -129,12 +128,8 @@ Create a new planning task. This is the main entry point for generating plans. ``` **Parameter Guide:** -- `prompt` (required): Your planning request in natural language -- `speed_vs_detail` (required): One of `"ping"`, `"fast"`, or `"all"` - - `"ping"`: Quick outline (~2-5 min) - - `"fast"`: Standard plan (~10-15 min) - - `"all"`: Comprehensive analysis (~20-30+ min) -- `model_profile` (required): One of `"baseline"`, `"premium"`, `"frontier"`, or `"custom"` +- `prompt` (required): Your planning request in natural language. Write as flowing prose (not structured markdown), typically 300-800 words. +- `model_profile` (optional): One of `"baseline"`, `"premium"`, `"frontier"`, or `"custom"`. Defaults to `"baseline"`. - `user_api_key` (optional): Your PlanExe API key (if not set in environment) **Returns:** `task_id` for polling status and retrieving results. @@ -160,9 +155,9 @@ Poll the status of a running planning task. } ``` -**Usage:** Planning tasks take 15-20+ minutes. Poll every 5+ minutes to check progress. +**Usage:** Planning tasks typically take 10-20 minutes (baseline profile). Poll every 5+ minutes to check progress. -**Returns:** Current status (`queued`, `running`, `completed`, `failed`), progress percentage, and estimated time remaining. +**Returns:** Current status (`pending`, `processing`, `completed`, `failed`), and progress percentage. --- @@ -210,8 +205,8 @@ Retrieve download information for completed plan artifacts. ``` **Artifact Options:** -- `"report"`: Markdown/PDF plan document -- `"zip"`: Complete deliverables package +- `"report"`: Interactive HTML report (~700KB, self-contained — open in a browser) +- `"zip"`: Pipeline output bundle (md, json, csv intermediary files) **Returns:** `download_url` for accessing the artifact. From f25dedf8b94e5d7d27e8a441ccb1d6895dbe10d7 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 23:39:41 +0100 Subject: [PATCH 34/38] mcp task_create default to the "admin" user --- mcp_cloud/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcp_cloud/app.py b/mcp_cloud/app.py index 76045108..57191e12 100644 --- a/mcp_cloud/app.py +++ b/mcp_cloud/app.py @@ -305,7 +305,7 @@ def _create_task_sync( task = TaskItem( prompt=prompt, state=TaskState.pending, - user_id=metadata.get("user_id", "mcp_user") if metadata else "mcp_user", + user_id=metadata.get("user_id", "admin") if metadata else "admin", parameters=parameters, ) db.session.add(task) From 0ff45c59e4152dabf304818e3e24c6da31822b4d Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 23 Feb 2026 23:54:23 +0100 Subject: [PATCH 35/38] documentation tweaks --- public/llms.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/public/llms.txt b/public/llms.txt index 1035aed1..0b921d03 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -19,9 +19,10 @@ Important pipeline constraint: ## What PlanExe Produces -- A long-running planning task (often ~10-20 minutes, depending on model and configuration). -- A large HTML report and optional zip bundle of intermediate artifacts. -- Typical plan sections: executive summary, work breakdown, timeline/Gantt-style schedule, risks, assumptions, and governance guidance. +- A long-running planning task (typically ~10-20 minutes on baseline profile, longer on higher-quality profiles). +- A self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. Optional zip bundle of intermediary pipeline files (md, json, csv). +- The report contains 20+ sections including: executive summary, interactive Gantt charts, investor pitch, project plan with SMART criteria, strategic decision analysis, scenario comparison, assumptions with expert review, governance structure, SWOT analysis, team role profiles, simulated expert criticism, work breakdown structure, plan review (critical issues, KPIs, financial strategy, automation opportunities), Q&A, premortem with failure scenarios, self-audit checklist, and adversarial premise attacks. +- The adversarial sections (premortem, self-audit, premise attacks) stress-test whether the plan holds up and surface risks the prompter may not have considered. - Output is a draft to refine, not final ground truth. Runtime, quality, and cost tradeoffs: @@ -71,7 +72,7 @@ The MCP server exposes tool-based workflows (not MCP tasks protocol): Key tool inputs/outputs: - task_create inputs: prompt (required), model_profile (optional: baseline | premium | frontier | custom). -- task_create prompt quality: for best results, provide a detailed prompt (typically ~300-800 words) with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. +- task_create prompt quality: for best results, provide a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. - model_profiles output: profile guidance + currently available models in each profile. - model_profiles returns `MODEL_PROFILES_UNAVAILABLE` when no models are available in any profile. - task_create hidden developer metadata: speed_vs_detail (ping | fast | all), provided via tool-specific metadata (not visible tool schema). @@ -95,7 +96,7 @@ Minimal error-handling contract: Recommended interaction order: 1. Call prompt_examples. 2. Optionally call model_profiles to choose model_profile based on current availability. -3. Non-tool step: prepare and approve a strong prompt (typically ~300-800 words) with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. +3. Non-tool step: prepare and approve a strong prompt as flowing prose (not structured markdown), typically ~300-800 words, with objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria. 4. Call task_create. 5. Poll task_status until complete (repeat every 5 minutes). 6. Use task_file_info to get download URLs. From d7af3017bde089ec8e956967f5ac06828313508b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 24 Feb 2026 00:25:52 +0100 Subject: [PATCH 36/38] Prevent the UI from showing Internal Server Error, when there is garbage in the TaskItem.prompt field. Prevent inserting invalid UTF8 in the prompt field. --- database_api/model_taskitem.py | 32 ++++++ database_api/tests/test_taskitem_model.py | 33 ++++++ frontend_multi_user/src/app.py | 132 +++++++++++++++++----- 3 files changed, 168 insertions(+), 29 deletions(-) diff --git a/database_api/model_taskitem.py b/database_api/model_taskitem.py index 28ac5e4a..190bc2df 100644 --- a/database_api/model_taskitem.py +++ b/database_api/model_taskitem.py @@ -5,6 +5,31 @@ from sqlalchemy_utils import UUIDType from sqlalchemy import JSON from sqlalchemy.orm import column_property +from sqlalchemy import event + + +def _sanitize_utf8_text(value): + """Normalize values into valid UTF-8-safe text for persistence.""" + if value is None: + return None + + if isinstance(value, str): + text = value + elif isinstance(value, (bytes, bytearray, memoryview)): + text = bytes(value).decode("utf-8", errors="replace") + else: + text = str(value) + + # Postgres text does not support embedded NULL bytes. + if "\x00" in text: + text = text.replace("\x00", "") + + # Replace unpaired surrogates or other non-encodable code points. + try: + text.encode("utf-8", errors="strict") + except UnicodeEncodeError: + text = text.encode("utf-8", errors="replace").decode("utf-8") + return text class TaskState(enum.Enum): pending = 1 @@ -113,3 +138,10 @@ def demo_items(cls) -> list['TaskItem']: } ) return [task1, task2, task3] + + +@event.listens_for(TaskItem, "before_insert") +@event.listens_for(TaskItem, "before_update") +def _sanitize_taskitem_fields(_mapper, _connection, target): + # Enforce valid UTF-8-safe prompt text regardless of writer path. + target.prompt = _sanitize_utf8_text(target.prompt) diff --git a/database_api/tests/test_taskitem_model.py b/database_api/tests/test_taskitem_model.py index 248b829e..5ad311a8 100644 --- a/database_api/tests/test_taskitem_model.py +++ b/database_api/tests/test_taskitem_model.py @@ -39,3 +39,36 @@ def test_stop_request_fields_default(self): self.assertTrue(hasattr(fetched, "run_activity_overview_json")) self.assertTrue(hasattr(fetched, "run_artifact_layout_version")) self.assertFalse(bool(fetched.stop_requested)) + + def test_prompt_invalid_bytes_are_sanitized(self): + with self.app.app_context(): + bad_bytes = b"Hello \xe2\x80 world" + task = TaskItem( + state=TaskState.pending, + prompt=bad_bytes, + user_id="test_user", + ) + db.session.add(task) + db.session.commit() + + fetched = db.session.get(TaskItem, task.id) + self.assertIsInstance(fetched.prompt, str) + # Must be encodable after sanitization. + fetched.prompt.encode("utf-8") + self.assertIn("Hello", fetched.prompt) + self.assertIn("world", fetched.prompt) + + def test_prompt_surrogates_are_sanitized(self): + with self.app.app_context(): + task = TaskItem( + state=TaskState.pending, + prompt="prefix \ud800 suffix", + user_id="test_user", + ) + db.session.add(task) + db.session.commit() + + fetched = db.session.get(TaskItem, task.id) + self.assertIsInstance(fetched.prompt, str) + fetched.prompt.encode("utf-8") + self.assertFalse(any(0xD800 <= ord(ch) <= 0xDFFF for ch in fetched.prompt)) diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index bfdccc34..baea60df 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -37,7 +37,7 @@ from worker_plan_api.filenames import FilenameEnum, ExtraFilenameEnum from worker_plan_api.prompt_catalog import PromptCatalog from sqlalchemy import text, inspect, func -from sqlalchemy.exc import OperationalError +from sqlalchemy.exc import OperationalError, DataError from database_api.model_taskitem import TaskItem, TaskState from database_api.model_event import EventType, EventItem from database_api.model_worker import WorkerItem @@ -1152,6 +1152,31 @@ def _format_relative_time(value: Any) -> str: n = seconds return f"{n} sec" if n == 1 else f"{n} secs" + def _load_prompt_preview_safe(self, task_id: Any, max_chars: int = 240) -> str: + """Load a prompt preview for one task, tolerating corrupted UTF-8 rows.""" + try: + preview = ( + self.db.session.query(func.substr(TaskItem.prompt, 1, max_chars)) + .filter(TaskItem.id == task_id) + .scalar() + ) + text = (preview or "").strip() + if text: + return text + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for task_id=%s; using placeholder preview.", + task_id, + exc_info=True, + ) + return "[Prompt unavailable due to encoding issue]" + except Exception: + self.db.session.rollback() + logger.debug("Unable to load prompt preview for task_id=%s", task_id, exc_info=True) + + return "[Prompt unavailable]" + def _get_current_user_account(self) -> Optional[UserAccount]: if not current_user.is_authenticated: return None @@ -2016,25 +2041,50 @@ def index(): if user_id: # Generate a nonce so the user can start a plan from the dashboard nonce = 'DASH_' + str(uuid.uuid4()) - recent_task_rows = ( - self.db.session.query( - TaskItem.id, - TaskItem.state, - func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + try: + recent_task_rows = ( + self.db.session.query( + TaskItem.id, + TaskItem.state, + func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + ) + .filter(TaskItem.user_id == str(user_id)) + .order_by(TaskItem.timestamp_created.desc()) + .limit(10) + .all() ) - .filter(TaskItem.user_id == str(user_id)) - .order_by(TaskItem.timestamp_created.desc()) - .limit(10) - .all() - ) - recent_tasks = [ - SimpleNamespace( - id=str(task.id), - state=task.state if isinstance(task.state, TaskState) else None, - prompt=(task.prompt_preview or "").strip(), + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for user_id=%s while loading dashboard; " + "falling back without prompt previews.", + user_id, + exc_info=True, + ) + recent_task_rows = ( + self.db.session.query( + TaskItem.id, + TaskItem.state, + ) + .filter(TaskItem.user_id == str(user_id)) + .order_by(TaskItem.timestamp_created.desc()) + .limit(10) + .all() + ) + recent_tasks = [] + for task in recent_task_rows: + prompt_preview = getattr(task, "prompt_preview", None) + if prompt_preview is None: + prompt_text = self._load_prompt_preview_safe(task.id) + else: + prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + recent_tasks.append( + SimpleNamespace( + id=str(task.id), + state=task.state if isinstance(task.state, TaskState) else None, + prompt=prompt_text, + ) ) - for task in recent_task_rows - ] total_tasks_count = ( TaskItem.query .filter_by(user_id=str(user_id)) @@ -2897,27 +2947,51 @@ def plan(): if not run_id: user_id = str(current_user.id) - tasks = ( - self.db.session.query( - TaskItem.id, - TaskItem.timestamp_created, - TaskItem.state, - func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + try: + tasks = ( + self.db.session.query( + TaskItem.id, + TaskItem.timestamp_created, + TaskItem.state, + func.substr(TaskItem.prompt, 1, 240).label("prompt_preview"), + ) + .filter(TaskItem.user_id == user_id) + .order_by(TaskItem.timestamp_created.desc()) + .all() + ) + except DataError: + self.db.session.rollback() + logger.warning( + "Detected invalid UTF-8 in task_item.prompt for user_id=%s while loading /plan; " + "falling back without prompt previews.", + user_id, + exc_info=True, + ) + tasks = ( + self.db.session.query( + TaskItem.id, + TaskItem.timestamp_created, + TaskItem.state, + ) + .filter(TaskItem.user_id == user_id) + .order_by(TaskItem.timestamp_created.desc()) + .all() ) - .filter(TaskItem.user_id == user_id) - .order_by(TaskItem.timestamp_created.desc()) - .all() - ) rows = [] for task in tasks: ts = task.timestamp_created created_compact = ts.strftime("%y%m%d-%H%M") if isinstance(ts, datetime) else "-" + prompt_preview = getattr(task, "prompt_preview", None) + if prompt_preview is None: + prompt_text = self._load_prompt_preview_safe(task.id) + else: + prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" rows.append({ "id": str(task.id), "created_compact": created_compact, "created_relative": self._format_relative_time(ts), "status": task.state.name if isinstance(task.state, TaskState) else "pending", - "prompt": (task.prompt_preview or "").strip(), + "prompt": prompt_text, }) return render_template("plan_list.html", plan_rows=rows) From 874d4299a08b93603d452f62bdab82d7f7f6782a Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 24 Feb 2026 01:04:52 +0100 Subject: [PATCH 37/38] Edit TaskItem rows, by using file upload/download. So the textfield doesn't have to show a 3mb file, which would cause the UI to hang forever. --- frontend_multi_user/src/planexe_modelviews.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index e28b41e8..c8205039 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -14,6 +14,7 @@ from flask import url_for, abort, redirect, Response from flask_login import current_user from sqlalchemy.orm import defer +from wtforms import FileField, BooleanField class AdminOnlyModelView(ModelView): """Restrict admin views to authenticated admin users only.""" @@ -152,6 +153,19 @@ class TaskItemView(AdminOnlyModelView): f'Download' ) if m.has_run_track_activity_jsonl else '—', } + form_excluded_columns = [ + 'generated_report_html', + 'run_zip_snapshot', + 'run_track_activity_jsonl', + ] + form_extra_fields = { + 'generated_report_html_upload': FileField('Upload Report HTML'), + 'generated_report_html_clear': BooleanField('Clear existing report HTML'), + 'run_zip_snapshot_upload': FileField('Upload Run ZIP'), + 'run_zip_snapshot_clear': BooleanField('Clear existing run ZIP'), + 'run_track_activity_jsonl_upload': FileField('Upload Track Activity JSONL'), + 'run_track_activity_jsonl_clear': BooleanField('Clear existing track activity JSONL'), + } def get_query(self): return super().get_query().options( @@ -160,6 +174,68 @@ def get_query(self): defer(self.model.run_track_activity_jsonl), ) + def on_form_prefill(self, form, id): + model = self.get_one(id) + if model is None: + return + + if hasattr(form, "generated_report_html_upload"): + if model.has_generated_report_html: + href = url_for("download_task_report", task_id=str(model.id)) + form.generated_report_html_upload.description = Markup( + f'Current file: download report.html' + ) + else: + form.generated_report_html_upload.description = "Current file: none" + + if hasattr(form, "run_zip_snapshot_upload"): + if model.has_run_zip_snapshot: + href = url_for("download_task_run_zip", task_id=str(model.id)) + form.run_zip_snapshot_upload.description = Markup( + f'Current file: download run.zip' + ) + else: + form.run_zip_snapshot_upload.description = "Current file: none" + + if hasattr(form, "run_track_activity_jsonl_upload"): + if model.has_run_track_activity_jsonl: + href = url_for("download_task_track_activity", task_id=str(model.id)) + form.run_track_activity_jsonl_upload.description = Markup( + f'Current file: download track_activity.jsonl' + ) + else: + form.run_track_activity_jsonl_upload.description = "Current file: none" + + def on_model_change(self, form, model, is_created): + def _read_upload(field_name: str): + field = getattr(form, field_name, None) + data = getattr(field, "data", None) if field is not None else None + filename = getattr(data, "filename", None) if data is not None else None + if not data or not filename: + return None + return data.read() + + uploaded_report = _read_upload("generated_report_html_upload") + uploaded_zip = _read_upload("run_zip_snapshot_upload") + uploaded_track = _read_upload("run_track_activity_jsonl_upload") + + if uploaded_report is not None: + model.generated_report_html = uploaded_report.decode("utf-8", errors="replace") + elif bool(getattr(form.generated_report_html_clear, "data", False)): + model.generated_report_html = None + + if uploaded_zip is not None: + model.run_zip_snapshot = uploaded_zip + elif bool(getattr(form.run_zip_snapshot_clear, "data", False)): + model.run_zip_snapshot = None + + if uploaded_track is not None: + model.run_track_activity_jsonl = uploaded_track.decode("utf-8", errors="replace") + elif bool(getattr(form.run_track_activity_jsonl_clear, "data", False)): + model.run_track_activity_jsonl = None + + return super().on_model_change(form, model, is_created) + class NonceItemView(AdminOnlyModelView): """Custom ModelView for NonceItem""" def __init__(self, model, *args, **kwargs): From 2f8e6648a2b5b248aa2dd085b5abad257ad04b74 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 24 Feb 2026 01:27:56 +0100 Subject: [PATCH 38/38] CI typecheck error --- frontend_multi_user/src/planexe_modelviews.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index c8205039..7f17581b 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -8,6 +8,7 @@ from datetime import datetime from decimal import Decimal from enum import Enum +from typing import Any from flask_admin.contrib.sqla import ModelView from flask_admin.actions import action from markupsafe import Markup @@ -174,7 +175,7 @@ def get_query(self): defer(self.model.run_track_activity_jsonl), ) - def on_form_prefill(self, form, id): + def on_form_prefill(self, form: Any, id: Any) -> None: model = self.get_one(id) if model is None: return @@ -206,7 +207,7 @@ def on_form_prefill(self, form, id): else: form.run_track_activity_jsonl_upload.description = "Current file: none" - def on_model_change(self, form, model, is_created): + def on_model_change(self, form: Any, model: Any, is_created: bool) -> None: def _read_upload(field_name: str): field = getattr(form, field_name, None) data = getattr(field, "data", None) if field is not None else None