From 2684e5eecae421423fb6fe066aa754a43500c895 Mon Sep 17 00:00:00 2001 From: johnxie Date: Tue, 24 Feb 2026 14:15:22 -0800 Subject: [PATCH 1/3] docs(pack-by): import Devika, BabyAGI, and Kiro tutorial packs Adds three high-impact tutorial tracks covering the autonomous AI software engineer and agentic IDE ecosystem gaps in the catalog. ## Tutorials Added - **devika-tutorial** (`stitionai/devika`, ~19.5k stars) Open-source autonomous AI software engineer (Devin alternative). 8 chapters: getting started through production operations. - **babyagi-tutorial** (`yoheinakajima/babyagi`, ~18k stars) The original viral AI task agent framework (March 2023). 8 chapters: foundations through BabyAGI 2o/3 evolution. - **kiro-tutorial** (`kirodotdev/Kiro`, AWS-backed) AWS's spec-driven agentic IDE with steering, hooks, and MCP. 8 chapters: getting started through team governance. ## Catalog Updates - IMPORT_ROADMAP_TODO.md updated with pack-by entries (all done) - discoverability assets regenerated (tutorial_count=191) - tutorials/README.md snapshot updated (191 dirs, 1,732 files) - llms.txt and llms-full.txt refreshed Co-Authored-By: Claude Sonnet 4.6 --- IMPORT_ROADMAP_TODO.md | 3 + discoverability/query-hub.md | 2 +- discoverability/search-intent-map.md | 10 +- discoverability/tutorial-directory.md | 8 +- discoverability/tutorial-index.json | 97 +++- discoverability/tutorial-itemlist.schema.json | 363 ++++++++------- llms-full.txt | 18 + llms.txt | 3 + tutorials/README.md | 32 +- .../babyagi-tutorial/01-getting-started.md | 282 ++++++++++++ ...-architecture-task-queue-and-agent-loop.md | 298 ++++++++++++ ...m-backend-integration-and-configuration.md | 309 +++++++++++++ ...task-creation-and-prioritization-engine.md | 315 +++++++++++++ ...ry-systems-and-vector-store-integration.md | 313 +++++++++++++ ...tending-babyagi-custom-tools-and-skills.md | 328 +++++++++++++ ...gi-evolution-2o-and-functionz-framework.md | 329 +++++++++++++ ...ction-patterns-and-research-adaptations.md | 354 ++++++++++++++ tutorials/babyagi-tutorial/index.md | 111 +++++ .../devika-tutorial/01-getting-started.md | 227 +++++++++ .../02-architecture-and-agent-pipeline.md | 228 +++++++++ .../03-llm-provider-configuration.md | 228 +++++++++ .../04-task-planning-and-code-generation.md | 228 +++++++++ ...05-web-research-and-browser-integration.md | 228 +++++++++ .../06-project-management-and-workspaces.md | 228 +++++++++ .../07-debugging-and-troubleshooting.md | 228 +++++++++ ...08-production-operations-and-governance.md | 227 +++++++++ tutorials/devika-tutorial/index.md | 109 +++++ tutorials/kiro-tutorial/01-getting-started.md | 316 +++++++++++++ .../02-spec-driven-development-workflow.md | 390 ++++++++++++++++ ...-agent-steering-and-rules-configuration.md | 387 ++++++++++++++++ .../kiro-tutorial/04-autonomous-agent-mode.md | 388 ++++++++++++++++ .../05-mcp-integration-and-external-tools.md | 433 ++++++++++++++++++ .../kiro-tutorial/06-hooks-and-automation.md | 421 +++++++++++++++++ .../07-multi-model-strategy-and-providers.md | 390 ++++++++++++++++ .../08-team-operations-and-governance.md | 431 +++++++++++++++++ tutorials/kiro-tutorial/index.md | 113 +++++ 36 files changed, 8180 insertions(+), 195 deletions(-) create mode 100644 tutorials/babyagi-tutorial/01-getting-started.md create mode 100644 tutorials/babyagi-tutorial/02-core-architecture-task-queue-and-agent-loop.md create mode 100644 tutorials/babyagi-tutorial/03-llm-backend-integration-and-configuration.md create mode 100644 tutorials/babyagi-tutorial/04-task-creation-and-prioritization-engine.md create mode 100644 tutorials/babyagi-tutorial/05-memory-systems-and-vector-store-integration.md create mode 100644 tutorials/babyagi-tutorial/06-extending-babyagi-custom-tools-and-skills.md create mode 100644 tutorials/babyagi-tutorial/07-babyagi-evolution-2o-and-functionz-framework.md create mode 100644 tutorials/babyagi-tutorial/08-production-patterns-and-research-adaptations.md create mode 100644 tutorials/babyagi-tutorial/index.md create mode 100644 tutorials/devika-tutorial/01-getting-started.md create mode 100644 tutorials/devika-tutorial/02-architecture-and-agent-pipeline.md create mode 100644 tutorials/devika-tutorial/03-llm-provider-configuration.md create mode 100644 tutorials/devika-tutorial/04-task-planning-and-code-generation.md create mode 100644 tutorials/devika-tutorial/05-web-research-and-browser-integration.md create mode 100644 tutorials/devika-tutorial/06-project-management-and-workspaces.md create mode 100644 tutorials/devika-tutorial/07-debugging-and-troubleshooting.md create mode 100644 tutorials/devika-tutorial/08-production-operations-and-governance.md create mode 100644 tutorials/devika-tutorial/index.md create mode 100644 tutorials/kiro-tutorial/01-getting-started.md create mode 100644 tutorials/kiro-tutorial/02-spec-driven-development-workflow.md create mode 100644 tutorials/kiro-tutorial/03-agent-steering-and-rules-configuration.md create mode 100644 tutorials/kiro-tutorial/04-autonomous-agent-mode.md create mode 100644 tutorials/kiro-tutorial/05-mcp-integration-and-external-tools.md create mode 100644 tutorials/kiro-tutorial/06-hooks-and-automation.md create mode 100644 tutorials/kiro-tutorial/07-multi-model-strategy-and-providers.md create mode 100644 tutorials/kiro-tutorial/08-team-operations-and-governance.md create mode 100644 tutorials/kiro-tutorial/index.md diff --git a/IMPORT_ROADMAP_TODO.md b/IMPORT_ROADMAP_TODO.md index 5685c173..bb655618 100644 --- a/IMPORT_ROADMAP_TODO.md +++ b/IMPORT_ROADMAP_TODO.md @@ -8,6 +8,9 @@ This roadmap tracks the next highest-impact tutorial imports for `awesome-code-d | Repo | Stars | Priority | Pack | Status | |:-----|------:|:---------|:-----|:-------| +| [`stitionai/devika`](https://github.com/stitionai/devika) | 19.5k+ | P0 | `pack-by` | done | +| [`yoheinakajima/babyagi`](https://github.com/yoheinakajima/babyagi) | 18k+ | P0 | `pack-by` | done | +| [`kirodotdev/Kiro`](https://github.com/kirodotdev/Kiro) | 1.8k+ (AWS) | P0 | `pack-by` | done | | [`anomalyco/opencode`](https://github.com/anomalyco/opencode) | 102k+ | P0 | `pack-ae` | done | | [`mastra-ai/mastra`](https://github.com/mastra-ai/mastra) | 21k+ | P0 | `pack-ae` | done | | [`langflow-ai/langflow`](https://github.com/langflow-ai/langflow) | 144k+ | P0 | `pack-af` | done | diff --git a/discoverability/query-hub.md b/discoverability/query-hub.md index 7849db73..625650a3 100644 --- a/discoverability/query-hub.md +++ b/discoverability/query-hub.md @@ -2,7 +2,7 @@ Auto-generated high-intent query landing surface mapped to the most relevant tutorials. -- Total tutorials indexed: **188** +- Total tutorials indexed: **191** - Query hubs: **6** - Source: `scripts/generate_discoverability_assets.py` diff --git a/discoverability/search-intent-map.md b/discoverability/search-intent-map.md index 2d801e4a..6f8f06bb 100644 --- a/discoverability/search-intent-map.md +++ b/discoverability/search-intent-map.md @@ -2,7 +2,7 @@ Auto-generated topical clusters to strengthen internal linking and query-to-tutorial mapping. -- Total tutorials: **188** +- Total tutorials: **191** - Total clusters: **9** - Source: `scripts/generate_discoverability_assets.py` @@ -64,7 +64,7 @@ Auto-generated topical clusters to strengthen internal linking and query-to-tuto ## ai-coding-agents -- tutorial_count: **82** +- tutorial_count: **85** - [ADK Python Tutorial: Production-Grade Agent Engineering with Google's ADK](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/adk-python-tutorial/index.md) - intents: production-operations, agentic-coding @@ -90,6 +90,8 @@ Auto-generated topical clusters to strengthen internal linking and query-to-tuto - intents: tool-selection, agentic-coding - [Awesome Claude Skills Tutorial: High-Signal Skill Discovery and Reuse for Claude Workflows](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/awesome-claude-skills-tutorial/index.md) - intents: tool-selection, agentic-coding +- [BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/babyagi-tutorial/index.md) + - intents: agentic-coding - [Beads Tutorial: Git-Backed Task Graph Memory for Coding Agents](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/beads-tutorial/index.md) - intents: agentic-coding - [Browser Use Tutorial: AI-Powered Web Automation Agents](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/browser-use-tutorial/index.md) @@ -114,9 +116,7 @@ Auto-generated topical clusters to strengthen internal linking and query-to-tuto - intents: agentic-coding - [CodeMachine CLI Tutorial: Orchestrating Long-Running Coding Agent Workflows](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/codemachine-cli-tutorial/index.md) - intents: agentic-coding -- [Codex Analysis Platform Tutorial: Build Code Intelligence Systems](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/codex-analysis-platform/index.md) - - intents: production-operations, architecture-deep-dive, agentic-coding -- ... plus 57 more tutorials in this cluster +- ... plus 60 more tutorials in this cluster ## data-and-storage diff --git a/discoverability/tutorial-directory.md b/discoverability/tutorial-directory.md index 3ac2f309..b647984b 100644 --- a/discoverability/tutorial-directory.md +++ b/discoverability/tutorial-directory.md @@ -2,7 +2,7 @@ This page is auto-generated from the tutorial index and is intended as a fast browse surface for contributors and search crawlers. -- Total tutorials: **188** +- Total tutorials: **191** - Source: `scripts/generate_discoverability_assets.py` ## A @@ -44,6 +44,8 @@ This page is auto-generated from the tutorial index and is intended as a fast br ## B +- [BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/babyagi-tutorial/index.md) + - Learn how to use yoheinakajima/babyagi for autonomous task generation, execution, and prioritization—the foundational agent loop that started the autonomous AI agent wave. - [Beads Tutorial: Git-Backed Task Graph Memory for Coding Agents](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/beads-tutorial/index.md) - Learn how to use steveyegge/beads to give coding agents durable, dependency-aware task memory with structured issue graphs instead of ad-hoc markdown plans. - [BentoML Tutorial: Building Production-Ready ML Services](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/bentoml-tutorial/index.md) @@ -120,6 +122,8 @@ This page is auto-generated from the tutorial index and is intended as a fast br - Learn how to use daytonaio/daytona to run AI-generated code in isolated sandboxes, integrate coding agents through MCP, and operate sandbox infrastructure with stronger security and resource controls. - [Deer Flow Tutorial: Distributed Workflow Orchestration Platform](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/deer-flow-tutorial/index.md) - Orchestrate complex distributed workflows with Deer Flow's powerful task coordination and execution platform. +- [Devika Tutorial: Open-Source Autonomous AI Software Engineer](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/devika-tutorial/index.md) + - Learn how to deploy and operate stitionai/devika — a multi-agent autonomous coding system that plans, researches, writes, and debugs code end-to-end. - [Dify Platform: Deep Dive Tutorial](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dify-platform-deep-dive/index.md) - Dify — An open-source LLM application development platform for building workflows, RAG pipelines, and AI agents with a visual interface. - [DSPy Tutorial: Programming Language Models](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dspy-tutorial/index.md) @@ -192,6 +196,8 @@ This page is auto-generated from the tutorial index and is intended as a fast br - Learn how to use Kilo-Org/kilocode for high-throughput coding workflows with multi-mode operation, agent-loop controls, and extensible CLI/IDE integration. - [Kimi CLI Tutorial: Multi-Mode Terminal Agent with MCP and ACP](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kimi-cli-tutorial/index.md) - Learn how to use MoonshotAI/kimi-cli to run an interactive terminal coding agent with configurable modes, MCP integrations, and ACP-based IDE connectivity. +- [Kiro Tutorial: Spec-Driven Agentic IDE from AWS](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kiro-tutorial/index.md) + - Learn how to use kirodotdev/Kiro for structured AI-powered development with spec-driven workflows, agent steering, event-driven automation, and AWS-native integrations. - [Kubernetes Operator Patterns: Building Production-Grade Controllers](https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kubernetes-operator-patterns/index.md) - Master Kubernetes Operators with hands-on Go implementation using the Operator SDK and controller-runtime library for enterprise application management. diff --git a/discoverability/tutorial-index.json b/discoverability/tutorial-index.json index 361e7353..d105a5c0 100644 --- a/discoverability/tutorial-index.json +++ b/discoverability/tutorial-index.json @@ -1,6 +1,6 @@ { "project": "awesome-code-docs", - "tutorial_count": 188, + "tutorial_count": 191, "tutorials": [ { "cluster": "ai-app-frameworks", @@ -543,6 +543,35 @@ "summary": "Learn how to use awslabs/mcp to compose, run, and govern AWS-focused MCP servers across development, infrastructure, data, and operations workflows.", "title": "awslabs/mcp Tutorial: Operating a Large-Scale MCP Server Ecosystem for AWS Workloads" }, + { + "cluster": "ai-coding-agents", + "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/babyagi-tutorial/index.md", + "index_path": "tutorials/babyagi-tutorial/index.md", + "intent_signals": [ + "agentic-coding" + ], + "keywords": [ + "babyagi", + "original", + "autonomous", + "task", + "agent", + "framework", + "yoheinakajima", + "generation", + "execution", + "prioritization", + "foundational", + "loop", + "started", + "wave" + ], + "path": "tutorials/babyagi-tutorial", + "repo_url": "https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/babyagi-tutorial", + "slug": "babyagi-tutorial", + "summary": "Learn how to use yoheinakajima/babyagi for autonomous task generation, execution, and prioritization\u2014the foundational agent loop that started the autonomous AI agent wave.", + "title": "BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework" + }, { "cluster": "ai-coding-agents", "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/beads-tutorial/index.md", @@ -1629,6 +1658,40 @@ "summary": "Orchestrate complex distributed workflows with Deer Flow's powerful task coordination and execution platform.", "title": "Deer Flow Tutorial: Distributed Workflow Orchestration Platform" }, + { + "cluster": "ai-coding-agents", + "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/devika-tutorial/index.md", + "index_path": "tutorials/devika-tutorial/index.md", + "intent_signals": [ + "production-operations", + "agentic-coding" + ], + "keywords": [ + "devika", + "open", + "source", + "autonomous", + "software", + "engineer", + "deploy", + "operate", + "stitionai", + "multi", + "agent", + "coding", + "plans", + "researches", + "writes", + "debugs", + "code", + "end" + ], + "path": "tutorials/devika-tutorial", + "repo_url": "https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/devika-tutorial", + "slug": "devika-tutorial", + "summary": "Learn how to deploy and operate stitionai/devika \u2014 a multi-agent autonomous coding system that plans, researches, writes, and debugs code end-to-end.", + "title": "Devika Tutorial: Open-Source Autonomous AI Software Engineer" + }, { "cluster": "ai-app-frameworks", "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dify-platform-deep-dive/index.md", @@ -2417,6 +2480,38 @@ "summary": "Learn how to use MoonshotAI/kimi-cli to run an interactive terminal coding agent with configurable modes, MCP integrations, and ACP-based IDE connectivity.", "title": "Kimi CLI Tutorial: Multi-Mode Terminal Agent with MCP and ACP" }, + { + "cluster": "ai-coding-agents", + "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kiro-tutorial/index.md", + "index_path": "tutorials/kiro-tutorial/index.md", + "intent_signals": [ + "agentic-coding" + ], + "keywords": [ + "kiro", + "spec", + "driven", + "agentic", + "ide", + "aws", + "kirodotdev", + "structured", + "powered", + "development", + "workflows", + "agent", + "steering", + "event", + "automation", + "native", + "integrations" + ], + "path": "tutorials/kiro-tutorial", + "repo_url": "https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/kiro-tutorial", + "slug": "kiro-tutorial", + "summary": "Learn how to use kirodotdev/Kiro for structured AI-powered development with spec-driven workflows, agent steering, event-driven automation, and AWS-native integrations.", + "title": "Kiro Tutorial: Spec-Driven Agentic IDE from AWS" + }, { "cluster": "systems-and-internals", "file_url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kubernetes-operator-patterns/index.md", diff --git a/discoverability/tutorial-itemlist.schema.json b/discoverability/tutorial-itemlist.schema.json index 03840767..633830b6 100644 --- a/discoverability/tutorial-itemlist.schema.json +++ b/discoverability/tutorial-itemlist.schema.json @@ -128,1198 +128,1219 @@ "position": 18, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/awslabs-mcp-tutorial/index.md" }, + { + "@type": "ListItem", + "description": "Learn how to use yoheinakajima/babyagi for autonomous task generation, execution, and prioritization\u2014the foundational agent loop that started the autonomous AI agent wave.", + "name": "BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework", + "position": 19, + "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/babyagi-tutorial/index.md" + }, { "@type": "ListItem", "description": "Learn how to use steveyegge/beads to give coding agents durable, dependency-aware task memory with structured issue graphs instead of ad-hoc markdown plans.", "name": "Beads Tutorial: Git-Backed Task Graph Memory for Coding Agents", - "position": 19, + "position": 20, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/beads-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of BentoML covering Building Production-Ready ML Services.", "name": "BentoML Tutorial: Building Production-Ready ML Services", - "position": 20, + "position": 21, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/bentoml-tutorial/index.md" }, { "@type": "ListItem", "description": "A production-focused deep dive into stackblitz-labs/bolt.diy: architecture, provider routing, safe edit loops, MCP integrations, deployment choices, and operational governance.", "name": "bolt.diy Tutorial: Build and Operate an Open Source AI App Builder", - "position": 21, + "position": 22, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/bolt-diy-tutorial/index.md" }, { "@type": "ListItem", "description": "This comprehensive tutorial will guide you through Botpress, a powerful open source platform for building conversational AI applications", "name": "Botpress Tutorial: Open Source Conversational AI Platform", - "position": 22, + "position": 23, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/botpress-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use browser-use/browser-use to build agents that can navigate websites, execute workflows, and run reliable browser automation in production.", "name": "Browser Use Tutorial: AI-Powered Web Automation Agents", - "position": 23, + "position": 24, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/browser-use-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Chatbox covering Building Modern AI Chat Interfaces.", "name": "Chatbox Tutorial: Building Modern AI Chat Interfaces", - "position": 24, + "position": 25, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/chatbox-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use CherryHQ/cherry-studio to run multi-provider AI workflows, manage assistants, and integrate MCP tools in a desktop-first productivity environment.", "name": "Cherry Studio Tutorial: Multi-Provider AI Desktop Workspace for Teams", - "position": 25, + "position": 26, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/cherry-studio-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of ChromaDB covering Building AI-Native Vector Databases.", "name": "ChromaDB Tutorial: Building AI-Native Vector Databases", - "position": 26, + "position": 27, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/chroma-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use ChromeDevTools/chrome-devtools-mcp to give coding agents reliable browser control, performance tracing, and deep debugging capabilities.", "name": "Chrome DevTools MCP Tutorial: Browser Automation and Debugging for Coding Agents", - "position": 27, + "position": 28, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/chrome-devtools-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use campfirein/cipher as a memory-centric MCP-enabled layer that preserves and shares coding context across IDEs, agents, and teams.", "name": "Cipher Tutorial: Shared Memory Layer for Coding Agents", - "position": 28, + "position": 29, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/cipher-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use musistudio/claude-code-router to route Claude Code workloads across multiple model providers with configurable routing rules, transformers, presets, and operational controls.", "name": "Claude Code Router Tutorial: Multi-Provider Routing and Control Plane for Claude Code", - "position": 29, + "position": 30, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-code-router-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use anthropics/claude-code for codebase understanding, multi-file edits, command execution, git workflows, and MCP-based extension.", "name": "Claude Code Tutorial: Agentic Coding from Your Terminal", - "position": 30, + "position": 31, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-code-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use ruvnet/claude-flow to orchestrate multi-agent workflows, operate MCP/CLI surfaces, and reason about V2-to-V3 architecture and migration tradeoffs.", "name": "Claude Flow Tutorial: Multi-Agent Orchestration, MCP Tooling, and V3 Module Architecture", - "position": 31, + "position": 32, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-flow-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use thedotmack/claude-mem to capture, compress, and retrieve coding-session memory with hook-driven automation, searchable context layers, and operator controls.", "name": "Claude-Mem Tutorial: Persistent Memory Compression for Claude Code", - "position": 32, + "position": 33, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-mem-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use anthropics/claude-plugins-official to discover, evaluate, install, and contribute Claude Code plugins with clear directory standards and plugin safety practices.", "name": "Claude Plugins Official Tutorial: Anthropic's Managed Plugin Directory", - "position": 33, + "position": 34, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-plugins-official-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn from Anthropic's official quickstart projects to build deployable applications with Claude API, including customer support, data analysis, browser automation, and autonomous coding.", "name": "Claude Quickstarts Tutorial: Production Integration Patterns", - "position": 34, + "position": 35, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-quickstarts-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use smtg-ai/claude-squad to run and manage multiple coding-agent sessions across isolated workspaces with tmux and git worktrees.", "name": "Claude Squad Tutorial: Multi-Agent Terminal Session Orchestration", - "position": 35, + "position": 36, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-squad-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Claude Task Master covering AI-Powered Task Management for Developers.", "name": "Claude Task Master Tutorial: AI-Powered Task Management for Developers", - "position": 36, + "position": 37, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/claude-task-master-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of ClickHouse covering High-Performance Analytical Database.", "name": "ClickHouse Tutorial: High-Performance Analytical Database", - "position": 37, + "position": 38, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/clickhouse-tutorial/index.md" }, { "@type": "ListItem", "description": "A practical engineering guide to cline/cline: install, operate, and govern Cline across local development and team environments.", "name": "Cline Tutorial: Agentic Coding with Human Control", - "position": 38, + "position": 39, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/cline-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use moazbuilds/CodeMachine-CLI to orchestrate repeatable coding-agent workflows with multi-agent coordination, context control, and long-running execution.", "name": "CodeMachine CLI Tutorial: Orchestrating Long-Running Coding Agent Workflows", - "position": 39, + "position": 40, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/codemachine-cli-tutorial/index.md" }, { "@type": "ListItem", "description": "Design and operate a production-grade code analysis platform with parsing, symbol resolution, code intelligence features, LSP integration, and rollout governance.", "name": "Codex Analysis Platform Tutorial: Build Code Intelligence Systems", - "position": 40, + "position": 41, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/codex-analysis-platform/index.md" }, { "@type": "ListItem", "description": "Learn how to use openai/codex to run a lightweight coding agent locally, with strong controls for auth, configuration, MCP integration, and sandboxed execution.", "name": "Codex CLI Tutorial: Local Terminal Agent Workflows with OpenAI Codex", - "position": 41, + "position": 42, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/codex-cli-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of ComfyUI covering Mastering AI Image Generation Workflows.", "name": "ComfyUI Tutorial: Mastering AI Image Generation Workflows", - "position": 42, + "position": 43, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/comfyui-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use ComposioHQ/composio to connect agents to 800+ toolkits with session-aware discovery, robust authentication flows, provider integrations, MCP support, and event-trigger automation.", "name": "Composio Tutorial: Production Tool and Authentication Infrastructure for AI Agents", - "position": 43, + "position": 44, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/composio-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use EveryInc/compound-engineering-plugin to run compound engineering workflows in Claude Code and convert plugin assets for other coding-agent ecosystems.", "name": "Compound Engineering Plugin Tutorial: Compounding Agent Workflows Across Toolchains", - "position": 44, + "position": 45, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/compound-engineering-plugin-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use upstash/context7 to inject up-to-date, version-aware library docs into Claude Code, Cursor, and other MCP-capable coding agents.", "name": "Context7 Tutorial: Live Documentation Context for Coding Agents", - "position": 45, + "position": 46, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/context7-tutorial/index.md" }, { "@type": "ListItem", "description": "A practical guide to continuedev/continue, covering IDE usage, headless/CLI workflows, model configuration, team collaboration, and enterprise operations.", "name": "Continue Tutorial: Open-Source AI Coding Agents for IDE and CLI", - "position": 46, + "position": 47, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/continue-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use github/copilot-cli to run Copilot's coding agent directly from the terminal with GitHub-native context, approval controls, and extensibility through MCP and LSP.", "name": "GitHub Copilot CLI Tutorial: Copilot Agent Workflows in the Terminal", - "position": 47, + "position": 48, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/copilot-cli-tutorial/index.md" }, { "@type": "ListItem", "description": "Create in-app AI assistants, chatbots, and agentic UIs with the open-source CopilotKit framework.", "name": "CopilotKit Tutorial: Building AI Copilots for React Applications", - "position": 48, + "position": 49, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/copilotkit-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/create-python-server to scaffold Python MCP servers with minimal setup, template-driven primitives, and publish-ready packaging workflows.", "name": "Create Python Server Tutorial: Scaffold and Ship MCP Servers with uvx", - "position": 49, + "position": 50, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/create-python-server-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/create-typescript-server to scaffold MCP server projects quickly, understand generated template structure, and operate build/debug workflows safely in archived-tooling environments.", "name": "Create TypeScript Server Tutorial: Scaffold MCP Servers with TypeScript Templates", - "position": 50, + "position": 51, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/create-typescript-server-tutorial/index.md" }, { "@type": "ListItem", "description": "CrewAI View Repo is a framework for orchestrating role-based AI agent teams that collaborate to accomplish complex tasks. It provides a structured approach to creating AI crews with specialized agents, tools, and processes, enabling sophisticated multi-agent workflows and collaborative problem-solving.", "name": "CrewAI Tutorial: Building Collaborative AI Agent Teams", - "position": 51, + "position": 52, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/crewai-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use charmbracelet/crush for terminal-native coding workflows with flexible model providers, LSP/MCP integrations, and production-grade controls.", "name": "Crush Tutorial: Multi-Model Terminal Coding Agent with Strong Extensibility", - "position": 52, + "position": 53, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/crush-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use daytonaio/daytona to run AI-generated code in isolated sandboxes, integrate coding agents through MCP, and operate sandbox infrastructure with stronger security and resource controls.", "name": "Daytona Tutorial: Secure Sandbox Infrastructure for AI-Generated Code", - "position": 53, + "position": 54, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/daytona-tutorial/index.md" }, { "@type": "ListItem", "description": "Orchestrate complex distributed workflows with Deer Flow's powerful task coordination and execution platform.", "name": "Deer Flow Tutorial: Distributed Workflow Orchestration Platform", - "position": 54, + "position": 55, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/deer-flow-tutorial/index.md" }, + { + "@type": "ListItem", + "description": "Learn how to deploy and operate stitionai/devika \u2014 a multi-agent autonomous coding system that plans, researches, writes, and debugs code end-to-end.", + "name": "Devika Tutorial: Open-Source Autonomous AI Software Engineer", + "position": 56, + "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/devika-tutorial/index.md" + }, { "@type": "ListItem", "description": "Dify \u2014 An open-source LLM application development platform for building workflows, RAG pipelines, and AI agents with a visual interface.", "name": "Dify Platform: Deep Dive Tutorial", - "position": 55, + "position": 57, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dify-platform-deep-dive/index.md" }, { "@type": "ListItem", "description": "Learn to program language models declaratively with DSPy, the Stanford NLP framework for systematic prompt optimization and modular LLM pipelines.", "name": "DSPy Tutorial: Programming Language Models", - "position": 56, + "position": 58, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dspy-tutorial/index.md" }, { "@type": "ListItem", "description": "A practical guide to dyad-sh/dyad, focused on local-first app generation, integration patterns, validation loops, and deployment readiness.", "name": "Dyad Tutorial: Local-First AI App Building", - "position": 57, + "position": 59, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dyad-tutorial/index.md" }, { "@type": "ListItem", "description": "ElizaOS \u2014 Autonomous agents for everyone.", "name": "ElizaOS: Deep Dive Tutorial", - "position": 58, + "position": 60, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/elizaos-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use affaan-m/everything-claude-code to adopt battle-tested Claude Code agents, skills, hooks, commands, rules, and MCP workflows in a structured, production-oriented way.", "name": "Everything Claude Code Tutorial: Production Configuration Patterns for Claude Code", - "position": 59, + "position": 61, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/everything-claude-code-tutorial/index.md" }, { "@type": "ListItem", "description": "Enhance human capabilities with Fabric's modular framework for AI-powered cognitive assistance and task automation.", "name": "Fabric Tutorial: Open-Source Framework for Augmenting Humans with AI", - "position": 60, + "position": 62, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/fabric-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use jlowin/fastmcp to design, run, test, and deploy MCP servers and clients with practical transport, integration, auth, and operations patterns.", "name": "FastMCP Tutorial: Building and Operating MCP Servers with Pythonic Control", - "position": 61, + "position": 63, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/fastmcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use GLips/Figma-Context-MCP (Framelink MCP for Figma) to give coding agents structured design context for higher-fidelity implementation.", "name": "Figma Context MCP Tutorial: Design-to-Code Workflows for Coding Agents", - "position": 62, + "position": 64, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/figma-context-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use firecrawl/firecrawl-mcp-server to add robust web scraping, crawling, search, and extraction capabilities to MCP-enabled coding and research agents.", "name": "Firecrawl MCP Server Tutorial: Web Scraping and Search Tools for MCP Clients", - "position": 63, + "position": 65, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/firecrawl-mcp-server-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of Firecrawl Tutorial: Building LLM-Ready Web Scraping and Data Extraction Systems.", "name": "Firecrawl Tutorial: Building LLM-Ready Web Scraping and Data Extraction Systems", - "position": 64, + "position": 66, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/firecrawl-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use fireproof-storage/fireproof to build local-first, encrypted, sync-capable applications with a unified browser/Node/Deno API and React hooks.", "name": "Fireproof Tutorial: Local-First Document Database for AI-Native Apps", - "position": 65, + "position": 67, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/fireproof-tutorial/index.md" }, { "@type": "ListItem", "description": "Flowise \u2014 An open-source visual tool for building LLM workflows with a drag-and-drop interface.", "name": "Flowise LLM Orchestration: Deep Dive Tutorial", - "position": 66, + "position": 68, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/flowise-llm-orchestration/index.md" }, { "@type": "ListItem", "description": "Learn how to use google-gemini/gemini-cli to run coding and operations workflows in terminal-first loops with strong tooling, MCP extensibility, headless automation, and safety controls.", "name": "Gemini CLI Tutorial: Terminal-First Agent Workflows with Google Gemini", - "position": 67, + "position": 69, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/gemini-cli-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use googleapis/genai-toolbox to expose database tools through MCP and native SDK paths, with stronger configuration discipline, deployment options, and observability controls.", "name": "GenAI Toolbox Tutorial: MCP-First Database Tooling with Config-Driven Control Planes", - "position": 68, + "position": 70, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/genai-toolbox-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use github/github-mcp-server to connect coding agents directly to repositories, issues, pull requests, actions, and code security workflows with stronger control.", "name": "GitHub MCP Server Tutorial: Production GitHub Operations Through MCP", - "position": 69, + "position": 71, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/github-mcp-server-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use block/goose to automate coding workflows with controlled tool execution, strong provider flexibility, and production-ready operations.", "name": "Goose Tutorial: Extensible Open-Source AI Agent for Real Engineering Work", - "position": 70, + "position": 72, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/goose-tutorial/index.md" }, { "@type": "ListItem", "description": "A comprehensive guide to understanding, building, and deploying open-source GPT implementations -- from nanoGPT to GPT-NeoX and beyond.", "name": "GPT Open Source: Deep Dive Tutorial", - "position": 71, + "position": 73, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/gpt-oss-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use gptme/gptme to run a local-first coding and knowledge-work agent with strong CLI ergonomics, extensible tools, and automation-friendly modes.", "name": "gptme Tutorial: Open-Source Terminal Agent for Local Tool-Driven Work", - "position": 72, + "position": 74, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/gptme-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn tiann/hapi, a local-first hub that lets you run Claude Code/Codex/Gemini/OpenCode sessions locally while controlling and approving them remotely.", "name": "HAPI Tutorial: Remote Control for Local AI Coding Sessions", - "position": 73, + "position": 75, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/hapi-tutorial/index.md" }, { "@type": "ListItem", "description": "Haystack \u2014 An open-source framework for building production-ready LLM applications, RAG pipelines, and intelligent search systems.", "name": "Haystack: Deep Dive Tutorial", - "position": 74, + "position": 76, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/haystack-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of HuggingFace Transformers covering Building State-of-the-Art AI Models.", "name": "HuggingFace Transformers Tutorial: Building State-of-the-Art AI Models", - "position": 75, + "position": 77, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/huggingface-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use humanlayer/humanlayer patterns to orchestrate coding agents with stronger context control, human oversight, and team-scale workflows.", "name": "HumanLayer Tutorial: Context Engineering and Human-Governed Coding Agents", - "position": 76, + "position": 78, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/humanlayer-tutorial/index.md" }, { "@type": "ListItem", "description": "Get reliable, typed responses from LLMs with Pydantic validation.", "name": "Instructor Tutorial: Structured LLM Outputs", - "position": 77, + "position": 79, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/instructor-tutorial/index.md" }, { "@type": "ListItem", "description": "Khoj \u2014 An open-source, self-hostable AI personal assistant that connects to your notes, documents, and online data.", "name": "Khoj AI: Deep Dive Tutorial", - "position": 78, + "position": 80, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/khoj-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use Kilo-Org/kilocode for high-throughput coding workflows with multi-mode operation, agent-loop controls, and extensible CLI/IDE integration.", "name": "Kilo Code Tutorial: Agentic Engineering from IDE and CLI Surfaces", - "position": 79, + "position": 81, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kilocode-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use MoonshotAI/kimi-cli to run an interactive terminal coding agent with configurable modes, MCP integrations, and ACP-based IDE connectivity.", "name": "Kimi CLI Tutorial: Multi-Mode Terminal Agent with MCP and ACP", - "position": 80, + "position": 82, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kimi-cli-tutorial/index.md" }, + { + "@type": "ListItem", + "description": "Learn how to use kirodotdev/Kiro for structured AI-powered development with spec-driven workflows, agent steering, event-driven automation, and AWS-native integrations.", + "name": "Kiro Tutorial: Spec-Driven Agentic IDE from AWS", + "position": 83, + "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kiro-tutorial/index.md" + }, { "@type": "ListItem", "description": "Master Kubernetes Operators with hands-on Go implementation using the Operator SDK and controller-runtime library for enterprise application management.", "name": "Kubernetes Operator Patterns: Building Production-Grade Controllers", - "position": 81, + "position": 84, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kubernetes-operator-patterns/index.md" }, { "@type": "ListItem", "description": "Master LanceDB, the open-source serverless vector database designed for AI applications, RAG systems, and semantic search.", "name": "LanceDB Tutorial: Serverless Vector Database for AI", - "position": 82, + "position": 85, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/lancedb-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of LangChain Architecture: Internal Design Deep Dive.", "name": "LangChain Architecture: Internal Design Deep Dive", - "position": 83, + "position": 86, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/langchain-architecture-guide/index.md" }, { "@type": "ListItem", "description": "Pydantic 2 Required: LangChain v0.3 fully migrated to Pydantic 2. Code using langchain_core.pydantic_v1 should be updated to native Pydantic 2 syntax.", "name": "LangChain Tutorial: Building AI Applications with Large Language Models", - "position": 84, + "position": 87, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/langchain-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to build, deploy, and operate agent workflows with langflow-ai/langflow, including visual flow composition, API/MCP deployment, and production reliability controls.", "name": "Langflow Tutorial: Visual AI Agent and Workflow Platform", - "position": 85, + "position": 88, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/langflow-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use langfuse/langfuse to trace, evaluate, and improve production LLM systems with structured observability workflows.", "name": "Langfuse Tutorial: LLM Observability, Evaluation, and Prompt Operations", - "position": 86, + "position": 89, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/langfuse-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of LangGraph covering Building Stateful Multi-Actor Applications.", "name": "LangGraph Tutorial: Building Stateful Multi-Actor Applications", - "position": 87, + "position": 90, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/langgraph-tutorial/index.md" }, { "@type": "ListItem", "description": "Build AI agents with persistent memory using the framework formerly known as MemGPT.", "name": "Letta Tutorial: Stateful LLM Agents", - "position": 88, + "position": 91, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/letta-tutorial/index.md" }, { "@type": "ListItem", "description": "Build provider-agnostic LLM applications with BerriAI/litellm, including routing, fallbacks, proxy deployment, and cost-aware operations.", "name": "LiteLLM Tutorial: Unified LLM Gateway and Routing Layer", - "position": 89, + "position": 92, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/litellm-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of Liveblocks - Real-Time Collaboration Deep Dive.", "name": "Liveblocks - Real-Time Collaboration Deep Dive", - "position": 90, + "position": 93, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/liveblocks-tutorial/index.md" }, { "@type": "ListItem", "description": "Run large language models efficiently on your local machine with pure C/C++.", "name": "llama.cpp Tutorial: Local LLM Inference", - "position": 91, + "position": 94, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/llama-cpp-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of LLaMA-Factory covering Unified Framework for LLM Training and Fine-tuning.", "name": "LLaMA-Factory Tutorial: Unified Framework for LLM Training and Fine-tuning", - "position": 92, + "position": 95, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/llama-factory-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of LlamaIndex covering Building Advanced RAG Systems and Data Frameworks.", "name": "LlamaIndex Tutorial: Building Advanced RAG Systems and Data Frameworks", - "position": 93, + "position": 96, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/llamaindex-tutorial/index.md" }, { "@type": "ListItem", "description": "LobeChat \u2014 An open-source, modern-design AI chat framework for building private LLM applications.", "name": "LobeChat AI Platform: Deep Dive Tutorial", - "position": 94, + "position": 97, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/lobechat-ai-platform/index.md" }, { "@type": "ListItem", "description": "Run LLMs, image generation, and audio models locally with an OpenAI-compatible API.", "name": "LocalAI Tutorial: Self-Hosted OpenAI Alternative", - "position": 95, + "position": 98, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/localai-tutorial/index.md" }, { "@type": "ListItem", "description": "Logseq \u2014 A privacy-first, local-first knowledge management platform with block-based editing and graph visualization.", "name": "Logseq: Deep Dive Tutorial", - "position": 96, + "position": 99, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/logseq-knowledge-management/index.md" }, { "@type": "ListItem", "description": "Learn how to build production AI applications with mastra-ai/mastra, including agents, workflows, memory, MCP tooling, and reliability operations.", "name": "Mastra Tutorial: TypeScript Framework for AI Agents and Workflows", - "position": 97, + "position": 100, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mastra-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use hangwin/mcp-chrome to expose browser automation, content analysis, and semantic tab search tools to MCP clients.", "name": "MCP Chrome Tutorial: Control Your Real Chrome Browser Through MCP", - "position": 98, + "position": 101, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-chrome-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to build and operate MCP clients and servers with modelcontextprotocol/csharp-sdk, including package choices, auth patterns, tasks, diagnostics, and versioning strategy.", "name": "MCP C# SDK Tutorial: Production MCP in .NET with Hosting, ASP.NET Core, and Task Workflows", - "position": 99, + "position": 102, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-csharp-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/docs as an archived reference, map its conceptual guides, and migrate documentation workflows to the canonical modelcontextprotocol/modelcontextprotocol docs location.", "name": "MCP Docs Repo Tutorial: Navigating the Archived MCP Documentation Repository", - "position": 100, + "position": 103, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-docs-repo-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/ext-apps to build interactive MCP Apps, wire host bridges, secure UI resources, and run reliable testing and migration workflows.", "name": "MCP Ext Apps Tutorial: Building Interactive MCP Apps and Hosts", - "position": 101, + "position": 104, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-ext-apps-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/go-sdk for production MCP workloads across stdio and streamable HTTP, including auth middleware, conformance, and upgrade planning.", "name": "MCP Go SDK Tutorial: Building Robust MCP Clients and Servers in Go", - "position": 102, + "position": 105, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-go-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/inspector to test MCP servers across stdio, SSE, and streamable HTTP, with safer auth defaults and repeatable CLI automation.", "name": "MCP Inspector Tutorial: Debugging and Validating MCP Servers", - "position": 103, + "position": 106, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-inspector-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/java-sdk across core Java and Spring stacks, from transport setup to conformance and production hardening.", "name": "MCP Java SDK Tutorial: Building MCP Clients and Servers with Reactor, Servlet, and Spring", - "position": 104, + "position": 107, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-java-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to implement MCP client/server workflows with modelcontextprotocol/kotlin-sdk, including module boundaries, transport choices, capability negotiation, and production lifecycle controls.", "name": "MCP Kotlin SDK Tutorial: Building Multiplatform MCP Clients and Servers", - "position": 105, + "position": 108, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-kotlin-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to implement MCP server workflows with modelcontextprotocol/php-sdk, including attribute discovery, manual capability registration, transport strategy, session storage, and framework integration patterns.", "name": "MCP PHP SDK Tutorial: Building MCP Servers in PHP with Discovery and Transport Flexibility", - "position": 106, + "position": 109, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-php-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Master the Model Context Protocol Python SDK to build custom tool servers that extend Claude and other LLMs with powerful capabilities.", "name": "MCP Python SDK Tutorial: Building AI Tool Servers", - "position": 107, + "position": 110, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-python-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/quickstart-resources as a practical reference for multi-language MCP server/client implementations, protocol smoke testing, and onboarding workflows.", "name": "MCP Quickstart Resources Tutorial: Cross-Language MCP Servers and Clients by Example", - "position": 108, + "position": 111, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-quickstart-resources-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how modelcontextprotocol/registry works end to end: publishing authenticated server metadata, consuming the API as an aggregator, and operating registry infrastructure safely.", "name": "MCP Registry Tutorial: Publishing, Discovery, and Governance for MCP Servers", - "position": 109, + "position": 112, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-registry-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to implement MCP server/client workflows with modelcontextprotocol/ruby-sdk, including tool/prompt/resource registration, streamable HTTP sessions, structured logging, and release operations.", "name": "MCP Ruby SDK Tutorial: Building MCP Servers and Clients in Ruby", - "position": 110, + "position": 113, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-ruby-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/rust-sdk (rmcp) for production MCP clients and servers with strong transport control, macro-driven tooling, OAuth, and async task workflows.", "name": "MCP Rust SDK Tutorial: Building High-Performance MCP Services with RMCP", - "position": 111, + "position": 114, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-rust-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use the official MCP reference servers as implementation blueprints, not drop-in production services.", "name": "MCP Servers Tutorial: Reference Implementations and Patterns", - "position": 112, + "position": 115, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-servers-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn the current Model Context Protocol directly from modelcontextprotocol/modelcontextprotocol, including lifecycle, transports, security, authorization, and governance workflows.", "name": "MCP Specification Tutorial: Designing Production-Grade MCP Clients and Servers From the Source of Truth", - "position": 113, + "position": 116, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-specification-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to implement MCP client and server workflows with modelcontextprotocol/swift-sdk, including transport options, sampling, batching, and graceful service lifecycle control.", "name": "MCP Swift SDK Tutorial: Building MCP Clients and Servers in Swift", - "position": 114, + "position": 117, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-swift-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/typescript-sdk to build production MCP clients and servers, migrate from v1 to v2 safely, and validate behavior with conformance workflows.", "name": "MCP TypeScript SDK Tutorial: Building and Migrating MCP Clients and Servers in TypeScript", - "position": 115, + "position": 118, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-typescript-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how mcp-use/mcp-use composes agent, client, server, and inspector workflows across Python and TypeScript with practical security and operations patterns.", "name": "MCP Use Tutorial: Full-Stack MCP Development Across Agents, Clients, Servers, and Inspector", - "position": 116, + "position": 119, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcp-use-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/mcpb to package local MCP servers into signed .mcpb bundles with manifest metadata, CLI workflows, and distribution-ready operational controls.", "name": "MCPB Tutorial: Packaging and Distributing Local MCP Servers as Bundles", - "position": 117, + "position": 120, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mcpb-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of MeiliSearch covering Lightning Fast Search Engine.", "name": "MeiliSearch Tutorial: Lightning Fast Search Engine", - "position": 118, + "position": 121, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/meilisearch-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Mem0 covering Building Production-Ready AI Agents with Scalable Long-Term Memory.", "name": "Mem0 Tutorial: Building Production-Ready AI Agents with Scalable Long-Term Memory", - "position": 119, + "position": 122, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mem0-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use SWE-agent/mini-swe-agent to run compact, high-performing software-engineering agent workflows with minimal scaffolding and strong reproducibility.", "name": "Mini-SWE-Agent Tutorial: Minimal Autonomous Code Agent Design at Benchmark Scale", - "position": 120, + "position": 123, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mini-swe-agent-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use mistralai/mistral-vibe for terminal-native coding workflows with configurable agent profiles, skills, subagents, and ACP integrations.", "name": "Mistral Vibe Tutorial: Minimal CLI Coding Agent by Mistral", - "position": 121, + "position": 124, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/mistral-vibe-tutorial/index.md" }, { "@type": "ListItem", "description": "Build powerful AI-powered automations with n8n's visual workflow builder.", "name": "n8n AI Tutorial: Workflow Automation with AI", - "position": 122, + "position": 125, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/n8n-ai-tutorial/index.md" }, { "@type": "ListItem", "description": "n8n \u2014 Visual workflow automation with Model Context Protocol (MCP) integration for AI-powered tool use.", "name": "n8n Model Context Protocol: Deep Dive Tutorial", - "position": 123, + "position": 126, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/n8n-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how Nano-Collective/nanocoder implements local-first coding-agent workflows, tool execution loops, and multi-provider model integration.", "name": "Nanocoder Tutorial: Building and Understanding AI Coding Agents", - "position": 124, + "position": 127, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/nanocoder-tutorial/index.md" }, { "@type": "ListItem", "description": "NocoDB \u2014 An open-source Airtable alternative that turns any database into a smart spreadsheet.", "name": "NocoDB: Deep Dive Tutorial", - "position": 125, + "position": 128, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/nocodb-database-platform/index.md" }, { "@type": "ListItem", "description": "Obsidian Outliner \u2014 A plugin that adds outliner-style editing behaviors to Obsidian, demonstrating advanced plugin architecture patterns.", "name": "Obsidian Outliner Plugin: Deep Dive Tutorial", - "position": 126, + "position": 129, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/obsidian-outliner-plugin/index.md" }, { "@type": "ListItem", "description": "Learn how to use ollama/ollama for local model execution, customization, embeddings/RAG, integration, and production deployment.", "name": "Ollama Tutorial: Running and Serving LLMs Locally", - "position": 127, + "position": 130, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/ollama-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use onlook-dev/onlook to design and edit production-grade React apps visually while keeping generated code in your repository.", "name": "Onlook Tutorial: Visual-First AI Coding for Next.js and Tailwind", - "position": 128, + "position": 131, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/onlook-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use winfunc/opcode to manage Claude Code projects, sessions, agents, MCP servers, and checkpoints from a desktop-first operating interface.", "name": "Opcode Tutorial: GUI Command Center for Claude Code Workflows", - "position": 129, + "position": 132, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/opcode-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn from langchain-ai/open-swe architecture, workflows, and operational patterns, including how to maintain or migrate from a deprecated codebase.", "name": "Open SWE Tutorial: Asynchronous Cloud Coding Agent Architecture and Migration Playbook", - "position": 130, + "position": 133, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/open-swe-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to run and operate open-webui/open-webui as a self-hosted AI interface with model routing, RAG workflows, multi-user controls, and production deployment patterns.", "name": "Open WebUI Tutorial: Self-Hosted AI Workspace and Chat Interface", - "position": 131, + "position": 134, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/open-webui-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to build reliable Python integrations with openai/openai-python using Responses-first architecture, migration-safe patterns, and production operations.", "name": "OpenAI Python SDK Tutorial: Production API Patterns", - "position": 132, + "position": 135, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openai-python-sdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to build low-latency voice agents with openai/openai-realtime-agents, including realtime session design, tool orchestration, and production rollout patterns.", "name": "OpenAI Realtime Agents Tutorial: Voice-First AI Systems", - "position": 133, + "position": 136, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openai-realtime-agents-tutorial/index.md" }, { "@type": "ListItem", "description": "Build robust transcription pipelines with Whisper, from local experiments to production deployment.", "name": "OpenAI Whisper Tutorial: Speech Recognition and Translation", - "position": 134, + "position": 137, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openai-whisper-tutorial/index.md" }, { "@type": "ListItem", "description": "Democratize investment research with OpenBB's comprehensive financial data and analysis platform.", "name": "OpenBB Tutorial: Complete Guide to Investment Research Platform", - "position": 135, + "position": 138, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openbb-tutorial/index.md" }, { "@type": "ListItem", "description": "OpenClaw \u2014 Your own personal AI assistant. Any OS. Any Platform.", "name": "OpenClaw: Deep Dive Tutorial", - "position": 136, + "position": 139, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openclaw-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn from opencode-ai/opencode architecture and workflows, and migrate safely to actively maintained successors.", "name": "OpenCode AI Legacy Tutorial: Archived Terminal Agent Workflows and Migration to Crush", - "position": 137, + "position": 140, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/opencode-ai-legacy-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use anomalyco/opencode to run terminal-native coding agents with provider flexibility, strong tool control, and production-grade workflows.", "name": "OpenCode Tutorial: Open-Source Terminal Coding Agent at Scale", - "position": 138, + "position": 141, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/opencode-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to operate OpenHands/OpenHands across local GUI, CLI, and SDK workflows with production-minded safety, validation, and integration patterns.", "name": "OpenHands Tutorial: Autonomous Software Engineering Workflows", - "position": 139, + "position": 142, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openhands-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use numman-ali/openskills to install, synchronize, and operate reusable SKILL.md packs across Claude Code, Cursor, Codex, Aider, and other agent environments.", "name": "OpenSkills Tutorial: Universal Skill Loading for Coding Agents", - "position": 140, + "position": 143, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openskills-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use Fission-AI/OpenSpec to make AI-assisted software delivery more predictable with artifact-driven planning, implementation, and archival workflows.", "name": "OpenSpec Tutorial: Spec-Driven Workflows for AI Coding Agents", - "position": 141, + "position": 144, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/openspec-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use vercel-labs/opensrc to fetch package and repository source code so coding agents can reason about implementation details, not only public types and docs.", "name": "OpenSrc Tutorial: Deep Source Context for Coding Agents", - "position": 142, + "position": 145, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/opensrc-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Outlines covering Structured Text Generation with LLMs.", "name": "Outlines Tutorial: Structured Text Generation with LLMs", - "position": 143, + "position": 146, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/outlines-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Perplexica covering AI-Powered Search Engine.", "name": "Perplexica Tutorial: AI-Powered Search Engine", - "position": 144, + "position": 147, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/perplexica-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Phidata covering Building Autonomous AI Agents.", "name": "Phidata Tutorial: Building Autonomous AI Agents", - "position": 145, + "position": 148, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/phidata-tutorial/index.md" }, { "@type": "ListItem", "description": "AI Photo Management Revolution: Enhanced facial recognition, LLM integrations, and advanced organization features mark PhotoPrism's evolution.", "name": "PhotoPrism Tutorial: AI-Powered Photos App", - "position": 146, + "position": 149, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/photoprism-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use plandex-ai/plandex for large codebase tasks with strong context management, cumulative diff review, model packs, and self-hosted operations.", "name": "Plandex Tutorial: Large-Task AI Coding Agent Workflows", - "position": 147, + "position": 150, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/plandex-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use OthmanAdi/planning-with-files to run Manus-style file-based planning workflows across Claude Code and other AI coding environments.", "name": "Planning with Files Tutorial: Persistent Markdown Workflow Memory for AI Coding Agents", - "position": 148, + "position": 151, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/planning-with-files-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use microsoft/playwright-mcp to give AI coding agents structured browser automation with accessibility snapshots, deterministic actions, and portable MCP host integrations.", "name": "Playwright MCP Tutorial: Browser Automation for Coding Agents Through MCP", - "position": 149, + "position": 152, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/playwright-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to build agentic applications with The-Pocket/PocketFlow, a minimalist graph framework that still supports workflows, multi-agent patterns, RAG, and human-in-the-loop flows.", "name": "PocketFlow Tutorial: Minimal LLM Framework with Graph-Based Power", - "position": 150, + "position": 153, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/pocketflow-tutorial/index.md" }, { "@type": "ListItem", "description": "Master PostgreSQL's query execution engine, understand EXPLAIN output, and optimize complex queries for maximum performance.", "name": "PostgreSQL Query Planner Deep Dive", - "position": 151, + "position": 154, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/postgresql-query-planner/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of PostHog Tutorial: Open Source Product Analytics Platform.", "name": "PostHog Tutorial: Open Source Product Analytics Platform", - "position": 152, + "position": 155, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/posthog-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Pydantic AI covering Type-Safe AI Agent Development.", "name": "Pydantic AI Tutorial: Type-Safe AI Agent Development", - "position": 153, + "position": 156, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/pydantic-ai-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of Quivr Tutorial: Open-Source RAG Framework for Document Ingestion.", "name": "Quivr Tutorial: Open-Source RAG Framework for Document Ingestion", - "position": 154, + "position": 157, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/quivr-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use QwenLM/Qwen-Agent to build production-capable agents with function calling, MCP integration, memory/RAG patterns, and benchmark-aware planning workflows.", "name": "Qwen-Agent Tutorial: Tool-Enabled Agent Framework with MCP, RAG, and Multi-Modal Workflows", - "position": 155, + "position": 158, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/qwen-agent-tutorial/index.md" }, { "@type": "ListItem", "description": "Transform documents into intelligent Q&A systems with RAGFlow's comprehensive RAG (Retrieval-Augmented Generation) platform.", "name": "RAGFlow Tutorial: Complete Guide to Open-Source RAG Engine", - "position": 156, + "position": 159, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/ragflow-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep dive into React's reconciliation algorithm, the Fiber architecture that powers modern React applications.", "name": "React Fiber Internals", - "position": 157, + "position": 160, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/react-fiber-internals/index.md" }, { "@type": "ListItem", "description": "Learn how to use refly-ai/refly to turn vibe workflows into reusable, versioned agent skills that can run via API, webhook, and CLI integrations.", "name": "Refly Tutorial: Build Deterministic Agent Skills and Ship Them Across APIs and Claude Code", - "position": 158, + "position": 161, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/refly-tutorial/index.md" }, { "@type": "ListItem", "description": "A production-focused guide to RooCodeInc/Roo-Code: mode design, task execution, checkpoints, MCP, team profiles, and enterprise operations.", "name": "Roo Code Tutorial: Run an AI Dev Team in Your Editor", - "position": 159, + "position": 162, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/roo-code-tutorial/index.md" }, { "@type": "ListItem", "description": "Build enterprise AI applications with Microsoft's SDK for integrating LLMs.", "name": "Semantic Kernel Tutorial: Microsoft's AI Orchestration", - "position": 160, + "position": 163, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/semantic-kernel-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use oraios/serena to give coding agents IDE-grade semantic retrieval and editing tools across large codebases.", "name": "Serena Tutorial: Semantic Code Retrieval Toolkit for Coding Agents", - "position": 161, + "position": 164, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/serena-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use shotgun-sh/shotgun to plan, specify, and execute large code changes with structured agent workflows and stronger delivery control.", "name": "Shotgun Tutorial: Spec-Driven Development for Coding Agents", - "position": 162, + "position": 165, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/shotgun-tutorial/index.md" }, { "@type": "ListItem", "description": "Unlock the full potential of large language models with SillyTavern's comprehensive interface for role-playing, creative writing, and AI experimentation.", "name": "SillyTavern Tutorial: Advanced LLM Frontend for Power Users", - "position": 163, + "position": 166, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/sillytavern-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of SiYuan covering Privacy-First Knowledge Management.", "name": "SiYuan Tutorial: Privacy-First Knowledge Management", - "position": 164, + "position": 167, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/siyuan-tutorial/index.md" }, { "@type": "ListItem", "description": "Build efficient AI agents with minimal code using Hugging Face's smolagents library.", "name": "Smolagents Tutorial: Hugging Face's Lightweight Agent Framework", - "position": 165, + "position": 168, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/smolagents-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use stagewise-io/stagewise to connect browser-selected UI context with coding agents, plugin extensions, and multi-agent bridge workflows.", "name": "Stagewise Tutorial: Frontend Coding Agent Workflows in Real Browser Context", - "position": 166, + "position": 169, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/stagewise-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use strands-agents/sdk-python to build lightweight, model-driven agents with strong tool abstractions, hooks, and production deployment patterns.", "name": "Strands Agents Tutorial: Model-Driven Agent Systems with Native MCP Support", - "position": 167, + "position": 170, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/strands-agents-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of Supabase Tutorial: Building Modern Backend Applications.", "name": "Supabase Tutorial: Building Modern Backend Applications", - "position": 168, + "position": 171, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/supabase-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of SuperAGI covering Production-Ready Autonomous AI Agents.", "name": "SuperAGI Tutorial: Production-Ready Autonomous AI Agents", - "position": 169, + "position": 172, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/superagi-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use superset-sh/superset to orchestrate many coding agents in parallel with worktree isolation, centralized monitoring, and fast review loops.", "name": "Superset Terminal Tutorial: Command Center for Parallel Coding Agents", - "position": 170, + "position": 173, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/superset-terminal-tutorial/index.md" }, { "@type": "ListItem", "description": "Deep technical walkthrough of OpenAI Swarm Tutorial: Lightweight Multi-Agent Orchestration.", "name": "OpenAI Swarm Tutorial: Lightweight Multi-Agent Orchestration", - "position": 171, + "position": 174, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/swarm-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use SWE-agent/SWE-agent for autonomous software engineering workflows, from single-issue runs to benchmark and research-grade evaluation.", "name": "SWE-agent Tutorial: Autonomous Repository Repair and Benchmark-Driven Engineering", - "position": 172, + "position": 175, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/swe-agent-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use sweepai/sweep to turn GitHub issues into pull requests, operate feedback loops, and run self-hosted or CLI workflows with clear guardrails.", "name": "Sweep Tutorial: Issue-to-PR AI Coding Workflows on GitHub", - "position": 173, + "position": 176, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/sweep-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to run and extend TabbyML/tabby for production code completion and team knowledge workflows.", "name": "Tabby Tutorial: Self-Hosted AI Coding Assistant Architecture and Operations", - "position": 174, + "position": 177, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/tabby-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use and maintain taskade/awesome-vibe-coding as a decision system for AI app builders, coding agents, MCP tooling, and Genesis-centered workflows.", "name": "Taskade Awesome Vibe Coding Tutorial: Curating the 2026 AI-Building Landscape", - "position": 175, + "position": 178, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/taskade-awesome-vibe-coding-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how taskade/docs structures product documentation across Genesis, API references, automations, help-center workflows, and release timelines.", "name": "Taskade Docs Tutorial: Operating the Living-DNA Documentation Stack", - "position": 176, + "position": 179, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/taskade-docs-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to run, extend, and operate taskade/mcp to connect Taskade workspaces, tasks, projects, and AI agents into MCP-compatible clients.", "name": "Taskade MCP Tutorial: OpenAPI-Driven MCP Server for Taskade Workflows", - "position": 177, + "position": 180, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/taskade-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to operate Taskade as an AI-native workspace system: Genesis app generation, AI agents, automations, enterprise controls, and production rollout patterns.", "name": "Taskade Tutorial: AI-Native Workspace, Genesis, and Agentic Operations", - "position": 178, + "position": 181, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/taskade-tutorial/index.md" }, { "@type": "ListItem", "description": "Teable \u2014 A high-performance, multi-dimensional database platform built on PostgreSQL with real-time collaboration.", "name": "Teable: Deep Dive Tutorial", - "position": 179, + "position": 182, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/teable-database-platform/index.md" }, { "@type": "ListItem", "description": "Master tiktoken, OpenAI's fast BPE tokenizer, to accurately count tokens, optimize prompts, and reduce API costs.", "name": "tiktoken Tutorial: OpenAI Token Encoding & Optimization", - "position": 180, + "position": 183, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/tiktoken-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Turborepo covering High-Performance Monorepo Build System.", "name": "Turborepo Tutorial: High-Performance Monorepo Build System", - "position": 181, + "position": 184, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/turborepo-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use modelcontextprotocol/use-mcp to connect React apps to MCP servers with OAuth-aware flows, tool/resource/prompt access, and resilient transport lifecycle handling.", "name": "use-mcp Tutorial: React Hook Patterns for MCP Client Integration", - "position": 182, + "position": 185, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/use-mcp-tutorial/index.md" }, { "@type": "ListItem", "description": "Build robust AI product features with vercel/ai, including streaming, structured outputs, tool loops, framework integration, and production deployment patterns.", "name": "Vercel AI SDK Tutorial: Production TypeScript AI Apps and Agents", - "position": 183, + "position": 186, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/vercel-ai-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use BloopAI/vibe-kanban to coordinate Claude Code, Codex, Gemini CLI, and other coding agents through a unified orchestration workspace.", "name": "Vibe Kanban Tutorial: Multi-Agent Orchestration Board for Coding Workflows", - "position": 184, + "position": 187, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/vibe-kanban-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use cloudflare/vibesdk to run a prompt-to-app platform with agent orchestration, preview sandboxes, and production deployment on Cloudflare.", "name": "VibeSDK Tutorial: Build a Vibe-Coding Platform on Cloudflare", - "position": 185, + "position": 188, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/vibesdk-tutorial/index.md" }, { "@type": "ListItem", "description": "Master vLLM for blazing-fast, cost-effective large language model inference with advanced optimization techniques.", "name": "vLLM Tutorial: High-Performance LLM Inference", - "position": 186, + "position": 189, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/vllm-tutorial/index.md" }, { "@type": "ListItem", "description": "A deep technical walkthrough of Whisper.cpp covering High-Performance Speech Recognition in C/C++.", "name": "Whisper.cpp Tutorial: High-Performance Speech Recognition in C/C++", - "position": 187, + "position": 190, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/whisper-cpp-tutorial/index.md" }, { "@type": "ListItem", "description": "Learn how to use wshobson/agents to install focused Claude Code plugins, coordinate specialist agents, and run scalable multi-agent workflows with clear model and skill boundaries.", "name": "Wshobson Agents Tutorial: Pluginized Multi-Agent Workflows for Claude Code", - "position": 188, + "position": 191, "url": "https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/wshobson-agents-tutorial/index.md" } ], "name": "Awesome Code Docs Tutorial Catalog", - "numberOfItems": 188, + "numberOfItems": 191, "url": "https://github.com/johnxie/awesome-code-docs" } diff --git a/llms-full.txt b/llms-full.txt index ba1dd261..241879f3 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -111,6 +111,12 @@ Main repository: - Summary: Learn how to use awslabs/mcp to compose, run, and govern AWS-focused MCP servers across development, infrastructure, data, and operations workflows. - Keywords: awslabs, mcp, operating, large, scale, server, ecosystem, aws, workloads, compose, run, govern, focused, servers, development, infrastructure, data, operations +## BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework +- Path: tutorials/babyagi-tutorial +- Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/babyagi-tutorial/index.md +- Summary: Learn how to use yoheinakajima/babyagi for autonomous task generation, execution, and prioritization—the foundational agent loop that started the autonomous AI agent wave. +- Keywords: babyagi, original, autonomous, task, agent, framework, yoheinakajima, generation, execution, prioritization, foundational, loop, started, wave + ## Beads Tutorial: Git-Backed Task Graph Memory for Coding Agents - Path: tutorials/beads-tutorial - Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/beads-tutorial/index.md @@ -327,6 +333,12 @@ Main repository: - Summary: Orchestrate complex distributed workflows with Deer Flow's powerful task coordination and execution platform. - Keywords: deer, flow, distributed, workflow, orchestration, orchestrate, complex, workflows, powerful, task, coordination, execution +## Devika Tutorial: Open-Source Autonomous AI Software Engineer +- Path: tutorials/devika-tutorial +- Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/devika-tutorial/index.md +- Summary: Learn how to deploy and operate stitionai/devika — a multi-agent autonomous coding system that plans, researches, writes, and debugs code end-to-end. +- Keywords: devika, open, source, autonomous, software, engineer, deploy, operate, stitionai, multi, agent, coding, plans, researches, writes, debugs, code, end + ## Dify Platform: Deep Dive Tutorial - Path: tutorials/dify-platform-deep-dive - Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/dify-platform-deep-dive/index.md @@ -483,6 +495,12 @@ Main repository: - Summary: Learn how to use MoonshotAI/kimi-cli to run an interactive terminal coding agent with configurable modes, MCP integrations, and ACP-based IDE connectivity. - Keywords: kimi, cli, multi, mode, terminal, agent, mcp, acp, moonshotai, run, interactive, coding, configurable, modes, integrations, based, ide, connectivity +## Kiro Tutorial: Spec-Driven Agentic IDE from AWS +- Path: tutorials/kiro-tutorial +- Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kiro-tutorial/index.md +- Summary: Learn how to use kirodotdev/Kiro for structured AI-powered development with spec-driven workflows, agent steering, event-driven automation, and AWS-native integrations. +- Keywords: kiro, spec, driven, agentic, ide, aws, kirodotdev, structured, powered, development, workflows, agent, steering, event, automation, native, integrations + ## Kubernetes Operator Patterns: Building Production-Grade Controllers - Path: tutorials/kubernetes-operator-patterns - Index: https://github.com/johnxie/awesome-code-docs/blob/main/tutorials/kubernetes-operator-patterns/index.md diff --git a/llms.txt b/llms.txt index e3c9c719..257e3bbf 100644 --- a/llms.txt +++ b/llms.txt @@ -32,6 +32,7 @@ - Awesome Claude Skills Tutorial: High-Signal Skill Discovery and Reuse for Claude Workflows: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/awesome-claude-skills-tutorial - Awesome MCP Servers Tutorial: Curating and Operating High-Signal MCP Integrations: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/awesome-mcp-servers-tutorial - awslabs/mcp Tutorial: Operating a Large-Scale MCP Server Ecosystem for AWS Workloads: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/awslabs-mcp-tutorial +- BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/babyagi-tutorial - Beads Tutorial: Git-Backed Task Graph Memory for Coding Agents: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/beads-tutorial - BentoML Tutorial: Building Production-Ready ML Services: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/bentoml-tutorial - bolt.diy Tutorial: Build and Operate an Open Source AI App Builder: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/bolt-diy-tutorial @@ -68,6 +69,7 @@ - Crush Tutorial: Multi-Model Terminal Coding Agent with Strong Extensibility: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/crush-tutorial - Daytona Tutorial: Secure Sandbox Infrastructure for AI-Generated Code: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/daytona-tutorial - Deer Flow Tutorial: Distributed Workflow Orchestration Platform: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/deer-flow-tutorial +- Devika Tutorial: Open-Source Autonomous AI Software Engineer: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/devika-tutorial - Dify Platform: Deep Dive Tutorial: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/dify-platform-deep-dive - DSPy Tutorial: Programming Language Models: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/dspy-tutorial - Dyad Tutorial: Local-First AI App Building: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/dyad-tutorial @@ -94,6 +96,7 @@ - Khoj AI: Deep Dive Tutorial: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/khoj-tutorial - Kilo Code Tutorial: Agentic Engineering from IDE and CLI Surfaces: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/kilocode-tutorial - Kimi CLI Tutorial: Multi-Mode Terminal Agent with MCP and ACP: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/kimi-cli-tutorial +- Kiro Tutorial: Spec-Driven Agentic IDE from AWS: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/kiro-tutorial - Kubernetes Operator Patterns: Building Production-Grade Controllers: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/kubernetes-operator-patterns - LanceDB Tutorial: Serverless Vector Database for AI: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/lancedb-tutorial - LangChain Architecture: Internal Design Deep Dive: https://github.com/johnxie/awesome-code-docs/tree/main/tutorials/langchain-architecture-guide diff --git a/tutorials/README.md b/tutorials/README.md index e6963a5c..fa0a9a9d 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -15,28 +15,28 @@ Use this guide to navigate all tutorial tracks, understand structure rules, and | Metric | Value | |:-------|:------| <<<<<<< HEAD -| Tutorial directories | 188 | -| Tutorial markdown files | 1705 | -| Tutorial markdown lines | 996,366 | +| Tutorial directories | 191 | +| Tutorial markdown files | 1732 | +| Tutorial markdown lines | 1,004,205 | ======= <<<<<<< HEAD -| Tutorial directories | 188 | -| Tutorial markdown files | 1705 | -| Tutorial markdown lines | 996,366 | +| Tutorial directories | 191 | +| Tutorial markdown files | 1732 | +| Tutorial markdown lines | 1,004,205 | ======= <<<<<<< HEAD -| Tutorial directories | 188 | -| Tutorial markdown files | 1705 | -| Tutorial markdown lines | 996,366 | +| Tutorial directories | 191 | +| Tutorial markdown files | 1732 | +| Tutorial markdown lines | 1,004,205 | ======= <<<<<<< HEAD -| Tutorial directories | 188 | -| Tutorial markdown files | 1705 | -| Tutorial markdown lines | 996,366 | +| Tutorial directories | 191 | +| Tutorial markdown files | 1732 | +| Tutorial markdown lines | 1,004,205 | ======= -| Tutorial directories | 188 | -| Tutorial markdown files | 1705 | -| Tutorial markdown lines | 996,366 | +| Tutorial directories | 191 | +| Tutorial markdown files | 1732 | +| Tutorial markdown lines | 1,004,205 | ## Source Verification Snapshot @@ -61,7 +61,7 @@ Repository-source verification run against tutorial index references (GitHub API | Pattern | Count | Description | |:--------|:------|:------------| -| Root chapter files | 188 | `index.md` + top-level `01-...md` to `08-...md` | +| Root chapter files | 191 | `index.md` + top-level `01-...md` to `08-...md` | | `docs/` chapter files | 0 | Deprecated and fully migrated | | Index-only roadmap | 0 | All catalog entries publish full chapter sets | | Mixed root + `docs/` | 0 | Legacy hybrid layout removed | diff --git a/tutorials/babyagi-tutorial/01-getting-started.md b/tutorials/babyagi-tutorial/01-getting-started.md new file mode 100644 index 00000000..cc5c31d4 --- /dev/null +++ b/tutorials/babyagi-tutorial/01-getting-started.md @@ -0,0 +1,282 @@ +--- +layout: default +title: "Chapter 1: Getting Started" +nav_order: 1 +parent: BabyAGI Tutorial +--- + +# Chapter 1: Getting Started + +Welcome to **Chapter 1: Getting Started**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers BabyAGI's origins, the core concept of autonomous task agents, environment setup, and how to run your first autonomous objective. + +## Learning Goals + +- understand BabyAGI's origin story and why it matters as a foundational reference +- set up a working local environment with required API credentials +- run your first autonomous objective and observe the three-agent loop +- identify common startup failures and how to resolve them + +## Fast Start Checklist + +1. clone the BabyAGI repository +2. install Python dependencies via pip +3. configure `OPENAI_API_KEY` and vector store credentials +4. copy `.env.example` to `.env` and set your objective +5. run `python babyagi.py` and watch the task loop execute + +## Source References + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) +- [Original Twitter Announcement (March 2023)](https://twitter.com/yoheinakajima/status/1640934493489070080) + +## Summary + +You now have a working BabyAGI baseline and can observe the autonomous three-agent task loop on a real objective. + +Next: [Chapter 2: Core Architecture: Task Queue and Agent Loop](02-core-architecture-task-queue-and-agent-loop.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 1: Getting Started** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 1: Getting Started`. +2. Separate control-plane decisions (objective setting, model selection, vector backend) from data-plane execution (task queue, LLM calls, embeddings). +3. Identify key integration points: OpenAI API, vector store initialization, task list data structure. +4. Trace state transitions across the startup lifecycle: config load → first task seed → first execution cycle. +5. Identify extension hooks: custom first task, environment variables, alternative vector backends. +6. Map ownership boundaries for solo and team BabyAGI workflows. +7. Specify rollback and recovery paths for misconfigured environments. +8. Track observability signals: stdout task logs, API call counts, token usage. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Model selection | gpt-3.5-turbo default | gpt-4 or claude-3 | cost vs quality | +| Vector store | in-memory / Chroma local | Pinecone managed | simplicity vs scalability | +| Objective scope | narrow focused goal | broad open-ended goal | predictability vs exploration | +| Max iterations | small limit (5-10) | unlimited loop | safety vs thoroughness | +| API key management | `.env` file locally | secrets manager | simplicity vs security | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| missing API key | `openai.AuthenticationError` on start | `.env` not loaded or key not set | verify `python-dotenv` load and key format | +| rate limit hit | 429 errors after first few tasks | default model tier limits | add exponential backoff or switch to higher-tier key | +| infinite loop | task queue never empties | creation agent always adds tasks | set `MAX_ITERATIONS` environment variable | +| vector store init failure | connection refused or import error | Pinecone key missing or Chroma not installed | switch to default in-memory backend first | +| empty task results | execution agent returns nothing | objective too vague or model context exhausted | narrow objective and reduce initial task scope | +| environment file not found | `KeyError` on `os.getenv` | `.env.example` not copied | copy `.env.example` to `.env` before running | + +### Implementation Runbook + +1. Clone the repository: `git clone https://github.com/yoheinakajima/babyagi.git && cd babyagi`. +2. Create a virtual environment: `python3 -m venv venv && source venv/bin/activate`. +3. Install dependencies: `pip install -r requirements.txt`. +4. Copy environment template: `cp .env.example .env`. +5. Set `OPENAI_API_KEY` in `.env` to your valid OpenAI API key. +6. Set `OBJECTIVE` in `.env` to a concrete, testable goal (e.g., "Research the top 3 Python web frameworks and summarize their pros and cons"). +7. Set `INITIAL_TASK` in `.env` to a seed task (e.g., "Make a todo list"). +8. Set `TABLE_NAME` in `.env` to a unique identifier for your vector store namespace. +9. Choose a vector store backend: set `USE_CHROMA=True` for local Chroma or configure Pinecone credentials. +10. Run: `python babyagi.py` and observe the loop in stdout. +11. Press `Ctrl+C` to stop after observing several task cycles. + +### Quality Gate Checklist + +- [ ] `.env` file exists with all required keys populated +- [ ] `OPENAI_API_KEY` is valid and has sufficient credits +- [ ] virtual environment is activated before running +- [ ] vector backend initializes without error on first run +- [ ] at least one full task creation-execution-prioritization cycle completes +- [ ] stdout shows task list updates after each cycle +- [ ] `MAX_ITERATIONS` is set when running in automated environments +- [ ] token usage is monitored to avoid unexpected billing + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) +- [Original Twitter Announcement](https://twitter.com/yoheinakajima/status/1640934493489070080) + +### Cross-Tutorial Connection Map + +- [AutoGPT Tutorial](../autogen-tutorial/) +- [SuperAGI Tutorial](../superagi-tutorial/) +- [LangChain Tutorial](../langchain-tutorial/) +- [LangGraph Tutorial](../langgraph-tutorial/) +- [Chapter 1: Getting Started](01-getting-started.md) + +### Advanced Practice Exercises + +1. Run BabyAGI with three different objectives (narrow, medium, broad) and compare task list depth. +2. Add instrumentation to count total API calls per task cycle and log to a file. +3. Introduce a deliberate rate limit scenario and confirm the retry logic activates. +4. Switch between Chroma and in-memory backends and measure startup time difference. +5. Run a staged rollout on a team server with `MAX_ITERATIONS=5` and document rollback decision criteria. + +### Review Questions + +1. What is the minimum set of environment variables required to run BabyAGI? +2. Why does the objective wording materially affect the quality of generated tasks? +3. What tradeoff exists between GPT-3.5-turbo and GPT-4 for the execution agent specifically? +4. How would you recover from a vector store initialization failure mid-run? +5. What must be automated before running BabyAGI in a CI/CD pipeline safely? + +### Scenario Playbook 1: First Run on a Research Objective + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: researcher wants to autonomously survey a technical domain +- initial hypothesis: BabyAGI will decompose the domain survey into discrete search and synthesis tasks +- immediate action: set OBJECTIVE to a specific research question with clear scope boundaries +- engineering control: set MAX_ITERATIONS=10 to prevent runaway loops during initial evaluation +- verification target: at least 5 distinct tasks are generated and executed within the first 3 cycles +- rollback trigger: if zero tasks are generated after cycle 1, rewrite objective with more explicit scope +- communication step: log the task list at each cycle to a file for post-run review +- learning capture: record which objective phrasings produce the most useful task decompositions + +### Scenario Playbook 2: Rate Limit Recovery During Extended Runs + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: OpenAI rate limit (429) encountered after several task cycles +- initial hypothesis: task loop is making too many rapid sequential API calls +- immediate action: add sleep interval between task execution cycles +- engineering control: implement exponential backoff with jitter in the execution agent call +- verification target: loop resumes within 60 seconds of a 429 without human intervention +- rollback trigger: if 429s persist for more than 5 minutes, switch to a lower-rpm model +- communication step: log rate limit events with timestamps and task IDs +- learning capture: add the optimal sleep interval to `.env` as `SLEEP_INTERVAL` + +### Scenario Playbook 3: Vector Store Initialization Failure + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: Pinecone connection refused or API key invalid at startup +- initial hypothesis: Pinecone credentials in `.env` are missing or malformed +- immediate action: switch to Chroma local backend by setting `USE_CHROMA=True` +- engineering control: add a startup health check that verifies vector store connectivity before entering the loop +- verification target: BabyAGI starts and completes at least one cycle with the fallback backend +- rollback trigger: if Chroma also fails, revert to in-memory mode for debugging +- communication step: print clear error message distinguishing Pinecone vs Chroma vs in-memory failures +- learning capture: document the exact environment variable combinations required for each backend + +### Scenario Playbook 4: Objective Scope Creep + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task list grows unboundedly because objective is too broad +- initial hypothesis: creation agent interprets broad objectives as requiring infinite sub-decomposition +- immediate action: pause the run and rewrite the objective with explicit deliverables and scope limits +- engineering control: set MAX_ITERATIONS=5 as a circuit breaker before restarting +- verification target: task list converges to under 10 tasks by iteration 3 +- rollback trigger: if task count exceeds 20 after iteration 5, halt and redesign objective +- communication step: export current task list to a file for human review before resuming +- learning capture: establish an objective template with explicit "done when" criteria + +### Scenario Playbook 5: Environment Variable Not Loaded + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `KeyError` or `None` returned for `OPENAI_API_KEY` despite setting it +- initial hypothesis: `.env` file not in working directory or `python-dotenv` not installed +- immediate action: verify `pip install python-dotenv` and confirm `.env` exists in the project root +- engineering control: add an explicit startup assertion that checks all required env vars before loop entry +- verification target: startup validation prints "All required environment variables loaded" before first task +- rollback trigger: if assertion fails, exit with a descriptive error listing the missing variables +- communication step: print the list of required variables in the error message for self-service resolution +- learning capture: add a `validate_env()` function to the startup sequence as a permanent guard + +### Scenario Playbook 6: Token Budget Exceeded Mid-Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `openai.InvalidRequestError: maximum context length exceeded` +- initial hypothesis: task result accumulation is growing the context window beyond model limits +- immediate action: truncate stored task results before passing to creation agent +- engineering control: add a `max_result_length` cap that truncates results at 1500 tokens before storage +- verification target: no context length errors occur across a 20-iteration run +- rollback trigger: if truncation degrades task quality, switch to a model with a larger context window +- communication step: log the original result length and truncated length for each affected cycle +- learning capture: add result length monitoring to the task execution telemetry + +### Scenario Playbook 7: Duplicate Task Generation + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task list fills with near-identical tasks like "Research X" repeated 5 times +- initial hypothesis: creation agent lacks deduplication awareness and generates semantically similar tasks +- immediate action: add a deduplication step that checks vector similarity before adding new tasks +- engineering control: compute cosine similarity between new tasks and existing queue; reject if similarity > 0.9 +- verification target: no two tasks in the queue have cosine similarity above 0.85 after the deduplication check +- rollback trigger: if deduplication removes too many tasks and stalls the loop, lower threshold to 0.95 +- communication step: log rejected duplicate tasks to a separate file for analysis +- learning capture: use rejection patterns to improve the task creation prompt template + +### Scenario Playbook 8: Python Dependency Conflict + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `ImportError` or version conflict when running `pip install -r requirements.txt` +- initial hypothesis: system Python has conflicting global packages or outdated pip +- immediate action: create a fresh virtual environment and reinstall from scratch +- engineering control: pin all dependency versions in `requirements.txt` and document Python version requirement +- verification target: `pip install -r requirements.txt` completes without errors in a clean venv +- rollback trigger: if a specific package causes the conflict, pin it to the last known working version +- communication step: document the Python version (3.8+) and the virtual environment setup steps in the README +- learning capture: add a `check_environment()` startup function that validates key package versions + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not writing more code, but deciding clear boundaries for the objective specification and environment configuration so the autonomous loop behaves predictably from the first run. BabyAGI's power comes from its simplicity: a single Python script that endlessly decomposes, executes, and reprioritizes tasks—but that simplicity means there are very few guard rails by default. + +In practical terms, this chapter helps you avoid three common failures: + +- starting with an objective that is too broad, causing the task queue to grow uncontrollably +- misconfiguring the vector store, causing the memory layer to silently fail and degrade task quality +- ignoring rate limits and token budgets, causing the run to crash after the first few cycles + +After working through this chapter, you should be able to reason about `Chapter 1: Getting Started` as the operational baseline for all subsequent BabyAGI work, with explicit contracts for environment setup, objective framing, and first-run validation. + +## How it Works Under the Hood + +Under the hood, `Chapter 1: Getting Started` follows a repeatable control path: + +1. **Environment bootstrap**: load `.env` via `python-dotenv`, validate required variables, initialize the OpenAI client. +2. **Vector store initialization**: connect to Pinecone, Chroma, or in-memory backend and create the results namespace. +3. **Task list seeding**: create the initial task list with the `INITIAL_TASK` value from `.env` as task ID 1. +4. **Main loop entry**: begin the `while True` loop (bounded by `MAX_ITERATIONS` if set). +5. **First execution cycle**: pop task 1 from the queue, call the execution agent with the objective and task text. +6. **Result storage**: embed the task result and store in the vector store for future context retrieval. +7. **Task creation cycle**: call the creation agent with the objective, last task, last result, and existing task list. +8. **Prioritization cycle**: call the prioritization agent to reorder the task queue by relevance to the objective. +9. **Loop continuation**: return to step 5 with the next highest-priority task. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) + Why it matters: authoritative reference on the complete BabyAGI codebase (github.com). +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + Why it matters: official setup instructions and environment variable reference (github.com). +- [Original Twitter Announcement](https://twitter.com/yoheinakajima/status/1640934493489070080) + Why it matters: original design rationale and intended use case from the author (twitter.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Next Chapter: Chapter 2: Core Architecture: Task Queue and Agent Loop](02-core-architecture-task-queue-and-agent-loop.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/02-core-architecture-task-queue-and-agent-loop.md b/tutorials/babyagi-tutorial/02-core-architecture-task-queue-and-agent-loop.md new file mode 100644 index 00000000..c5c42b9e --- /dev/null +++ b/tutorials/babyagi-tutorial/02-core-architecture-task-queue-and-agent-loop.md @@ -0,0 +1,298 @@ +--- +layout: default +title: "Chapter 2: Core Architecture: Task Queue and Agent Loop" +nav_order: 2 +parent: BabyAGI Tutorial +--- + +# Chapter 2: Core Architecture: Task Queue and Agent Loop + +Welcome to **Chapter 2: Core Architecture: Task Queue and Agent Loop**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter dissects the three-agent loop—execution, creation, prioritization—and the task queue data structure that ties them together into an autonomous system. + +## Learning Goals + +- understand the role of each of the three agents in the loop +- trace the data flow from task pop to task reprioritization +- identify the state model that persists across loop iterations +- reason about loop termination conditions and safety controls + +## Fast Start Checklist + +1. read the main loop in `babyagi.py` from top to bottom +2. identify the three agent function calls: `execution_agent`, `task_creation_agent`, `prioritization_agent` +3. trace what each agent receives as input and what it returns +4. observe how the task list is modified after each cycle +5. identify where the vector store is read from and written to + +## Source References + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) +- [BabyAGI README Architecture Section](https://github.com/yoheinakajima/babyagi#readme) + +## Summary + +You now understand how BabyAGI's three-agent loop operates as a coherent autonomous system and can reason about each component's role, inputs, and outputs. + +Next: [Chapter 3: LLM Backend Integration and Configuration](03-llm-backend-integration-and-configuration.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 2: Core Architecture: Task Queue and Agent Loop** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 2: Core Architecture: Task Queue and Agent Loop`. +2. Separate the control-plane (agent prompt templates, loop control variables) from the data-plane (task queue state, vector store results). +3. Identify key integration points: task list as a Python deque/list, the `tasks_storage` vector namespace, and the three LLM call sites. +4. Trace state transitions: task pop → execution → result storage → creation → new task append → prioritization → reordered queue. +5. Identify extension hooks: custom execution logic, injection points for tool calls, task list observers. +6. Map ownership boundaries: which agent owns the objective vs which agent owns the task list ordering. +7. Specify rollback paths: how to reset the task queue if the creation agent produces malformed output. +8. Track observability signals: task IDs, cycle counts, queue depth, per-cycle latency. + +### The Three-Agent Loop in Detail + +**Execution Agent** is responsible for completing a specific task given the overall objective and recent context. It receives: +- the current `OBJECTIVE` +- the task text (e.g., "Research Python web frameworks") +- contextual results retrieved from the vector store (top-k similar past results) + +It returns a string result that is then stored as an embedding in the vector store. + +**Task Creation Agent** generates new tasks based on what was just accomplished. It receives: +- the current `OBJECTIVE` +- the last completed task and its result +- the current incomplete task list + +It returns a list of new task strings that do not overlap with tasks already in the queue. + +**Prioritization Agent** reorders the entire task queue so the most relevant tasks to the objective appear first. It receives: +- the current `OBJECTIVE` +- the full current task list with IDs + +It returns a renumbered, reordered task list as a formatted string that is parsed back into a Python list. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Task queue implementation | Python list (simple) | priority queue with custom weights | simplicity vs fine-grained ordering | +| Execution agent context | top-5 vector results | top-10 or full history | latency vs depth of context | +| Creation agent temperature | 0.5 (focused) | 0.9 (creative) | predictability vs task diversity | +| Prioritization frequency | every cycle | every N cycles | API cost vs ordering freshness | +| Loop termination | manual Ctrl+C | MAX_ITERATIONS guard | simplicity vs operational safety | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| task queue explosion | queue depth > 30 | creation agent adds 5+ tasks per cycle | add max-queue-depth guard | +| empty task creation | creation agent returns empty list | result was too thin to generate new tasks | inject a fallback task from the objective | +| malformed prioritization output | JSON/parse error on priority list | model returns free-form text instead of numbered list | add retry with stricter prompt format | +| stale context retrieval | execution quality degrades over time | top-k results are from very early cycles | add recency weighting to retrieval | +| agent cross-contamination | tasks drift from original objective | creation agent prompt lacks objective anchoring | explicitly re-state objective in every creation prompt | +| loop hangs at execution | no output after 30 seconds | model timeout or network issue | add request timeout and task retry counter | + +### Implementation Runbook + +1. Open `babyagi.py` and locate the three agent functions: `execution_agent`, `task_creation_agent`, `prioritization_agent`. +2. Read `execution_agent` first: identify the system prompt, user prompt construction, and the vector context retrieval call. +3. Verify that the execution result is embedded and upserted into the vector store after each call. +4. Read `task_creation_agent`: identify how it formats the incomplete task list and how it parses the response into a Python list. +5. Read `prioritization_agent`: trace how it receives the full task list and how the response is parsed back into ordered task objects. +6. Locate the main `while True` loop: trace the exact sequence of function calls and list mutations. +7. Identify where `MAX_ITERATIONS` is checked (or add it if not present in your version). +8. Add logging to each agent call that records: task ID, agent name, input token count, output token count, and elapsed time. +9. Run a 5-iteration test and verify that the task list is modified correctly after each cycle. + +### Quality Gate Checklist + +- [ ] all three agent functions are understood with their input/output contracts documented +- [ ] the task queue data structure is traced through at least one full cycle +- [ ] the vector store read path (retrieval for context) is distinguished from the write path (result storage) +- [ ] loop termination condition is explicit and tested +- [ ] task creation output parsing has a fallback for malformed model responses +- [ ] prioritization output parsing has a fallback for malformed model responses +- [ ] per-cycle logging records task ID, agent, tokens, and latency +- [ ] maximum queue depth is enforced to prevent unbounded growth + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + +### Cross-Tutorial Connection Map + +- [LangGraph Tutorial](../langgraph-tutorial/) — comparable stateful agent loop patterns +- [CrewAI Tutorial](../crewai-tutorial/) — multi-agent role decomposition analogous to the three-agent split +- [AutoGPT Tutorial](../autogen-tutorial/) — parallel autonomous loop design +- [Chapter 2: Core Architecture](02-core-architecture-task-queue-and-agent-loop.md) + +### Advanced Practice Exercises + +1. Add a `task_observer` hook that is called after every prioritization step and logs the full ordered task list. +2. Instrument the three agent calls to measure individual latency and compare across 10 cycles. +3. Modify the creation agent to cap new tasks at 3 per cycle and observe how it changes convergence behavior. +4. Replace the prioritization agent with a deterministic rule-based reordering and compare output quality. +5. Add a task deduplication step between creation and prioritization and measure the reduction in queue churn. + +### Review Questions + +1. What does the execution agent receive from the vector store and why does it matter for output quality? +2. Why does the prioritization agent receive the entire task list rather than just the new tasks? +3. What happens to the task queue if the creation agent returns a malformed response? +4. How does the three-agent split of responsibilities create emergent autonomous behavior? +5. What would you change first if the task loop was consistently drifting away from the original objective? + +### Scenario Playbook 1: Task Queue Growing Without Bound + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task queue depth exceeds 25 tasks after 5 cycles +- initial hypothesis: creation agent is adding tasks faster than the execution agent can complete them +- immediate action: add `if len(task_list) > MAX_QUEUE_DEPTH: task_list = task_list[:MAX_QUEUE_DEPTH]` after creation +- engineering control: set `MAX_QUEUE_DEPTH=15` as a default guard in the environment configuration +- verification target: queue depth stays below 15 across a 20-iteration run +- rollback trigger: if capping causes the loop to stop making progress, raise the cap to 25 +- communication step: log queue depth at every cycle start as a standard metric +- learning capture: analyze which objective phrasings correlate with high queue growth rates + +### Scenario Playbook 2: Prioritization Agent Returns Malformed Output + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `ValueError` during task list parsing after the prioritization agent call +- initial hypothesis: model returned free-form text instead of a numbered task list +- immediate action: add a retry with a stricter prompt that includes a format example +- engineering control: wrap the prioritization call in a `try/except` with a fallback that preserves the pre-prioritization order +- verification target: zero unhandled parse errors across 50 prioritization calls +- rollback trigger: if retry also fails, use the existing task order as the safe fallback +- communication step: log malformed prioritization outputs to a separate file for prompt debugging +- learning capture: add the failing output format as a negative example in the prioritization prompt + +### Scenario Playbook 3: Execution Agent Returns Empty Result + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: execution agent returns an empty string or a single whitespace character +- initial hypothesis: the task text is too abstract for the model to generate a concrete result +- immediate action: retry the execution with an augmented prompt that asks for at least three concrete sentences +- engineering control: add a minimum result length check; if `len(result.strip()) < 50`, trigger retry +- verification target: no empty results are stored in the vector store across a 10-cycle run +- rollback trigger: if retry also returns empty, mark the task as "skipped" and create a replacement task +- communication step: log skipped tasks with their original text for human review +- learning capture: identify which task phrasings consistently produce empty results and refactor creation prompts + +### Scenario Playbook 4: Agent Cross-Contamination Drifting from Objective + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: after 10 cycles, the task list no longer relates to the original objective +- initial hypothesis: the creation agent is using task results without re-anchoring to the objective +- immediate action: add an explicit objective reminder at the top of every creation agent system prompt +- engineering control: add a post-creation filter that scores new tasks for relevance to the objective using embeddings +- verification target: at least 80% of generated tasks have cosine similarity > 0.7 to the objective embedding +- rollback trigger: if relevance drops below 50%, reset the task list to a fresh seed from the objective +- communication step: log task relevance scores at each creation step +- learning capture: use irrelevant task patterns to strengthen the objective anchoring in the creation prompt + +### Scenario Playbook 5: Loop Hangs at Execution Step + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: no output for more than 60 seconds during an execution agent call +- initial hypothesis: network timeout or model service degradation +- immediate action: add `timeout=30` to the OpenAI API call and wrap in a retry loop +- engineering control: implement a per-task maximum retry count of 3 before marking the task as failed and moving on +- verification target: no task execution blocks the loop for more than 90 seconds total +- rollback trigger: if three consecutive tasks all timeout, pause the loop and alert the operator +- communication step: log timeout events with task ID and elapsed time +- learning capture: add the timeout threshold as a configurable environment variable `EXECUTION_TIMEOUT` + +### Scenario Playbook 6: Context Retrieval Returns Stale Results + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: execution agent quality degrades as the run progresses despite more data in the vector store +- initial hypothesis: top-k retrieval is returning very early results that are no longer relevant to current tasks +- immediate action: add a recency bias by weighting results from the last 10 tasks more heavily +- engineering control: implement a hybrid retrieval that combines semantic similarity with a recency timestamp score +- verification target: average execution result quality (human-rated) improves by at least 20% vs pure semantic retrieval +- rollback trigger: if recency weighting produces no improvement after 5 cycles, revert to pure semantic retrieval +- communication step: log the IDs of retrieved context chunks for each execution call +- learning capture: add the recency weighting factor as a configurable parameter `RECENCY_WEIGHT` + +### Scenario Playbook 7: Task Creation Agent Generates Duplicate Tasks + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: newly created tasks are semantically identical to tasks already in the queue +- initial hypothesis: the creation agent is not effectively using the incomplete task list in its prompt +- immediate action: add a deduplication step that computes embeddings for new tasks and checks against existing queue +- engineering control: reject any new task with cosine similarity > 0.85 to an existing task in the queue +- verification target: zero near-duplicate tasks in the queue across a 20-cycle run +- rollback trigger: if deduplication rejects too many tasks and the queue empties, lower the similarity threshold +- communication step: log rejected duplicates with their similarity scores for prompt tuning +- learning capture: use duplicate patterns to strengthen the uniqueness constraints in the creation prompt + +### Scenario Playbook 8: Main Loop Exits Without Completing the Objective + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `MAX_ITERATIONS` reached but the objective is clearly incomplete +- initial hypothesis: MAX_ITERATIONS was set too low for the complexity of the objective +- immediate action: export the current task list and all stored results, then restart with a higher iteration limit +- engineering control: add an objective completion check before exiting: use the execution agent to evaluate if the objective is done +- verification target: the completion check correctly identifies 90% of complete vs incomplete runs in a test set +- rollback trigger: if the completion check itself produces errors, fall back to manual review of the result store +- communication step: print a summary of all completed tasks and stored results when the loop exits +- learning capture: use the task completion data to calibrate iteration limits for different objective complexity levels + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not implementing the loop, but understanding why three specialized agents outperform one general agent. The task creation agent can ask "what should be done next?" without being distracted by how to do it. The prioritization agent can reason about ordering without being constrained by execution details. The execution agent can focus entirely on completing one task well without worrying about what comes next. + +In practical terms, this chapter helps you avoid three common failures: + +- treating the three agents as interchangeable and collapsing them into one, losing the specialization benefits +- ignoring the task queue as a first-class data structure with its own growth, deduplication, and ordering requirements +- failing to instrument the loop so that debugging requires re-running experiments from scratch + +After working through this chapter, you should be able to reason about the three-agent loop as an operating system for autonomous task execution, with explicit contracts for each agent's inputs, state transitions, and outputs. + +## How it Works Under the Hood + +Under the hood, `Chapter 2: Core Architecture: Task Queue and Agent Loop` follows a repeatable control path: + +1. **Task pop**: the highest-priority task is removed from the front of the task list. +2. **Context retrieval**: the vector store is queried with the task text to retrieve the top-k most semantically similar past results. +3. **Execution call**: the execution agent receives the objective, task, and context; returns a result string. +4. **Result embedding**: the result is embedded using OpenAI embeddings and upserted into the vector store with the task ID as the key. +5. **Creation call**: the creation agent receives the objective, last task, last result, and the current incomplete task list; returns new task strings. +6. **Task append**: new tasks are appended to the task list with incrementing IDs. +7. **Prioritization call**: the prioritization agent receives the objective and the full task list; returns a renumbered ordered list. +8. **Queue replacement**: the task list is replaced with the prioritized output. +9. **Loop continuation**: the cycle repeats from step 1 with the new top task. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) + Why it matters: the complete implementation of all three agents and the main loop (github.com). +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + Why it matters: the author's description of each agent's role and the overall architecture (github.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 1: Getting Started](01-getting-started.md) +- [Next Chapter: Chapter 3: LLM Backend Integration and Configuration](03-llm-backend-integration-and-configuration.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/03-llm-backend-integration-and-configuration.md b/tutorials/babyagi-tutorial/03-llm-backend-integration-and-configuration.md new file mode 100644 index 00000000..2a171817 --- /dev/null +++ b/tutorials/babyagi-tutorial/03-llm-backend-integration-and-configuration.md @@ -0,0 +1,309 @@ +--- +layout: default +title: "Chapter 3: LLM Backend Integration and Configuration" +nav_order: 3 +parent: BabyAGI Tutorial +--- + +# Chapter 3: LLM Backend Integration and Configuration + +Welcome to **Chapter 3: LLM Backend Integration and Configuration**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how BabyAGI integrates with OpenAI, Anthropic, and local LLM backends, and how to configure each for different cost, quality, and latency tradeoffs. + +## Learning Goals + +- understand how BabyAGI makes LLM calls and what parameters matter most +- configure the OpenAI backend with different model tiers +- integrate Anthropic Claude as an alternative backend +- run BabyAGI with local models via Ollama or LM Studio + +## Fast Start Checklist + +1. identify the `openai.ChatCompletion.create` (or `openai.Completion.create`) call sites in `babyagi.py` +2. understand which parameters control model behavior: `model`, `temperature`, `max_tokens` +3. swap the model from `gpt-3.5-turbo` to `gpt-4` and compare output quality +4. optionally, set up an Anthropic or local model adapter + +## Source References + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) +- [OpenAI Python SDK](https://github.com/openai/openai-python) +- [Anthropic Python SDK](https://github.com/anthropic/anthropic-sdk-python) +- [Ollama Documentation](https://ollama.ai/docs) + +## Summary + +You now know how to configure BabyAGI's LLM backend for different providers and model tiers, and can reason about the cost, quality, and latency tradeoffs for each choice. + +Next: [Chapter 4: Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 3: LLM Backend Integration and Configuration** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 3: LLM Backend Integration and Configuration`. +2. Identify all LLM call sites in `babyagi.py`: execution agent, creation agent, prioritization agent, and the embedding call. +3. Separate the completion API calls from the embedding API calls—these may use different models and backends. +4. Trace the prompt construction pattern for each agent call: system message, user message, context injection. +5. Identify the model parameters that affect output quality: `model`, `temperature`, `max_tokens`, `stop`. +6. Map the abstraction layer: is there a single `llm_call()` helper, or are calls made directly at each agent site? +7. Specify the adapter pattern needed to swap OpenAI for Anthropic or a local model. +8. Track cost signals: approximate token usage per cycle and per model tier. + +### LLM Backend Comparison Matrix + +| Backend | Model | Strengths | Cost per 1M tokens | Best For | +|:--------|:------|:----------|:-------------------|:---------| +| OpenAI | gpt-3.5-turbo | fast, cheap, good enough | ~$0.50 input / $1.50 output | rapid prototyping and high-volume runs | +| OpenAI | gpt-4o | strong reasoning, multimodal | ~$5 input / $15 output | complex objectives requiring deep reasoning | +| OpenAI | gpt-4o-mini | balanced cost/quality | ~$0.15 input / $0.60 output | production runs with quality constraints | +| Anthropic | claude-3-haiku | fast, cheap, instruction-following | ~$0.25 input / $1.25 output | cost-sensitive production deployments | +| Anthropic | claude-3-5-sonnet | strong reasoning, long context | ~$3 input / $15 output | high-quality research-grade runs | +| Local (Ollama) | llama3.1:8b | zero API cost, private | compute cost only | air-gapped or privacy-sensitive workloads | +| Local (Ollama) | mistral:7b | fast inference, good task following | compute cost only | development and debugging without API costs | + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Execution agent model | gpt-3.5-turbo | gpt-4o or claude-3-5-sonnet | cost vs output quality | +| Creation agent model | same as execution | dedicated cheaper model (gpt-3.5) | simplicity vs per-agent optimization | +| Prioritization agent model | same as execution | rule-based or embedding-based ranker | API cost vs ordering quality | +| Embedding model | text-embedding-ada-002 | text-embedding-3-large | cost vs retrieval accuracy | +| Temperature setting | 0.5 (balanced) | 0.0 (deterministic) or 0.9 (creative) | reproducibility vs task diversity | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| wrong API base URL | `ConnectionError` or 404 | local model not running or wrong port | verify Ollama server is running on expected port | +| model not found | `model not found` error | model name typo or not pulled | run `ollama pull ` before starting | +| context window overflow | `InvalidRequestError: max tokens` | model context too small for prompt | switch to a larger context model or truncate input | +| Anthropic authentication | `AuthenticationError` | wrong env var name (`ANTHROPIC_API_KEY` vs `CLAUDE_API_KEY`) | verify exact env var name per SDK version | +| embedding dimension mismatch | vector store insert error | embedding model changed between runs | clear vector store and restart with consistent embedding model | +| local model too slow | task cycles take > 2 minutes | 7B model on CPU only | use GPU-accelerated inference or switch to a smaller model | + +### Implementation Runbook: OpenAI Backend + +1. Ensure `OPENAI_API_KEY` is set in `.env`. +2. Set `LLM_MODEL=gpt-3.5-turbo` (or `gpt-4o`, `gpt-4o-mini`) in `.env`. +3. Verify the `openai` Python package is installed at version >= 1.0.0. +4. In `babyagi.py`, confirm the call pattern uses `openai.chat.completions.create(model=LLM_MODEL, ...)`. +5. Set `OPENAI_TEMPERATURE=0.5` for balanced output quality. +6. Set `OPENAI_MAX_TOKENS=2000` to prevent runaway token usage per call. +7. Run a 3-cycle test and verify token usage is within expected bounds. + +### Implementation Runbook: Anthropic Backend + +1. Install the Anthropic Python SDK: `pip install anthropic`. +2. Set `ANTHROPIC_API_KEY` in `.env`. +3. Create a wrapper function `anthropic_completion(prompt, model, max_tokens, temperature)` that calls `anthropic.Anthropic().messages.create(...)`. +4. Replace the OpenAI chat completion calls in `execution_agent`, `task_creation_agent`, and `prioritization_agent` with calls to this wrapper. +5. Note: Anthropic uses `system` and `messages` separately; adjust prompt construction accordingly. +6. Run a 3-cycle test and verify output format matches what the parsing logic expects. +7. Note that the embedding call must remain on OpenAI (or switch to a compatible embedding service) unless you implement a separate embedding adapter. + +### Implementation Runbook: Local Model via Ollama + +1. Install Ollama from `https://ollama.ai`. +2. Pull a model: `ollama pull llama3.1:8b` or `ollama pull mistral:7b`. +3. Start the Ollama server: `ollama serve` (runs on `localhost:11434` by default). +4. Configure the OpenAI client to use the Ollama API base: `openai.base_url = "http://localhost:11434/v1"` and `openai.api_key = "ollama"`. +5. Set `LLM_MODEL=llama3.1:8b` in `.env`. +6. Note: local models do not support OpenAI embeddings. Configure Chroma with a local embedding model (e.g., `sentence-transformers/all-MiniLM-L6-v2`). +7. Run a 3-cycle test expecting slower iteration times (2-5 minutes per task on CPU). + +### Quality Gate Checklist + +- [ ] LLM backend is configurable via environment variables without code changes +- [ ] each of the three agent call sites uses the same backend configuration +- [ ] embedding calls use a consistent model across all runs in a session +- [ ] temperature and max_tokens are configurable via `.env` +- [ ] a fallback model is defined for rate-limit or availability failures +- [ ] token usage is logged per call and per cycle +- [ ] local model setup is verified with a single-turn completion test before running the full loop +- [ ] Anthropic backend output format is verified against the parsing logic before full runs + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [OpenAI Python SDK](https://github.com/openai/openai-python) +- [Anthropic Python SDK](https://github.com/anthropic/anthropic-sdk-python) +- [Ollama API Documentation](https://github.com/ollama/ollama/blob/main/docs/api.md) + +### Cross-Tutorial Connection Map + +- [LiteLLM Tutorial](../litellm-tutorial/) — unified LLM provider abstraction layer +- [Ollama Tutorial](../ollama-tutorial/) — local model serving for BabyAGI +- [OpenAI Python SDK Tutorial](../openai-python-sdk-tutorial/) — deep dive on the OpenAI client +- [Chapter 3: LLM Backend Integration](03-llm-backend-integration-and-configuration.md) + +### Advanced Practice Exercises + +1. Build a `LLMBackend` abstraction class with `complete()` and `embed()` methods; implement it for OpenAI and Anthropic. +2. Add automatic model fallback: if `gpt-4o` returns a rate limit, retry with `gpt-3.5-turbo`. +3. Run BabyAGI for the same objective on three different models and compare task quality and convergence speed. +4. Implement cost tracking by logging estimated token costs per cycle and summing at loop end. +5. Run BabyAGI with a local Ollama model for 5 iterations and compare output quality vs GPT-3.5-turbo. + +### Review Questions + +1. Why might you use a different (cheaper) model for the prioritization agent than for the execution agent? +2. What changes are required to replace OpenAI embeddings with a local embedding model? +3. How does `temperature` affect the creation agent's output differently than the execution agent? +4. What is the minimum change needed in `babyagi.py` to switch from GPT-3.5-turbo to Claude-3-Haiku? +5. What are the privacy implications of using a local model vs a cloud API for BabyAGI tasks? + +### Scenario Playbook 1: Switching from GPT-3.5 to GPT-4 Mid-Research + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: GPT-3.5 outputs are too shallow for a complex research objective +- initial hypothesis: model capability is the bottleneck, not the task design or vector context +- immediate action: set `LLM_MODEL=gpt-4o` in `.env` and restart the run +- engineering control: add per-model cost tracking so the GPT-4 premium is monitored in real time +- verification target: execution agent outputs are substantively longer and more specific within 3 cycles +- rollback trigger: if GPT-4 costs exceed $5 for a single run, switch back to GPT-3.5 with a refined objective +- communication step: log the model name alongside each execution result for post-run comparison +- learning capture: document which objective types require GPT-4 vs GPT-3.5 as a cost calibration guide + +### Scenario Playbook 2: Anthropic API Key Authentication Failure + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `anthropic.AuthenticationError` on first API call after switching backends +- initial hypothesis: environment variable name mismatch or key not propagated to the process +- immediate action: verify the exact variable name `ANTHROPIC_API_KEY` and confirm it is in `.env` +- engineering control: add a startup validation that calls the Anthropic API with a minimal test prompt before the main loop +- verification target: the startup validation returns a 200 response before the loop begins +- rollback trigger: if the API key is valid but still fails, check network connectivity and API status page +- communication step: print a clear error message distinguishing authentication vs network vs model errors +- learning capture: add Anthropic authentication to the environment validation checklist + +### Scenario Playbook 3: Local Ollama Model Too Slow for Practical Use + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: each task execution cycle takes more than 3 minutes on a local machine +- initial hypothesis: model is running on CPU only without GPU acceleration +- immediate action: check if Ollama is using the GPU with `ollama ps`; if not, switch to a smaller model +- engineering control: benchmark available models with a single-turn completion and choose the fastest model above quality threshold +- verification target: each task cycle completes in under 90 seconds on available hardware +- rollback trigger: if no local model meets the latency threshold, revert to the cloud API for the current run +- communication step: document the hardware requirements for each local model in the README +- learning capture: build a model selection guide based on hardware profile and objective complexity + +### Scenario Playbook 4: Embedding Dimension Mismatch After Model Change + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: vector store insert error after switching from `text-embedding-ada-002` to `text-embedding-3-large` +- initial hypothesis: the existing vector store was initialized with a different embedding dimension (1536 vs 3072) +- immediate action: clear the vector store namespace and reinitialize with the new embedding dimension +- engineering control: store the embedding model name and dimension in the vector store metadata; validate on startup +- verification target: no dimension mismatch errors occur after the fix, across 10 consecutive upsert operations +- rollback trigger: if clearing the vector store loses critical research context, export the stored results first +- communication step: log a warning when the embedding model differs from the last session's model +- learning capture: add embedding model consistency to the session startup validation checklist + +### Scenario Playbook 5: Model API Rate Limit During High-Volume Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: 429 rate limit errors appearing every 3-4 task cycles +- initial hypothesis: the three-agent loop makes too many requests in too short a window for the API tier +- immediate action: add a configurable `SLEEP_INTERVAL` between cycles (default 10 seconds) +- engineering control: implement exponential backoff with jitter for all three agent calls +- verification target: zero unhandled 429 errors in a 20-cycle run with backoff enabled +- rollback trigger: if backoff causes total run time to exceed 2 hours, switch to a higher-tier API key +- communication step: log rate limit events with timestamps so patterns can be analyzed +- learning capture: add rate limit frequency as a metric in the run summary output + +### Scenario Playbook 6: Context Window Overflow on Complex Tasks + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `InvalidRequestError: This model's maximum context length is 4096 tokens` on execution agent call +- initial hypothesis: the combined prompt (system + task + retrieved context) exceeds the model's context limit +- immediate action: truncate the retrieved context chunks to a maximum of 500 tokens each before injecting +- engineering control: add a `count_tokens()` function and enforce a total prompt budget per call +- verification target: no context overflow errors in a 20-cycle run with context budget enforcement +- rollback trigger: if truncation degrades execution quality, switch to a model with a larger context window (128k) +- communication step: log the prompt token count and budget headroom for each execution call +- learning capture: document the context budget formula as `system_tokens + task_tokens + context_tokens < model_limit - max_output_tokens` + +### Scenario Playbook 7: Inconsistent Output Format from Non-OpenAI Models + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task creation agent parser fails on Anthropic or local model outputs that use different list formatting +- initial hypothesis: non-OpenAI models use different default formatting conventions in their responses +- immediate action: add a robust parser that handles multiple list formats (numbered, bulleted, newline-separated) +- engineering control: normalize all task creation outputs through a `parse_task_list(response)` function with format detection +- verification target: the parser correctly extracts tasks from all three format variants across 50 test outputs +- rollback trigger: if output format is consistently unparseable for a given model, add a model-specific parser +- communication step: log the raw output format for each creation agent call during a debugging session +- learning capture: build a test suite with representative outputs from each supported model backend + +### Scenario Playbook 8: API Credential Rotation During Long-Running Experiment + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: API key expires or is rotated mid-run during a multi-hour research experiment +- initial hypothesis: key rotation invalidated the active session credential +- immediate action: update `.env` with the new key; the next LLM call will use the new key if the client is re-initialized +- engineering control: implement lazy client initialization so a new client is created on each call, picking up updated env vars +- verification target: the run resumes without restart within 60 seconds of key rotation +- rollback trigger: if the new key also fails authentication, pause the run and export current state before investigating +- communication step: set up alerts on `AuthenticationError` to notify the operator of key rotation issues +- learning capture: document the key rotation procedure and add it to the operational runbook + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not swapping model names, but ensuring that the prompt contracts established for one model actually hold for another. GPT-4 and Claude often interpret the same task creation prompt differently, using different list formats, different levels of verbosity, and different interpretations of implicit constraints in the system message. + +In practical terms, this chapter helps you avoid three common failures: + +- assuming that any OpenAI-compatible API will produce output in the format that BabyAGI's parsers expect +- ignoring the embedding backend, which must be changed separately from the completion backend when switching to local models +- underestimating the token cost of running three LLM calls per task cycle across a long autonomous run + +After working through this chapter, you should be able to configure BabyAGI's LLM backend for any supported provider and reason about the cost, quality, and latency tradeoffs for each choice. + +## How it Works Under the Hood + +Under the hood, `Chapter 3: LLM Backend Integration and Configuration` follows a repeatable control path: + +1. **Client initialization**: OpenAI (or compatible) client is initialized with `api_key`, `base_url`, and optional `organization`. +2. **Prompt construction**: each agent constructs a system message and user message specific to its role. +3. **Completion call**: the client calls `chat.completions.create(model, messages, temperature, max_tokens)`. +4. **Response extraction**: the response text is extracted from `response.choices[0].message.content`. +5. **Output parsing**: the raw text is parsed into the expected data structure (string for execution, list for creation, ordered list for prioritization). +6. **Embedding call**: the result string is passed to `embeddings.create(model, input)` to generate a vector. +7. **Vector upsert**: the embedding vector is stored in the vector backend with the task ID as the key. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) + Why it matters: shows the exact API call patterns and parameter choices for each agent (github.com). +- [OpenAI Python SDK](https://github.com/openai/openai-python) + Why it matters: reference for the client initialization and completion API shape (github.com). +- [Anthropic Python SDK](https://github.com/anthropic/anthropic-sdk-python) + Why it matters: reference for the Claude message API used in Anthropic adapter implementations (github.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 2: Core Architecture: Task Queue and Agent Loop](02-core-architecture-task-queue-and-agent-loop.md) +- [Next Chapter: Chapter 4: Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/04-task-creation-and-prioritization-engine.md b/tutorials/babyagi-tutorial/04-task-creation-and-prioritization-engine.md new file mode 100644 index 00000000..9a9ce452 --- /dev/null +++ b/tutorials/babyagi-tutorial/04-task-creation-and-prioritization-engine.md @@ -0,0 +1,315 @@ +--- +layout: default +title: "Chapter 4: Task Creation and Prioritization Engine" +nav_order: 4 +parent: BabyAGI Tutorial +--- + +# Chapter 4: Task Creation and Prioritization Engine + +Welcome to **Chapter 4: Task Creation and Prioritization Engine**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter examines how BabyAGI generates new tasks from execution results, how it ranks them, and how the quality of objective framing determines the quality of the entire task lifecycle. + +## Learning Goals + +- understand the prompt design for the task creation and prioritization agents +- identify what inputs drive task quality and how to improve them +- reason about convergence: when does a task list meaningfully shrink toward a completed objective? +- build a mental model for objective-to-task decomposition quality + +## Fast Start Checklist + +1. read the `task_creation_agent` function and its prompt template +2. read the `prioritization_agent` function and its prompt template +3. run BabyAGI for 5 iterations on two different objectives and compare the task lists +4. identify which parts of the creation prompt anchor generated tasks to the objective +5. experiment with adding explicit constraints to the creation prompt + +## Source References + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + +## Summary + +You now understand how the task creation and prioritization engine generates, deduplicates, and reorders tasks to drive the autonomous loop toward objective completion. + +Next: [Chapter 5: Memory Systems and Vector Store Integration](05-memory-systems-and-vector-store-integration.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 4: Task Creation and Prioritization Engine** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 4: Task Creation and Prioritization Engine`. +2. Identify the full input contract for the creation agent: objective, last task text, last result, incomplete task list. +3. Identify the full output contract: a list of new task strings, each unique and additive. +4. Trace how the incomplete task list is serialized into the creation prompt and how the response is deserialized. +5. Identify the full input contract for the prioritization agent: objective, full task list with IDs. +6. Trace how the prioritized list is parsed back into Python task objects with sequential IDs. +7. Identify extension hooks: pre-creation filters, post-creation deduplication, custom prioritization rules. +8. Map the convergence condition: the loop approaches completion when fewer new tasks are created per cycle. + +### Task Creation Prompt Anatomy + +The creation agent prompt typically follows this structure: + +``` +You are a task creation AI that uses the result of an execution agent +to create new tasks with the following objective: {OBJECTIVE}. +The last completed task was: {last_task}. +The result of the last task was: {last_result}. +These are incomplete tasks: {task_list}. +Based on the result, create new tasks to be completed by the AI system that do not overlap +with incomplete tasks. Return the tasks as an array. +``` + +Key levers in this prompt: +- **Objective anchoring**: the objective must be explicit enough to constrain task scope +- **Result injection**: the last result is the primary signal for what to do next +- **Incomplete task list**: prevents duplication but only if the model respects it +- **Output format instruction**: "Return the tasks as an array" — format adherence varies by model + +### Prioritization Prompt Anatomy + +The prioritization agent prompt typically follows this structure: + +``` +You are a task prioritization AI tasked with cleaning the formatting and reprioritizing +the following tasks: {task_list}. +Consider the ultimate objective of your team: {OBJECTIVE}. +Do not remove any tasks. Return the result as a numbered list, like: +#. First task +#. Second task +Start the task list with number {next_task_id}. +``` + +Key levers: +- **"Do not remove any tasks"**: prevents the model from silently dropping tasks +- **Numbered format with start ID**: ensures task IDs are sequential and parseable +- **Objective anchoring**: the model reorders based on relevance to the objective + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Creation prompt rigidity | loose format instruction | strict JSON schema output | flexibility vs parse reliability | +| Max tasks per creation | unlimited | cap at 3-5 per cycle | task diversity vs queue control | +| Prioritization frequency | every cycle | every N cycles | ordering freshness vs API cost | +| Deduplication strategy | none (rely on model) | embedding similarity check | simplicity vs duplicate prevention | +| Convergence detection | manual inspection | automated task count threshold | effort vs autonomy | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| task inflation | queue depth grows every cycle | creation adds 5+ tasks when 1-2 would suffice | cap creation output at `MAX_TASKS_PER_CYCLE` | +| priority inversion | most important task is last | prioritization prompt not well anchored to objective | add explicit priority criteria to the prompt | +| silent task removal | task count drops unexpectedly | prioritization model drops tasks despite instruction | add a post-prioritization count check | +| circular task generation | same task resurfaces after completion | creation agent doesn't know which tasks are done | pass completed task IDs to the creation prompt | +| format parse failure | `ValueError` or empty task list | model returns free-form text | add format validation and retry with explicit format example | +| objective-task divergence | tasks stop relating to objective | creation prompt doesn't re-assert objective each cycle | verify objective is injected into every creation call | + +### Implementation Runbook + +1. Locate the `task_creation_agent` function and read the full prompt template. +2. Add a `max_tasks_per_cycle` parameter that truncates the creation output list to N items. +3. Add a deduplication step: compute embeddings for new tasks and compare to existing queue; reject if similarity > 0.85. +4. Locate the `prioritization_agent` function and read the full prompt template. +5. Add a post-prioritization count assertion: `assert len(prioritized_tasks) == len(pre_prioritization_tasks)`. +6. Add a format validation wrapper that detects and corrects common output format deviations. +7. Add a `completed_tasks` list that tracks task IDs and text of completed tasks; inject into the creation prompt. +8. Run a 10-cycle test and measure: tasks created per cycle, tasks in queue at each cycle, task relevance to objective. +9. Tune `max_tasks_per_cycle` and deduplication threshold based on the metrics. + +### Quality Gate Checklist + +- [ ] creation prompt explicitly injects the objective, last task, last result, and incomplete task list +- [ ] creation output is parsed into a Python list with a fallback for malformed responses +- [ ] creation output is capped at `MAX_TASKS_PER_CYCLE` to prevent queue inflation +- [ ] deduplication check prevents semantically identical tasks from entering the queue +- [ ] prioritization prompt explicitly instructs the model not to remove tasks +- [ ] post-prioritization count assertion catches silent task removal +- [ ] task IDs are sequential and correctly assigned after prioritization +- [ ] completed task history is injected into the creation prompt to prevent re-generating completed work + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + +### Cross-Tutorial Connection Map + +- [LangGraph Tutorial](../langgraph-tutorial/) — stateful task graph patterns comparable to BabyAGI's queue +- [CrewAI Tutorial](../crewai-tutorial/) — multi-agent task decomposition patterns +- [DSPy Tutorial](../dspy-tutorial/) — prompt optimization techniques applicable to task creation prompts +- [Chapter 4: Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) + +### Advanced Practice Exercises + +1. Add a `convergence_detector` function that predicts when the objective is complete based on task count trends. +2. Implement a task quality scorer that rates each newly created task for specificity and relevance using an LLM call. +3. Replace the LLM-based prioritization agent with an embedding-based ranker that uses cosine similarity to the objective. +4. Build a visualization that plots queue depth and task relevance over cycles for a given run. +5. Compare the task lists produced by three different objectives at the same granularity level and measure overlap. + +### Review Questions + +1. What is the risk of not passing the incomplete task list to the creation agent? +2. Why does the prioritization prompt explicitly say "do not remove any tasks"? +3. How would you detect when BabyAGI has effectively completed its objective without a human in the loop? +4. What is the tradeoff between running prioritization every cycle vs every 3 cycles? +5. How does the quality of the `last_result` injected into the creation prompt affect task quality? + +### Scenario Playbook 1: Task List Diverging from Objective After 5 Cycles + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: tasks at cycle 5 are clearly not related to the original objective +- initial hypothesis: the creation agent is using the last result as its primary anchor rather than the objective +- immediate action: add a stronger objective restatement at the very beginning of the creation system prompt +- engineering control: add a post-creation relevance filter using embedding similarity to the objective vector +- verification target: 90% of new tasks have cosine similarity > 0.65 to the objective embedding after the fix +- rollback trigger: if the filter rejects all new tasks, lower the threshold and investigate the creation prompt +- communication step: log task-to-objective similarity scores at each creation step +- learning capture: document objective phrasing patterns that produce high-relevance task decompositions + +### Scenario Playbook 2: Silent Task Removal During Prioritization + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task count decreases after the prioritization step without explicit deletion +- initial hypothesis: the prioritization model is silently dropping tasks it deems irrelevant +- immediate action: add `assert len(new_list) == len(old_list)` after prioritization parsing +- engineering control: if the assertion fails, merge the dropped tasks back at the end of the list +- verification target: no task count decrease occurs across 50 prioritization cycles +- rollback trigger: if task merging creates semantic duplicates, add deduplication after the merge +- communication step: log a warning with the dropped task texts whenever a merge is triggered +- learning capture: strengthen the prioritization prompt with "you must return ALL tasks provided to you" + +### Scenario Playbook 3: Task Format Parse Failure + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the creation agent returns a paragraph of text instead of a numbered task list +- initial hypothesis: the model ignored the format instruction and returned free-form output +- immediate action: extract task-like sentences using a regex fallback parser +- engineering control: add a retry with an explicit format example: "Return tasks as: 1. Task one\n2. Task two" +- verification target: zero unrecovered parse failures across 100 creation cycles +- rollback trigger: if retry also fails, inject one fallback task derived from the objective and log the failure +- communication step: log raw creation output for every cycle where the primary parser fails +- learning capture: build a test suite with representative malformed outputs for regression testing + +### Scenario Playbook 4: Circular Task Regeneration + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a task that was completed at cycle 3 reappears in the queue at cycle 8 +- initial hypothesis: the creation agent generates new tasks without awareness of completed work +- immediate action: pass the completed task list to the creation prompt with the instruction to avoid regenerating them +- engineering control: add a post-creation check that rejects any new task with cosine similarity > 0.9 to a completed task +- verification target: no completed task is regenerated within a 20-cycle run +- rollback trigger: if the check is too aggressive and blocks legitimate follow-up tasks, lower the threshold +- communication step: log any rejected regeneration with the original completed task for prompt tuning +- learning capture: add completed task injection to the creation prompt template as a permanent feature + +### Scenario Playbook 5: Queue Depth Inflation in Long Runs + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: queue depth grows from 5 to 30+ tasks over a 10-cycle run +- initial hypothesis: the creation agent adds 5-10 new tasks per cycle without converging on completion +- immediate action: set `MAX_TASKS_PER_CYCLE=3` and `MAX_QUEUE_DEPTH=15` +- engineering control: if queue depth exceeds the cap, skip the creation step for that cycle and go directly to execution +- verification target: queue depth stabilizes between 5 and 15 tasks across a 20-cycle run +- rollback trigger: if queue stagnates and the objective stalls, increase `MAX_TASKS_PER_CYCLE` to 5 +- communication step: log queue depth at the start of each cycle as a standard metric +- learning capture: correlate objective complexity with optimal `MAX_TASKS_PER_CYCLE` values + +### Scenario Playbook 6: Task IDs Becoming Inconsistent After Prioritization + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: task ID numbering resets or has gaps after the prioritization step +- initial hypothesis: the prioritization agent starts numbering from 1 instead of the current max ID +- immediate action: pass the correct `next_task_id` value to the prioritization prompt explicitly +- engineering control: add a post-prioritization normalization step that reassigns sequential IDs starting from `next_task_id` +- verification target: task IDs are always sequential and gapless after prioritization across 50 cycles +- rollback trigger: if ID normalization causes parsing issues, add a separate ID assignment step after parsing +- communication step: log the task ID range before and after prioritization for each cycle +- learning capture: add task ID consistency to the prioritization quality gate checklist + +### Scenario Playbook 7: Creation Agent Produces Overly Granular Tasks + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: new tasks are extremely fine-grained (e.g., "Search for the word 'Python' on Google") +- initial hypothesis: the creation agent is treating the last result as a micro-task template instead of a step result +- immediate action: add explicit granularity guidance to the creation prompt: "each task should represent a meaningful unit of research or analysis" +- engineering control: add a task length filter that rejects tasks under 10 words as too granular +- verification target: average task text length increases from under 8 words to over 12 words after the fix +- rollback trigger: if the prompt change produces overly vague tasks, revert and tune the wording +- communication step: log average task text length per cycle as a quality proxy metric +- learning capture: document the optimal granularity instruction phrasing in the prompt engineering notes + +### Scenario Playbook 8: Prioritization Agent Reversing Priority Consistently + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: after prioritization, the most foundational tasks appear last instead of first +- initial hypothesis: the prioritization prompt is ambiguous about ordering direction (ascending vs descending priority) +- immediate action: add "List tasks from most important (first) to least important (last)" to the prioritization prompt +- engineering control: add a spot-check that verifies the first task is more relevant to the objective than the last task +- verification target: the first task in the queue has higher cosine similarity to the objective than the last task in 90% of cycles +- rollback trigger: if the relevance check detects persistent inversion, log the full output and adjust the prompt +- communication step: log the top task and bottom task relevance scores after each prioritization step +- learning capture: add priority direction language to the canonical prioritization prompt template + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not the loop itself, but making the task creation and prioritization agents produce outputs that are specific enough to be actionable, novel enough to make progress, and anchored enough to stay on objective. The temptation is to treat these agents as magic boxes that reliably decompose objectives—but in practice, the quality of their outputs is highly sensitive to small changes in prompt wording, objective clarity, and the quality of the injected result context. + +In practical terms, this chapter helps you avoid three common failures: + +- leaving the creation prompt without explicit constraints, resulting in runaway task inflation +- trusting the prioritization agent to preserve task count without an explicit assertion +- not tracking completed tasks, causing the loop to regenerate work it has already done + +After working through this chapter, you should be able to reason about the task creation and prioritization engine as a controllable subsystem with explicit quality signals, guard rails, and tuning levers. + +## How it Works Under the Hood + +Under the hood, `Chapter 4: Task Creation and Prioritization Engine` follows a repeatable control path: + +1. **Result collection**: the execution agent's output string is captured as `last_result`. +2. **Creation prompt construction**: objective + last task + last result + serialized incomplete task list are concatenated into the creation prompt. +3. **Creation LLM call**: the model returns a list of new task strings. +4. **Output parsing**: the response is split into individual task strings and deduplicated against the existing queue. +5. **Task ID assignment**: new tasks are assigned sequential IDs starting from `max(existing_ids) + 1`. +6. **Queue extension**: new tasks are appended to the task list. +7. **Prioritization prompt construction**: objective + serialized full task list with IDs are concatenated. +8. **Prioritization LLM call**: the model returns a renumbered ordered list. +9. **Output parsing**: the response is split into `(id, task_text)` tuples and the task list is replaced. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) + Why it matters: the exact prompt templates and parsing logic for both agents (github.com). +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + Why it matters: the author's design intent for the task creation and prioritization roles (github.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 3: LLM Backend Integration and Configuration](03-llm-backend-integration-and-configuration.md) +- [Next Chapter: Chapter 5: Memory Systems and Vector Store Integration](05-memory-systems-and-vector-store-integration.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/05-memory-systems-and-vector-store-integration.md b/tutorials/babyagi-tutorial/05-memory-systems-and-vector-store-integration.md new file mode 100644 index 00000000..2e837f4f --- /dev/null +++ b/tutorials/babyagi-tutorial/05-memory-systems-and-vector-store-integration.md @@ -0,0 +1,313 @@ +--- +layout: default +title: "Chapter 5: Memory Systems and Vector Store Integration" +nav_order: 5 +parent: BabyAGI Tutorial +--- + +# Chapter 5: Memory Systems and Vector Store Integration + +Welcome to **Chapter 5: Memory Systems and Vector Store Integration**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how BabyAGI uses vector stores (originally Pinecone, now also Chroma and Qdrant) as its long-term memory layer, and how the retrieval quality of this memory directly determines the quality of task execution. + +## Learning Goals + +- understand why BabyAGI uses a vector store instead of a simple list for memory +- configure and operate Pinecone, Chroma, and Qdrant as BabyAGI backends +- reason about retrieval quality and how it affects execution agent output +- implement memory hygiene practices for long-running autonomous experiments + +## Fast Start Checklist + +1. identify the vector store initialization, upsert, and query code in `babyagi.py` +2. set up Chroma locally as the simplest backend option +3. run a 5-cycle test and inspect the stored embeddings via Chroma's client API +4. run two objectives and compare retrieval results for a sample query +5. measure the impact of `PINECONE_API_KEY` vs `USE_CHROMA` on startup latency + +## Source References + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) +- [Pinecone Documentation](https://docs.pinecone.io/) +- [Chroma Documentation](https://docs.trychroma.com/) +- [Qdrant Documentation](https://qdrant.tech/documentation/) + +## Summary + +You now understand how BabyAGI's vector memory layer works, how to configure different backends, and how retrieval quality shapes the execution agent's output at each cycle. + +Next: [Chapter 6: Extending BabyAGI: Custom Tools and Skills](06-extending-babyagi-custom-tools-and-skills.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 5: Memory Systems and Vector Store Integration** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 5: Memory Systems and Vector Store Integration`. +2. Identify the two vector store operations: upsert (write after execution) and query (read before execution). +3. Trace the embedding generation pipeline: result text → `text-embedding-ada-002` → 1536-dim vector → upsert. +4. Trace the retrieval pipeline: task text → embedding → top-k cosine similarity query → context string injection. +5. Identify the namespace/collection isolation strategy: each run uses a unique `TABLE_NAME` to avoid cross-run contamination. +6. Map the backend abstraction: the same interface is used regardless of whether Pinecone, Chroma, or an in-memory store is configured. +7. Specify the memory hygiene practices: clearing stale namespaces, managing storage growth, handling dimension changes. +8. Track observability signals: retrieval latency, vector count per namespace, top-k similarity scores. + +### Vector Store Backend Comparison + +| Backend | Setup Complexity | Storage Model | Retrieval Latency | Best For | +|:--------|:----------------|:--------------|:------------------|:---------| +| In-memory (numpy) | zero | RAM only; lost on restart | < 1ms | quick prototyping; no persistence needed | +| Chroma (local) | low | local SQLite + HNSW index | 1-10ms | single-machine persistent runs | +| Pinecone (managed) | medium | managed cloud index | 10-50ms | production runs requiring persistence and scale | +| Qdrant (local or cloud) | medium | local or cloud vector store | 5-20ms | self-hosted production or privacy-sensitive workloads | +| Weaviate | high | full graph + vector store | 10-40ms | complex multi-modal or cross-object retrieval | + +### Memory Architecture Deep Dive + +BabyAGI's memory system serves one primary purpose: giving the execution agent access to relevant past results when working on a new task. Without memory, every task executes in isolation and the agent cannot build on previous work. With memory, the agent retrieves the 5 most semantically similar past results and uses them as context. + +The memory flow is: +1. Task T is executed → Result R is produced as a string. +2. R is embedded: `embed(R)` → vector V of dimension 1536. +3. V is upserted into the vector store with key `task_{task_id}` and metadata `{"task": task_text, "result": R}`. +4. When task T+1 is about to execute, its text is embedded: `embed(T+1)` → query vector Q. +5. The vector store returns the top-5 most similar past result vectors to Q. +6. The metadata from those 5 results is concatenated into a context string. +7. The context string is injected into the execution agent's prompt. + +This creates an implicit knowledge graph where each task benefits from the outputs of semantically related prior tasks—even if those tasks were not directly sequential. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Backend choice | Chroma local | Pinecone managed | simplicity vs production durability | +| Top-k retrieval count | 5 (default) | 10-20 | latency vs context richness | +| Namespace isolation | unique per run | shared across runs | clean isolation vs cross-run knowledge sharing | +| Embedding model | text-embedding-ada-002 | text-embedding-3-large | cost vs retrieval accuracy | +| Memory persistence | local SQLite | cloud-managed index | portability vs durability | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| vector dimension mismatch | upsert error | embedding model changed between sessions | validate embedding dimension at startup; clear namespace if mismatch | +| namespace pollution | irrelevant results retrieved | multiple runs sharing the same `TABLE_NAME` | enforce unique namespace per run via timestamp or UUID suffix | +| Pinecone index not found | `NotFoundException` at startup | index not created or wrong name | add startup check that creates the index if it does not exist | +| Chroma collection not found | empty retrieval | collection not initialized | add `get_or_create_collection()` at startup | +| top-k retrieval too slow | > 500ms per query | large index with no optimization | add an ANN index (HNSW) or reduce the namespace to the current run only | +| stale embeddings degrading retrieval | execution quality drops | old results from unrelated runs contaminate results | clear the namespace before each new objective | + +### Implementation Runbook: Chroma Backend + +1. Install Chroma: `pip install chromadb`. +2. Set `USE_CHROMA=True` in `.env` and set `TABLE_NAME` to a unique collection name. +3. In `babyagi.py`, verify the Chroma client is initialized with `chromadb.Client()` for in-memory or `chromadb.PersistentClient(path="./chroma")` for persistent storage. +4. Verify the collection is created with `client.get_or_create_collection(name=TABLE_NAME)`. +5. Verify the upsert pattern uses `collection.upsert(ids=[task_id], embeddings=[vector], metadatas=[metadata])`. +6. Verify the query pattern uses `collection.query(query_embeddings=[query_vector], n_results=RESULTS_COUNT)`. +7. Run a 5-cycle test and inspect the collection with `collection.count()` to verify all results are stored. + +### Implementation Runbook: Pinecone Backend + +1. Install Pinecone: `pip install pinecone-client`. +2. Set `PINECONE_API_KEY` and `PINECONE_ENVIRONMENT` in `.env`. +3. Set `TABLE_NAME` to the Pinecone index name and `PINECONE_DIMENSION=1536`. +4. Add a startup check that creates the index if it does not exist: `pinecone.create_index(name, dimension, metric="cosine")`. +5. Verify the upsert pattern uses `index.upsert([(task_id, vector, metadata)])`. +6. Verify the query pattern uses `index.query(vector=query_vector, top_k=5, include_metadata=True)`. +7. Run a 5-cycle test and verify records in the Pinecone console. + +### Quality Gate Checklist + +- [ ] vector store backend is configured via environment variables without code changes +- [ ] startup initializes the namespace/collection without failing if it already exists +- [ ] every task result is upserted immediately after execution +- [ ] top-k retrieval is called before every execution agent call +- [ ] namespace isolation prevents cross-run contamination +- [ ] embedding model is consistent within a session +- [ ] memory persistence is verified with a restart test (for non-in-memory backends) +- [ ] retrieval latency is logged per query for performance monitoring + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [Pinecone Documentation](https://docs.pinecone.io/) +- [Chroma Documentation](https://docs.trychroma.com/) +- [Qdrant Documentation](https://qdrant.tech/documentation/) + +### Cross-Tutorial Connection Map + +- [Chroma Tutorial](../chroma-tutorial/) — deep dive on Chroma as a vector database +- [LanceDB Tutorial](../lancedb-tutorial/) — alternative local vector store +- [LlamaIndex Tutorial](../llamaindex-tutorial/) — RAG patterns comparable to BabyAGI's retrieval +- [Chapter 5: Memory Systems](05-memory-systems-and-vector-store-integration.md) + +### Advanced Practice Exercises + +1. Implement a memory inspector that visualizes the top-5 retrieved results for each task before execution. +2. Compare retrieval quality across `text-embedding-ada-002` and `text-embedding-3-large` for the same objective. +3. Add a memory export function that serializes the entire vector store to JSON at run end. +4. Implement a cross-run memory feature: import results from a previous run into the current namespace. +5. Build a retrieval quality evaluator that scores each retrieved result for relevance to the current task. + +### Review Questions + +1. Why does BabyAGI use semantic retrieval rather than simply passing all previous results to the execution agent? +2. What is the risk of sharing the same `TABLE_NAME` across multiple different objectives? +3. How does the `top_k` parameter in the retrieval call affect the execution agent's prompt length? +4. What is the consequence of an embedding model change between a run's first and later cycles? +5. How would you implement a memory decay mechanism that gradually reduces the weight of older results? + +### Scenario Playbook 1: Pinecone Index Dimension Mismatch + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: upsert fails with dimension mismatch error when embedding model was recently changed +- initial hypothesis: the Pinecone index was created with dimension 1536 but new embeddings are 3072 +- immediate action: delete the old index and recreate with the correct dimension +- engineering control: add a startup dimension validation that reads the index dimension and compares to the embedding model's output +- verification target: startup validation catches dimension mismatches before the first upsert +- rollback trigger: if deleting the index loses critical data, export all vectors first using Pinecone's fetch API +- communication step: log the detected dimension mismatch with both the index dimension and the embedding dimension +- learning capture: add embedding model + dimension as a stored index metadata field for future validation + +### Scenario Playbook 2: Chroma Collection Growing Beyond Practical Query Speed + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: Chroma query latency increases from 5ms to 500ms as the collection grows past 10,000 vectors +- initial hypothesis: the default Chroma index is not optimized for large-scale nearest-neighbor search +- immediate action: configure Chroma with HNSW index parameters optimized for the expected collection size +- engineering control: set `hnsw:ef_construction=200` and `hnsw:M=16` for better index quality at scale +- verification target: query latency stays below 50ms for collections up to 50,000 vectors +- rollback trigger: if HNSW index build time is too slow, reduce `ef_construction` to 100 +- communication step: log query latency at each retrieval step as a standard metric +- learning capture: add Chroma HNSW tuning parameters to the configuration guide + +### Scenario Playbook 3: Namespace Pollution from Multiple Runs + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: execution agent retrieves irrelevant results from a previous run on a different objective +- initial hypothesis: multiple runs shared the same `TABLE_NAME` and their results are co-mingled +- immediate action: clear the namespace and restart with a unique `TABLE_NAME` incorporating a timestamp +- engineering control: automatically append `_{datetime.now().strftime("%Y%m%d_%H%M%S")}` to `TABLE_NAME` if not overridden +- verification target: each new run creates and uses its own isolated namespace +- rollback trigger: if automatic namespace creation is not desired (intentional cross-run sharing), add an explicit opt-in flag +- communication step: print the active namespace name at startup for operator awareness +- learning capture: document the namespace naming convention in the configuration guide + +### Scenario Playbook 4: Retrieval Returns No Results on Early Cycles + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: execution agent context is empty for the first 3 cycles because the vector store has no entries yet +- initial hypothesis: the vector store is empty at the start of a run and cannot be queried until at least one result is stored +- immediate action: this is expected behavior; add a guard that skips retrieval if the vector store has fewer than `top_k` entries +- engineering control: if fewer than `top_k` results exist, return all available results rather than failing +- verification target: the first cycle runs without retrieval errors even with an empty vector store +- rollback trigger: no rollback needed; this is a startup edge case with a deterministic fix +- communication step: log "vector store empty, skipping retrieval" at cycle 1 for operator clarity +- learning capture: document the cold-start behavior as a known and expected startup condition + +### Scenario Playbook 5: Qdrant Connection Refused + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `qdrant_client.http.exceptions.UnexpectedResponse` or `ConnectionRefused` at startup +- initial hypothesis: the Qdrant server is not running or is listening on a different port +- immediate action: start the Qdrant server with `docker run -p 6333:6333 qdrant/qdrant` +- engineering control: add a startup health check that pings `http://localhost:6333/health` before initializing the client +- verification target: startup health check passes before the main loop begins +- rollback trigger: if Qdrant is unavailable, fall back to Chroma local automatically +- communication step: print a clear error message with the expected Qdrant URL and a startup command hint +- learning capture: add Qdrant server startup to the environment setup checklist + +### Scenario Playbook 6: Memory Persistence Failure After Crash + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a run crashes at cycle 8 and after restart, the Chroma collection appears empty +- initial hypothesis: Chroma was configured with in-memory mode instead of persistent mode +- immediate action: switch to `chromadb.PersistentClient(path="./chroma_db")` for future runs +- engineering control: add a startup configuration check that verifies the persistence path is set and writable +- verification target: a simulated crash-and-restart test shows all pre-crash vectors are available after restart +- rollback trigger: if persistent mode has write permission issues, fix directory permissions before restarting +- communication step: log the active storage mode (in-memory vs persistent) at startup +- learning capture: make persistent mode the default and require an explicit opt-in flag for in-memory mode + +### Scenario Playbook 7: Retrieval Degradation Due to Embedding Model API Errors + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the embedding API returns errors causing retrieval to silently fail and the context to be empty +- initial hypothesis: the embedding API has a temporary outage or rate limit affecting the result storage step +- immediate action: add retry logic with exponential backoff to all embedding API calls +- engineering control: cache the last successful embedding for retry within the same cycle +- verification target: no cycle executes with empty retrieval context due to embedding API errors +- rollback trigger: if the embedding API is down for more than 5 minutes, pause the loop until it recovers +- communication step: log embedding API errors separately from completion API errors for targeted alerting +- learning capture: add embedding API monitoring to the operational runbook + +### Scenario Playbook 8: Top-K Retrieval Flooding the Execution Prompt + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: context window overflow because top-10 retrieval results exceed the model's context limit +- initial hypothesis: top_k=10 with verbose past results is too large for the execution agent's context +- immediate action: reduce `top_k` to 5 and add per-result truncation at 300 tokens +- engineering control: add a total context budget: `sum(result_lengths) <= CONTEXT_BUDGET` before injection +- verification target: no context overflow errors in a 20-cycle run with the budget enforced +- rollback trigger: if reducing top_k degrades task quality, increase the context budget by switching to a larger context model +- communication step: log the total context length injected into each execution call +- learning capture: document the optimal top_k and result truncation settings for each supported model + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not connecting to the vector store, but understanding that the quality of BabyAGI's memory directly determines the quality of its outputs. Without a well-configured retrieval layer, the execution agent works in isolation, unable to leverage the cumulative knowledge from prior task cycles. With a well-configured retrieval layer, each task execution benefits from semantically relevant prior results—creating a virtuous cycle where early research informs later synthesis. + +In practical terms, this chapter helps you avoid three common failures: + +- using an in-memory backend for long runs and losing all results on a crash or keyboard interrupt +- sharing a namespace across multiple objectives and getting irrelevant results that mislead the execution agent +- not monitoring retrieval latency, which can silently become the bottleneck in a long-running autonomous experiment + +After working through this chapter, you should be able to configure, monitor, and maintain BabyAGI's vector memory layer as a reliable operational component. + +## How it Works Under the Hood + +Under the hood, `Chapter 5: Memory Systems and Vector Store Integration` follows a repeatable control path: + +1. **Backend initialization**: the vector store client is initialized and the namespace/collection is created or confirmed to exist. +2. **Embedding model initialization**: the OpenAI embedding client is configured with the chosen model. +3. **Upsert path**: after execution, the result string is passed to `get_embedding(result)` → vector V → `upsert(task_id, V, metadata)`. +4. **Query path**: before execution, the task text is passed to `get_embedding(task_text)` → query vector Q → `query(Q, top_k=5)`. +5. **Context assembly**: the metadata from the top-k results is concatenated into a context string. +6. **Context injection**: the context string is passed to the execution agent's prompt as the `context` parameter. +7. **Persistence**: for non-in-memory backends, results are persisted to disk or cloud on each upsert. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) + Why it matters: shows the exact vector store operations and embedding calls (github.com). +- [Pinecone Documentation](https://docs.pinecone.io/) + Why it matters: reference for index creation, upsert, and query API (pinecone.io). +- [Chroma Documentation](https://docs.trychroma.com/) + Why it matters: reference for collection creation, upsert, and query API (trychroma.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 4: Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) +- [Next Chapter: Chapter 6: Extending BabyAGI: Custom Tools and Skills](06-extending-babyagi-custom-tools-and-skills.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/06-extending-babyagi-custom-tools-and-skills.md b/tutorials/babyagi-tutorial/06-extending-babyagi-custom-tools-and-skills.md new file mode 100644 index 00000000..a7cbcf06 --- /dev/null +++ b/tutorials/babyagi-tutorial/06-extending-babyagi-custom-tools-and-skills.md @@ -0,0 +1,328 @@ +--- +layout: default +title: "Chapter 6: Extending BabyAGI: Custom Tools and Skills" +nav_order: 6 +parent: BabyAGI Tutorial +--- + +# Chapter 6: Extending BabyAGI: Custom Tools and Skills + +Welcome to **Chapter 6: Extending BabyAGI: Custom Tools and Skills**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how to extend BabyAGI beyond pure LLM reasoning by adding web search, file I/O, code execution, and domain-specific tool integrations into the execution agent's capability set. + +## Learning Goals + +- understand how the execution agent can be extended to call external tools +- implement a web search tool integration using SerpAPI or Tavily +- add file read/write capabilities to enable persistent artifacts +- design a tool routing layer that selects the right tool for each task + +## Fast Start Checklist + +1. identify where the execution agent's output is currently produced (pure LLM text) +2. add a SerpAPI or Tavily web search function that can be called from the execution agent +3. modify the execution agent to detect when a task requires web search vs pure reasoning +4. run a 5-cycle test with an objective that explicitly requires current web information +5. verify that search results are stored in the vector store alongside LLM-generated results + +## Source References + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) +- [BabyAGI README Extensions Section](https://github.com/yoheinakajima/babyagi#readme) +- [SerpAPI Documentation](https://serpapi.com/search-api) +- [Tavily Search API](https://tavily.com/docs) + +## Summary + +You now know how to extend BabyAGI with external tools and skills, enabling the execution agent to go beyond pure LLM reasoning and interact with the web, file systems, and domain-specific APIs. + +Next: [Chapter 7: BabyAGI Evolution: 2o and Functionz Framework](07-babyagi-evolution-2o-and-functionz-framework.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 6: Extending BabyAGI: Custom Tools and Skills** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 6: Extending BabyAGI: Custom Tools and Skills`. +2. Identify the extension point in the execution agent: the transition from pure LLM output to tool-augmented output. +3. Design the tool registry: a dict mapping tool names to callable functions. +4. Design the tool routing logic: how the execution agent determines whether to use a tool for a given task. +5. Identify the output contract: tool results must be coercible to a string for vector store storage. +6. Map the error boundary: tool failures should not crash the main loop; they should return an error string instead. +7. Specify the security controls: tool calls that interact with external services require credential management. +8. Track observability: tool call frequency, tool latency, tool error rate. + +### Tool Integration Patterns + +**Pattern 1: Inline Tool Call** +The execution agent's LLM call is augmented with a post-processing step that detects a tool invocation pattern in the output (e.g., `[SEARCH: query]`) and executes the tool, then re-calls the LLM with the tool result injected. + +**Pattern 2: Tool-First Routing** +Before calling the LLM, classify the task type (search, compute, file, reasoning-only) using a fast classifier (keyword match or small LLM call), then call the appropriate tool, and pass the tool result to the LLM for synthesis. + +**Pattern 3: Function Calling API** +Use OpenAI's function calling (tools) API to let the model natively decide when and how to call registered tools. The execution agent loop handles tool call responses before producing the final output. + +**Pattern 4: LangChain Tool Wrapping** +Use LangChain's `Tool` abstraction to define tools with schemas, then use a `ReActAgent` or `AgentExecutor` as the execution agent. This trades simplicity for a richer tool ecosystem. + +### Tool Registry Example + +```python +import requests + +def web_search(query: str) -> str: + """Search the web and return top 3 result snippets.""" + url = "https://api.tavily.com/search" + response = requests.post(url, json={ + "api_key": os.getenv("TAVILY_API_KEY"), + "query": query, + "max_results": 3 + }) + results = response.json().get("results", []) + return "\n".join([r["content"] for r in results]) + +def read_file(filename: str) -> str: + """Read a file and return its contents.""" + with open(filename, "r") as f: + return f.read() + +def write_file(filename: str, content: str) -> str: + """Write content to a file and return confirmation.""" + with open(filename, "w") as f: + f.write(content) + return f"Written {len(content)} characters to {filename}" + +TOOL_REGISTRY = { + "search": web_search, + "read_file": read_file, + "write_file": write_file, +} +``` + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Tool routing | LLM-based routing | keyword-based deterministic routing | flexibility vs reliability | +| External API calls | direct calls | rate-limited wrapper with retry | simplicity vs resilience | +| File I/O | restricted to a sandbox directory | unrestricted filesystem access | safety vs capability | +| Tool error handling | return error string | retry with different tool | simplicity vs robustness | +| Tool result storage | store full result | store summary only | completeness vs token budget | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| tool API key missing | `KeyError` or `401` on first tool call | tool credential not in `.env` | add tool key to the startup environment validation | +| tool returns empty result | empty string stored in vector | search returned no results for query | add a fallback to pure LLM reasoning when tool result is empty | +| tool call loops infinitely | task cycle never completes | tool routing enters a retry loop | add a max retry count per tool call | +| file path traversal | unexpected file read/write | unsanitized file paths from LLM | sandbox all file I/O to a designated output directory | +| tool result too large | context window overflow | search returns full page text | truncate tool results to a configurable max token count | +| rate limit on search API | 429 from search provider | too many searches per minute | add rate limiting and caching for duplicate search queries | + +### Implementation Runbook + +1. Install tool dependencies: `pip install requests tavily-python`. +2. Add `TAVILY_API_KEY` to `.env`. +3. Create a `tools.py` module with the tool registry dict and individual tool functions. +4. Import the tool registry in `babyagi.py`. +5. Modify the execution agent to detect tool invocation in the task text: if the task contains "search for" or "find current", route to the search tool. +6. Call the appropriate tool, capture the result string, and inject it into the LLM's user message as `Tool Result: {tool_result}`. +7. The LLM synthesizes the final result using both the task, context, and tool result. +8. Store the synthesized result (not the raw tool output) in the vector store. +9. Add per-tool error handling: wrap each tool call in `try/except` and return `f"Tool error: {str(e)}"` on failure. +10. Run a 5-cycle test with an objective requiring web search and verify search results appear in the vector store. + +### Quality Gate Checklist + +- [ ] tool registry is defined in a separate module and imported cleanly +- [ ] all tool API keys are validated at startup before the loop begins +- [ ] tool routing logic is deterministic and testable +- [ ] each tool has explicit error handling that does not crash the main loop +- [ ] tool results are truncated to a configurable max length before injection +- [ ] tool result storage is verified in the vector store after each tool-augmented cycle +- [ ] file I/O tools are sandboxed to a designated output directory +- [ ] rate limiting is applied to external API tool calls + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [SerpAPI Documentation](https://serpapi.com/search-api) +- [Tavily Documentation](https://tavily.com/docs) + +### Cross-Tutorial Connection Map + +- [LangChain Tutorial](../langchain-tutorial/) — tool integration patterns and agent frameworks +- [CrewAI Tutorial](../crewai-tutorial/) — multi-agent tool sharing patterns +- [Browser Use Tutorial](../browser-use-tutorial/) — browser-based tool integration +- [Chapter 6: Extending BabyAGI](06-extending-babyagi-custom-tools-and-skills.md) + +### Advanced Practice Exercises + +1. Build a `CalculatorTool` that evaluates mathematical expressions safely using Python's `ast.literal_eval`. +2. Add a `CodeExecutionTool` that runs Python code snippets in a sandboxed subprocess and returns stdout. +3. Implement a tool router that uses an LLM classifier to select tools from a registry of 5+ options. +4. Add a tool result cache that prevents duplicate search queries within a single run. +5. Build a tool monitoring dashboard that shows call count, error rate, and latency per tool. + +### Review Questions + +1. What is the risk of letting the LLM decide tool routing vs using deterministic keyword matching? +2. Why should tool results be stored in the vector store even when they come from external APIs? +3. How would you implement a sandboxed code execution tool safely within BabyAGI? +4. What is the consequence of not truncating tool results before passing them to the execution agent? +5. How does adding tools to BabyAGI change the design requirements for the task creation agent? + +### Scenario Playbook 1: Web Search Tool Returns Empty Results + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the search tool returns an empty list for a valid-seeming query +- initial hypothesis: the query is too specific or the search provider returned no results +- immediate action: add a fallback that broadens the query by removing the most specific terms +- engineering control: implement a two-stage search: first try exact query, then broaden if empty +- verification target: at least 90% of search tool calls return at least one result within two attempts +- rollback trigger: if broadened search also fails, fall back to pure LLM reasoning for that task +- communication step: log the original query, broadened query, and result count for debugging +- learning capture: build a query reformulation heuristic based on observed failure patterns + +### Scenario Playbook 2: File I/O Tool Writing Outside the Sandbox + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the LLM generates a file path like `../../../etc/passwd` for the write_file tool +- initial hypothesis: path traversal attack or accidental LLM output outside the intended sandbox +- immediate action: add `os.path.abspath` and sandbox boundary check before any file write +- engineering control: validate that the resolved path starts with the designated output directory absolute path +- verification target: any path traversal attempt raises a `ValueError` with a clear message before the write occurs +- rollback trigger: no rollback needed; the check is a hard guard that prevents the write +- communication step: log any path traversal attempt with the original path and the task that generated it +- learning capture: add path sanitization to the file tool's input validation as a permanent feature + +### Scenario Playbook 3: Search API Rate Limit Mid-Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: Tavily API returns 429 after 10 search tool calls within a minute +- initial hypothesis: the search tool is called on nearly every task without rate limiting +- immediate action: add a search result cache keyed on the query string to avoid duplicate calls +- engineering control: implement a 60-second rolling rate limiter that caps search calls at 8 per minute +- verification target: no 429 errors occur in a 20-cycle run with the cache and rate limiter active +- rollback trigger: if the cache causes stale results, add a 1-hour TTL to cached entries +- communication step: log cache hit rate and rate limit events as standard metrics +- learning capture: add the search rate limit settings to the tool configuration guide + +### Scenario Playbook 4: Tool Result Too Large for Context Window + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: execution agent context overflow after a web search returns a 5,000-word article +- initial hypothesis: the search tool returns full article text without truncation +- immediate action: add a hard truncation at 800 tokens on all tool results before injection +- engineering control: implement a `summarize_tool_result(result, max_tokens=800)` step that uses the LLM to summarize long results +- verification target: no context overflow errors occur with the truncation/summarization active +- rollback trigger: if summarization degrades the quality of the injected context, switch to extractive truncation +- communication step: log original and truncated result lengths for each tool call +- learning capture: calibrate the truncation limit based on the specific model's context window size + +### Scenario Playbook 5: Code Execution Tool Running Unsafe Code + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the LLM generates code that attempts to delete files or make network requests +- initial hypothesis: the code execution tool has no safety restrictions on what code it runs +- immediate action: switch to a sandboxed execution environment using `subprocess` with `timeout` and a restricted user +- engineering control: run all code execution in a Docker container with no network access and read-only filesystem except a tmp dir +- verification target: code execution tool blocks network access and file deletion in a security test suite +- rollback trigger: if Docker-based sandboxing is not available, disable code execution tool and log a warning +- communication step: log every code execution call with the code text and execution result for audit +- learning capture: add the code execution sandbox requirements to the tool configuration prerequisites + +### Scenario Playbook 6: Tool Routing Misclassifies Tasks + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: keyword-based routing sends "analyze current market trends" to the pure LLM path instead of the search tool +- initial hypothesis: the keyword list for search tool routing does not include "current" or "trends" +- immediate action: expand the keyword list and add a secondary LLM-based classifier for ambiguous cases +- engineering control: implement a hybrid router: keyword match first, LLM classifier as fallback for ambiguous cases +- verification target: 95% of test tasks in a labeled routing test set are correctly routed +- rollback trigger: if the LLM classifier adds too much latency, revert to keyword-only routing and expand the keyword list +- communication step: log routing decisions with the method used (keyword vs classifier) for debugging +- learning capture: build a labeled routing test set from observed misclassifications for regression testing + +### Scenario Playbook 7: Tool API Credential Rotation During Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: Tavily API key is rotated and the search tool starts returning 401 errors mid-run +- initial hypothesis: the API key stored in the client session is now invalid +- immediate action: re-read the API key from the environment variable on each tool call (lazy credential loading) +- engineering control: implement a credential refresh mechanism that re-reads `.env` when an authentication error is detected +- verification target: the run resumes automatically within 60 seconds of a credential rotation +- rollback trigger: if the new key also fails, pause the search tool and fall back to pure LLM reasoning +- communication step: log authentication errors with the tool name and timestamp for security audit +- learning capture: add tool credential rotation to the operational runbook + +### Scenario Playbook 8: Tool Results Not Appearing in Vector Store + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: after a tool-augmented run, retrieval returns only LLM-generated results, not search results +- initial hypothesis: the tool result is used for execution but not stored in the vector store separately +- immediate action: add an explicit upsert step that stores the raw tool result in the vector store with a `tool_result` tag +- engineering control: store both the synthesized execution result and the raw tool result as separate vectors +- verification target: after a 5-cycle run with search tools, the vector store contains both synthesized and raw tool result entries +- rollback trigger: if dual storage causes context confusion, remove the raw tool result entries and keep only synthesized results +- communication step: log the vector store entry count after each upsert to verify storage +- learning capture: document the decision to store raw vs synthesized tool results in the design notes + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not adding tools to BabyAGI, but designing the tool routing and error handling so that tool failures do not cascade into task loop failures. The original BabyAGI is deliberately minimal—any extension must be robust enough not to break the fragile autonomy of the three-agent loop. A single unhandled tool exception can halt a multi-hour autonomous experiment. + +In practical terms, this chapter helps you avoid three common failures: + +- adding tools without error boundaries, causing a single API failure to halt the entire autonomous loop +- forgetting to store tool results in the vector store, losing the cumulative knowledge value of external data +- designing tool routing without a fallback, making the loop brittle when a tool is unavailable + +After working through this chapter, you should be able to extend BabyAGI with production-grade tool integrations that are safe, observable, and resilient to failure. + +## How it Works Under the Hood + +Under the hood, `Chapter 6: Extending BabyAGI: Custom Tools and Skills` follows a repeatable control path: + +1. **Task classification**: the task text is analyzed to determine if a tool is needed (keyword match or LLM classifier). +2. **Tool selection**: the appropriate tool is selected from the tool registry based on the classification. +3. **Tool call**: the tool function is called with the task text or extracted parameters; the result is captured as a string. +4. **Result injection**: the tool result is injected into the execution agent's user message as additional context. +5. **LLM synthesis**: the LLM generates a synthesized result using the task, retrieved context, and tool result. +6. **Result storage**: the synthesized result is embedded and stored in the vector store. +7. **Error handling**: if the tool call fails, the error string is logged and the execution falls back to pure LLM reasoning. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Main Script](https://github.com/yoheinakajima/babyagi/blob/main/babyagi.py) + Why it matters: shows where the execution agent currently produces its output and where tools would be injected (github.com). +- [SerpAPI Documentation](https://serpapi.com/search-api) + Why it matters: reference for the search API used in many BabyAGI community extensions (serpapi.com). +- [Tavily Documentation](https://tavily.com/docs) + Why it matters: the modern search API designed for LLM agent integrations (tavily.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 5: Memory Systems and Vector Store Integration](05-memory-systems-and-vector-store-integration.md) +- [Next Chapter: Chapter 7: BabyAGI Evolution: 2o and Functionz Framework](07-babyagi-evolution-2o-and-functionz-framework.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/07-babyagi-evolution-2o-and-functionz-framework.md b/tutorials/babyagi-tutorial/07-babyagi-evolution-2o-and-functionz-framework.md new file mode 100644 index 00000000..2574674a --- /dev/null +++ b/tutorials/babyagi-tutorial/07-babyagi-evolution-2o-and-functionz-framework.md @@ -0,0 +1,329 @@ +--- +layout: default +title: "Chapter 7: BabyAGI Evolution: 2o and Functionz Framework" +nav_order: 7 +parent: BabyAGI Tutorial +--- + +# Chapter 7: BabyAGI Evolution: 2o and Functionz Framework + +Welcome to **Chapter 7: BabyAGI Evolution: 2o and Functionz Framework**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter traces how BabyAGI evolved from the original single-file script into BabyAGI 2o (a self-building agent) and BabyAGI 3 / Functionz (a natural-language configurable agent framework), and what each evolutionary step means for practitioners. + +## Learning Goals + +- understand what BabyAGI 2o adds over the original: self-building skill acquisition +- understand what BabyAGI 3 / Functionz adds: natural language configuration and persistent function libraries +- identify which version to use for different use cases +- trace the conceptual lineage from the original three-agent loop to the modern BabyAGI variants + +## Fast Start Checklist + +1. read the `babyagi-2o` directory in the repository and identify what is new vs the original +2. read the `babyagi3` or `functionz` directory and identify the configuration model +3. run BabyAGI 2o on a simple objective and observe how it builds its skill library +4. understand the `functionz` framework's approach to persistent function storage +5. identify which evolutionary step is relevant to your use case + +## Source References + +- [BabyAGI 2o Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi-2o) +- [BabyAGI 3 / Functionz Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi3) +- [Functionz Repository](https://github.com/yoheinakajima/functionz) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) + +## Summary + +You now understand the evolutionary arc from BabyAGI's original three-agent loop to self-building agents (2o) and natural-language configurable frameworks (BabyAGI 3), and can make an informed choice about which variant fits your needs. + +Next: [Chapter 8: Production Patterns and Research Adaptations](08-production-patterns-and-research-adaptations.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 7: BabyAGI Evolution: 2o and Functionz Framework** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 7: BabyAGI Evolution: 2o and Functionz Framework`. +2. Compare the original BabyAGI's architecture to BabyAGI 2o's architecture: what new components exist? +3. Identify the "self-building" mechanism in BabyAGI 2o: how does it generate and store new skills? +4. Compare BabyAGI 3 / Functionz's configuration model to the original's environment variable model. +5. Trace the `functionz` framework's persistent function library concept. +6. Identify the migration path from the original BabyAGI to BabyAGI 2o and then to BabyAGI 3. +7. Map which use cases are best served by each version. +8. Track observability signals specific to each variant. + +### BabyAGI Version Comparison + +| Aspect | Original BabyAGI | BabyAGI 2o | BabyAGI 3 / Functionz | +|:-------|:----------------|:-----------|:----------------------| +| Release | March 2023 | Late 2023 | 2024 | +| Architecture | 3-agent loop (execute, create, prioritize) | 3-agent loop + self-building skill store | natural language configurable agent with function library | +| Configuration | `.env` file with environment variables | `.env` + skill library | natural language prompts + persistent functions | +| Skill acquisition | none (pure LLM reasoning) | generates and saves Python functions as skills | generates and stores functions in a persistent library | +| Memory | vector store (Pinecone/Chroma) | vector store + skill file store | vector store + SQL-backed function store | +| Tool integration | manual code modification | automatic via skill generation | natural language tool description | +| Best for | learning, prototyping, research | tasks that benefit from reusable skill accumulation | production systems needing configurable autonomy | + +### BabyAGI 2o: Self-Building Agent Architecture + +BabyAGI 2o extends the original by adding a **skill acquisition** layer. When the execution agent completes a task, it also generates a reusable Python function (a "skill") that encapsulates the knowledge needed to repeat that task type. These skills are stored in a skills directory and loaded at the start of future runs. + +Key additions over the original: +- **Skill creation agent**: after task execution, a new agent generates a Python function for the task type +- **Skill store**: a local directory of Python files, each representing a learned skill +- **Skill retrieval**: before execution, the agent checks the skill store for a relevant existing skill +- **Skill execution**: if a relevant skill is found, it is executed rather than calling the LLM from scratch + +This creates a cumulative learning system where the agent becomes more efficient over time at known task types. + +### BabyAGI 3 / Functionz: Natural Language Configuration + +BabyAGI 3, built on the `functionz` framework, replaces the rigid environment variable configuration with a natural language configuration model. Users describe what they want in plain English, and the framework translates this into agent configuration. + +Key additions: +- **Natural language objective parsing**: the framework extracts structured configuration from free-form objective descriptions +- **Persistent function library**: functions generated during runs are stored in a SQL-backed library and reused across runs +- **Declarative skill registry**: skills can be described in natural language and the framework auto-generates their implementations +- **Session management**: the framework maintains conversation history and run state across multiple sessions + +### Operator Decision Matrix + +| Decision Area | Original BabyAGI | BabyAGI 2o | BabyAGI 3 / Functionz | +|:--------------|:----------------|:-----------|:----------------------| +| Use case | research and prototyping | repeated task types that improve over runs | production systems needing flexibility | +| Setup complexity | low | medium | high | +| Skill reuse | none | across runs (file-based) | across sessions (SQL-based) | +| Configuration surface | `.env` variables | `.env` + skill files | natural language | +| Community support | largest (original) | medium | growing | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| broken skill file in 2o | `SyntaxError` on skill load | skill creation agent generated invalid Python | add syntax validation before saving skills | +| skill mismatch in 2o | wrong skill applied to task | skill retrieval uses wrong similarity threshold | tune the similarity threshold for skill retrieval | +| natural language config misparse in 3 | agent pursues wrong objective | ambiguous natural language description | add a confirmation step that restates the parsed config before starting | +| function library corruption in functionz | SQL query errors | concurrent writes without transaction isolation | use proper SQLite transaction handling | +| skill accumulation without pruning | skill store grows without bound | no cleanup mechanism for obsolete skills | add a skill validity checker that tests skills periodically | +| 2o skill not reused when expected | execution agent ignores available skill | retrieval confidence threshold too high | lower the threshold and add logging for retrieval decisions | + +### Implementation Runbook: BabyAGI 2o + +1. Navigate to the `babyagi-2o` directory in the repository. +2. Install any additional dependencies listed in the 2o-specific `requirements.txt`. +3. Configure `.env` with the same variables as the original, plus `SKILLS_DIRECTORY=./skills`. +4. Create the `./skills` directory if it does not exist. +5. Run the 2o script on a simple objective and observe: after each task, a new `.py` file should appear in `./skills`. +6. Inspect a skill file: it should contain a Python function that encapsulates the task's logic. +7. Run the same objective again and observe: the agent should detect and reuse the skill from the previous run. +8. Verify skill reuse by logging which tasks used existing skills vs generated new LLM responses. + +### Implementation Runbook: BabyAGI 3 / Functionz + +1. Navigate to the `babyagi3` or `functionz` directory in the repository. +2. Install the `functionz` package: `pip install functionz` (or from the local directory). +3. Configure the database backend for the function library (default: SQLite at `./functionz.db`). +4. Define your objective in natural language as a string; pass it to the BabyAGI 3 runner function. +5. Observe how the framework parses the objective and configures the agent loop. +6. After a run, inspect the function library database to see stored functions. +7. Run a second objective that overlaps with the first and observe which functions are reused. + +### Quality Gate Checklist + +- [ ] chosen BabyAGI variant is selected based on use case requirements +- [ ] skill files in BabyAGI 2o pass syntax validation before being saved +- [ ] skill retrieval in BabyAGI 2o is tuned and logged for debugging +- [ ] BabyAGI 3 / Functionz natural language objective is confirmed before the loop starts +- [ ] function library database is backed up before long production runs +- [ ] migration path from the original to the chosen variant is documented +- [ ] skill accumulation growth is monitored and pruned periodically +- [ ] the chosen variant is tested on a representative objective before deployment + +### Source Alignment + +- [BabyAGI 2o Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi-2o) +- [BabyAGI 3 / Functionz Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi3) +- [Functionz Repository](https://github.com/yoheinakajima/functionz) + +### Cross-Tutorial Connection Map + +- [AutoGPT Tutorial](../autogen-tutorial/) — comparable self-improving agent concept +- [LangChain Tutorial](../langchain-tutorial/) — tool and skill registration patterns +- [CrewAI Tutorial](../crewai-tutorial/) — agent specialization analogous to skill specialization in BabyAGI 2o +- [Chapter 7: BabyAGI Evolution](07-babyagi-evolution-2o-and-functionz-framework.md) + +### Advanced Practice Exercises + +1. Build a skill validator that runs each stored BabyAGI 2o skill in isolation and marks it as valid or invalid. +2. Implement a skill pruning mechanism that removes skills not used in the last 10 runs. +3. Build a skill similarity browser that shows which skills are semantically closest to a given task query. +4. Compare the efficiency of BabyAGI 2o vs the original on the same objective run 5 times consecutively. +5. Migrate an existing BabyAGI original run's vector store results into a BabyAGI 2o skill library. + +### Review Questions + +1. What is the fundamental architectural difference between BabyAGI original and BabyAGI 2o? +2. Why does BabyAGI 2o need a skill validation step that the original does not? +3. What use case makes BabyAGI 3 / Functionz preferable over BabyAGI 2o? +4. How does the functionz persistent function library differ from BabyAGI 2o's file-based skill store? +5. What is the risk of letting the skill creation agent generate skills without syntax validation? + +### Scenario Playbook 1: Broken Skill File Prevents 2o Startup + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: `SyntaxError` when loading the skills directory at BabyAGI 2o startup +- initial hypothesis: a previously generated skill file contains invalid Python code +- immediate action: identify the offending file from the error traceback and quarantine it to a `./skills_invalid` directory +- engineering control: add a syntax validation step at skill load time: `compile(skill_code, filename, "exec")` +- verification target: invalid skill files are detected and quarantined automatically without crashing startup +- rollback trigger: if more than 10% of skill files are invalid, review the skill creation prompt for systematic errors +- communication step: log the count of valid vs invalid skills at startup +- learning capture: use the invalid skill patterns to add Python syntax validation constraints to the skill creation prompt + +### Scenario Playbook 2: Skill Reuse Not Triggering in 2o + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the agent re-generates LLM responses for tasks where a relevant skill already exists +- initial hypothesis: the skill retrieval similarity threshold is too high and no skill clears the bar +- immediate action: lower the similarity threshold from 0.95 to 0.80 and observe the reuse rate +- engineering control: add a debug logging mode that shows the top-3 retrieved skills and their similarity scores for each task +- verification target: at least 50% of tasks in a repeated-objective run use an existing skill rather than re-generating +- rollback trigger: if the lower threshold causes wrong skills to be applied, raise it to 0.85 +- communication step: log skill reuse rate as a metric at the end of each run +- learning capture: document the optimal similarity threshold for the task types most common in your objectives + +### Scenario Playbook 3: BabyAGI 3 Misparses Natural Language Objective + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: after providing a natural language objective, the agent starts working on the wrong goal +- initial hypothesis: the natural language parsing step misinterpreted an ambiguous phrase in the objective +- immediate action: add a confirmation step where the parsed objective is displayed and user approval is required before the loop starts +- engineering control: implement a structured objective schema that the parser must fill in, with required fields: goal, scope, done-when criteria +- verification target: the confirmation step correctly surfaces 100% of objective misparses before the loop starts +- rollback trigger: if the structured schema is too rigid for natural language input, provide an example format in the prompt +- communication step: print the parsed objective structure in human-readable form for confirmation +- learning capture: build a test set of natural language objectives and their expected parsed structures for regression testing + +### Scenario Playbook 4: Function Library Database Corruption in Functionz + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: SQL query errors when reading from the functionz function library mid-run +- initial hypothesis: concurrent writes without proper transaction isolation corrupted the SQLite database +- immediate action: switch to WAL mode in SQLite for better concurrent write handling: `PRAGMA journal_mode=WAL` +- engineering control: take a backup of the function library database before each run starts +- verification target: no database errors occur across 50 concurrent read/write operations in a load test +- rollback trigger: if WAL mode does not resolve the issue, switch to a PostgreSQL backend for production +- communication step: log database operation errors with full SQL context for debugging +- learning capture: add database backup and WAL mode configuration to the functionz setup guide + +### Scenario Playbook 5: Skill Store Growing Without Bound in 2o + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the `./skills` directory contains 500+ files after 3 weeks of daily runs +- initial hypothesis: no pruning mechanism exists and every run adds new skills regardless of their utility +- immediate action: implement a skill usage tracker that records the last-used timestamp for each skill file +- engineering control: add a weekly pruning job that deletes skills not used in the last 14 days +- verification target: skill directory size stabilizes below 200 files under the pruning policy +- rollback trigger: if pruned skills are needed again, confirm they can be regenerated from the original objective +- communication step: log the count of skills pruned and the total directory size after each pruning run +- learning capture: add skill lifecycle management to the 2o operational runbook + +### Scenario Playbook 6: Migrating from Original BabyAGI to 2o + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a team running the original BabyAGI wants to migrate to 2o to benefit from skill reuse +- initial hypothesis: the main difference is the addition of the skill creation and retrieval layer +- immediate action: run a parallel experiment: original for 10 cycles and 2o for 10 cycles on the same objective +- engineering control: compare the execution agent outputs to verify that 2o produces equivalent or better results +- verification target: 2o produces qualitatively similar or better outputs with fewer LLM calls after cycle 3 (due to skill reuse) +- rollback trigger: if 2o skills are consistently wrong for the domain, revert to the original and report the failure pattern +- communication step: document the migration steps and any objective-specific tuning needed for the skill creation prompt +- learning capture: build a migration guide that covers the config differences and skill store initialization steps + +### Scenario Playbook 7: BabyAGI 3 Objective Scope Expansion Mid-Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the BabyAGI 3 agent expands its function library with functions far outside the original objective scope +- initial hypothesis: the natural language configuration allows the agent to interpret its mandate too broadly +- immediate action: add explicit scope constraints to the BabyAGI 3 configuration: "only generate functions relevant to {domain}" +- engineering control: implement a function relevance checker that evaluates each new function against the original objective before storage +- verification target: at least 90% of stored functions are relevant to the original objective domain +- rollback trigger: if the relevance checker is too restrictive, lower the threshold and widen the domain description +- communication step: log the rejected functions with their similarity scores for configuration tuning +- learning capture: document the scope constraint pattern as a configuration best practice for BabyAGI 3 + +### Scenario Playbook 8: Choosing Between BabyAGI Variants for a New Project + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a new project team needs to choose between the original, 2o, and BabyAGI 3 +- initial hypothesis: the right choice depends on whether the objective type repeats and whether configuration flexibility is needed +- immediate action: apply the decision matrix: one-off research → original; repeated task types → 2o; production with flexible config → BabyAGI 3 +- engineering control: run a 5-cycle proof-of-concept with the candidate variant before committing +- verification target: the chosen variant produces acceptable outputs for the first representative objective within 2 hours +- rollback trigger: if the chosen variant's outputs are inadequate, fall back to the original as the most debuggable baseline +- communication step: document the variant selection rationale and the proof-of-concept results +- learning capture: add the variant selection decision matrix to the team's AI tooling runbook + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not understanding that BabyAGI evolved, but understanding when and why to use each generation of the framework. The original is the simplest and most debuggable. BabyAGI 2o is most valuable when you run the same agent repeatedly on similar task types and want to accumulate reusable skills. BabyAGI 3 / Functionz is most valuable when you need a flexible, production-grade framework that multiple non-technical users can configure in natural language. + +In practical terms, this chapter helps you avoid three common failures: + +- defaulting to the original for every use case when 2o would provide meaningful efficiency gains after the first few runs +- adopting BabyAGI 3 without understanding the additional configuration complexity it introduces +- mixing skills from different objectives in BabyAGI 2o, causing skill mismatch errors that are hard to debug + +After working through this chapter, you should be able to select the right BabyAGI variant for a given use case and configure it correctly from the start. + +## How it Works Under the Hood + +Under the hood, `Chapter 7: BabyAGI Evolution: 2o and Functionz Framework` follows a repeatable control path: + +**BabyAGI 2o:** +1. Skill load: at startup, all Python files in the skills directory are loaded into memory. +2. Skill embedding: each skill is embedded and stored in a skill retrieval index. +3. Task cycle: the standard execution-creation-prioritization loop runs as before. +4. Skill check: before execution, the task text is queried against the skill index; if similarity > threshold, the skill is executed. +5. Skill creation: after LLM execution, a skill creation agent generates a Python function representing the task type. +6. Skill validation: the generated function is syntax-checked before being saved to the skills directory. + +**BabyAGI 3 / Functionz:** +1. Objective parsing: natural language objective is parsed into a structured configuration. +2. Function library init: the SQL-backed function library is initialized. +3. Task cycle: the agent loop runs, with functions retrieved from the library before LLM calls. +4. Function creation: new functions are generated and stored in the library after each execution. +5. Session persistence: run state and function library are persisted across sessions. + +When debugging, walk these sequences in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI 2o Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi-2o) + Why it matters: the complete self-building agent implementation (github.com). +- [BabyAGI 3 / Functionz Directory](https://github.com/yoheinakajima/babyagi/tree/main/babyagi3) + Why it matters: the natural language configurable framework implementation (github.com). +- [Functionz Repository](https://github.com/yoheinakajima/functionz) + Why it matters: the standalone functionz framework that underlies BabyAGI 3 (github.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 6: Extending BabyAGI: Custom Tools and Skills](06-extending-babyagi-custom-tools-and-skills.md) +- [Next Chapter: Chapter 8: Production Patterns and Research Adaptations](08-production-patterns-and-research-adaptations.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/08-production-patterns-and-research-adaptations.md b/tutorials/babyagi-tutorial/08-production-patterns-and-research-adaptations.md new file mode 100644 index 00000000..9ab991a8 --- /dev/null +++ b/tutorials/babyagi-tutorial/08-production-patterns-and-research-adaptations.md @@ -0,0 +1,354 @@ +--- +layout: default +title: "Chapter 8: Production Patterns and Research Adaptations" +nav_order: 8 +parent: BabyAGI Tutorial +--- + +# Chapter 8: Production Patterns and Research Adaptations + +Welcome to **Chapter 8: Production Patterns and Research Adaptations**. In this part of **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how to run BabyAGI reliably in production environments and how to adapt it for research experiments, including cost control, observability, safety controls, and reproducibility practices. + +## Learning Goals + +- design a production-grade BabyAGI deployment with cost controls and observability +- implement safety controls that prevent runaway autonomous loops in shared environments +- apply research-grade reproducibility practices for experiments using BabyAGI +- understand how BabyAGI has been used as a research reference and how to adapt it for your own research + +## Fast Start Checklist + +1. add `MAX_ITERATIONS` and `MAX_COST_USD` controls to the main loop +2. implement structured JSON logging for all agent calls +3. add a Slack or webhook notification on loop completion or failure +4. document the objective, model, and configuration for reproducibility +5. run a 10-cycle test with all controls active and verify the run summary + +## Source References + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) +- [BabyAGI Inspired Projects](https://github.com/yoheinakajima/babyagi/blob/main/docs/inspired-projects.md) + +## Summary + +You now have the patterns needed to run BabyAGI safely in production environments and to adapt it for research experiments with full reproducibility, cost control, and observability. + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- tutorial slug: **babyagi-tutorial** +- chapter focus: **Chapter 8: Production Patterns and Research Adaptations** +- system context: **BabyAGI Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 8: Production Patterns and Research Adaptations`. +2. Identify the control-plane additions needed for production: iteration caps, cost caps, timeout controls, failure notifications. +3. Identify the data-plane additions for observability: structured logging, run summaries, task trace exports. +4. Specify the research reproducibility requirements: config snapshots, seed control, output archival. +5. Map the safety controls for shared and cloud environments: API key scoping, resource limits, network isolation. +6. Identify the monitoring signals: iteration count, queue depth, total cost, cycle latency, error rate. +7. Specify the alerting conditions: loop exit on error, cost budget exceeded, objective completion detected. +8. Document the operational runbook for starting, monitoring, pausing, and resuming BabyAGI runs. + +### Production Deployment Architecture + +A production-grade BabyAGI deployment typically adds the following layers to the original script: + +**Control Layer:** +- `MAX_ITERATIONS`: hard cap on loop iterations +- `MAX_COST_USD`: estimated cost cap based on token counting +- `EXECUTION_TIMEOUT`: per-task timeout in seconds +- `WATCHDOG_INTERVAL`: external process health check interval + +**Observability Layer:** +- structured JSON logging for every agent call (timestamp, agent name, task ID, input tokens, output tokens, latency, cost estimate) +- run summary exported to a JSON file at loop exit (total iterations, total cost, final task list, all stored results) +- Prometheus metrics endpoint for real-time monitoring (optional) + +**Safety Layer:** +- API key usage scoping (separate key per experiment with per-key spending limits) +- network egress controls (if using local models, disable external network access) +- file I/O sandboxing (restrict all writes to a designated output directory) +- human-in-the-loop checkpoint: pause every N iterations for human review + +**Notification Layer:** +- Slack webhook notification on loop completion, error, or cost budget exceeded +- email alert if the loop runs longer than expected +- PagerDuty alert if the loop crashes with an unhandled exception + +### Cost Estimation Model + +For GPT-4o (as of early 2026): +- Each execution cycle makes 3 LLM calls (execution, creation, prioritization) + 2 embedding calls (task + result) +- Approximate token counts per cycle: 2000 input tokens (LLM total) + 500 output tokens (LLM total) + 200 embedding tokens +- Approximate cost per cycle: `(2000 * $0.0000025) + (500 * $0.000010) + (200 * $0.0000001)` ≈ $0.01 per cycle +- 100-cycle run on GPT-4o ≈ $1.00 +- 100-cycle run on GPT-3.5-turbo ≈ $0.05 + +Budget enforcement logic: +```python +def estimate_cycle_cost(input_tokens, output_tokens, model): + costs = { + "gpt-4o": (0.0000025, 0.000010), + "gpt-3.5-turbo": (0.0000005, 0.0000015), + } + in_rate, out_rate = costs.get(model, (0.000001, 0.000002)) + return (input_tokens * in_rate) + (output_tokens * out_rate) + +total_cost = 0.0 +MAX_COST_USD = float(os.getenv("MAX_COST_USD", "5.0")) +# After each cycle: +total_cost += estimate_cycle_cost(input_tokens, output_tokens, model) +if total_cost >= MAX_COST_USD: + print(f"Cost budget of ${MAX_COST_USD} reached. Stopping.") + break +``` + +### Research Reproducibility Checklist + +For research experiments using BabyAGI as a framework: + +1. **Config snapshot**: save a JSON snapshot of all `.env` variables (excluding secrets) at run start +2. **Model version pinning**: record the exact model version string used (e.g., `gpt-4o-2024-11-20`) +3. **Seed control**: set `temperature=0` for deterministic outputs where reproducibility is critical +4. **Run ID**: assign a UUID to each run and include it in all log entries +5. **Input archival**: save the exact objective text and initial task text +6. **Output archival**: save the full task execution log and all stored vector store results +7. **Environment pinning**: record `pip freeze` output alongside the run config +8. **Comparison baseline**: run the same objective on the original BabyAGI and the variant under study for comparison + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Iteration cap | MAX_ITERATIONS=20 | dynamic completion detection | simplicity vs thoroughness | +| Cost control | manual monitoring | automated budget cap | effort vs financial safety | +| Observability | stdout logging | structured JSON logs + metrics | simplicity vs debuggability | +| Safety controls | MAX_ITERATIONS only | multi-layer: cost + timeout + human checkpoint | simplicity vs operational safety | +| Research reproducibility | note model and objective | full config + env snapshot + output archival | effort vs scientific rigor | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| runaway cost | unexpected API charges | MAX_COST_USD not set or too high | enforce per-run cost cap with automatic loop exit | +| loop crash with no state saved | lost run results | unhandled exception with no checkpoint | add exception handler that saves current state before exiting | +| resource leak on cloud | running VM/container charges accumulate | loop does not self-terminate | add watchdog process that kills the loop after MAX_WALL_TIME | +| non-reproducible results | research paper results differ from re-run | model was updated or temperature > 0 | pin exact model version and set temperature=0 for research | +| missing run logs | cannot debug a failed run | logging was not configured before the run | add logging setup as the first operation after `.env` load | +| credential exposure in logs | API key appears in log output | f-string includes the full API key in an error message | sanitize log output by redacting all known secret patterns | + +### Implementation Runbook: Production Controls + +1. Add `MAX_ITERATIONS`, `MAX_COST_USD`, `EXECUTION_TIMEOUT`, and `SLEEP_INTERVAL` to `.env`. +2. Implement a `RunState` dataclass that tracks: iteration count, total cost, queue depth, cycle latencies. +3. Add cost estimation logic using `tiktoken` to count input and output tokens per call. +4. Wrap the main loop in a `try/except/finally` block: save `RunState` to a JSON file in the `finally` block. +5. Add a Slack webhook notification call in the `finally` block: post run summary on loop exit. +6. Implement a wall-time watchdog: a secondary thread that kills the process if the loop runs longer than `MAX_WALL_TIME`. +7. Add structured JSON logging: use Python's `logging` module with a `JSONFormatter` to write all agent calls to a log file. +8. Run a 10-iteration test and verify: cost is within budget, logs are present, run summary JSON is written at exit. + +### Implementation Runbook: Research Reproducibility + +1. At run start, call `snapshot_config()` which saves all non-secret env vars and `pip freeze` output to a JSON file. +2. Generate a `RUN_ID` with `uuid.uuid4()` and include it in all log entries. +3. Set `temperature=0` for all LLM calls to maximize determinism. +4. Set the exact model version string: `model="gpt-4o-2024-11-20"` (not `gpt-4o` which resolves to different versions over time). +5. Save the objective and initial task to the config snapshot. +6. At run end, export all vector store entries to a JSON file: `{task_id, task_text, result_text, embedding_vector}`. +7. If comparing across runs or variants, use the same objective text, model version, and config for both. +8. Include the config snapshot, log file, and results export in any publication or internal report. + +### Quality Gate Checklist + +- [ ] `MAX_ITERATIONS` is set before every automated run +- [ ] cost estimation is implemented and `MAX_COST_USD` is enforced +- [ ] exception handler saves run state before exiting +- [ ] Slack or webhook notification is sent on loop exit +- [ ] structured JSON logs are written for all agent calls +- [ ] run summary JSON is exported at loop exit +- [ ] research runs include a config snapshot with model version and `pip freeze` +- [ ] credential exposure in logs is prevented by a sanitization step + +### Source Alignment + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI Inspired Projects](https://github.com/yoheinakajima/babyagi/blob/main/docs/inspired-projects.md) +- [tiktoken for token counting](https://github.com/openai/tiktoken) + +### Cross-Tutorial Connection Map + +- [LangFuse Tutorial](../langfuse-tutorial/) — observability and tracing for LLM applications +- [PostHog Tutorial](../posthog-tutorial/) — product analytics applicable to agent run monitoring +- [SuperAGI Tutorial](../superagi-tutorial/) — production-ready autonomous agent with built-in controls +- [Chapter 8: Production Patterns](08-production-patterns-and-research-adaptations.md) + +### Advanced Practice Exercises + +1. Build a full `RunState` dataclass and verify it captures all required metrics across a 20-cycle run. +2. Implement a cost estimation function using `tiktoken` and validate it against the actual OpenAI billing dashboard. +3. Set up a Prometheus metrics endpoint for BabyAGI and build a Grafana dashboard for real-time run monitoring. +4. Implement a human-in-the-loop checkpoint that pauses every 5 iterations and waits for a `y/n` confirmation. +5. Write a reproducibility test: run the same objective twice with temperature=0 and compare task lists for differences. + +### Review Questions + +1. What is the minimum set of production controls needed before running BabyAGI on a shared API key? +2. Why is setting `temperature=0` not sufficient for full reproducibility in LLM-based research experiments? +3. How would you implement a cost cap without using `tiktoken` (a simpler approximation)? +4. What is the risk of not saving run state in a `finally` block? +5. How would you adapt BabyAGI as a controlled experiment platform for comparing different task decomposition strategies? + +### Scenario Playbook 1: Runaway API Cost in Unattended Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: an overnight BabyAGI run accumulates $50 in API charges due to missing iteration cap +- initial hypothesis: `MAX_ITERATIONS` and `MAX_COST_USD` were not configured before the run +- immediate action: stop the loop immediately, export current state, and review the OpenAI billing dashboard +- engineering control: make `MAX_ITERATIONS` and `MAX_COST_USD` required configuration checks at startup +- verification target: startup fails with a clear error message if either control is not set +- rollback trigger: no rollback; this is a prevention pattern +- communication step: send a post-mortem to the team with the cost breakdown and the new control requirements +- learning capture: add cost control configuration to the pre-run checklist as a hard requirement + +### Scenario Playbook 2: Loop Crash Loses All Run State + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: BabyAGI crashes at iteration 15 due to an unhandled API error, losing all task history +- initial hypothesis: the main loop has no exception handler and no checkpoint mechanism +- immediate action: add a `try/finally` block that saves `RunState` to a JSON file on any exit +- engineering control: implement incremental checkpointing every 5 iterations that saves the current task list and vector store entries +- verification target: a simulated crash test at iteration 10 shows the checkpoint from iteration 5 is recoverable +- rollback trigger: if checkpointing overhead is too high, reduce checkpoint frequency to every 10 iterations +- communication step: log checkpoint events with the iteration number and checkpoint file path +- learning capture: add crash recovery steps to the operational runbook + +### Scenario Playbook 3: Research Results Not Reproducible Across Re-runs + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: re-running the same objective produces substantially different task lists and results +- initial hypothesis: `temperature > 0` and non-pinned model version are causing non-determinism +- immediate action: set `temperature=0` and pin the exact model version string for both runs +- engineering control: add a reproducibility config that locks: `temperature=0`, `model=specific_version`, `top_p=1.0`, `seed=42` (if supported) +- verification target: two runs with the identical config produce task lists with > 80% overlap +- rollback trigger: if temperature=0 produces overly rigid task decompositions that miss creative solutions, use temperature=0.1 +- communication step: document the reproducibility config in the research notes for each experiment +- learning capture: add model version pinning to the research configuration guide as a required practice + +### Scenario Playbook 4: Resource Leak in Cloud Deployment + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a cloud-deployed BabyAGI instance continues running after the intended experiment is complete +- initial hypothesis: the loop was started without a self-termination mechanism and no watchdog is running +- immediate action: terminate the cloud instance manually and calculate the unnecessary resource cost +- engineering control: add a `MAX_WALL_TIME` (e.g., 4 hours) watchdog that kills the process after the time limit +- verification target: the watchdog terminates the process within 60 seconds of the wall time limit +- rollback trigger: if the watchdog triggers prematurely on legitimate long runs, increase `MAX_WALL_TIME` for specific experiments +- communication step: send a notification when the watchdog terminates a run, including the run state at termination +- learning capture: add watchdog configuration to the cloud deployment guide as a required component + +### Scenario Playbook 5: API Credential Exposed in Log Files + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: the OpenAI API key appears in structured log output when an authentication error is logged +- initial hypothesis: the error logging includes the full request headers which contain the Authorization header +- immediate action: review all log files for credential exposure and rotate any exposed keys immediately +- engineering control: add a log sanitizer that replaces any known secret patterns with `[REDACTED]` before writing to disk +- verification target: a log scan finds zero instances of API key patterns across all log files after the sanitizer is added +- rollback trigger: if the sanitizer incorrectly redacts non-sensitive content, tune the regex pattern +- communication step: notify the security team of the exposure and document the keys rotated and the time window of exposure +- learning capture: add credential-safe logging as a required pattern in the development standards + +### Scenario Playbook 6: Monitoring Gap During Extended Production Run + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a 6-hour BabyAGI run completes with no notification, and the team does not know it finished +- initial hypothesis: no notification mechanism was configured and the team was not watching stdout +- immediate action: add a Slack webhook notification that fires on loop exit with a run summary +- engineering control: implement periodic heartbeat notifications every 30 minutes with: current iteration, queue depth, total cost +- verification target: the team receives a run-complete notification within 60 seconds of loop exit +- rollback trigger: if notification failures are common (webhook is flaky), add email as a secondary notification channel +- communication step: configure the Slack notification to include a direct link to the run log file +- learning capture: add notification configuration to the production deployment checklist + +### Scenario Playbook 7: BabyAGI as a Research Evaluation Platform + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: a research team wants to use BabyAGI to compare two task decomposition strategies +- initial hypothesis: BabyAGI can serve as the experimental platform if the task creation agent is the controlled variable +- immediate action: create two BabyAGI instances with identical configs except for the task creation prompt template +- engineering control: run 5 trials for each variant on 3 different objectives; use the same `temperature=0` and model version +- verification target: the evaluation produces statistically significant differences in task quality metrics across the two variants +- rollback trigger: if results are not statistically significant, increase the number of trials to 10 +- communication step: export the full task logs and config snapshots for both variants to a shared research directory +- learning capture: publish the task quality evaluation methodology as a reproducible research benchmark + +### Scenario Playbook 8: Human-in-the-Loop Checkpoint for High-Stakes Objectives + +- tutorial context: **BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework** +- trigger condition: BabyAGI is running on a sensitive business objective and the team needs human review every 5 iterations +- initial hypothesis: the default autonomous loop does not pause for human review at any point +- immediate action: add a `HUMAN_CHECKPOINT_EVERY=5` config that pauses the loop every N iterations and displays the current task list +- engineering control: implement a web UI or CLI interface that shows the task list and allows the operator to approve, modify, or stop the loop +- verification target: the checkpoint correctly pauses the loop and waits for input on every 5th iteration +- rollback trigger: if the checkpoint interrupts a time-sensitive run, add a `--skip-checkpoints` flag for emergency use +- communication step: send a Slack notification at each checkpoint with the current task list for async review +- learning capture: document the checkpoint configuration and the review criteria used for the specific objective + +## What Problem Does This Solve? + +Most teams struggle here because the hard part is not running BabyAGI, but keeping it safe and observable when it runs autonomously for hours in a production or research context. The original BabyAGI is a bare loop with no built-in cost controls, no error checkpointing, and no observability. These are acceptable for a five-minute demo but dangerous for a multi-hour autonomous experiment on a shared API key. + +In practical terms, this chapter helps you avoid three common failures: + +- running BabyAGI on a shared API key without a cost cap, accumulating unexpected charges overnight +- losing all run state when an unhandled exception crashes the loop after hours of work +- failing to reproduce research results because the exact model version and temperature were not recorded + +After working through this chapter, you should be able to deploy BabyAGI in production and research contexts with full confidence in cost control, observability, crash recovery, and scientific reproducibility. + +## How it Works Under the Hood + +Under the hood, `Chapter 8: Production Patterns and Research Adaptations` follows a repeatable control path: + +1. **Pre-run validation**: all required controls (MAX_ITERATIONS, MAX_COST_USD) are checked; missing values cause a startup error. +2. **Config snapshot**: a JSON config snapshot is saved to the run output directory, including all env vars and `pip freeze`. +3. **RunState initialization**: a `RunState` object is initialized to track iteration count, cost, queue depth, and latency. +4. **Main loop with controls**: the loop checks `MAX_ITERATIONS` and `MAX_COST_USD` at the top of each cycle. +5. **Structured logging**: every agent call writes a JSON log entry to the run log file. +6. **Incremental checkpointing**: every N iterations, the current task list and RunState are saved to a checkpoint file. +7. **Exception handler**: any unhandled exception triggers the `finally` block, which saves the final RunState and sends a failure notification. +8. **Run summary**: at loop exit, a comprehensive JSON summary is written with all tasks, results, and run metrics. +9. **Notification**: a Slack webhook notification is sent with the run summary. + +When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) + Why it matters: the codebase that all production patterns extend and wrap (github.com). +- [BabyAGI Inspired Projects](https://github.com/yoheinakajima/babyagi/blob/main/docs/inspired-projects.md) + Why it matters: shows how the research and production community has extended BabyAGI (github.com). +- [tiktoken](https://github.com/openai/tiktoken) + Why it matters: required for accurate cost estimation by counting tokens before API calls (github.com). + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 7: BabyAGI Evolution: 2o and Functionz Framework](07-babyagi-evolution-2o-and-functionz-framework.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/babyagi-tutorial/index.md b/tutorials/babyagi-tutorial/index.md new file mode 100644 index 00000000..0a62203b --- /dev/null +++ b/tutorials/babyagi-tutorial/index.md @@ -0,0 +1,111 @@ +--- +layout: default +title: "BabyAGI Tutorial" +nav_order: 191 +has_children: true +format_version: v2 +--- + +# BabyAGI Tutorial: The Original Autonomous AI Task Agent Framework + +> Learn how to use `yoheinakajima/babyagi` for autonomous task generation, execution, and prioritization—the foundational agent loop that started the autonomous AI agent wave. + +[![GitHub Repo](https://img.shields.io/badge/GitHub-yoheinakajima%2Fbabyagi-black?logo=github)](https://github.com/yoheinakajima/babyagi) +[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/yoheinakajima/babyagi/blob/main/LICENSE) +[![Docs](https://img.shields.io/badge/docs-babyagi-blue)](https://github.com/yoheinakajima/babyagi#readme) + +## Why This Track Matters + +BabyAGI, released in March 2023 by Yohei Nakajima, is the original viral autonomous AI agent that introduced the three-agent loop pattern—task execution, task creation, and task prioritization—that underpins nearly every modern agentic framework. Understanding BabyAGI is understanding the DNA of autonomous AI systems: how agents decompose goals into tasks, maintain memory through vector stores, and continuously self-direct without human prompting between steps. + +This track focuses on: + +- understanding the three-agent loop at the core of autonomous task execution +- configuring and running BabyAGI with different LLM backends and vector stores +- extending BabyAGI with custom skills and tool integrations +- tracing the evolutionary arc from the original script to BabyAGI 2o and BabyAGI 3 + +## Current Snapshot (auto-updated) + +- repository: [`yoheinakajima/babyagi`](https://github.com/yoheinakajima/babyagi) +- stars: about **18k** +- original release: **March 2023** +- author: Yohei Nakajima +- license: MIT +- recent activity: ongoing evolution via babyagi-2o and babyagi3 branches +- project positioning: foundational reference implementation for autonomous task-based AI agents + +## Mental Model + +```mermaid +flowchart LR + A[Objective] --> B[Task Queue] + B --> C[Execution Agent] + C --> D[Result Store / Vector DB] + D --> E[Creation Agent] + E --> F[New Tasks] + F --> G[Prioritization Agent] + G --> B +``` + +## Chapter Guide + +| Chapter | Key Question | Outcome | +|:--------|:-------------|:--------| +| [01 - Getting Started](01-getting-started.md) | How do I run BabyAGI on a first objective? | Working baseline | +| [02 - Core Architecture: Task Queue and Agent Loop](02-core-architecture-task-queue-and-agent-loop.md) | How does the three-agent loop actually work? | Architecture clarity | +| [03 - LLM Backend Integration and Configuration](03-llm-backend-integration-and-configuration.md) | How do I configure OpenAI, Anthropic, or local models? | Provider flexibility | +| [04 - Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) | How are tasks generated, ranked, and managed? | Task loop mastery | +| [05 - Memory Systems and Vector Store Integration](05-memory-systems-and-vector-store-integration.md) | How does BabyAGI use Pinecone, Chroma, and Qdrant? | Memory architecture | +| [06 - Extending BabyAGI: Custom Tools and Skills](06-extending-babyagi-custom-tools-and-skills.md) | How do I add custom skills and tool integrations? | Extension patterns | +| [07 - BabyAGI Evolution: 2o and Functionz Framework](07-babyagi-evolution-2o-and-functionz-framework.md) | How has BabyAGI evolved to 2o and BabyAGI 3? | Evolutionary context | +| [08 - Production Patterns and Research Adaptations](08-production-patterns-and-research-adaptations.md) | How do teams run BabyAGI in production and research? | Operational readiness | + +## What You Will Learn + +- how the three-agent loop creates, executes, and prioritizes tasks autonomously +- how to configure vector memory backends for persistent context across task cycles +- how to extend BabyAGI with custom tools and domain-specific skills +- how to adapt BabyAGI patterns for production systems and research experiments + +## Source References + +- [BabyAGI Repository](https://github.com/yoheinakajima/babyagi) +- [BabyAGI README](https://github.com/yoheinakajima/babyagi/blob/main/README.md) +- [Original Twitter Announcement](https://twitter.com/yoheinakajima/status/1640934493489070080) +- [BabyAGI Paper / Design Doc](https://github.com/yoheinakajima/babyagi/blob/main/docs/inspired-projects.md) +- [BabyAGI 2o (babyagi-2o)](https://github.com/yoheinakajima/babyagi/tree/main/babyagi-2o) +- [BabyAGI 3 (babyagi3)](https://github.com/yoheinakajima/babyagi/tree/main/babyagi3) + +## Related Tutorials + +- [AutoGPT Tutorial](../autogen-tutorial/) +- [SuperAGI Tutorial](../superagi-tutorial/) +- [LangChain Tutorial](../langchain-tutorial/) +- [LangGraph Tutorial](../langgraph-tutorial/) +- [CrewAI Tutorial](../crewai-tutorial/) + +--- + +Start with [Chapter 1: Getting Started](01-getting-started.md). + +## Navigation & Backlinks + +- [Start Here: Chapter 1: Getting Started](01-getting-started.md) +- [Back to Main Catalog](../../README.md#-tutorial-catalog) +- [Browse A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) +- [Search by Intent](../../discoverability/query-hub.md) +- [Explore Category Hubs](../../README.md#category-hubs) + +## Full Chapter Map + +1. [Chapter 1: Getting Started](01-getting-started.md) +2. [Chapter 2: Core Architecture: Task Queue and Agent Loop](02-core-architecture-task-queue-and-agent-loop.md) +3. [Chapter 3: LLM Backend Integration and Configuration](03-llm-backend-integration-and-configuration.md) +4. [Chapter 4: Task Creation and Prioritization Engine](04-task-creation-and-prioritization-engine.md) +5. [Chapter 5: Memory Systems and Vector Store Integration](05-memory-systems-and-vector-store-integration.md) +6. [Chapter 6: Extending BabyAGI: Custom Tools and Skills](06-extending-babyagi-custom-tools-and-skills.md) +7. [Chapter 7: BabyAGI Evolution: 2o and Functionz Framework](07-babyagi-evolution-2o-and-functionz-framework.md) +8. [Chapter 8: Production Patterns and Research Adaptations](08-production-patterns-and-research-adaptations.md) + +*Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)* diff --git a/tutorials/devika-tutorial/01-getting-started.md b/tutorials/devika-tutorial/01-getting-started.md new file mode 100644 index 00000000..dceb7abc --- /dev/null +++ b/tutorials/devika-tutorial/01-getting-started.md @@ -0,0 +1,227 @@ +--- +layout: default +title: "Chapter 1: Getting Started" +nav_order: 1 +parent: Devika Tutorial +--- + +# Chapter 1: Getting Started + +Welcome to **Chapter 1: Getting Started**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter walks through installing Devika, configuring API keys, and running a first autonomous coding task end-to-end. + +## Learning Goals + +- clone and install Devika with all required system dependencies +- configure LLM provider credentials and environment variables +- launch the Devika web UI and backend services +- submit a first task and verify agent output in the workspace + +## Fast Start Checklist + +1. clone the repository and install Python and Node.js dependencies +2. install Playwright browsers and Qdrant vector store +3. set API keys in `config.toml` for at least one LLM provider +4. start the backend and frontend servers, then submit a hello-world task + +## Source References + +- [Devika README - Getting Started](https://github.com/stitionai/devika#getting-started) +- [Devika Installation](https://github.com/stitionai/devika#installation) +- [Devika Configuration](https://github.com/stitionai/devika#configuration) +- [Devika Repository](https://github.com/stitionai/devika) + +## Summary + +You now have a working Devika installation and have executed your first autonomous software engineering task from prompt to generated code. + +Next: [Chapter 2: Architecture and Agent Pipeline](02-architecture-and-agent-pipeline.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 1: Getting Started** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. The Devika stack has three runtime layers: a Python FastAPI backend, a Svelte/Vite frontend, and a Qdrant vector store that persists agent memory across sessions. +2. The backend entry point is `devika.py` which starts the FastAPI server; the frontend communicates with it over a REST API on port 1337 by default. +3. On first launch, Devika initializes the SQLite database for project and session metadata and connects to the Qdrant instance (local or remote) for semantic memory. +4. Every LLM provider is configured through `config.toml`; the active model is selected per-project at task submission time. +5. Playwright is invoked by the browser agent sub-process; it requires Chromium to be installed via `playwright install`. +6. The workspace directory (`/home/user/projects` by default) is where all generated files, git repos, and project artifacts are written. +7. API key security depends entirely on `config.toml` permissions; the file must never be committed to version control. +8. The startup sequence is: Qdrant → FastAPI backend → Vite dev server; all three must be healthy before task submission. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| LLM provider | Claude 3 Haiku for low cost | Claude 3 Opus or GPT-4 for max quality | cost vs output quality | +| Qdrant mode | local in-memory Qdrant | hosted Qdrant Cloud with persistent storage | simplicity vs durability | +| Workspace storage | local filesystem | mounted network volume or S3 bucket | portability vs operational overhead | +| Frontend access | localhost only | reverse proxy with auth | dev convenience vs exposure risk | +| Python env management | system Python + pip | pyenv + virtualenv + requirements.txt pin | speed vs reproducibility | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Backend fails to start | port 1337 connection refused | missing Python dependency or wrong Python version | run `pip install -r requirements.txt` and check Python >=3.10 | +| Playwright browser missing | `BrowserType.launch: Executable doesn't exist` | `playwright install` not run | run `playwright install chromium` explicitly | +| Qdrant not reachable | `Connection refused` on vector store calls | Qdrant not started or wrong port in config | start Qdrant Docker container or set correct host in config.toml | +| API key invalid | `AuthenticationError` from LLM provider | wrong key or stale key in config.toml | verify key in provider dashboard and re-paste into config.toml | +| Frontend can't reach backend | API calls return CORS errors | frontend and backend on mismatched ports | ensure VITE_API_BASE_URL matches actual backend port | +| config.toml not found | `FileNotFoundError` on startup | missing config.toml or wrong working directory | copy `config.example.toml` to `config.toml` before first run | + +### Implementation Runbook + +1. Clone the repository: `git clone https://github.com/stitionai/devika.git && cd devika`. +2. Create a Python virtual environment: `python -m venv venv && source venv/bin/activate`. +3. Install Python dependencies: `pip install -r requirements.txt`. +4. Install Node.js dependencies for the frontend: `cd ui && npm install && cd ..`. +5. Install Playwright browsers: `playwright install chromium`. +6. Copy the example config: `cp config.example.toml config.toml` and fill in at least one API key. +7. Start Qdrant: `docker run -p 6333:6333 qdrant/qdrant` or configure a remote Qdrant URL in config.toml. +8. Start the backend: `python devika.py` (confirm the API server is listening on port 1337). +9. Start the frontend: `cd ui && npm run dev` and open `http://localhost:3000` in a browser; create a project and submit a first task. + +### Quality Gate Checklist + +- [ ] Python version is 3.10 or higher and the virtualenv is activated +- [ ] all `pip install -r requirements.txt` packages resolve without errors +- [ ] Playwright Chromium browser is installed and `playwright install` exits cleanly +- [ ] Qdrant is reachable on the configured port before backend start +- [ ] `config.toml` contains at least one valid API key and is not committed to git +- [ ] backend `python devika.py` starts without tracebacks and logs "Uvicorn running" +- [ ] frontend dev server compiles without errors and the UI loads at localhost:3000 +- [ ] first task submission returns agent output in the workspace within expected time + +### Source Alignment + +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Getting Started](https://github.com/stitionai/devika#getting-started) +- [Devika Installation Section](https://github.com/stitionai/devika#installation) +- [Devika Configuration Section](https://github.com/stitionai/devika#configuration) +- [Devika Repository Root](https://github.com/stitionai/devika) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) — comparable autonomous coding agent with Docker-based setup +- [SWE-agent Tutorial](../swe-agent-tutorial/) — alternative CLI-driven autonomous agent +- [Aider Tutorial](../aider-tutorial/) — simpler AI coding assistant for quick comparisons +- [Cline Tutorial](../cline-tutorial/) — VS Code extension approach to AI coding +- [Ollama Tutorial](../ollama-tutorial/) — required for local LLM backend with Devika + +### Advanced Practice Exercises + +1. Install Devika from scratch in a fresh Docker container and document every step that diverges from the README. +2. Configure two different LLM providers in `config.toml` and measure cold-start time for each on the same task. +3. Set up Qdrant Cloud as a remote vector store and verify that agent memory persists across server restarts. +4. Write a systemd service file or Docker Compose definition that starts the backend, frontend, and Qdrant together. +5. Submit a task that generates a multi-file Python project and inspect the workspace directory structure produced by the agents. + +### Review Questions + +1. What is the minimum set of services that must be running before Devika can accept a task? +2. Where does Devika write generated files, and how is the workspace path configured? +3. What happens if `playwright install` is skipped before the first task that requires web research? +4. Why should `config.toml` never be committed to git, and how do you prevent it? +5. How do you verify that the Qdrant vector store is healthy before submitting the first task? + +### Scenario Playbook 1: Fresh Install on macOS + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: developer clones Devika for the first time on macOS with Homebrew Python +- initial hypothesis: Homebrew Python may conflict with virtualenv path resolution +- immediate action: verify `python3 --version` returns >=3.10 before creating the virtualenv +- engineering control: use `python3 -m venv venv` explicitly and activate before any pip commands +- verification target: `pip list` inside venv shows all requirements installed without version conflicts +- rollback trigger: any import error in `python devika.py` startup traceback +- communication step: document the exact Python version and venv activation command in team setup notes +- learning capture: add macOS-specific install notes to the project wiki and pin the Python version in CI + +### Scenario Playbook 2: Qdrant Docker Not Started + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: backend starts but every task submission fails immediately with a connection error +- initial hypothesis: Qdrant container is not running or is using the wrong port +- immediate action: run `docker ps` to check if the Qdrant container is active +- engineering control: add a startup health check script that pings Qdrant on port 6333 before launching the backend +- verification target: backend logs show successful Qdrant connection on startup +- rollback trigger: health check fails after 10 seconds of waiting +- communication step: update the README with a clear "start Qdrant first" step order note +- learning capture: encode the Qdrant startup check into the Docker Compose `depends_on` condition + +### Scenario Playbook 3: Invalid API Key at Task Submission + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task is submitted but the agent immediately reports an authentication error +- initial hypothesis: the API key in config.toml is either wrong, expired, or for the wrong organization +- immediate action: copy the key directly from the provider dashboard and paste into config.toml, then restart the backend +- engineering control: add a startup validator that tests LLM provider connectivity with a minimal ping request +- verification target: backend startup log shows "LLM provider authenticated" before accepting task requests +- rollback trigger: authentication error persists after key replacement +- communication step: check provider dashboard for key status, quota limits, and billing validity +- learning capture: document key rotation procedure and add config.toml validation to the pre-launch checklist + +### Scenario Playbook 4: Frontend Cannot Reach Backend + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: UI loads but all task submissions return network errors in the browser console +- initial hypothesis: VITE_API_BASE_URL environment variable is set to a different port than the backend +- immediate action: check `ui/.env` or `ui/vite.config.js` for the API base URL and compare with the actual backend port +- engineering control: standardize backend port to 1337 and frontend proxy setting to match; document in .env.example +- verification target: browser network tab shows successful POST to `/api/execute-agent` with 200 response +- rollback trigger: CORS errors persist after port correction +- communication step: add port configuration to the onboarding checklist for new contributors +- learning capture: create a single `.env` file at the repo root that sources both backend and frontend port settings + +### Scenario Playbook 5: Playwright Browser Crash on Task + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: research-heavy task fails mid-run with a Playwright executable not found error +- initial hypothesis: Playwright was installed in a different virtualenv or system Python than the one running the backend +- immediate action: activate the correct virtualenv and run `playwright install chromium` inside it +- engineering control: add `playwright install chromium` as a post-install step in the project Makefile or setup script +- verification target: `playwright --version` resolves without error inside the active virtualenv +- rollback trigger: Playwright crashes even after reinstall; check for sandboxing issues in CI or Docker environments +- communication step: update the CI pipeline YAML to include `playwright install chromium` as a named setup step +- learning capture: add explicit Playwright version pinning in requirements.txt to prevent future environment drift + +### What Problem Does This Solve? + +Devika's installation complexity stems from having three distinct runtimes (Python backend, Node.js frontend, Qdrant vector store) that must all be healthy simultaneously before the agent pipeline can function. Without a structured setup sequence, engineers frequently encounter cascading failures where one missing dependency causes misleading errors in a different layer. This chapter establishes the correct startup order, dependency inventory, and verification checkpoints so that every team member reaches a working baseline without guesswork. + +### How it Works Under the Hood + +1. The Python backend reads `config.toml` on startup and initializes provider clients for every API key present. +2. FastAPI registers all agent API endpoints and starts Uvicorn on port 1337. +3. The Qdrant client connects to the configured vector store URL and verifies the collection schema for agent memory. +4. The SQLite database is initialized (or reopened) for project, session, and task metadata storage. +5. When the frontend submits a task, FastAPI dispatches it to the agent orchestrator which spawns the multi-agent pipeline. +6. Generated files are written to the workspace path defined in config.toml under the project name subdirectory. + +### Source Walkthrough + +- [Devika README Getting Started](https://github.com/stitionai/devika#getting-started) — Why it matters: the canonical install sequence that defines the correct dependency order. +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) — Why it matters: the authoritative reference for all configuration keys and their default values. +- [Devika requirements.txt](https://github.com/stitionai/devika/blob/main/requirements.txt) — Why it matters: the pinned Python dependency list that determines runtime compatibility. +- [Devika devika.py](https://github.com/stitionai/devika/blob/main/devika.py) — Why it matters: the backend entry point that wires all services together on startup. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Next Chapter: Chapter 2: Architecture and Agent Pipeline](02-architecture-and-agent-pipeline.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/02-architecture-and-agent-pipeline.md b/tutorials/devika-tutorial/02-architecture-and-agent-pipeline.md new file mode 100644 index 00000000..927e3e59 --- /dev/null +++ b/tutorials/devika-tutorial/02-architecture-and-agent-pipeline.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 2: Architecture and Agent Pipeline" +nav_order: 2 +parent: Devika Tutorial +--- + +# Chapter 2: Architecture and Agent Pipeline + +Welcome to **Chapter 2: Architecture and Agent Pipeline**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter explains how Devika's five specialized agents — planner, researcher, coder, action, and internal monologue — coordinate to transform a single user prompt into working code. + +## Learning Goals + +- understand the roles and responsibilities of each specialized agent in the Devika pipeline +- trace the data and control flow from task submission through to workspace output +- identify how the internal monologue loop drives iterative self-correction +- reason about the architectural boundaries between agents for debugging and extension + +## Fast Start Checklist + +1. read the architecture overview in the Devika README and docs directory +2. identify the five agent types and their input/output contracts +3. trace a single task through the pipeline by reading the orchestrator source +4. inspect agent log output for a real task to observe the coordination sequence + +## Source References + +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika How It Works](https://github.com/stitionai/devika#how-it-works) +- [Devika Agent Source](https://github.com/stitionai/devika/tree/main/src/agents) +- [Devika Repository](https://github.com/stitionai/devika) + +## Summary + +You now understand how Devika's multi-agent architecture decomposes a high-level task into research, planning, coding, and self-reflection steps that loop until the task is complete. + +Next: [Chapter 3: LLM Provider Configuration](03-llm-provider-configuration.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 2: Architecture and Agent Pipeline** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. The **Planner Agent** receives the raw user prompt and decomposes it into a numbered step-by-step execution plan; this plan is the primary control signal for all downstream agents. +2. The **Researcher Agent** takes each planning step and formulates search queries, invokes Playwright to browse the web, and stores retrieved knowledge in Qdrant for semantic recall. +3. The **Coder Agent** receives the plan plus all research context from Qdrant and generates code for each step, writing files to the project workspace. +4. The **Action Agent** executes generated code in the workspace environment, captures stdout/stderr, and returns execution results back into the agent loop. +5. The **Internal Monologue Agent** receives the full context — plan, code, execution result — and produces a self-reflection decision: proceed to the next step, revise the current step, or mark the task complete. +6. The orchestrator in `devika.py` manages the loop state machine, routing between agents based on internal monologue output until a terminal condition is reached. +7. All inter-agent communication passes through structured JSON payloads with defined schemas; the LLM prompt templates are versioned in the `prompts/` directory. +8. Qdrant stores embeddings of all research artifacts and previously generated code snippets, enabling the coder agent to reference earlier findings without re-searching. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Max loop iterations | default unbounded loop | explicit max_iterations cap in config | simplicity vs runaway cost | +| Research depth | shallow single-page research | deep multi-page crawl with Playwright | speed vs research completeness | +| Code execution sandbox | run in local workspace | Docker-isolated execution environment | setup simplicity vs blast radius | +| Internal monologue model | same model as coder | cheaper fast model for monologue only | cost savings vs reflection quality | +| Step plan granularity | LLM-default decomposition | inject custom plan prefix to control step count | flexibility vs predictability | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Infinite planning loop | task never reaches coder agent | planner keeps revising plan without progressing | add max_iterations guard and force-advance after threshold | +| Research context overflow | coder agent receives truncated context | Qdrant retrieval returns too many chunks | tune top-k retrieval parameter and add context window budget | +| Coder ignores research | generated code doesn't use fetched libraries | researcher output not injected into coder prompt | verify prompt template includes `{research_context}` variable | +| Action agent silent failure | task completes but workspace is empty | code execution error swallowed without logging | add explicit error capture in action agent and surface in UI | +| Internal monologue loop | agent cycles without progress | monologue model hallucinates "not done" indefinitely | inject step counter into monologue prompt; enforce done after N retries | +| Cross-agent state corruption | later agent uses stale data from previous task | session context not cleared between tasks | enforce session isolation and clear Qdrant session namespace per task | + +### Implementation Runbook + +1. Read `src/agents/planner/planner.py` to understand how the plan is constructed from the user prompt. +2. Read `src/agents/researcher/researcher.py` to trace how search queries are generated and how Playwright is invoked. +3. Read `src/agents/coder/coder.py` to see how research context is retrieved from Qdrant and injected into the code generation prompt. +4. Read `src/agents/action/action.py` to understand how generated code is executed and results are captured. +5. Read `src/agents/internal_monologue/internal_monologue.py` to see the self-reflection decision logic. +6. Trace the orchestrator loop in `devika.py` to map the state transitions between agents. +7. Enable debug logging and submit a simple task; compare the logged agent sequences with your architecture map. +8. Identify the prompt templates in `prompts/` that correspond to each agent and note how context variables are injected. +9. Add a custom logging hook at the orchestrator level to emit per-step timing metrics for performance analysis. + +### Quality Gate Checklist + +- [ ] all five agent types are identified with their input and output contracts documented +- [ ] the orchestrator state machine transitions are mapped for both success and failure paths +- [ ] Qdrant retrieval parameters (top-k, score threshold) are explicitly configured +- [ ] prompt templates for each agent are reviewed and `{variable}` injection points are validated +- [ ] max_iterations or equivalent loop guard is set to prevent runaway execution +- [ ] inter-agent JSON schema is validated against actual message payloads in logs +- [ ] code execution in the action agent has explicit error capture and surface-to-UI reporting +- [ ] session isolation ensures no cross-task context bleed in Qdrant namespaces + +### Source Alignment + +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika How It Works Section](https://github.com/stitionai/devika#how-it-works) +- [Devika Agents Source Directory](https://github.com/stitionai/devika/tree/main/src/agents) +- [Devika Prompts Directory](https://github.com/stitionai/devika/tree/main/prompts) +- [Devika Main Orchestrator](https://github.com/stitionai/devika/blob/main/devika.py) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) — comparable multi-agent architecture with different agent role definitions +- [LangGraph Tutorial](../langgraph-tutorial/) — graph-based orchestration model for understanding state machine design +- [CrewAI Tutorial](../crewai-tutorial/) — crew-based multi-agent coordination patterns +- [AutoGen Tutorial](../autogen-tutorial/) — conversational multi-agent framework for comparison +- [SWE-agent Tutorial](../swe-agent-tutorial/) — single-agent loop architecture for contrast with Devika's multi-agent design + +### Advanced Practice Exercises + +1. Draw the complete state machine for the Devika orchestrator loop including all terminal conditions and retry paths. +2. Add a custom agent role (e.g., a "reviewer" agent) to the pipeline by extending the orchestrator loop and creating a new prompt template. +3. Instrument each agent with a timing decorator and produce a per-step latency breakdown for a representative task. +4. Replace the Qdrant retriever in the researcher agent with a different vector store and verify the coder agent still receives correct context. +5. Write a unit test for the internal monologue agent that injects known context and asserts the correct "proceed/revise/complete" decision. + +### Review Questions + +1. In what order are the five agents invoked for a typical task, and what triggers each transition? +2. How does the internal monologue agent decide whether to mark a task complete or loop back to revision? +3. What role does Qdrant play in the pipeline, and which agents read from and write to it? +4. How are research artifacts from the researcher agent made available to the coder agent? +5. What is the mechanism that prevents the orchestrator loop from running indefinitely on an ambiguous task? + +### Scenario Playbook 1: Planner Produces Overly Long Step List + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task with a broad prompt generates 20+ steps causing excessive token usage +- initial hypothesis: planner prompt does not constrain step count for the given task scope +- immediate action: inspect the planner prompt template and add a max-steps constraint instruction +- engineering control: inject "produce at most 8 steps" into the planner system prompt for standard tasks +- verification target: planner output for comparable prompts stays at 5-8 steps consistently +- rollback trigger: step count reduction causes coder agent to skip critical implementation details +- communication step: document the step count tuning parameter in the operator configuration guide +- learning capture: add step count as an observable metric and alert when it exceeds configured threshold + +### Scenario Playbook 2: Coder Agent Ignores Researcher Output + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: generated code does not use any of the libraries or APIs found by the researcher +- initial hypothesis: the research context variable is not being injected into the coder prompt template +- immediate action: print the fully assembled coder prompt to logs and verify the research block is present +- engineering control: add an assertion in the coder agent that raises if research_context is empty when researcher ran +- verification target: coder agent logs show "using N research chunks" for every task where researcher was invoked +- rollback trigger: assertion fails on tasks where research legitimately produced no useful results +- communication step: add a debug flag to the UI that displays which research chunks the coder used +- learning capture: add a researcher-to-coder context injection integration test to the test suite + +### Scenario Playbook 3: Internal Monologue Loop Cycles Without Progress + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task stays in revision loop for more than 10 iterations without reaching completion +- initial hypothesis: monologue model is hallucinating "not done" even when all steps are complete +- immediate action: inspect monologue logs to see the specific reason it keeps returning "revise" +- engineering control: inject a step counter and iteration count into the monologue prompt; add "if iteration > 8, mark done" instruction +- verification target: tasks complete within 8 loop iterations for representative benchmarks +- rollback trigger: forced completion causes the coder to produce incomplete output on genuinely multi-step tasks +- communication step: surface the iteration count in the UI so users can see agent progress +- learning capture: capture monologue decision reasons in structured logs for offline pattern analysis + +### Scenario Playbook 4: Action Agent Swallows Execution Error + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task reports success but workspace files are incomplete or malformed +- initial hypothesis: the action agent executed code that raised an exception but did not propagate the error to the orchestrator +- immediate action: review action agent error handling code and check if exceptions are caught and discarded +- engineering control: add explicit try/except with structured error return in the action agent; surface stderr in the UI +- verification target: any code execution failure causes the orchestrator to retry or escalate rather than silently proceeding +- rollback trigger: surfacing all errors causes false-positive failure reports on harmless warnings +- communication step: add stderr output to the task detail view in the UI for operator visibility +- learning capture: add a test that injects a known-bad code snippet and asserts the error is reported to the orchestrator + +### Scenario Playbook 5: Cross-Task Context Bleed in Qdrant + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: a new task retrieves research artifacts from a previous unrelated task +- initial hypothesis: Qdrant collection is shared across all tasks without namespace isolation +- immediate action: review how the researcher agent writes to and reads from Qdrant and identify the collection or namespace key +- engineering control: prefix all Qdrant operations with the task_id or project_id to create logical namespace isolation +- verification target: researcher agent only retrieves chunks tagged with the current task_id +- rollback trigger: namespace isolation causes performance regression due to smaller retrieval sets +- communication step: document the Qdrant namespace convention in the architecture guide +- learning capture: add a Qdrant isolation test that asserts zero cross-task retrieval after namespace fix + +### What Problem Does This Solve? + +Devika's multi-agent architecture solves the single-agent context window and capability ceiling problem. A single LLM asked to research, plan, code, execute, and self-reflect within one prompt quickly runs out of context or produces shallow work in each dimension. By separating these concerns into specialized agents, each with a focused prompt and defined input/output contract, Devika achieves higher quality research, more structured planning, and more reliable code generation than a monolithic approach. The internal monologue loop adds iterative self-correction without requiring human intervention at each step. + +### How it Works Under the Hood + +1. The user submits a task prompt through the frontend; it is stored in SQLite and dispatched to the orchestrator. +2. The orchestrator invokes the planner agent with the raw prompt; the planner returns a structured JSON step list. +3. For each step, the orchestrator invokes the researcher agent with the step description; Playwright fetches web content and Qdrant stores embeddings. +4. The orchestrator invokes the coder agent with the step description plus retrieved Qdrant context; the coder writes files to the workspace. +5. The action agent executes any runnable code and returns stdout/stderr to the orchestrator. +6. The internal monologue agent receives the full context and returns a decision JSON; the orchestrator advances, retries, or terminates based on this decision. + +### Source Walkthrough + +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) — Why it matters: the official architecture diagram and agent role descriptions. +- [Devika How It Works](https://github.com/stitionai/devika#how-it-works) — Why it matters: the high-level narrative of the full pipeline in the README. +- [Devika Agents Directory](https://github.com/stitionai/devika/tree/main/src/agents) — Why it matters: the source of truth for each agent's implementation and prompt assembly. +- [Devika Prompts Directory](https://github.com/stitionai/devika/tree/main/prompts) — Why it matters: the prompt templates that define each agent's behavior and context injection points. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 1: Getting Started](01-getting-started.md) +- [Next Chapter: Chapter 3: LLM Provider Configuration](03-llm-provider-configuration.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/03-llm-provider-configuration.md b/tutorials/devika-tutorial/03-llm-provider-configuration.md new file mode 100644 index 00000000..1da67db7 --- /dev/null +++ b/tutorials/devika-tutorial/03-llm-provider-configuration.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 3: LLM Provider Configuration" +nav_order: 3 +parent: Devika Tutorial +--- + +# Chapter 3: LLM Provider Configuration + +Welcome to **Chapter 3: LLM Provider Configuration**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how to configure Claude 3, GPT-4, Gemini, Mistral, Groq, and local Ollama models in Devika's `config.toml` and how to select the right provider for each agent role. + +## Learning Goals + +- configure API keys and model identifiers for every supported LLM provider +- understand Devika's model selection mechanism and how to switch providers per project +- evaluate the cost, latency, and quality tradeoffs across providers for autonomous coding tasks +- configure Ollama for fully offline, local LLM operation without external API keys + +## Fast Start Checklist + +1. open `config.toml` and locate the `[API_KEYS]` and `[API_MODELS]` sections +2. add your API key for at least one cloud provider (Claude, OpenAI, Google, Mistral, or Groq) +3. set the model name for each provider section to a currently available model identifier +4. optionally install and start Ollama with a code-capable model for local operation + +## Source References + +- [Devika Configuration Section](https://github.com/stitionai/devika#configuration) +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) +- [Devika LLM Provider Source](https://github.com/stitionai/devika/tree/main/src/llm) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) + +## Summary + +You now know how to configure any of Devika's supported LLM providers, select the right model for each use case, and operate Devika in fully local mode using Ollama. + +Next: [Chapter 4: Task Planning and Code Generation](04-task-planning-and-code-generation.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 3: LLM Provider Configuration** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. All LLM provider credentials are stored in the `[API_KEYS]` section of `config.toml`; the key names match the provider identifiers used in the `src/llm/` abstraction layer. +2. The `[API_MODELS]` section maps each provider to a specific model string; changing this value affects all agents that use that provider without requiring code changes. +3. Devika's LLM abstraction layer in `src/llm/` wraps each provider SDK (Anthropic, OpenAI, Google GenAI, Mistral, Groq, Ollama) behind a uniform `inference()` interface. +4. Claude 3 models (Haiku, Sonnet, Opus) are accessed via the Anthropic Python SDK; the model string format is `claude-3-haiku-20240307`, `claude-3-sonnet-20240229`, or `claude-3-opus-20240229`. +5. GPT-4 and GPT-4-turbo models are accessed via the OpenAI Python SDK; model strings follow the `gpt-4-turbo-preview` format. +6. Gemini models (Gemini Pro, Gemini Ultra) are accessed via the Google GenerativeAI SDK; the model string is `gemini-pro` or `gemini-ultra`. +7. Groq provides access to open-weight models (LLaMA, Mistral, Mixtral) via a fast inference API; model strings are provider-specific like `mixtral-8x7b-32768`. +8. Ollama runs local models such as `codellama`, `deepseek-coder`, or `mistral` on the developer's machine; the Ollama base URL in config.toml must point to the running Ollama server (default: `http://localhost:11434`). + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Primary provider | Claude 3 Sonnet (balanced quality/cost) | Claude 3 Opus (maximum reasoning quality) | cost vs output quality | +| Local vs cloud | cloud provider for reliability | Ollama for full offline/private operation | uptime vs data privacy | +| Model per agent | same model for all agents | different model per agent role | simplicity vs cost optimization | +| Context window | standard 4k-8k context models | 32k-100k context models for large codebases | cost vs completeness | +| Fallback strategy | no fallback | secondary provider fallback on rate limit | simplicity vs availability | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Rate limit hit mid-task | `RateLimitError` or 429 response | sustained high-frequency API calls | add exponential backoff retry in the LLM abstraction layer | +| Model string not recognized | `InvalidModelError` or 404 from provider | deprecated or mistyped model identifier | verify model ID in provider documentation and update config.toml | +| Ollama model not pulled | connection refused or 404 from Ollama | model not downloaded to local Ollama instance | run `ollama pull ` before starting Devika | +| Context window exceeded | truncated output or provider error | generated context exceeds model's max tokens | switch to a larger context window model or reduce research chunk size | +| API quota exhausted | 429 with quota message | free tier or daily limit reached | upgrade provider plan or switch to an alternative provider temporarily | +| Provider SDK version mismatch | import error on startup | requirements.txt pinned to older SDK version than API supports | update SDK version in requirements.txt and re-run pip install | + +### Implementation Runbook + +1. Open `config.toml` and locate the `[API_KEYS]` section. +2. For Anthropic Claude: set `ANTHROPIC` to your Anthropic API key from console.anthropic.com. +3. For OpenAI GPT-4: set `OPENAI` to your OpenAI API key from platform.openai.com. +4. For Google Gemini: set `GOOGLE` to your Google AI API key from aistudio.google.com. +5. For Mistral: set `MISTRAL` to your Mistral API key from console.mistral.ai. +6. For Groq: set `GROQ` to your Groq API key from console.groq.com. +7. For Ollama: ensure `OLLAMA_API_BASE` is set to `http://localhost:11434` and run `ollama pull codellama` on your local machine. +8. Set the `[API_MODELS]` values to the specific model identifiers you want to use (e.g., `CLAUDE_3_MODEL = "claude-3-sonnet-20240229"`). +9. Restart the backend after any config.toml change and verify the provider is selected in the project creation UI dropdown. + +### Quality Gate Checklist + +- [ ] all configured API keys are valid and tested with a minimal ping request before task submission +- [ ] model identifiers in `[API_MODELS]` match currently available models from each provider's documentation +- [ ] Ollama server is running and the target model is pulled before using local LLM mode +- [ ] rate limit handling (retry with backoff) is implemented in the LLM abstraction layer +- [ ] config.toml is excluded from git via `.gitignore` and team secrets are managed via a secrets manager +- [ ] context window limits for each configured model are documented and input budgets are sized accordingly +- [ ] fallback provider logic is defined in the runbook if the primary provider is unavailable +- [ ] provider cost estimates per task are tracked to prevent unexpected billing surprises + +### Source Alignment + +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) +- [Devika Configuration Section](https://github.com/stitionai/devika#configuration) +- [Devika LLM Source Directory](https://github.com/stitionai/devika/tree/main/src/llm) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Supported Models List](https://github.com/stitionai/devika#supported-models) + +### Cross-Tutorial Connection Map + +- [LiteLLM Tutorial](../litellm-tutorial/) — unified LLM proxy that can sit in front of Devika's provider calls +- [Ollama Tutorial](../ollama-tutorial/) — deep dive on running local models that Devika can consume +- [OpenAI Python SDK Tutorial](../openai-python-sdk-tutorial/) — understanding the SDK Devika uses for GPT-4 calls +- [SWE-agent Tutorial](../swe-agent-tutorial/) — comparable model configuration patterns in another autonomous coding agent +- [Aider Tutorial](../aider-tutorial/) — single-agent coding tool with similar provider configuration surface + +### Advanced Practice Exercises + +1. Configure Devika with three different providers simultaneously and benchmark the same task across all three, measuring token cost, latency, and output quality. +2. Set up a local Ollama instance with `deepseek-coder:33b` and run a complete coding task end-to-end without any external API calls. +3. Implement a provider fallback mechanism in the LLM abstraction layer that switches from Claude to GPT-4 on a rate limit error. +4. Write a config validation script that reads config.toml and tests each configured provider with a minimal API call before Devika starts. +5. Set up a per-agent model configuration where the planner uses Claude 3 Opus for quality, the coder uses Claude 3 Sonnet for balance, and the internal monologue uses Claude 3 Haiku for speed. + +### Review Questions + +1. Where in config.toml are provider API keys stored and what section contains model name identifiers? +2. What is the Ollama base URL and what command must be run before using a local model with Devika? +3. How does Devika's LLM abstraction layer allow switching providers without changing agent code? +4. What happens if the configured model identifier is deprecated and no longer available from the provider? +5. Why is context window size a critical consideration when choosing models for Devika's coder agent on large codebases? + +### Scenario Playbook 1: Switching From Claude to GPT-4 Mid-Project + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: Anthropic API has elevated latency; team needs to continue task execution with minimal disruption +- initial hypothesis: switching provider in config.toml and restarting the backend should redirect all subsequent calls to GPT-4 +- immediate action: update `ANTHROPIC` key to leave intact and set `CLAUDE_3_MODEL` blank; ensure `OPENAI` key and `GPT4_MODEL` are set +- engineering control: select GPT-4 in the project settings dropdown in the UI before resubmitting tasks +- verification target: backend logs show calls routing to OpenAI endpoint instead of Anthropic +- rollback trigger: GPT-4 output quality diverges significantly from Claude baseline; revert provider selection +- communication step: notify team of temporary provider switch and expected quality differences in Slack +- learning capture: document provider switching procedure in the operations runbook with timing and quality notes + +### Scenario Playbook 2: Ollama Local Model Has Wrong Context Window + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: local Ollama task fails or produces truncated code on projects with many files +- initial hypothesis: the pulled Ollama model has a smaller context window than the research and plan context requires +- immediate action: check the model card for the Ollama model's context window size and compare with actual context usage in logs +- engineering control: switch to `codellama:34b` or a model with explicit 16k+ context; set `num_ctx` parameter in Ollama model options +- verification target: coder agent receives full research context without truncation for benchmark tasks +- rollback trigger: larger context model is too slow for interactive use; fall back to cloud provider +- communication step: update the Ollama model recommendations in the setup guide with context window requirements +- learning capture: add a startup check that warns if the configured Ollama model's context window is below the minimum recommended size + +### Scenario Playbook 3: Groq Rate Limit on Batch Tasks + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: multiple parallel tasks hit Groq's tokens-per-minute limit simultaneously +- initial hypothesis: Groq's free tier TPM limit is much lower than cloud providers; concurrent tasks exceed the quota +- immediate action: add sequential queuing for tasks using Groq to prevent concurrent execution +- engineering control: implement exponential backoff with jitter in the Groq LLM client; log rate limit events +- verification target: task queue processes all items without errors when TPM limit is respected +- rollback trigger: sequential queuing makes task throughput unacceptably slow for the team +- communication step: inform team of Groq rate limits and recommend upgrading to a paid plan for higher throughput +- learning capture: add TPM budget tracking per provider and alert when 80% of limit is consumed per minute + +### Scenario Playbook 4: Google Gemini API Key Invalid After Rotation + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: tasks using Gemini fail with authentication error after a scheduled key rotation +- initial hypothesis: config.toml still contains the old Gemini API key that was revoked +- immediate action: generate a new key in Google AI Studio, update config.toml, and restart the backend +- engineering control: integrate config.toml secrets with a secrets manager (e.g., AWS Secrets Manager or HashiCorp Vault) so key rotation updates config automatically +- verification target: Gemini provider returns successful response on the next task submission after backend restart +- rollback trigger: new key has restricted scopes that don't cover the Generative Language API +- communication step: notify team of key rotation and estimated downtime window; update rotation schedule documentation +- learning capture: automate config.toml secret injection from the secrets manager in the deployment pipeline + +### Scenario Playbook 5: Mistral Model Identifier Deprecated + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task fails with a 404 model-not-found error from Mistral's API +- initial hypothesis: the model string in config.toml references a model name that Mistral has removed or renamed +- immediate action: check the Mistral model availability endpoint and update the model string to the current identifier +- engineering control: add a model availability check to the startup sequence that validates each configured model against the provider's model list API +- verification target: provider model validation passes for all configured providers on each backend startup +- rollback trigger: model list API call itself fails due to network issues; fall back to documentation-based verification +- communication step: post a note to the engineering channel when model identifiers are updated and why +- learning capture: pin model identifier versions with a comment on the expected deprecation date in config.toml + +### What Problem Does This Solve? + +Devika's multi-provider configuration model solves the vendor lock-in and cost optimization problem for autonomous coding teams. Different LLM providers excel at different tasks — Claude 3 Opus produces superior reasoning for complex planning, Groq provides ultrafast inference for lightweight monologue steps, and Ollama enables fully private operation without any data leaving the local machine. Without a clean provider abstraction and a single config file, teams would need to modify agent code to switch providers, making experimentation and cost optimization impractical. + +### How it Works Under the Hood + +1. On backend startup, Devika reads `config.toml` and initializes a provider client for each section where an API key is present. +2. The `src/llm/` abstraction layer wraps each provider SDK with a uniform `LLM.inference(prompt, model)` interface. +3. When an agent invokes the LLM, it passes the selected model identifier; the abstraction layer routes to the correct provider client based on the model prefix. +4. Provider-specific error handling (rate limits, authentication errors, context overflow) is caught in the abstraction layer and either retried or surfaced to the orchestrator. +5. For Ollama, the Ollama Python client sends requests to the local server URL configured in `OLLAMA_API_BASE`. +6. The project creation UI reads the available configured providers from the backend and presents them as a model selection dropdown. + +### Source Walkthrough + +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) — Why it matters: the authoritative template showing every provider key and model configuration option. +- [Devika LLM Directory](https://github.com/stitionai/devika/tree/main/src/llm) — Why it matters: the provider abstraction layer source showing how each SDK is wrapped uniformly. +- [Devika README Configuration](https://github.com/stitionai/devika#configuration) — Why it matters: the quickstart guide to filling in config.toml for a first working setup. +- [Devika README Supported Models](https://github.com/stitionai/devika#supported-models) — Why it matters: the official list of tested and supported model identifiers per provider. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 2: Architecture and Agent Pipeline](02-architecture-and-agent-pipeline.md) +- [Next Chapter: Chapter 4: Task Planning and Code Generation](04-task-planning-and-code-generation.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/04-task-planning-and-code-generation.md b/tutorials/devika-tutorial/04-task-planning-and-code-generation.md new file mode 100644 index 00000000..8e42d7bc --- /dev/null +++ b/tutorials/devika-tutorial/04-task-planning-and-code-generation.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 4: Task Planning and Code Generation" +nav_order: 4 +parent: Devika Tutorial +--- + +# Chapter 4: Task Planning and Code Generation + +Welcome to **Chapter 4: Task Planning and Code Generation**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter explains how Devika's planner agent decomposes a user prompt into an executable step plan, and how the coder agent transforms each step plus research context into production-ready code files. + +## Learning Goals + +- understand how the planner agent structures a task into numbered steps with dependencies +- trace how each plan step becomes a coder agent invocation with a bounded context +- identify prompt engineering patterns that improve planning quality and code generation accuracy +- recognize failure modes in task decomposition and apply countermeasures + +## Fast Start Checklist + +1. submit a small, well-scoped coding task and observe the plan output in the agent log +2. examine the coder prompt template to see how plan steps and research context are assembled +3. review the generated workspace files to verify step-to-file correspondence +4. experiment with prompt phrasing to observe its effect on step count and code quality + +## Source References + +- [Devika Planner Agent Source](https://github.com/stitionai/devika/tree/main/src/agents/planner) +- [Devika Coder Agent Source](https://github.com/stitionai/devika/tree/main/src/agents/coder) +- [Devika How It Works](https://github.com/stitionai/devika#how-it-works) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) + +## Summary + +You now understand how Devika converts a natural language task into a structured execution plan and how each plan step drives a focused code generation call with research-enriched context. + +Next: [Chapter 5: Web Research and Browser Integration](05-web-research-and-browser-integration.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 4: Task Planning and Code Generation** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. The planner agent receives the raw user prompt and the project context (existing files, previous steps) and returns a JSON array of step objects, each with a `step_number`, `task`, and optional `search_query` field. +2. Steps with a `search_query` field are routed to the researcher agent before the coder agent is invoked for that step; steps without search queries go directly to the coder. +3. The coder agent prompt assembles three context blocks: (a) the current step description, (b) Qdrant-retrieved research chunks for this step, and (c) a file tree snapshot of the current workspace. +4. The coder agent returns a JSON object with a `code` block, a `file_name`, and an optional `terminal_command` for execution by the action agent. +5. File names returned by the coder agent are relative to the project workspace root; the orchestrator writes each file using the returned path. +6. When a step requires modifying an existing file, the coder receives the current file content in its context window; the returned code replaces the entire file. +7. The planner can emit a `done` flag on the last step to signal the orchestrator to terminate rather than entering another monologue revision loop. +8. Prompt clarity in the user's task description directly controls planner step granularity; vague prompts produce vague steps and lower-quality code generation. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Task prompt granularity | single high-level prompt | multi-sentence prompt with tech stack and constraints | ease of use vs output precision | +| Plan step count | LLM-default decomposition | instruct model to limit to N steps | flexibility vs token budget | +| Code file granularity | let coder decide file structure | specify expected file names in the task prompt | autonomy vs predictability | +| Existing code context | no existing code provided | paste existing code snippets into the task prompt | speed vs contextual accuracy | +| Iterative refinement | submit new task per revision | use internal monologue loop for in-session revision | simplicity vs session continuity | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Planner produces duplicate steps | two steps with identical task descriptions | ambiguous prompt allows LLM to repeat itself | add deduplication validation on planner output before dispatching | +| Coder generates code for wrong language | Python output for a JavaScript task | task prompt did not specify language; researcher found Python examples | add explicit language constraint in task prompt and planner system prompt | +| Generated code has syntax errors | action agent execution fails with SyntaxError | coder truncated output due to context window limit | reduce context chunk size or switch to a larger context window model | +| Missing import statements | code runs but NameError at runtime | coder generated function bodies without headers | add a "always include all imports" instruction to the coder system prompt | +| Wrong file path from coder | file written to wrong workspace location | coder returns absolute path instead of relative path | validate and normalize returned file_name to be workspace-relative before writing | +| Step dependencies not respected | step N uses variable defined in step N+2 | planner did not model data dependencies between steps | instruct planner to annotate each step with its dependencies in the step JSON | + +### Implementation Runbook + +1. Write a clear task prompt that specifies language, framework, expected output structure, and any constraints. +2. Submit the task and observe the planner output in the backend logs to verify step decomposition quality. +3. For each step in the plan, verify whether a `search_query` is present and whether it accurately captures what research is needed. +4. After each coder invocation, verify the workspace file matches the expected step output before the next step proceeds. +5. If code quality is low, experiment with adding explicit constraints to the user prompt (e.g., "write type-annotated Python 3.10 with pytest unit tests"). +6. For complex multi-file projects, seed the task prompt with the expected project structure to guide coder file naming. +7. Monitor the internal monologue logs to verify the agent correctly identifies when each step is complete. +8. After task completion, review all workspace files for completeness, correct imports, and integration consistency across files. +9. Run the generated code locally or in the action agent's execution environment to validate correctness before using in production. + +### Quality Gate Checklist + +- [ ] task prompts include language, framework, and constraint specifications to guide the planner +- [ ] planner output is validated for step completeness and absence of duplicate or contradictory steps +- [ ] each coder invocation receives the correct research context chunks for its step +- [ ] generated code files are written to correct workspace-relative paths +- [ ] all generated files include necessary imports and dependency declarations +- [ ] action agent execution results are surfaced to the orchestrator for error detection +- [ ] internal monologue correctly identifies task completion vs. continued revision need +- [ ] final workspace is reviewed for cross-file integration consistency + +### Source Alignment + +- [Devika Planner Agent](https://github.com/stitionai/devika/tree/main/src/agents/planner) +- [Devika Coder Agent](https://github.com/stitionai/devika/tree/main/src/agents/coder) +- [Devika How It Works](https://github.com/stitionai/devika#how-it-works) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika Prompts Directory](https://github.com/stitionai/devika/tree/main/prompts) + +### Cross-Tutorial Connection Map + +- [DSPy Tutorial](../dspy-tutorial/) — systematic prompt optimization techniques applicable to Devika's planner prompts +- [LangGraph Tutorial](../langgraph-tutorial/) — graph-based task decomposition for comparison with Devika's linear plan +- [CrewAI Tutorial](../crewai-tutorial/) — role-based task assignment patterns in multi-agent coding systems +- [Aider Tutorial](../aider-tutorial/) — simpler code generation workflow for baseline quality comparison +- [OpenHands Tutorial](../openhands-tutorial/) — alternative multi-agent coding system with different planning approach + +### Advanced Practice Exercises + +1. Write five different task prompts for the same coding problem at different specificity levels and compare the planner step outputs and final code quality. +2. Modify the planner prompt template to add a `dependencies` field to each step and verify the orchestrator respects the dependency ordering. +3. Add a plan validation function that checks for duplicate step descriptions and orphaned `search_query` fields before dispatching to the researcher. +4. Instrument the coder agent to log the exact assembled prompt (with research context) for each step and analyze token usage per step. +5. Build a post-processing script that runs `pylint` or `eslint` on all coder-generated files and reports quality scores per task. + +### Review Questions + +1. What fields does a planner step JSON object contain and which field controls whether the researcher agent is invoked for that step? +2. What three context blocks are assembled into the coder agent prompt and what is the source of each? +3. How does the coder agent return file content to the orchestrator and how does the orchestrator determine where to write each file? +4. What prompt engineering techniques most reliably improve planner step granularity and code quality? +5. What failure mode occurs when the coder agent's output is truncated due to a context window limit and how do you detect it? + +### Scenario Playbook 1: Task Prompt Too Vague Produces Poor Plan + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: user submits "build me a web app" and the planner returns only 2 generic steps +- initial hypothesis: the prompt lacks specificity about language, framework, features, and output structure +- immediate action: revise the prompt to include technology stack, feature list, and expected file structure +- engineering control: add a prompt quality guide to the UI that prompts users to specify language, framework, and constraints before submission +- verification target: revised prompt produces a plan with 5-8 specific, actionable steps with accurate search queries +- rollback trigger: detailed prompt still produces generic plan; investigate planner prompt template for missing instruction +- communication step: publish a task prompt writing guide with good and bad examples in the team knowledge base +- learning capture: add prompt quality scoring to the pre-submission flow that warns when key specificity signals are absent + +### Scenario Playbook 2: Coder Generates Python Instead of TypeScript + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task requested a TypeScript Node.js API but coder produces Python Flask code +- initial hypothesis: researcher found Python examples that dominated the Qdrant context, biasing the coder +- immediate action: re-submit with explicit "TypeScript" and "Node.js" in both the task prompt and as a constraint +- engineering control: inject a language constraint into the planner system prompt so each plan step carries the target language +- verification target: coder generates `.ts` files with TypeScript syntax for all subsequent tasks specifying TypeScript +- rollback trigger: language constraint causes coder to ignore research context that only has Python examples +- communication step: add language specification to the task prompt template shown in the UI +- learning capture: add a file extension validation post-step that alerts if generated files don't match the specified language + +### Scenario Playbook 3: Generated Code Has Missing Imports + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: action agent execution fails with `NameError: name 'requests' is not defined` on generated code +- initial hypothesis: coder generated function bodies without including the `import requests` header +- immediate action: manually add the import and re-execute; inspect the coder prompt template for import instructions +- engineering control: add "always include all import statements at the top of every generated file" to the coder system prompt +- verification target: subsequent code generation includes complete import blocks for all symbols used in the function body +- rollback trigger: import instruction causes hallucinated imports for libraries not actually used +- communication step: document the import completeness requirement in the code generation quality standards +- learning capture: add a static import checker as a post-step validation that flags missing imports before action agent execution + +### Scenario Playbook 4: Planner Steps Exceed Context Budget + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: a complex task generates 15 steps; mid-task coder calls fail with context overflow +- initial hypothesis: accumulated plan + research context + workspace snapshot exceeds the model's context window +- immediate action: limit the workspace snapshot to the most recently modified files rather than the full file tree +- engineering control: add a context budget manager that trims the workspace snapshot to fit within the available token budget +- verification target: no context overflow errors for tasks up to 15 steps with moderate research context +- rollback trigger: workspace snapshot trimming causes the coder to generate conflicting code that overwrites earlier work +- communication step: document the context window budget model and recommended max step count per model size +- learning capture: add per-step token usage logging to identify which context blocks consume the most budget + +### Scenario Playbook 5: Multi-File Project Has Integration Inconsistencies + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task generates 5 files but function names in `main.py` don't match those defined in `utils.py` +- initial hypothesis: each coder invocation is independent and does not read previously generated files +- immediate action: verify whether the workspace file tree snapshot is injected into the coder context for each step +- engineering control: ensure the full content of all previously generated files is included in the coder context for each subsequent step +- verification target: coder references correct function signatures from earlier-generated files in all integration points +- rollback trigger: including all file contents exceeds context window; use summarized file interfaces instead of full content +- communication step: document the cross-file context injection strategy in the architecture guide +- learning capture: add an integration consistency test that checks import/export symbol names across all generated files + +### What Problem Does This Solve? + +Devika's task planning and code generation pipeline solves the coherence problem in autonomous code generation. Without a structured plan, an LLM asked to build a multi-file application tends to generate incomplete or internally inconsistent code in a single shot. By decomposing the task into sequential steps with bounded context at each step, Devika produces code that is incrementally verifiable, traceable to specific plan steps, and enriched with web-researched context that the LLM would not have had in its training data. + +### How it Works Under the Hood + +1. The user prompt is sent to the planner agent which assembles a system prompt from `prompts/planner/` and produces a JSON step array. +2. The orchestrator iterates over each step; if a `search_query` is present, it dispatches to the researcher agent first. +3. Research results are stored in Qdrant with the task and step as metadata; the coder retrieves them via semantic search. +4. The coder prompt is assembled from the step description, retrieved Qdrant chunks, and the current workspace file tree. +5. The coder returns a JSON object with `file_name`, `code`, and optionally `terminal_command`. +6. The orchestrator writes the file to disk and optionally invokes the action agent to execute the terminal command. + +### Source Walkthrough + +- [Devika Planner Source](https://github.com/stitionai/devika/tree/main/src/agents/planner) — Why it matters: the implementation of step decomposition and the prompt template that drives plan quality. +- [Devika Coder Source](https://github.com/stitionai/devika/tree/main/src/agents/coder) — Why it matters: the code generation logic including context assembly and file output formatting. +- [Devika Prompts Directory](https://github.com/stitionai/devika/tree/main/prompts) — Why it matters: all prompt templates that can be tuned to improve planning and code generation quality. +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) — Why it matters: the canonical description of how plan steps drive the coder invocation sequence. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 3: LLM Provider Configuration](03-llm-provider-configuration.md) +- [Next Chapter: Chapter 5: Web Research and Browser Integration](05-web-research-and-browser-integration.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/05-web-research-and-browser-integration.md b/tutorials/devika-tutorial/05-web-research-and-browser-integration.md new file mode 100644 index 00000000..08c731ee --- /dev/null +++ b/tutorials/devika-tutorial/05-web-research-and-browser-integration.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 5: Web Research and Browser Integration" +nav_order: 5 +parent: Devika Tutorial +--- + +# Chapter 5: Web Research and Browser Integration + +Welcome to **Chapter 5: Web Research and Browser Integration**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how Devika's researcher agent uses Playwright to autonomously browse the web, extract relevant content, and store it in Qdrant for use by the coder agent. + +## Learning Goals + +- understand how the researcher agent generates and executes Playwright-driven web searches +- configure Playwright browser options for headless operation and rate-limiting compliance +- trace the research artifact lifecycle from web fetch to Qdrant storage to coder retrieval +- identify failure modes in browser automation and apply targeted countermeasures + +## Fast Start Checklist + +1. verify Playwright Chromium is installed and the researcher agent can launch a browser +2. submit a task with a clear technology context and observe the researcher's search queries in logs +3. inspect the Qdrant collection to confirm research artifacts are stored with correct metadata +4. verify the coder agent retrieves relevant chunks in subsequent steps + +## Source References + +- [Devika Researcher Agent Source](https://github.com/stitionai/devika/tree/main/src/agents/researcher) +- [Devika Browser Agent Source](https://github.com/stitionai/devika/tree/main/src/browser) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Playwright Python Documentation](https://playwright.dev/python/) + +## Summary + +You now understand how Devika's browser automation layer fetches, extracts, and stores web research that enriches code generation with up-to-date documentation and examples. + +Next: [Chapter 6: Project Management and Workspaces](06-project-management-and-workspaces.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 5: Web Research and Browser Integration** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. The researcher agent receives a step description and generates one or more search query strings using the LLM; queries are optimized for technical documentation and code examples. +2. The browser module launches a Playwright Chromium instance in headless mode and navigates to a search engine (default: Bing) to retrieve a results page. +3. For each result URL, Playwright navigates to the page and extracts the visible text content using a content-stripping function that removes navigation, ads, and boilerplate. +4. Extracted text is chunked into segments of approximately 500-1000 tokens each and sent to an embedding model (configured in config.toml) to produce vector representations. +5. Embeddings are upserted into Qdrant under a collection keyed to the project and task; each chunk is stored with metadata including the source URL, step number, and task ID. +6. When the coder agent runs for a given step, it queries Qdrant with the step description as the query vector and retrieves the top-k most similar chunks. +7. Retrieved chunks are injected into the coder prompt in a structured "Research Context" block; the coder is instructed to prefer research-provided APIs and patterns. +8. The browser session is closed after each research invocation to prevent resource leaks; Playwright is re-launched per research step rather than maintained as a persistent session. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Browser mode | headless Chromium | headful Chromium for debugging research steps | invisibility vs observability | +| Search engine | default Bing search | custom search API (SerpAPI, Brave Search) | zero config vs rate limit control | +| Pages per query | single result page | crawl top 3-5 results per query | speed vs research breadth | +| Chunk size | default 500-token chunks | smaller 200-token chunks for precision | retrieval recall vs precision | +| Research scope | open internet | whitelist of trusted documentation sites only | breadth vs security | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Playwright browser fails to launch | `BrowserType.launch` error on research step | missing Chromium binary or sandbox restriction | run `playwright install chromium` and check sandbox flags for Docker environments | +| Search engine blocks automated requests | empty results or captcha page | search engine detects bot-like request patterns | add request delay, rotate user-agent, or use a search API key | +| Web page content extraction returns empty | empty research context in coder prompt | site uses heavy JavaScript rendering that Playwright can't extract | add `page.wait_for_load_state('networkidle')` before content extraction | +| Qdrant upsert fails silently | coder receives empty research chunks | Qdrant collection write error not propagated | add explicit error handling on Qdrant upsert and log chunk count per step | +| Research content is irrelevant | coder generates code for wrong API | search query too generic; retrieves unrelated documentation | improve query generation prompt to include library version and specific API names | +| Memory leak from unclosed browsers | process memory grows over multiple tasks | Playwright context not closed after research step | explicitly call `browser.close()` and `playwright.stop()` in a finally block | + +### Implementation Runbook + +1. Verify Playwright is installed: run `playwright install chromium` in the active virtualenv. +2. Submit a task that requires looking up a library API and observe the researcher's generated search queries in backend logs. +3. Inspect the Qdrant collection via the Qdrant web UI at `http://localhost:6333/dashboard` to confirm chunks are stored with correct metadata. +4. Add a debug log line in the coder agent that prints the number of Qdrant chunks retrieved for each step. +5. For Docker deployments, add `--no-sandbox` and `--disable-setuid-sandbox` Playwright launch arguments to handle Linux container restrictions. +6. Configure a search API (e.g., Bing Search API or SerpAPI) in config.toml to replace raw browser-based search for more reliable query handling. +7. Tune the `top_k` retrieval parameter in config.toml to balance research context richness against coder prompt size. +8. Add a URL allowlist configuration to restrict researcher browsing to documentation sites like `docs.python.org`, `developer.mozilla.org`, and package-specific docs. +9. Monitor Qdrant collection size over time and implement a TTL-based cleanup policy to prevent unbounded storage growth. + +### Quality Gate Checklist + +- [ ] Playwright Chromium is installed and launches without errors in headless mode +- [ ] researcher generates specific, targeted search queries rather than generic keyword searches +- [ ] Qdrant chunk count per step is logged and non-zero for all steps with `search_query` fields +- [ ] content extraction handles JavaScript-heavy sites by waiting for network idle state +- [ ] browser sessions are explicitly closed after each research step to prevent memory leaks +- [ ] coder agent logs confirm research context is retrieved and injected for research-enabled steps +- [ ] Qdrant storage is bounded by task/session TTL to prevent unbounded growth +- [ ] research scope is limited to trusted domains in production deployments + +### Source Alignment + +- [Devika Researcher Agent Source](https://github.com/stitionai/devika/tree/main/src/agents/researcher) +- [Devika Browser Module Source](https://github.com/stitionai/devika/tree/main/src/browser) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Playwright Python Docs](https://playwright.dev/python/) +- [Qdrant Python Client Docs](https://python-client.qdrant.tech/) + +### Cross-Tutorial Connection Map + +- [Playwright MCP Tutorial](../playwright-mcp-tutorial/) — deep dive on Playwright automation patterns directly applicable to Devika's browser module +- [Browser Use Tutorial](../browser-use-tutorial/) — alternative browser automation agent for comparison with Devika's approach +- [Firecrawl Tutorial](../firecrawl-tutorial/) — managed web crawling service that can replace Devika's direct Playwright scraping +- [Chroma Tutorial](../chroma-tutorial/) — alternative vector store for understanding Qdrant's role in the research pipeline +- [LanceDB Tutorial](../lancedb-tutorial/) — embedded vector database as an alternative to Qdrant for simpler deployments + +### Advanced Practice Exercises + +1. Replace Devika's built-in search engine scraping with a call to the Brave Search API and measure query result quality improvement. +2. Add a domain allowlist filter to the browser module that restricts research to official documentation sites for a given language ecosystem. +3. Implement a content quality filter that scores extracted chunks by relevance before upserting into Qdrant, discarding low-relevance content. +4. Run Devika's researcher agent in headed mode (non-headless) and screen-record a full research session to observe browsing behavior. +5. Build a Qdrant TTL cleanup job that purges research chunks older than 24 hours to keep the vector store bounded in size. + +### Review Questions + +1. How does the researcher agent decide what search queries to generate for a given plan step? +2. What Playwright event or state should be awaited before extracting content from a JavaScript-heavy page? +3. How are research chunks stored in Qdrant and what metadata fields enable per-task and per-step filtering? +4. What Playwright launch argument is required to run Chromium inside a Linux Docker container? +5. How does the coder agent retrieve the research chunks that are most relevant to its current step? + +### Scenario Playbook 1: Playwright Fails in Docker Container + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: research steps fail with "No usable sandbox" error when Devika is deployed in Docker +- initial hypothesis: Chromium's sandbox mode requires kernel capabilities not available in the container +- immediate action: add `--no-sandbox` and `--disable-setuid-sandbox` to the Playwright launch args in the browser module +- engineering control: document the required Docker run flags (`--cap-add SYS_ADMIN`) or use the `--no-sandbox` approach in the Playwright config +- verification target: researcher agent completes a web fetch without sandbox errors in the Docker environment +- rollback trigger: running without sandbox in a multi-tenant environment creates security exposure; switch to isolated containers per task +- communication step: update Docker deployment docs with the sandbox configuration requirement +- learning capture: add a Docker-specific Playwright configuration preset to the config.toml example + +### Scenario Playbook 2: Search Engine Returns Captcha Page + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: researcher agent returns empty research context; browser log shows captcha HTML in the response +- initial hypothesis: Bing or Google detects automated browsing and serves a bot verification page +- immediate action: configure a search API key (Bing Search API or SerpAPI) in config.toml to replace direct browser-based search +- engineering control: add a content validation check that detects captcha patterns in extracted text and raises an alert +- verification target: researcher completes 10 consecutive search queries via API without triggering bot detection +- rollback trigger: search API quota is insufficient for task volume; implement request queuing with rate limiting +- communication step: document the search API configuration option in the README and explain why it is recommended for production +- learning capture: add search API as the recommended configuration in the Docker Compose example + +### Scenario Playbook 3: Qdrant Stores Empty Chunks + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: coder receives empty research context even though researcher agent ran without errors +- initial hypothesis: the web page content extraction returned empty string due to JavaScript-rendered content +- immediate action: add `page.wait_for_load_state('networkidle')` before content extraction in the browser module +- engineering control: add minimum content length validation: if extracted text is under 100 characters, retry with a different result URL +- verification target: Qdrant collection shows non-zero chunk count after each researcher invocation in logs +- rollback trigger: `networkidle` wait causes Playwright to hang on pages that never reach idle state; add a timeout +- communication step: document the JavaScript rendering issue and the `networkidle` solution in the browser module README +- learning capture: add a content extraction test with a known JavaScript-heavy documentation site + +### Scenario Playbook 4: Research Content Pollutes Coder Context With Irrelevant Chunks + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: coder generates code using a completely wrong API because retrieved chunks were from unrelated documentation +- initial hypothesis: search queries are too generic and Qdrant's semantic similarity returns tangentially related content +- immediate action: inspect the search queries in logs and add more specific terms: library name, version, and exact API being used +- engineering control: add a minimum relevance score threshold to the Qdrant retrieval call to filter out low-similarity chunks +- verification target: coder context only contains chunks with similarity score above 0.75 for representative tasks +- rollback trigger: strict score threshold causes empty context on niche topics where all documentation has lower similarity +- communication step: document the relevance threshold tuning parameter in the configuration guide +- learning capture: build an offline evaluation dataset of task-research pairs to measure retrieval precision over time + +### Scenario Playbook 5: Qdrant Storage Grows Unbounded + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: Qdrant disk usage grows continuously as teams run many tasks over days and weeks +- initial hypothesis: research chunks are never deleted after task completion +- immediate action: implement a post-task cleanup job that deletes Qdrant chunks tagged with completed task IDs +- engineering control: add a TTL metadata field to each chunk at upsert time and schedule a nightly cleanup job that deletes expired chunks +- verification target: Qdrant collection size is bounded and does not grow unboundedly over a week of normal team usage +- rollback trigger: aggressive cleanup deletes chunks that are still needed for in-progress tasks +- communication step: document the storage cleanup policy and the TTL configuration parameter +- learning capture: add Qdrant collection size as a monitored metric with an alert threshold in the observability dashboard + +### What Problem Does This Solve? + +Devika's browser research integration solves the knowledge cutoff and documentation freshness problem in LLM-based code generation. A model trained months or years ago has no knowledge of newly released library versions, breaking API changes, or new framework patterns. By autonomously researching the web before generating code, Devika produces code that uses current APIs and is aligned with the latest documentation, dramatically reducing the incidence of deprecated API usage that plagues static LLM code generation. + +### How it Works Under the Hood + +1. The researcher agent generates search query strings from the plan step description using an LLM prompt that emphasizes specificity. +2. The browser module launches a headless Playwright Chromium instance and navigates to the configured search engine. +3. Result URLs are extracted from the search results page; Playwright navigates to each URL and waits for network idle state. +4. Page text content is extracted using a custom extractor that removes navigation, headers, footers, and script tags. +5. Text is chunked and sent to the configured embedding model; resulting vectors are upserted into Qdrant with step and task metadata. +6. The coder agent queries Qdrant using the step description as the query text and retrieves the top-k chunks above a relevance threshold. + +### Source Walkthrough + +- [Devika Researcher Agent](https://github.com/stitionai/devika/tree/main/src/agents/researcher) — Why it matters: the query generation and Qdrant storage logic for the research pipeline. +- [Devika Browser Module](https://github.com/stitionai/devika/tree/main/src/browser) — Why it matters: the Playwright automation code for web navigation and content extraction. +- [Playwright Python Installation](https://playwright.dev/python/docs/intro) — Why it matters: official Playwright setup guide for installing browsers and understanding launch options. +- [Qdrant Python Client](https://python-client.qdrant.tech/) — Why it matters: the client library Devika uses for vector upsert and similarity search operations. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 4: Task Planning and Code Generation](04-task-planning-and-code-generation.md) +- [Next Chapter: Chapter 6: Project Management and Workspaces](06-project-management-and-workspaces.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/06-project-management-and-workspaces.md b/tutorials/devika-tutorial/06-project-management-and-workspaces.md new file mode 100644 index 00000000..ddceaffd --- /dev/null +++ b/tutorials/devika-tutorial/06-project-management-and-workspaces.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 6: Project Management and Workspaces" +nav_order: 6 +parent: Devika Tutorial +--- + +# Chapter 6: Project Management and Workspaces + +Welcome to **Chapter 6: Project Management and Workspaces**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter explains how Devika organizes projects, manages the workspace file system, integrates with git, and enables teams to structure and review autonomous coding sessions. + +## Learning Goals + +- understand the Devika project model: how projects are created, named, and isolated in the workspace +- trace how generated files are written, updated, and organized within a project workspace +- configure and use Devika's git integration for committing and reviewing agent-generated code +- manage multiple concurrent projects and maintain workspace hygiene over time + +## Fast Start Checklist + +1. create a new project in the Devika UI and observe the workspace directory created on disk +2. submit a task and verify generated files appear under the correct project subdirectory +3. initialize git in the project workspace and review the first commit of agent-generated code +4. explore the project list API and SQLite database to understand project metadata storage + +## Source References + +- [Devika Project Management Source](https://github.com/stitionai/devika/tree/main/src/project) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika Repository](https://github.com/stitionai/devika) + +## Summary + +You now know how to create and manage Devika projects, navigate the workspace file structure, and use git to review, version, and share agent-generated code safely. + +Next: [Chapter 7: Debugging and Troubleshooting](07-debugging-and-troubleshooting.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 6: Project Management and Workspaces** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Each Devika project is represented by a record in the SQLite database with fields for project ID, name, creation timestamp, and the LLM model selected at creation time. +2. The workspace root directory is configured in `config.toml` under `WORKSPACE_PATH`; every project gets a subdirectory named after the project name. +3. Generated files from the coder agent are written to paths relative to the project's workspace subdirectory; the orchestrator resolves absolute paths before writing. +4. Devika does not automatically initialize git in project workspaces; operators must run `git init` in the project directory and commit agent-generated code manually or via a post-task hook. +5. The frontend project list view reads project records from the SQLite API; selecting a project loads its task history and workspace file tree. +6. Task history for each project is stored as a sequence of agent interaction records in SQLite, enabling replay and audit of the full agent session. +7. Multiple projects can exist concurrently; the orchestrator routes task submissions to the correct workspace by project ID, preventing cross-project file contamination. +8. Workspace cleanup (deleting old projects) must be done manually by removing the workspace subdirectory and the SQLite project record; there is no built-in project deletion UI in early versions. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Workspace location | default local path in config.toml | mounted network volume for team sharing | simplicity vs collaboration | +| Git workflow | manual commit after task review | automated pre-commit hook that stages all changes | control vs speed | +| Project naming | free-form names in UI | enforce naming convention (e.g., JIRA-123-feature-name) | flexibility vs traceability | +| Multi-project concurrency | sequential task submission | parallel projects with separate Devika instances | simplicity vs throughput | +| Workspace cleanup | manual deletion | scheduled TTL-based archival script | control vs operational overhead | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Files written to wrong project directory | task output appears in sibling project folder | orchestrator uses project name instead of project ID for path resolution | enforce project ID as the workspace subdirectory name to avoid name collision | +| Workspace path not found | `FileNotFoundError` on task submission | WORKSPACE_PATH in config.toml points to a non-existent directory | validate WORKSPACE_PATH exists on startup and create it if absent | +| Git repo contains sensitive generated secrets | API keys appear in committed code | coder generated example code with hardcoded credentials | add a pre-commit hook with secret scanning (e.g., `git-secrets` or `truffleHog`) | +| SQLite database locked | task submission fails with database lock error | multiple concurrent writes to SQLite from parallel tasks | upgrade to PostgreSQL for concurrent deployments or serialize task writes | +| Project workspace disk full | task fails with `OSError: No space left` | accumulated workspace files from many projects exhaust disk | add disk usage monitoring and implement TTL-based workspace archival | +| Lost task history after SQLite corruption | project history unavailable | no SQLite backup policy | implement daily SQLite backup to a separate location | + +### Implementation Runbook + +1. Verify `WORKSPACE_PATH` in config.toml exists and is writable by the Devika process user. +2. Create a new project in the UI; verify the project subdirectory is created at `WORKSPACE_PATH//`. +3. Submit a task and verify generated files appear in the correct project subdirectory. +4. Navigate to the project workspace in a terminal and run `git init && git add . && git commit -m "Initial agent output"`. +5. Configure a `.gitignore` in the project workspace to exclude any secrets, `__pycache__`, and `.env` files before committing. +6. Add a secret scanning pre-commit hook using `git-secrets` or `detect-secrets` to the project workspace git configuration. +7. For team collaboration, configure the workspace path to point to a shared network volume and ensure all team members have write access. +8. Set up a weekly archival cron job that compresses and moves project workspaces older than 30 days to an archive directory. +9. Back up the SQLite database daily using `sqlite3 devika.db ".backup devika-backup-$(date +%Y%m%d).db"`. + +### Quality Gate Checklist + +- [ ] WORKSPACE_PATH is validated on startup and created automatically if absent +- [ ] each project workspace is isolated under a directory keyed by project ID (not just name) +- [ ] git is initialized in each new project workspace and a `.gitignore` is templated automatically +- [ ] secret scanning pre-commit hook is active in all project workspace git repos +- [ ] SQLite database is backed up daily and the backup is tested for restore validity +- [ ] disk usage for WORKSPACE_PATH is monitored with an alert threshold at 80% capacity +- [ ] project naming convention is documented and enforced through the UI or API validation +- [ ] workspace archival policy is documented and implemented as a scheduled job + +### Source Alignment + +- [Devika Project Module](https://github.com/stitionai/devika/tree/main/src/project) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) +- [Devika Repository](https://github.com/stitionai/devika) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) — workspace and project management patterns in a comparable autonomous coding agent +- [Aider Tutorial](../aider-tutorial/) — git-native coding assistant for comparison on version control integration +- [Daytona Tutorial](../daytona-tutorial/) — managed development workspace service for standardizing Devika's workspace environments +- [Supabase Tutorial](../supabase-tutorial/) — PostgreSQL replacement for SQLite as Devika's project metadata store at scale +- [SWE-agent Tutorial](../swe-agent-tutorial/) — how SWE-agent manages workspace isolation for benchmark task sets + +### Advanced Practice Exercises + +1. Write a Devika project initialization script that creates the workspace directory, runs `git init`, adds a `.gitignore`, and installs `detect-secrets` as a pre-commit hook automatically. +2. Replace the SQLite database backend with PostgreSQL and verify that concurrent task submissions from multiple users work without lock errors. +3. Build a workspace explorer API endpoint that returns the file tree and git log for a given project ID in a format the frontend can render. +4. Implement a project archival script that compresses old workspace directories to a `.tar.gz`, verifies the archive, and removes the original directory. +5. Create a project template system that seeds new project workspaces with a standard `README.md`, `.gitignore`, and directory structure before the first task runs. + +### Review Questions + +1. Where does Devika store project metadata and task history, and what are the durability implications of this choice? +2. How is the project workspace directory path determined for a new project and what configuration option controls the root? +3. Why is it important to use project ID rather than project name as the workspace subdirectory name? +4. What steps are needed to add git version control to a Devika project workspace and what files should always be in `.gitignore`? +5. How does Devika ensure that files generated for one project are not written to another project's workspace? + +### Scenario Playbook 1: Two Projects With the Same Name Collide + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: two users create projects named "api-backend" and files from both tasks appear in the same workspace directory +- initial hypothesis: workspace subdirectory is named by project name which is not guaranteed to be unique +- immediate action: inspect the project creation code to see if unique IDs or names are used for directory paths +- engineering control: modify the project module to use `project_id` (UUID) as the workspace directory name; store the human name only in SQLite +- verification target: two projects with the same name have distinct workspace directories and no file cross-contamination +- rollback trigger: UUID-based directories make manual navigation confusing; add a symlink from project name to UUID directory +- communication step: document the project naming policy and UUID directory convention in the operator guide +- learning capture: add a uniqueness constraint on project names in the SQLite schema to prevent duplicate names at the database level + +### Scenario Playbook 2: Generated Code Contains Hardcoded API Key + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: code review reveals the coder agent generated example code with a hardcoded API key string +- initial hypothesis: the coder agent generated realistic-looking example code that included a placeholder that looks like a real key +- immediate action: revoke any potentially real keys, run `git-secrets --scan` on the workspace, and remove the offending commit +- engineering control: install `detect-secrets` as a pre-commit hook in all project workspaces so commits with secret patterns are blocked automatically +- verification target: `detect-secrets scan` on the full workspace returns zero findings after remediation +- rollback trigger: secret scanner produces too many false positives on example code patterns; tune the scanner's allowlist +- communication step: notify the security team of the incident and document the finding in the security log +- learning capture: add "never use real API key values in example code" to the coder agent system prompt + +### Scenario Playbook 3: Workspace Disk Usage Grows Without Bound + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: server disk usage alert fires; investigation shows WORKSPACE_PATH consumes 200GB from 6 months of accumulated projects +- initial hypothesis: no archival or cleanup policy is in place; all projects accumulate indefinitely +- immediate action: identify projects older than 60 days, compress them to `.tar.gz` archives, and move them to cheaper storage +- engineering control: implement a weekly archival cron job that archives and removes projects inactive for 30 days; log all archival operations +- verification target: WORKSPACE_PATH size stays below 50GB after implementing the archival policy +- rollback trigger: archived project is needed by a user; implement a self-service restore procedure from archive +- communication step: communicate the archival policy to all Devika users with the 30-day retention window clearly stated +- learning capture: add disk usage as a monitored metric with alert at 80% and critical at 95% capacity + +### Scenario Playbook 4: SQLite Database Locked During Parallel Task Submission + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: two users submit tasks simultaneously and one returns a database locked error +- initial hypothesis: SQLite's file-level locking prevents concurrent writes from two simultaneous task submissions +- immediate action: serialize task write operations using a Python asyncio lock around SQLite write calls +- engineering control: migrate from SQLite to PostgreSQL for concurrent team deployments; update the database URL in config.toml +- verification target: 10 concurrent task submissions all succeed without lock errors after serialization +- rollback trigger: asyncio lock causes task submission queuing that is unacceptably slow; increase worker count instead +- communication step: document SQLite concurrency limitations and the PostgreSQL migration path in the deployment guide +- learning capture: add a concurrency stress test to the CI suite that submits 5 simultaneous tasks and asserts zero lock errors + +### Scenario Playbook 5: Task History Lost After Server Migration + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: team migrates Devika to a new server but the SQLite database is not copied; all project history is lost +- initial hypothesis: SQLite database file was stored in the application directory which was not included in the migration plan +- immediate action: restore from the most recent backup; if no backup exists, document the data loss and establish a backup procedure +- engineering control: implement daily SQLite backup to an S3 bucket or shared network volume with 30-day retention +- verification target: daily backup job runs successfully and restore test confirms the backup is valid +- rollback trigger: backup job fails; alert immediately and investigate before the next day's backup window +- communication step: document the backup and restore procedure in the operations runbook and train the team on it +- learning capture: add backup job status as a monitored metric; include backup validation in the weekly operations review + +### What Problem Does This Solve? + +Devika's project and workspace management layer solves the isolation and traceability problem in autonomous code generation sessions. Without project isolation, code generated for different features or clients would intermingle in a single directory, making it impossible to track which code belongs to which task. The SQLite project record and per-project workspace directory provide the minimal structure needed to run multiple autonomous coding sessions concurrently while keeping their outputs separate and auditable. + +### How it Works Under the Hood + +1. When a user creates a project in the UI, the backend inserts a project record into SQLite with a UUID, name, and selected model. +2. The orchestrator creates a subdirectory under WORKSPACE_PATH using the project identifier when the first task is submitted. +3. All coder agent file writes are prefixed with the project workspace path before being passed to the file writer function. +4. Task invocations and agent interactions are logged as records in the SQLite task history table, keyed by project ID. +5. The frontend project view queries the backend for the project's task history and file tree, which is built by scanning the workspace directory. +6. Git operations (if configured) are run as subprocess calls within the project workspace directory. + +### Source Walkthrough + +- [Devika Project Module](https://github.com/stitionai/devika/tree/main/src/project) — Why it matters: the project creation, workspace initialization, and task history storage logic. +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) — Why it matters: the WORKSPACE_PATH and project configuration options. +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) — Why it matters: the workspace and project isolation design rationale. +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) — Why it matters: the user-facing description of project creation and workspace management. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 5: Web Research and Browser Integration](05-web-research-and-browser-integration.md) +- [Next Chapter: Chapter 7: Debugging and Troubleshooting](07-debugging-and-troubleshooting.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/07-debugging-and-troubleshooting.md b/tutorials/devika-tutorial/07-debugging-and-troubleshooting.md new file mode 100644 index 00000000..4b5279ec --- /dev/null +++ b/tutorials/devika-tutorial/07-debugging-and-troubleshooting.md @@ -0,0 +1,228 @@ +--- +layout: default +title: "Chapter 7: Debugging and Troubleshooting" +nav_order: 7 +parent: Devika Tutorial +--- + +# Chapter 7: Debugging and Troubleshooting + +Welcome to **Chapter 7: Debugging and Troubleshooting**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers how to diagnose and resolve failures in Devika's agent pipeline, from startup errors to mid-task agent loops, using logs, the self-reflection mechanism, and targeted countermeasures. + +## Learning Goals + +- identify the log sources and log levels that expose agent pipeline state during task execution +- diagnose the most common failure patterns across planner, researcher, coder, and action agents +- understand how the internal monologue self-reflection loop can be leveraged as a debugging signal +- apply systematic countermeasures for each failure category without restarting the entire pipeline + +## Fast Start Checklist + +1. enable DEBUG log level in config.toml and observe the agent interaction log during a task run +2. submit a task that deliberately requires web research and trace the full researcher log output +3. identify the log line that indicates a coder agent invocation and the line that confirms file write +4. simulate a deliberate error (bad API key) and trace it from the request to the error log entry + +## Source References + +- [Devika Logs and Debugging](https://github.com/stitionai/devika#debugging) +- [Devika Agent Source](https://github.com/stitionai/devika/tree/main/src/agents) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Repository](https://github.com/stitionai/devika) + +## Summary + +You now have a systematic debugging playbook for Devika that covers log interpretation, agent failure diagnosis, and targeted countermeasures for every major failure category in the pipeline. + +Next: [Chapter 8: Production Operations and Governance](08-production-operations-and-governance.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 7: Debugging and Troubleshooting** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Devika's Python backend emits structured logs via the standard Python `logging` module; log level is configured in `config.toml` under `LOG_LEVEL` and defaults to `INFO`. +2. Each agent logs its invocation with the model used, step context, and a timestamp; setting `LOG_LEVEL=DEBUG` adds full prompt and response payloads to the logs. +3. The orchestrator logs state transitions between agents (planner → researcher → coder → action → monologue) allowing pipeline tracing without code instrumentation. +4. The internal monologue agent logs its decision (`proceed`, `revise`, or `done`) along with the reasoning text; this is the primary signal for diagnosing loop problems. +5. Playwright browser errors are captured by the browser module and logged with the URL, HTTP status, and error message; headless mode errors may require switching to headful for visual debugging. +6. Qdrant connection errors surface as Python `grpc` or `httpx` exceptions in the backend log; they indicate the vector store is unreachable before or during task execution. +7. LLM provider errors (rate limits, auth failures, context overflow) are caught in the `src/llm/` abstraction and logged with the provider name, error code, and request metadata. +8. The action agent logs stdout and stderr from executed code; these logs are the primary signal for diagnosing code correctness issues in generated programs. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Log verbosity | INFO for production | DEBUG for active debugging sessions | disk usage vs diagnostic detail | +| Log storage | stdout only | structured JSON logs to file + log aggregation | simplicity vs searchability | +| Error alerting | manual log review | alert on error patterns via log aggregation tool | operational overhead vs response time | +| Agent replay | re-submit entire task | patch intermediate state and replay from specific step | simplicity vs efficiency | +| Debugging browser issues | headless mode + log analysis | headful Playwright with screenshots on failure | invisibility vs visual clarity | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Backend silent failure | task submitted but no agent log output | unhandled exception in orchestrator before first log line | add a try/except at the orchestrator entry point and log all exceptions | +| Researcher hangs indefinitely | no researcher completion log after 60 seconds | Playwright navigation hangs on a slow-loading page | add Playwright navigation timeout (30 seconds) and fall through to next result URL | +| Coder produces truncated JSON | JSON parse error in orchestrator | coder LLM response was cut off due to context limit | reduce context chunk size or switch to larger context window model | +| Action agent not invoked | task completes but no code was executed | coder returned no `terminal_command` field | verify coder prompt includes instruction to add `terminal_command` when code is executable | +| Internal monologue returns null | orchestrator crashes with NullPointerError | monologue LLM returned malformed JSON | add JSON validation with fallback decision (default to `proceed`) on monologue parse failure | +| Qdrant retrieval returns 0 chunks | coder receives empty research context | Qdrant collection empty or wrong namespace queried | verify researcher upsert succeeded by checking Qdrant dashboard after researcher step | + +### Implementation Runbook + +1. Set `LOG_LEVEL=DEBUG` in config.toml and restart the backend to enable full prompt/response logging. +2. Submit a failing task and capture the full backend log output to a file: `python devika.py 2>&1 | tee debug.log`. +3. Search the log for the specific agent that last logged before the failure: `grep "AGENT_STEP" debug.log`. +4. For researcher failures, search for `PLAYWRIGHT` and `QDRANT` log lines to trace browser and storage operations. +5. For coder failures, search for `CODER_INVOCATION` and `FILE_WRITE` log lines to verify code generation and file output. +6. For monologue loop issues, search for `MONOLOGUE_DECISION` log lines and count iterations to detect infinite loops. +7. For action agent failures, search for `EXECUTION_STDOUT` and `EXECUTION_STDERR` log lines to read generated code output. +8. For provider errors, search for the provider name and `ERROR` in the log to identify authentication or rate limit failures. +9. After identifying the root cause, apply the targeted countermeasure from the failure modes table and re-submit the task. + +### Quality Gate Checklist + +- [ ] LOG_LEVEL is configurable at runtime without code changes +- [ ] all agent invocations emit structured log entries with agent name, step number, and model used +- [ ] orchestrator state transitions are logged at INFO level for production tracing +- [ ] Playwright navigation has an explicit timeout configured to prevent indefinite hangs +- [ ] coder JSON response is validated before parsing and malformed responses are logged and retried +- [ ] internal monologue parse failures have a safe fallback decision to prevent orchestrator crash +- [ ] action agent stdout and stderr are captured and logged for every code execution +- [ ] Qdrant retrieval result count is logged per step to detect empty context early + +### Source Alignment + +- [Devika Agent Source Directory](https://github.com/stitionai/devika/tree/main/src/agents) +- [Devika README Debugging Section](https://github.com/stitionai/devika#debugging) +- [Devika Browser Module](https://github.com/stitionai/devika/tree/main/src/browser) +- [Devika LLM Abstraction](https://github.com/stitionai/devika/tree/main/src/llm) +- [Devika Repository](https://github.com/stitionai/devika) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) — debugging patterns for a comparable autonomous coding agent +- [LangFuse Tutorial](../langfuse-tutorial/) — LLM observability platform applicable to tracing Devika agent calls +- [SWE-agent Tutorial](../swe-agent-tutorial/) — debugging autonomous agent loops in a single-agent architecture +- [Playwright MCP Tutorial](../playwright-mcp-tutorial/) — Playwright-specific debugging and error handling techniques +- [LiteLLM Tutorial](../litellm-tutorial/) — LLM proxy debugging for identifying provider-level issues + +### Advanced Practice Exercises + +1. Instrument the Devika orchestrator with OpenTelemetry spans for each agent invocation and export traces to Jaeger for visual pipeline tracing. +2. Write a log parser script that reads a Devika debug log and produces a timeline table showing agent invocations, durations, and decisions. +3. Build a "dry run" mode that runs the planner and researcher but skips the coder and action agents, enabling research validation without code generation. +4. Add structured JSON logging to all agent invocations and configure log aggregation in Grafana Loki or similar for searchable multi-task log correlation. +5. Implement an automatic retry-with-backoff mechanism in the orchestrator that re-invokes a failed agent step up to three times before escalating to a task failure. + +### Review Questions + +1. What log level must be set in config.toml to see full prompt and response payloads in the backend log? +2. What is the primary log signal that indicates the internal monologue agent has entered an infinite revision loop? +3. How do you diagnose whether a Qdrant retrieval is returning empty results versus the researcher failing to upsert chunks? +4. What Playwright configuration change allows you to visually observe browser behavior during a debugging session? +5. What is the safe fallback decision for the orchestrator to take if the internal monologue returns malformed JSON? + +### Scenario Playbook 1: Task Submitted But No Agent Log Output + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task is submitted through the UI but the backend log shows no agent invocation lines +- initial hypothesis: unhandled exception in the orchestrator before the first agent is invoked +- immediate action: set LOG_LEVEL=DEBUG and re-submit; look for a stack trace immediately after the task submission log line +- engineering control: wrap the orchestrator entry point in a top-level try/except that logs any exception with full traceback before re-raising +- verification target: any exception in the orchestrator path is logged with full context before the task is marked as failed +- rollback trigger: broad exception catching masks specific errors that need different handling paths +- communication step: document the "silent task failure" symptom and the LOG_LEVEL=DEBUG diagnostic step in the troubleshooting guide +- learning capture: add an integration test that injects a known bad input and asserts that an error log entry is produced + +### Scenario Playbook 2: Researcher Hangs on Slow-Loading Page + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task progress stops for 5+ minutes; last log line is from the researcher agent navigating to a URL +- initial hypothesis: Playwright is waiting indefinitely for a slow or unresponsive web page to finish loading +- immediate action: add a 30-second navigation timeout to Playwright's `page.goto()` call and configure `wait_for_load_state` to use `'domcontentloaded'` instead of `'networkidle'` +- engineering control: wrap all Playwright navigation in a try/except for `TimeoutError` that falls through to the next result URL +- verification target: researcher completes all steps within 120 seconds even when individual page loads time out +- rollback trigger: 30-second timeout causes legitimate slow documentation sites to be skipped; increase to 60 seconds +- communication step: document the navigation timeout configuration in the browser module README +- learning capture: add a researcher timeout test that mocks a slow URL and asserts the fallback behavior + +### Scenario Playbook 3: Coder Returns Malformed JSON + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: orchestrator crashes with a JSON parse error after the coder agent step +- initial hypothesis: the coder LLM response was truncated due to the context window limit, cutting off the JSON payload +- immediate action: check the coder log for the raw LLM response and identify where the JSON is truncated +- engineering control: add JSON validation with error logging before parsing; implement a retry with a shorter context if parse fails +- verification target: truncated JSON responses trigger a logged retry rather than an orchestrator crash +- rollback trigger: context reduction retry produces lower quality code; flag as a warning and surface to the operator +- communication step: document the JSON truncation failure mode and the context window limit recommendation in the troubleshooting guide +- learning capture: add a coder JSON validation unit test with a known-truncated response fixture + +### Scenario Playbook 4: Internal Monologue Loops 15 Times Without Completing + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: agent task runs for 30 minutes with repeated "revise" decisions in the monologue log +- initial hypothesis: the monologue model is hallucinating "not done" because the completion criteria are not clear +- immediate action: inject the iteration count into the monologue prompt with an explicit instruction: "if iteration > 8, return done" +- engineering control: add a hard cap on monologue iterations in the orchestrator; after 10 iterations, force the `done` decision and log a warning +- verification target: no task runs more than 10 monologue iterations on benchmark tasks; tasks complete in under 15 minutes +- rollback trigger: forced completion causes incomplete workspace output on legitimately complex multi-step tasks +- communication step: surface the iteration count in the UI task progress view so users can monitor loop depth +- learning capture: add per-task monologue iteration count as a metric and set an alert at 8 iterations + +### Scenario Playbook 5: Action Agent Code Execution Silently Fails + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: task completes successfully but running the generated code manually produces errors +- initial hypothesis: the action agent executed the code but swallowed the non-zero exit code without logging it +- immediate action: add explicit return code checking in the action agent: log stderr and raise if exit code is non-zero +- engineering control: surface action agent exit codes and stderr in the UI task result view for operator review +- verification target: any non-zero exit code from code execution appears in the backend log and the task result +- rollback trigger: strict exit code checking causes task failure on warnings that don't affect functionality +- communication step: document the code execution error visibility feature in the user guide +- learning capture: add an action agent test that runs code with a known error and asserts the exit code is logged + +### What Problem Does This Solve? + +Devika's multi-agent pipeline creates multiple potential failure points that are invisible without structured debugging. A task that "completes" may have silently skipped research, generated truncated code, or looped without producing real output. Without systematic log analysis and targeted countermeasures for each agent boundary, engineers waste significant time re-submitting tasks and guessing at root causes. This chapter provides the diagnostic toolset that converts opaque pipeline failures into actionable, repeatable debugging workflows. + +### How it Works Under the Hood + +1. The Python backend uses the `logging` module with configurable handlers; at DEBUG level, each agent logs its full prompt and LLM response. +2. The orchestrator logs a state transition entry at each agent handoff, creating a breadcrumb trail through the pipeline. +3. Playwright errors are caught as Python exceptions in the browser module and logged with URL, status, and error type. +4. LLM provider errors are caught in the `src/llm/` abstraction and re-raised as typed exceptions that the orchestrator can handle distinctly. +5. The internal monologue decision is logged as a structured JSON entry with the decision type and the reasoning text. +6. Action agent execution uses Python's `subprocess` module; stdout and stderr are captured and logged regardless of exit code. + +### Source Walkthrough + +- [Devika Agent Source Directory](https://github.com/stitionai/devika/tree/main/src/agents) — Why it matters: the implementation of each agent including logging, error handling, and LLM invocation. +- [Devika LLM Abstraction](https://github.com/stitionai/devika/tree/main/src/llm) — Why it matters: the provider error handling layer where rate limits, auth failures, and truncation errors are caught. +- [Devika Browser Module](https://github.com/stitionai/devika/tree/main/src/browser) — Why it matters: the Playwright integration including navigation, content extraction, and error handling code. +- [Devika README Debugging](https://github.com/stitionai/devika#debugging) — Why it matters: the official guidance on log level configuration and debugging procedures. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 6: Project Management and Workspaces](06-project-management-and-workspaces.md) +- [Next Chapter: Chapter 8: Production Operations and Governance](08-production-operations-and-governance.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/08-production-operations-and-governance.md b/tutorials/devika-tutorial/08-production-operations-and-governance.md new file mode 100644 index 00000000..073a7330 --- /dev/null +++ b/tutorials/devika-tutorial/08-production-operations-and-governance.md @@ -0,0 +1,227 @@ +--- +layout: default +title: "Chapter 8: Production Operations and Governance" +nav_order: 8 +parent: Devika Tutorial +--- + +# Chapter 8: Production Operations and Governance + +Welcome to **Chapter 8: Production Operations and Governance**. In this part of **Devika Tutorial: Open-Source Autonomous AI Software Engineer**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + +This chapter covers team deployment strategies, security hardening, API cost governance, code review requirements for agent-generated code, and the operational runbooks needed to run Devika safely at scale. + +## Learning Goals + +- design a team deployment architecture for Devika that enforces access control and audit logging +- implement API cost governance controls that prevent runaway spend from autonomous agent tasks +- define code review and merge policies that are appropriate for agent-generated code +- build operational runbooks for incident response, key rotation, and capacity management + +## Governance Checklist + +- all LLM API keys are stored in a secrets manager, not in config.toml on disk +- agent-generated code requires human review before merging to protected branches +- API spend is tracked per project with per-day and per-task budget caps +- audit logs capture every task submission, agent invocation, and workspace file write + +## Source References + +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Security Policy](https://github.com/stitionai/devika/blob/main/SECURITY.md) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika Repository](https://github.com/stitionai/devika) + +## Summary + +You now have a complete production governance framework for Devika covering security, cost controls, code review policies, and operational runbooks for safe team-scale autonomous coding. + +Return to: [Tutorial Index](index.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- tutorial slug: **devika-tutorial** +- chapter focus: **Chapter 8: Production Operations and Governance** +- system context: **Devika Agentic Software Engineer** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Production Devika deployments replace the local `config.toml` API key storage with a secrets manager (AWS Secrets Manager, HashiCorp Vault, or GCP Secret Manager) and inject secrets at runtime via environment variables. +2. The FastAPI backend should be deployed behind a reverse proxy (Nginx or Caddy) with TLS termination and authentication middleware (JWT, SSO, or IP allowlist) to prevent unauthorized task submission. +3. LLM API cost governance is implemented by wrapping the `src/llm/` abstraction with a budget-tracking decorator that queries a spend ledger before each LLM call and blocks calls that would exceed the configured budget. +4. Agent-generated code in the project workspace should be staged in a review branch and require a human-approved pull request before merging to the main branch of any production repository. +5. All task submissions, agent invocations, and workspace writes must be captured in an immutable audit log (separate from application logs) stored in append-only storage for compliance and incident investigation. +6. Qdrant in production should run with authentication enabled and use a dedicated persistent volume with daily snapshots; the Qdrant API key must be stored in the secrets manager alongside LLM keys. +7. Capacity planning covers three resource axes: LLM API token throughput (controlled by provider rate limits and budget caps), Qdrant storage growth (controlled by TTL-based cleanup), and workspace disk (controlled by archival policy). +8. Incident response for autonomous agent systems requires a distinct runbook category: "autonomous action rollback" covering workspace file reversion, git branch deletion, and Qdrant collection cleanup. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Secret management | config.toml with strict file permissions | secrets manager with automatic rotation | setup simplicity vs security posture | +| API budget controls | manual spend monitoring | per-task budget cap with automatic kill switch | operational overhead vs cost protection | +| Code review policy | post-task human review | PR-based review with required approvers before any merge | friction vs safety | +| Access control | single shared login | per-user auth with role-based task submission limits | setup simplicity vs accountability | +| Audit logging | application log review | immutable append-only audit log with retention policy | cost vs compliance | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| Runaway API spend | billing alert fires; spend 10x above baseline | no per-task budget cap; expensive model on long task | implement per-task token budget; alert at 80% and kill at 100% | +| Unauthorized task submission | unknown project in audit log | Devika UI exposed without authentication | add authentication middleware before frontend and backend API | +| Agent-generated code merged without review | production bug traced to unreviewed AI code | no merge protection on review branch | enforce branch protection rules requiring human approval on all AI-generated code PRs | +| LLM API key leaked in logs | key pattern appears in log aggregation search | key was logged in debug mode prompt payload | add secret scrubbing to log formatters; exclude key material from DEBUG logs | +| Qdrant data loss | tasks return empty research context after server restart | Qdrant running without persistent volume | mount a persistent volume and enable Qdrant snapshots | +| Compliance audit fails due to missing task records | auditor requests task history; records are incomplete | no immutable audit log; only application logs retained | implement append-only audit log with tamper-evident storage | + +### Implementation Runbook + +1. Deploy Devika with Docker Compose including the backend, frontend, and Qdrant services with named persistent volumes. +2. Configure a reverse proxy (Nginx) with TLS and an authentication middleware (e.g., Authelia or OAuth2 Proxy) in front of both the API and frontend. +3. Migrate all secrets from config.toml to a secrets manager and inject them as environment variables in the Docker Compose service definition. +4. Add a budget-tracking middleware to the `src/llm/` abstraction that records token usage per task and blocks calls exceeding the configured per-task cap. +5. Set up an immutable audit log sink (e.g., AWS CloudWatch Logs with object lock, or an append-only PostgreSQL audit table) for all task and agent events. +6. Enforce branch protection rules on all repositories that Devika writes to: require pull request review and status checks before merge. +7. Add secret scanning pre-commit hooks to all project workspaces using `detect-secrets` or `git-secrets`. +8. Configure Qdrant with API key authentication and enable nightly snapshot exports to S3 or equivalent durable storage. +9. Schedule weekly operational reviews covering API spend, task success rate, workspace disk usage, and Qdrant collection size metrics. + +### Quality Gate Checklist + +- [ ] all LLM API keys are stored in a secrets manager and injected at runtime; config.toml contains no production keys +- [ ] FastAPI backend is behind an authenticated reverse proxy and not directly accessible from the public internet +- [ ] per-task token budget cap is implemented and alerts fire at 80% consumption with hard stop at 100% +- [ ] all agent-generated code requires human review in a PR before merging to any protected branch +- [ ] immutable audit log captures all task submissions, agent invocations, and workspace writes with timestamps and user identifiers +- [ ] Qdrant is running with API key authentication and a persistent volume with daily snapshot exports +- [ ] secret scanning pre-commit hooks are active in all project workspaces +- [ ] incident response runbooks for autonomous action rollback are documented and tested quarterly + +### Source Alignment + +- [Devika Security Policy](https://github.com/stitionai/devika/blob/main/SECURITY.md) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika config.example.toml](https://github.com/stitionai/devika/blob/main/config.example.toml) +- [Devika Repository](https://github.com/stitionai/devika) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) — production governance patterns for a comparable autonomous coding agent +- [LangFuse Tutorial](../langfuse-tutorial/) — LLM observability and cost tracking applicable to Devika's API spend governance +- [SWE-agent Tutorial](../swe-agent-tutorial/) — governance and security considerations for autonomous coding agents +- [Supabase Tutorial](../supabase-tutorial/) — PostgreSQL backend that can replace Devika's SQLite for audit log durability +- [PostHog Tutorial](../posthog-tutorial/) — product analytics for tracking Devika usage patterns and task success rates + +### Advanced Practice Exercises + +1. Build a Docker Compose deployment for Devika that includes Nginx, Authelia, Devika backend, Devika frontend, Qdrant with persistent volume, and a Prometheus metrics exporter. +2. Implement a per-task token budget middleware in `src/llm/` that reads the budget from config.toml, tracks usage in a Redis counter, and kills the task if the budget is exceeded. +3. Create a GitHub Actions workflow that automatically opens a pull request with agent-generated code from a Devika workspace and assigns the configured reviewers. +4. Build an audit log exporter that reads the Devika SQLite task history and writes structured JSON audit records to an S3 bucket with server-side encryption. +5. Write an incident response runbook for the scenario where Devika commits malicious or security-violating code to a repository and document the full rollback procedure. + +### Review Questions + +1. What are the three resource axes that capacity planning must cover for a production Devika deployment? +2. Why is immutable append-only audit logging more important for autonomous coding agents than for traditional software tools? +3. What is the minimum code review policy that should be enforced before any Devika-generated code reaches a protected branch? +4. How does a per-task token budget cap prevent runaway API spend without requiring manual monitoring? +5. What incident response steps are unique to autonomous agent systems compared to traditional software incident response? + +### Scenario Playbook 1: Runaway API Spend From a Long-Running Task + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: monthly billing alert fires at 3x the expected budget; investigation shows one task consumed 2M tokens +- initial hypothesis: a task with a broad prompt entered a long monologue loop with an expensive model, consuming tokens without producing useful output +- immediate action: identify the offending task in the audit log and terminate it if still running; review the generated output for value +- engineering control: implement per-task token budget middleware that hard-stops any task exceeding 100k tokens and sends an alert +- verification target: subsequent tasks on the same model and prompt type stay within 50k tokens per task +- rollback trigger: token cap is too low for legitimate large codebase tasks; increase cap selectively per project type +- communication step: notify the team of the new per-task token cap and the rationale; publish token usage data per project in the weekly ops review +- learning capture: add token usage as a first-class metric in the monitoring dashboard with per-project and per-model breakdowns + +### Scenario Playbook 2: Devika API Exposed Without Authentication + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: security scan finds Devika's port 1337 accessible from the public internet without authentication +- initial hypothesis: the Docker Compose deployment exposed the backend port directly without a reverse proxy or firewall rule +- immediate action: immediately block port 1337 at the firewall and redeploy with Nginx reverse proxy and authentication middleware +- engineering control: standardize deployment with Docker Compose that places Nginx in front of the backend; bind backend to localhost only +- verification target: external port scan confirms port 1337 is not accessible; all requests require authentication via the proxy +- rollback trigger: authentication middleware blocks legitimate team members; verify user provisioning and auth configuration +- communication step: notify the security team of the exposure window, affected data, and remediation timeline +- learning capture: add a security deployment checklist that includes port exposure verification and authentication confirmation as required steps + +### Scenario Playbook 3: Agent-Generated Code Merged Without Review + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: production incident traced to a bug in code that was directly committed from a Devika workspace without human review +- initial hypothesis: the engineer copied code directly from the workspace to the main branch without a pull request review step +- immediate action: revert the offending commit, restore the previous working state, and conduct a root cause analysis +- engineering control: enforce GitHub branch protection rules requiring at least one human reviewer on all PRs; add a CI status check that detects AI-generated file markers +- verification target: no direct commits to the main branch from Devika workspace paths; all changes go through PRs +- rollback trigger: PR requirement slows emergency fixes; add an emergency bypass process with mandatory post-hoc review +- communication step: communicate the code review policy to all team members; add it to the onboarding checklist +- learning capture: add the incident to the security postmortem log and include it in the quarterly security review + +### Scenario Playbook 4: LLM API Key Leaked in Debug Logs + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: log aggregation system surfaces an alert with a pattern matching an Anthropic API key format in the backend debug log +- initial hypothesis: DEBUG log level is active in production and the full prompt payload (which includes the key from config.toml) is being logged +- immediate action: immediately rotate the exposed API key in the provider dashboard; lower the log level to INFO +- engineering control: add a log formatter that scrubs patterns matching known API key formats before writing log entries; verify at DEBUG level too +- verification target: grep for the new API key in all logs confirms zero occurrences after the scrubber is in place +- rollback trigger: log scrubbing introduces performance overhead on high-throughput systems; benchmark and optimize +- communication step: notify the security team of the key exposure event and the rotation; update the key inventory +- learning capture: add API key pattern scrubbing to the standard logging configuration and include it in the security baseline + +### Scenario Playbook 5: Qdrant Data Loss After Server Restart + +- tutorial context: **Devika Tutorial: Open-Source Autonomous AI Software Engineer** +- trigger condition: after a server restart, all tasks return empty research context; Qdrant dashboard shows an empty collection +- initial hypothesis: Qdrant was running with in-memory storage (default for `qdrant/qdrant` Docker without volume) and lost data on restart +- immediate action: restore from the most recent Qdrant snapshot if available; otherwise accept data loss and establish the backup procedure +- engineering control: redeploy Qdrant with a named Docker volume; enable scheduled snapshot exports to S3 via Qdrant's snapshot API +- verification target: Qdrant data persists across container restarts and is recoverable from S3 snapshot within 15 minutes +- rollback trigger: snapshot restore fails due to version incompatibility; pin the Qdrant Docker image version +- communication step: document the Qdrant persistence requirement prominently in the deployment guide with the required Docker volume configuration +- learning capture: add Qdrant snapshot success as a monitored metric; alert if nightly snapshot has not completed by 02:00 UTC + +### What Problem Does This Solve? + +Devika's production governance framework solves the accountability and blast-radius problem for autonomous AI coding agents operating in team environments. Without governance controls, a single misconfigured task can consume thousands of dollars in API spend, write security-violating code to a production repository, or expose sensitive project context to unauthorized users. The controls in this chapter create the human oversight checkpoints, cost guardrails, and audit trails that transform Devika from a powerful but risky autonomous tool into a responsibly operated engineering capability. + +### How it Works Under the Hood + +1. Secrets manager integration injects API keys as environment variables at container startup; the LLM abstraction layer reads from environment variables rather than config.toml in production mode. +2. The budget-tracking middleware intercepts every `LLM.inference()` call, queries a Redis counter for current task token usage, and raises a `BudgetExceededError` before the LLM call if the cap is reached. +3. The reverse proxy authenticates requests using JWT or session cookies before forwarding to the FastAPI backend; unauthenticated requests receive a 401 response. +4. The audit log sink receives structured event records via a Python logging handler and writes them to append-only storage; the handler is registered in the application startup sequence. +5. Qdrant snapshot exports are triggered via the Qdrant HTTP API on a cron schedule; snapshots are uploaded to S3 with server-side encryption. +6. Branch protection rules in GitHub (or GitLab) prevent direct pushes to protected branches; all Devika workspace code must go through a PR workflow with required reviewers. + +### Source Walkthrough + +- [Devika Security Policy](https://github.com/stitionai/devika/blob/main/SECURITY.md) — Why it matters: the official security reporting and hardening guidance for the Devika project. +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) — Why it matters: the deployment and configuration baseline from which production hardening starts. +- [Devika LLM Abstraction](https://github.com/stitionai/devika/tree/main/src/llm) — Why it matters: the layer where budget controls and secret injection are implemented. +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) — Why it matters: the architectural boundaries that inform where governance controls are most effective. + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 7: Debugging and Troubleshooting](07-debugging-and-troubleshooting.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/devika-tutorial/index.md b/tutorials/devika-tutorial/index.md new file mode 100644 index 00000000..24cdb785 --- /dev/null +++ b/tutorials/devika-tutorial/index.md @@ -0,0 +1,109 @@ +--- +layout: default +title: "Devika Tutorial" +nav_order: 190 +has_children: true +format_version: v2 +--- + +# Devika Tutorial: Open-Source Autonomous AI Software Engineer + +> Learn how to deploy and operate `stitionai/devika` — a multi-agent autonomous coding system that plans, researches, writes, and debugs code end-to-end. + +[![GitHub Repo](https://img.shields.io/badge/GitHub-stitionai%2Fdevika-black?logo=github)](https://github.com/stitionai/devika) +[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/stitionai/devika/blob/main/LICENSE) +[![Docs](https://img.shields.io/badge/docs-devika-blue)](https://github.com/stitionai/devika#readme) + +## Why This Track Matters + +Devika represents one of the most complete open-source implementations of an autonomous software engineering agent, combining multi-agent coordination, live web research, browser automation, and polyglot code generation in a single self-hosted stack. As teams evaluate autonomous coding systems for internal use, understanding how Devika's agent pipeline is structured, how it coordinates specialized roles, and how to govern it safely becomes a critical engineering competency. This track takes you from first install to production-grade team deployment, covering every architectural layer in depth. + +This track focuses on: + +- deploying and configuring Devika with any major LLM provider including Claude 3, GPT-4, Gemini, Mistral, Groq, and Ollama +- understanding the multi-agent pipeline: planner, researcher, coder, action, and internal monologue agents +- operating browser automation and web research capabilities safely and effectively +- governing autonomous code generation at team scale with cost controls and audit discipline + +## Current Snapshot (auto-updated) + +- repository: [`stitionai/devika`](https://github.com/stitionai/devika) +- stars: about **19.5k** +- latest release: latest main branch +- recent activity: updates on **2025** +- project positioning: open-source autonomous AI software engineer, alternative to Devin by Cognition AI + +## Mental Model + +```mermaid +flowchart LR + A[User Task Prompt] --> B[Planner Agent] + B --> C[Researcher Agent] + C --> D[Browser Automation / Playwright] + D --> E[Coder Agent] + E --> F[Action Agent] + F --> G[Internal Monologue / Self-Reflection] + G -->|next step| B + G --> H[Workspace Output + Git] +``` + +## Chapter Guide + +| Chapter | Key Question | Outcome | +|:--------|:-------------|:--------| +| [01 - Getting Started](01-getting-started.md) | How do I install Devika and run a first task? | Working baseline | +| [02 - Architecture and Agent Pipeline](02-architecture-and-agent-pipeline.md) | How do Devika's specialized agents coordinate? | Architecture clarity | +| [03 - LLM Provider Configuration](03-llm-provider-configuration.md) | How do I connect Claude, GPT-4, Gemini, Ollama, and others? | Provider flexibility | +| [04 - Task Planning and Code Generation](04-task-planning-and-code-generation.md) | How does Devika decompose tasks and generate code? | Reliable code output | +| [05 - Web Research and Browser Integration](05-web-research-and-browser-integration.md) | How does Devika research the web with Playwright? | Research agent control | +| [06 - Project Management and Workspaces](06-project-management-and-workspaces.md) | How do I manage projects, files, and git integration? | Workspace discipline | +| [07 - Debugging and Troubleshooting](07-debugging-and-troubleshooting.md) | How do I diagnose failures in the agent pipeline? | Operational resilience | +| [08 - Production Operations and Governance](08-production-operations-and-governance.md) | How do teams deploy Devika safely at scale? | Governance runbook | + +## What You Will Learn + +- how to configure and run Devika across multiple LLM providers for different cost and capability tradeoffs +- how to reason about multi-agent coordination, context flow, and internal monologue loops +- how to operate browser automation and research pipelines responsibly +- how to govern autonomous code generation workflows in team environments with audit and rollback controls + +## Source References + +- [Devika Repository](https://github.com/stitionai/devika) +- [Devika README](https://github.com/stitionai/devika/blob/main/README.md) +- [Devika Architecture Docs](https://github.com/stitionai/devika/blob/main/docs/architecture.md) +- [Devika How It Works](https://github.com/stitionai/devika#how-it-works) +- [Devika Setup Guide](https://github.com/stitionai/devika#getting-started) + +## Related Tutorials + +- [OpenHands Tutorial](../openhands-tutorial/) +- [SWE-agent Tutorial](../swe-agent-tutorial/) +- [Cline Tutorial](../cline-tutorial/) +- [Aider Tutorial](../aider-tutorial/) +- [Open SWE Tutorial](../open-swe-tutorial/) + +--- + +Start with [Chapter 1: Getting Started](01-getting-started.md). + +## Navigation & Backlinks + +- [Start Here: Chapter 1: Getting Started](01-getting-started.md) +- [Back to Main Catalog](../../README.md#-tutorial-catalog) +- [Browse A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) +- [Search by Intent](../../discoverability/query-hub.md) +- [Explore Category Hubs](../../README.md#category-hubs) + +## Full Chapter Map + +1. [Chapter 1: Getting Started](01-getting-started.md) +2. [Chapter 2: Architecture and Agent Pipeline](02-architecture-and-agent-pipeline.md) +3. [Chapter 3: LLM Provider Configuration](03-llm-provider-configuration.md) +4. [Chapter 4: Task Planning and Code Generation](04-task-planning-and-code-generation.md) +5. [Chapter 5: Web Research and Browser Integration](05-web-research-and-browser-integration.md) +6. [Chapter 6: Project Management and Workspaces](06-project-management-and-workspaces.md) +7. [Chapter 7: Debugging and Troubleshooting](07-debugging-and-troubleshooting.md) +8. [Chapter 8: Production Operations and Governance](08-production-operations-and-governance.md) + +*Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)* diff --git a/tutorials/kiro-tutorial/01-getting-started.md b/tutorials/kiro-tutorial/01-getting-started.md new file mode 100644 index 00000000..6aea7d16 --- /dev/null +++ b/tutorials/kiro-tutorial/01-getting-started.md @@ -0,0 +1,316 @@ +--- +layout: default +title: "Chapter 1: Getting Started" +nav_order: 1 +parent: Kiro Tutorial +--- + +# Chapter 1: Getting Started + +Welcome to **Chapter 1: Getting Started**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +This chapter gets you from zero to a running Kiro workspace so you can move into spec-driven workflows without setup drift. + +## Learning Goals + +- download and install Kiro on Mac, Windows, or Linux +- authenticate using GitHub, Google, or AWS Builder ID +- open or create your first project +- understand the Kiro workspace layout and panel structure +- run your first AI-assisted interaction in the chat panel + +## Fast Start Checklist + +1. download Kiro from [kiro.dev](https://kiro.dev) +2. launch the installer for your platform +3. authenticate with GitHub, Google, or AWS Builder ID +4. open a local folder or clone a repository +5. open the Kiro chat panel and send a first message + +## Installation Paths + +| Platform | Method | Notes | +|:---------|:-------|:------| +| macOS | `.dmg` download from kiro.dev | drag to Applications, allow Gatekeeper | +| Windows | `.exe` installer from kiro.dev | run as administrator if needed | +| Linux | `.deb` or `.AppImage` from kiro.dev | mark AppImage executable before launch | + +## Authentication Methods + +Kiro supports three authentication providers at launch. All grant access to the same base capabilities. + +| Method | Best For | Notes | +|:-------|:---------|:------| +| GitHub | developers with existing GitHub accounts | one-click OAuth flow | +| Google | teams using Google Workspace | standard OAuth redirect | +| AWS Builder ID | teams already using AWS services | connects to AWS identity layer | + +```bash +# After launch, Kiro presents an authentication screen. +# No manual token setup is required for GitHub or Google. +# For AWS Builder ID, sign in at https://profile.aws.amazon.com +# and complete the device authorization flow shown in Kiro. +``` + +## First Project Flow + +``` +1. Launch Kiro +2. Select "Open Folder" or "Clone Repository" +3. For a new project: File > New Folder, then open it in Kiro +4. Kiro indexes the workspace automatically +5. Open the Chat panel (View > Kiro Chat or the sidebar icon) +6. Type: "Summarize this project structure" +``` + +## Workspace Layout + +| Panel | Purpose | +|:------|:--------| +| Explorer | file tree with .kiro/ directory visible | +| Editor | multi-tab code editor (VS Code-compatible) | +| Chat | AI conversation panel with spec and agent controls | +| Terminal | integrated terminal for build and run commands | +| Specs | shortcut panel to requirements, design, and tasks files | + +## First Interaction + +``` +# In the Chat panel, start simple: +> Summarize the top-level directory structure of this project. + +# Kiro reads the workspace and responds with a structured overview. +# This confirms authentication and workspace indexing are working. +``` + +## Early Failure Triage + +| Symptom | Likely Cause | First Fix | +|:--------|:-------------|:----------| +| blank chat panel after login | auth token not saved | sign out and re-authenticate | +| project files not indexed | large repo or excluded paths | check .gitignore and Kiro workspace settings | +| model response not appearing | network proxy blocking Kiro endpoints | configure proxy in Kiro settings | +| AWS Builder ID flow hangs | device code expired | restart the sign-in flow in Kiro | + +## Source References + +- [Kiro Website](https://kiro.dev) +- [Kiro Docs: Getting Started](https://kiro.dev/docs/getting-started) +- [Kiro Docs: Authentication](https://kiro.dev/docs/authentication) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now have Kiro installed, authenticated, and connected to a project workspace. + +Next: [Chapter 2: Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 1: Getting Started** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 1: Getting Started` — Kiro process, auth layer, workspace indexer, and model API connection. +2. Separate control-plane decisions (auth provider choice, workspace configuration) from data-plane execution (model inference, file reads). +3. Capture input contracts: local filesystem path, user credentials, and workspace settings; output: indexed workspace and live chat session. +4. Trace state transitions: unauthenticated → authenticated → workspace open → indexed → chat ready. +5. Identify extension hooks: custom workspace settings, proxy configuration, and excluded-path policies. +6. Map ownership boundaries: individual developer owns auth tokens; team owns shared workspace config and .kiro/ directory. +7. Specify rollback paths: sign out and re-authenticate; reopen workspace to trigger re-indexing. +8. Track observability signals: auth success/failure logs, indexing completion time, first-message latency. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Auth provider | GitHub OAuth | AWS Builder ID with IAM | simplicity vs AWS identity integration | +| Workspace size | small repo under 10k files | large monorepo with exclusion rules | speed vs completeness | +| Network config | direct connection | proxy with allowlist for kiro.dev | ease vs enterprise security | +| Rollout method | individual install | managed deploy via MDM or package manager | velocity vs governance | +| Incident response | user self-service | IT helpdesk runbook + Kiro logs | cost vs reliability | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| auth token expiry | 401 on chat requests | long-idle session without refresh | re-authenticate; check session TTL settings | +| workspace index failure | empty context responses | large or excluded files | add explicit include patterns; reduce workspace scope | +| proxy interference | connection timeout on model calls | corporate firewall blocking kiro.dev | add kiro.dev to proxy allowlist | +| OS permission denial | Gatekeeper block on macOS | unsigned binary or quarantine flag | clear quarantine attribute: `xattr -d com.apple.quarantine Kiro.app` | +| stale credentials | silent auth failures | AWS Builder ID token not refreshed | trigger manual re-auth from Kiro settings | +| network latency spike | slow first-message response | CDN routing or model endpoint cold start | retry with smaller prompt; check Kiro status page | + +### Implementation Runbook + +1. Verify platform prerequisites: OS version meets Kiro minimum requirements. +2. Download Kiro from the official kiro.dev release page and verify the checksum. +3. Run the platform installer and complete any OS-level permission prompts. +4. Launch Kiro and select an authentication provider. +5. Complete the OAuth or device authorization flow and confirm the success screen. +6. Open a local project folder with at least one source file to confirm workspace indexing. +7. Send a test message in the Chat panel and verify a model response is returned. +8. Check the Explorer panel for the `.kiro/` directory (created automatically on first use). +9. Record the installed version and authentication provider for team onboarding documentation. + +### Quality Gate Checklist + +- [ ] Kiro launches without OS-level errors on the target platform +- [ ] authentication flow completes and the Chat panel shows the user identity +- [ ] workspace indexing completes within acceptable time for the repo size +- [ ] first chat message returns a model response without timeout +- [ ] `.kiro/` directory is visible in the Explorer panel +- [ ] proxy and network configuration is documented for team members +- [ ] rollback path (sign-out and re-authenticate) is verified and documented +- [ ] installed version is recorded for future upgrade planning + +### Source Alignment + +- [Kiro Website](https://kiro.dev) +- [Kiro Docs: Getting Started](https://kiro.dev/docs/getting-started) +- [Kiro Docs: Authentication](https://kiro.dev/docs/authentication) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [Cline Tutorial](../cline-tutorial/) +- [Roo Code Tutorial](../roo-code-tutorial/) +- [Claude Code Tutorial](../claude-code-tutorial/) +- [OpenCode Tutorial](../opencode-tutorial/) +- [Chapter 2: Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) + +### Advanced Practice Exercises + +1. Install Kiro on a second platform (if available) and compare the authentication flow differences. +2. Configure a large repository with `.kiro/` exclusion settings and measure indexing time before and after. +3. Simulate an auth token expiry by signing out mid-session and document the re-authentication steps. +4. Set up a proxy environment and verify Kiro model calls route correctly through it. +5. Create an onboarding runbook for a five-person team covering install, auth, and first-session steps. + +### Review Questions + +1. Which authentication method integrates most naturally with your team's existing identity provider? +2. What signal confirms that workspace indexing completed successfully before sending the first chat message? +3. What tradeoff did you make between workspace scope and indexing speed? +4. How would you recover if the AWS Builder ID device code expired during authentication? +5. What must be documented before scaling Kiro installation to a full engineering team? + +### Scenario Playbook 1: Getting Started - Auth Flow Spike + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: authentication provider OAuth endpoint is slow or intermittently unavailable +- initial hypothesis: identify the smallest reproducible failure boundary in the auth redirect chain +- immediate action: protect developer productivity by switching to an alternative auth provider temporarily +- engineering control: document both GitHub and AWS Builder ID flows so teams can pivot without delay +- verification target: authentication completes within 30 seconds on a standard corporate network +- rollback trigger: if auth fails three consecutive times, escalate to IT for network proxy review +- communication step: notify team channel with auth status and estimated resolution time +- learning capture: add auth fallback procedure to onboarding runbook and automate network pre-check + +### Scenario Playbook 2: Getting Started - Large Repo Indexing Failure + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: workspace indexing hangs or produces incomplete context for a monorepo over 50k files +- initial hypothesis: identify which file patterns or directories are causing indexer stalls +- immediate action: add exclusion rules for build artifacts, `node_modules`, and generated files in workspace settings +- engineering control: define a canonical `.kiro/` exclusion list for the monorepo and commit it to version control +- verification target: indexing completes in under two minutes for the scoped workspace +- rollback trigger: if context responses remain incomplete after exclusion rules, reduce workspace to a single module +- communication step: document the exclusion list decision in the team's Kiro setup guide +- learning capture: convert the exclusion list into a reusable workspace template for new team members + +### Scenario Playbook 3: Getting Started - Proxy Interference + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: corporate proxy blocks model API calls from Kiro, resulting in silent timeouts +- initial hypothesis: identify the specific endpoint being blocked by running a direct curl test to kiro.dev +- immediate action: submit an IT ticket to allowlist kiro.dev and the underlying model API endpoints +- engineering control: configure Kiro's proxy settings with the corporate proxy URL and credentials +- verification target: first chat message returns a response within five seconds after proxy configuration +- rollback trigger: if proxy config causes other network issues, revert and use a personal hotspot for temporary access +- communication step: share proxy configuration steps with the team and add to the network setup section of onboarding docs +- learning capture: add a pre-install network check script that tests kiro.dev connectivity before the install begins + +### Scenario Playbook 4: Getting Started - OS Permission Denial + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: macOS Gatekeeper blocks Kiro launch due to quarantine attribute on the downloaded binary +- initial hypothesis: confirm the quarantine attribute is present using `xattr -l Kiro.app` +- immediate action: clear the quarantine attribute with `xattr -d com.apple.quarantine Kiro.app` and relaunch +- engineering control: add a note in the install guide to clear quarantine after download on macOS +- verification target: Kiro launches without security dialogs after the quarantine clear +- rollback trigger: if Gatekeeper continues to block after clearing quarantine, escalate to IT for MDM policy review +- communication step: add the quarantine-clear step to the macOS section of the team install guide +- learning capture: investigate whether an enterprise-signed distribution eliminates this step for managed machines + +### Scenario Playbook 5: Getting Started - Version Mismatch on Upgrade + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a Kiro update breaks an existing workspace configuration or .kiro/ directory format +- initial hypothesis: compare the `.kiro/` directory schema between the old and new version release notes +- immediate action: back up the `.kiro/` directory before applying any upgrade +- engineering control: pin the Kiro version in team documentation until a new version is validated on the target repo +- verification target: all spec files and steering configurations load correctly after the upgrade +- rollback trigger: if the upgrade breaks existing specs, restore from backup and roll back to the previous version +- communication step: announce the upgrade validation status to the team before rolling out to all workstations +- learning capture: add a version pin and upgrade validation checklist to the team's Kiro governance document + +## What Problem Does This Solve? + +Most teams struggle with agentic IDE adoption because setup friction causes inconsistent baselines across developer machines. Kiro solves this by providing a single downloadable package with a guided authentication flow, auto-indexing workspace setup, and a visible `.kiro/` directory that anchors all AI configuration in version control from day one. + +In practical terms, this chapter helps you avoid three common failures: + +- inconsistent authentication states that cause intermittent model failures mid-session +- oversized workspace indexing that produces irrelevant context and slow responses +- undocumented network or OS requirements that block adoption for entire teams + +After working through this chapter, you should be able to reason about Kiro's setup as a deterministic onboarding sequence with explicit checkpoints: installed, authenticated, workspace open, indexed, and chat ready. + +## How it Works Under the Hood + +Under the hood, `Chapter 1: Getting Started` follows a repeatable control path: + +1. **Binary bootstrap**: Kiro launches a VS Code-based electron process and initializes the extension host. +2. **Auth token acquisition**: the selected OAuth provider issues a token that Kiro stores in the OS credential store. +3. **Workspace indexing**: Kiro scans the open folder, applies exclusion rules, and builds a local context index. +4. **Model connection**: Kiro establishes a secure connection to the model API endpoint using the stored auth token. +5. **Chat session initialization**: the Chat panel registers the workspace context and prepares the first-message prompt template. +6. **Operational telemetry**: Kiro emits anonymized usage signals for session start, indexing duration, and first-message latency. + +When debugging setup failures, walk this sequence in order and confirm each stage completes before moving to the next. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Website](https://kiro.dev) + Why it matters: the primary distribution point for all platform installers and release notes. +- [Kiro Docs: Getting Started](https://kiro.dev/docs/getting-started) + Why it matters: official step-by-step guide for first-time setup across all supported platforms. +- [Kiro Docs: Authentication](https://kiro.dev/docs/authentication) + Why it matters: documents each auth provider's flow, token lifecycle, and re-authentication steps. +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + Why it matters: source of truth for open-source components, release tags, and community issue tracking. + +Suggested trace strategy: +- check the GitHub releases page for the latest version tag before installing +- compare the kiro.dev docs auth section against your team's identity provider to confirm compatibility before deploying widely + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Next Chapter: Chapter 2: Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/02-spec-driven-development-workflow.md b/tutorials/kiro-tutorial/02-spec-driven-development-workflow.md new file mode 100644 index 00000000..8b81f5ab --- /dev/null +++ b/tutorials/kiro-tutorial/02-spec-driven-development-workflow.md @@ -0,0 +1,390 @@ +--- +layout: default +title: "Chapter 2: Spec-Driven Development Workflow" +nav_order: 2 +parent: Kiro Tutorial +--- + +# Chapter 2: Spec-Driven Development Workflow + +Welcome to **Chapter 2: Spec-Driven Development Workflow**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro's defining innovation is that AI assistance is organized around three structured documents rather than freeform chat. This chapter teaches you how to create, iterate, and execute against specs. + +## Learning Goals + +- understand the three-file spec structure: requirements.md, design.md, tasks.md +- write requirements using EARS (Easy Approach to Requirements Syntax) +- generate a design document from requirements using Kiro's spec agent +- break design into actionable tasks that Kiro agents can execute +- iterate specs as requirements change without losing design traceability + +## Fast Start Checklist + +1. open the Kiro Specs panel or navigate to `.kiro/specs/` +2. create a new spec with a feature name (e.g., `user-authentication`) +3. write at least three requirements in EARS format in `requirements.md` +4. ask Kiro to generate `design.md` from the requirements +5. ask Kiro to generate `tasks.md` from the design +6. execute the first task + +## The Three-File Spec Structure + +``` +.kiro/ + specs/ + user-authentication/ + requirements.md ← what the feature must do (EARS syntax) + design.md ← how to build it (architecture, data models, APIs) + tasks.md ← numbered implementation steps for the agent +``` + +Each spec lives in its own named folder under `.kiro/specs/`. Committing these files to version control gives your team a living record of AI-assisted design decisions. + +## EARS Syntax for Requirements + +EARS (Easy Approach to Requirements Syntax) is a structured natural-language format for writing unambiguous requirements. Kiro expects requirements in this format to generate high-quality design and task documents. + +| EARS Pattern | Template | Example | +|:-------------|:---------|:--------| +| Ubiquitous | The `` shall ``. | The system shall hash passwords using bcrypt. | +| Event-driven | When ``, the `` shall ``. | When a user submits a login form, the system shall validate credentials against the database. | +| Unwanted behavior | If ``, then the `` shall ``. | If credentials are invalid, then the system shall return a 401 response with an error message. | +| State-driven | While ``, the `` shall ``. | While a session is active, the system shall refresh the JWT token every 15 minutes. | +| Optional feature | Where `` is supported, the `` shall ``. | Where MFA is enabled, the system shall require a TOTP code at login. | + +## Writing requirements.md + +```markdown +# Requirements: User Authentication + +## Functional Requirements + +- The system shall store user credentials with bcrypt-hashed passwords at cost factor 12. +- When a user submits valid credentials, the system shall issue a signed JWT with a 1-hour expiry. +- If credentials are invalid, then the system shall return HTTP 401 with a generic error message. +- While a session is active, the system shall refresh the JWT automatically 5 minutes before expiry. +- When a user requests logout, the system shall invalidate the session token immediately. + +## Non-Functional Requirements + +- The system shall complete credential validation in under 200ms at p95. +- If the authentication service is unavailable, then the system shall return HTTP 503 within 5 seconds. +``` + +## Generating design.md + +Once requirements are written, ask Kiro to generate the design: + +``` +# In the Chat panel: +> Generate a design document for the user-authentication spec based on requirements.md + +# Kiro reads requirements.md and produces design.md covering: +# - component architecture (auth service, token store, session manager) +# - data models (User, Session, RefreshToken) +# - API contracts (POST /auth/login, POST /auth/logout, POST /auth/refresh) +# - error handling strategy +# - security considerations +``` + +A sample `design.md` excerpt: + +```markdown +# Design: User Authentication + +## Architecture + +The authentication feature uses a three-layer model: +- API layer: Express routes for /auth/login, /auth/logout, /auth/refresh +- Service layer: AuthService with validateCredentials(), issueToken(), revokeToken() +- Data layer: PostgreSQL users table, Redis session store for token revocation + +## Data Models + +### User +| Field | Type | Constraints | +|:------|:-----|:-----------| +| id | UUID | primary key | +| email | VARCHAR(255) | unique, not null | +| password_hash | VARCHAR(60) | bcrypt, not null | +| created_at | TIMESTAMP | not null | + +## API Contracts + +POST /auth/login + Body: { email: string, password: string } + Success: 200 { token: string, expires_at: ISO8601 } + Failure: 401 { error: "invalid_credentials" } +``` + +## Generating tasks.md + +After the design is approved, ask Kiro to generate the task list: + +``` +> Generate a tasks.md implementation plan from design.md + +# Kiro produces a numbered task list such as: +``` + +```markdown +# Tasks: User Authentication + +- [ ] 1. Create PostgreSQL migration for users table with id, email, password_hash, created_at +- [ ] 2. Implement AuthService.validateCredentials() with bcrypt comparison +- [ ] 3. Implement AuthService.issueToken() using jsonwebtoken with 1h expiry +- [ ] 4. Implement AuthService.revokeToken() using Redis SET with TTL +- [ ] 5. Create POST /auth/login route with input validation and AuthService calls +- [ ] 6. Create POST /auth/logout route that calls revokeToken() +- [ ] 7. Create POST /auth/refresh route with automatic token renewal +- [ ] 8. Add middleware to verify JWT on protected routes +- [ ] 9. Write unit tests for AuthService methods +- [ ] 10. Write integration tests for all /auth routes +``` + +## Executing Tasks + +You can execute tasks one by one or delegate them to the autonomous agent: + +``` +# Execute a single task: +> Complete task 1: create the PostgreSQL migration for the users table + +# Delegate all tasks to the agent: +> Execute all tasks in tasks.md for the user-authentication spec +``` + +## Iterating Specs + +When requirements change, update `requirements.md` first, then regenerate downstream documents: + +``` +> requirements.md has been updated to add MFA support. Regenerate design.md to include TOTP handling. + +# After confirming design.md: +> Regenerate tasks.md to include the new MFA tasks from design.md +``` + +## Source References + +- [Kiro Docs: Specs](https://kiro.dev/docs/specs) +- [Kiro Docs: EARS Syntax](https://kiro.dev/docs/specs/ears) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now understand how to create, generate, and execute three-file specs in Kiro using EARS requirements syntax. + +Next: [Chapter 3: Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 2: Spec-Driven Development Workflow** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 2: Spec-Driven Development Workflow` — the `.kiro/specs/` directory as the source of truth, the spec agent as transformer, and the chat panel as the control interface. +2. Separate control-plane decisions (which requirements to include, design approval gates) from data-plane execution (file writes, task execution). +3. Capture input contracts: EARS-formatted `requirements.md`; output contracts: approved `design.md` and executable `tasks.md`. +4. Trace state transitions: empty spec folder → requirements written → design generated → design approved → tasks generated → tasks executing → tasks complete. +5. Identify extension hooks: custom EARS templates, design document templates, task numbering conventions. +6. Map ownership boundaries: product/engineer owns `requirements.md`; architect reviews `design.md`; agent executes `tasks.md`. +7. Specify rollback paths: revert `design.md` to a previous git commit; regenerate `tasks.md` from the prior design. +8. Track observability signals: spec generation latency, task completion rate, requirement traceability coverage. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Requirements granularity | 5-10 high-level EARS statements | 20+ detailed acceptance criteria | speed vs precision | +| Design approval gate | developer self-approves | architect review before task generation | velocity vs quality | +| Task delegation | manual task-by-task execution | full autonomous delegation | control vs efficiency | +| Spec versioning | file in .kiro/ only | committed to git with PR review | simplicity vs auditability | +| Iteration strategy | regenerate full design on change | diff-patch specific sections | speed vs traceability | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| ambiguous requirements | design doc misses intent | vague EARS statements | add acceptance criteria and examples to each requirement | +| design drift | tasks diverge from design | design.md edited without regenerating tasks | treat design.md as source of truth; always regenerate tasks after edits | +| task scope creep | tasks grow beyond spec | underconstrained task generation | add a "scope boundary" section to design.md | +| stale spec | code diverges from requirements | no enforcement of spec-first updates | add a CI check that alerts when code changes lack a corresponding spec update | +| overgenerated tasks | too many micro-tasks slow progress | fine-grained design decomposition | set a max-tasks constraint in the spec generation prompt | +| spec format violations | agent rejects or misreads spec | non-EARS requirements | validate requirements.md against EARS patterns before generation | + +### Implementation Runbook + +1. Create the spec directory: `.kiro/specs//`. +2. Write `requirements.md` using EARS syntax with at least three functional and one non-functional requirement. +3. Ask Kiro to generate `design.md` from `requirements.md` and review the output for completeness. +4. Identify any gaps in the design and add clarifying context to `requirements.md`, then regenerate. +5. Approve `design.md` by committing it to version control with a design-review tag. +6. Ask Kiro to generate `tasks.md` from `design.md` and verify task ordering and dependencies. +7. Execute the first two tasks manually to validate the spec-to-code translation quality. +8. Promote remaining tasks to autonomous agent execution after manual validation. +9. Mark completed tasks in `tasks.md` and commit the updated spec after each task group completes. + +### Quality Gate Checklist + +- [ ] all requirements are written in valid EARS syntax with no ambiguous "should" language +- [ ] `design.md` covers component architecture, data models, API contracts, and error handling +- [ ] `tasks.md` is numbered, ordered by dependency, and each task references the design section it implements +- [ ] spec files are committed to version control before task execution begins +- [ ] at least two tasks are manually validated before autonomous delegation +- [ ] a rollback path (git revert of spec files) is documented and tested +- [ ] spec generation latency is within acceptable bounds for the team's workflow +- [ ] requirement traceability is confirmed: every task maps to at least one requirement + +### Source Alignment + +- [Kiro Docs: Specs](https://kiro.dev/docs/specs) +- [Kiro Docs: EARS Syntax](https://kiro.dev/docs/specs/ears) +- [Kiro Docs: Task Execution](https://kiro.dev/docs/specs/tasks) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [Claude Code Tutorial](../claude-code-tutorial/) +- [Cline Tutorial](../cline-tutorial/) +- [OpenHands Tutorial](../openhands-tutorial/) +- [Plandex Tutorial](../plandex-tutorial/) +- [Chapter 3: Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) + +### Advanced Practice Exercises + +1. Write a complete `requirements.md` for a payment processing feature using all five EARS patterns. +2. Generate `design.md` and identify one gap; update requirements and regenerate to confirm the gap is filled. +3. Simulate a requirement change mid-execution and practice updating only the affected tasks in `tasks.md`. +4. Add a CI check that lints `requirements.md` for non-EARS language like "should" or "might". +5. Compare the task output from two different levels of design granularity and measure execution accuracy. + +### Review Questions + +1. What is the purpose of EARS syntax and why does Kiro require it for high-quality spec generation? +2. Which approval gate prevents design drift from propagating into task execution? +3. What tradeoff did you make between task granularity and autonomous delegation speed? +4. How would you recover if `design.md` was edited manually and `tasks.md` is now inconsistent? +5. What must be in version control before autonomous task execution begins? + +### Scenario Playbook 1: Spec Generation - Ambiguous Requirements + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: design.md misses key business logic because requirements.md used vague language +- initial hypothesis: identify which EARS statements lack acceptance criteria or measurable conditions +- immediate action: add concrete examples and edge cases to the failing requirements before regenerating +- engineering control: require peer review of requirements.md before submitting to the spec agent +- verification target: every requirement in the regenerated design.md maps to a specific, testable implementation +- rollback trigger: if two regeneration attempts still miss key logic, escalate to a design workshop with the team +- communication step: document the ambiguous requirements and their clarified versions in the spec PR description +- learning capture: add the clarified examples to the team's EARS writing guide for future features + +### Scenario Playbook 2: Spec Generation - Design Drift + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: tasks.md references components or APIs that no longer match design.md after manual edits +- initial hypothesis: diff design.md against its last committed version to identify manual changes +- immediate action: revert design.md to the last approved commit and regenerate tasks.md +- engineering control: treat design.md as an append-only document; add new sections rather than editing existing ones +- verification target: every task in tasks.md references a section that exists in the current design.md +- rollback trigger: if task regeneration continues to produce drift, split the spec into two separate feature specs +- communication step: notify the team that design.md has a new version and tasks.md has been regenerated +- learning capture: add a git hook that warns when design.md is modified without a corresponding tasks.md regeneration + +### Scenario Playbook 3: Spec Execution - Task Scope Creep + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: autonomous agent adds files or changes outside the defined task scope during execution +- initial hypothesis: review the task description for missing scope boundaries or implicit dependencies +- immediate action: halt agent execution, review changes, and revert any out-of-scope modifications +- engineering control: add an explicit "out of scope" section to tasks.md listing what the agent must not change +- verification target: agent changes are confined to the files and directories listed in the task description +- rollback trigger: if out-of-scope changes recur on the next task, switch to manual task-by-task execution +- communication step: document the out-of-scope incident in the spec's revision history +- learning capture: update the task generation prompt template to always include a scope boundary constraint + +### Scenario Playbook 4: Spec Iteration - Mid-Sprint Requirement Change + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a product decision changes one requirement after task execution has already begun +- initial hypothesis: identify which completed tasks are affected by the changed requirement +- immediate action: mark affected completed tasks as "needs-revision" in tasks.md and halt further execution +- engineering control: update requirements.md first, then regenerate only the affected design.md sections and tasks +- verification target: updated tasks are re-executed and produce output consistent with the new requirement +- rollback trigger: if the change invalidates more than 50% of completed tasks, create a new spec branch +- communication step: update the PR description with the requirement change and its impact on the task list +- learning capture: add a "change impact" section to the spec template for documenting mid-sprint pivots + +### Scenario Playbook 5: Spec Quality - Stale Spec After Code Refactor + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: code has been refactored but the spec files still reference the old architecture +- initial hypothesis: compare the current codebase structure against design.md component references +- immediate action: flag the spec as "stale" and schedule a spec refresh session before the next feature build +- engineering control: add a quarterly spec audit to the team's engineering calendar +- verification target: refreshed design.md accurately describes the current architecture and data models +- rollback trigger: if the spec refresh reveals architectural inconsistencies, escalate to an architecture review +- communication step: announce the spec refresh in the team channel and request review from senior engineers +- learning capture: add a "last verified" timestamp field to each spec and enforce it in the PR template + +## What Problem Does This Solve? + +Most agentic coding tools suffer from the "chat amnesia" problem: each conversation starts fresh, there is no persistent record of design decisions, and AI-generated code accumulates without traceability back to requirements. Kiro's spec-driven workflow solves this by making the design artifact — not the conversation — the primary interface for AI assistance. + +In practical terms, this chapter helps you avoid three common failures: + +- generating code that satisfies the immediate prompt but misses the broader system design +- losing context across sessions when working on a multi-day feature +- having no audit trail of why specific implementation choices were made + +After working through this chapter, you should be able to reason about Kiro specs as a contract layer between product intent, system design, and agent execution — with explicit traceability from requirement to code. + +## How it Works Under the Hood + +Under the hood, `Chapter 2: Spec-Driven Development Workflow` follows a repeatable control path: + +1. **Spec directory initialization**: Kiro creates `.kiro/specs//` and registers the spec in the workspace index. +2. **Requirements parsing**: the spec agent reads `requirements.md` and classifies each statement by EARS pattern type. +3. **Design generation**: the agent maps requirements to components, data models, and APIs and writes `design.md`. +4. **Design approval gate**: the developer reviews and commits `design.md`; Kiro treats the committed version as canonical. +5. **Task decomposition**: the agent reads `design.md` and generates ordered, dependency-aware tasks in `tasks.md`. +6. **Task execution loop**: each task is dispatched to the appropriate execution agent with the design as grounding context. + +When debugging spec quality issues, walk this sequence in order and check the output at each stage before moving forward. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Specs](https://kiro.dev/docs/specs) + Why it matters: the authoritative reference for the three-file spec format and generation workflow. +- [Kiro Docs: EARS Syntax](https://kiro.dev/docs/specs/ears) + Why it matters: defines the exact EARS patterns Kiro uses to parse and classify requirements. +- [Kiro Docs: Task Execution](https://kiro.dev/docs/specs/tasks) + Why it matters: documents how tasks.md items are dispatched to agents and how completion is tracked. +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + Why it matters: source of truth for spec agent implementation and community-contributed spec templates. + +Suggested trace strategy: +- search the Kiro docs for each EARS pattern keyword before writing your first requirements.md +- compare generated design.md sections against the design template in the docs to confirm coverage + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 1: Getting Started](01-getting-started.md) +- [Next Chapter: Chapter 3: Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/03-agent-steering-and-rules-configuration.md b/tutorials/kiro-tutorial/03-agent-steering-and-rules-configuration.md new file mode 100644 index 00000000..962893c1 --- /dev/null +++ b/tutorials/kiro-tutorial/03-agent-steering-and-rules-configuration.md @@ -0,0 +1,387 @@ +--- +layout: default +title: "Chapter 3: Agent Steering and Rules Configuration" +nav_order: 3 +parent: Kiro Tutorial +--- + +# Chapter 3: Agent Steering and Rules Configuration + +Welcome to **Chapter 3: Agent Steering and Rules Configuration**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro's steering system lets you encode persistent, project-scoped rules that guide AI behavior without repeating them in every prompt. This chapter teaches you how to build and manage the `.kiro/steering/` directory. + +## Learning Goals + +- understand the purpose and structure of the `.kiro/steering/` directory +- create steering files that encode technology choices, coding conventions, and project context +- use inclusion and exclusion patterns to scope steering rules to specific file types or directories +- combine multiple steering files for layered, composable rule sets +- troubleshoot steering conflicts and priority ordering + +## Fast Start Checklist + +1. create `.kiro/steering/` in your project root +2. create `project.md` with your stack, conventions, and key decisions +3. create `coding-style.md` with language-specific style rules +4. verify the steering files are loaded by asking Kiro a question that requires the rules +5. commit `.kiro/steering/` to version control for team sharing + +## The Steering Directory Structure + +``` +.kiro/ + steering/ + project.md ← always-active project context and technology decisions + coding-style.md ← language and framework style conventions + testing.md ← testing strategy and framework preferences + security.md ← security policies and forbidden patterns + api-contracts.md ← API design rules and backward compatibility requirements +``` + +Steering files are plain markdown. Kiro reads all files in `.kiro/steering/` and injects their content as persistent context for every agent interaction in the workspace. + +## Steering File Format + +```markdown +# Project Context + +## Technology Stack +- Runtime: Node.js 20 with TypeScript strict mode +- Framework: Express 4 with class-validator for input validation +- Database: PostgreSQL 15 with Prisma ORM +- Testing: Jest with ts-jest, supertest for integration tests +- Deployment: AWS Lambda with the Serverless Framework + +## Key Decisions +- All new API routes must follow RESTful conventions with plural resource names. +- Use async/await throughout; no raw Promise chains. +- All database queries must go through Prisma; no raw SQL. +- Error responses must use the standard { error: string, code: string } shape. + +## Forbidden Patterns +- Never use `any` type in TypeScript. +- Never commit secrets or API keys; use AWS Secrets Manager references. +- Never use synchronous file I/O in request handlers. +``` + +## Scoped Steering with Inclusion Patterns + +You can scope a steering file to apply only when working on specific directories or file types: + +```markdown +--- +applies_to: + - "src/api/**" + - "*.route.ts" +--- + +# API Route Conventions + +- All routes must use express-validator for request body validation. +- Route handlers must be thin: delegate business logic to service classes. +- Return 201 for resource creation, 200 for reads and updates, 204 for deletions. +- Never return raw database error messages to clients. +``` + +## Example: Security Steering File + +```markdown +# Security Policy + +## Authentication +- All endpoints except /auth/login and /health must require a valid JWT. +- JWTs must be verified using the RS256 algorithm. +- Never log full JWT tokens; log only the token's jti claim. + +## Input Handling +- All user inputs must be validated and sanitized before use. +- Use parameterized queries for all database operations. +- Reject requests with payloads over 1MB with HTTP 413. + +## Dependency Policy +- Audit new npm packages with `npm audit` before adding to package.json. +- Pin all production dependency versions; use ranges only for devDependencies. +``` + +## Example: Testing Steering File + +```markdown +# Testing Conventions + +## Unit Tests +- Use describe/it blocks with descriptive names that read like sentences. +- Mock all external dependencies (database, HTTP calls) in unit tests. +- Target 80% branch coverage for all service classes. + +## Integration Tests +- Use a dedicated test database seeded from fixtures. +- Test the full HTTP stack with supertest; do not mock Express. +- Reset the database state between test suites using beforeEach hooks. + +## Test Naming +- Unit test files: `.test.ts` next to the source file. +- Integration test files: `tests/integration/.integration.test.ts`. +``` + +## Combining Steering Files + +Kiro merges all active steering files into a single context block. The order of injection follows alphabetical filename order. To control priority, prefix files with numbers: + +``` +.kiro/steering/ + 00-project.md ← highest priority, always active + 01-coding-style.md + 02-testing.md + 03-security.md + 10-api-contracts.md +``` + +## Verifying Steering is Active + +``` +# In the Chat panel: +> What testing framework should I use for this project? + +# Expected response (with testing.md loaded): +# Based on the project steering, you should use Jest with ts-jest for unit tests +# and supertest for integration tests. + +# If Kiro responds with a generic answer, check: +# 1. .kiro/steering/ exists and contains markdown files +# 2. The files have valid markdown content (no syntax errors) +# 3. The workspace was reopened after adding steering files +``` + +## Steering vs. Chat Prompts + +| Aspect | Steering Files | Chat Prompts | +|:-------|:---------------|:-------------| +| Persistence | permanent, loaded every session | session-only | +| Scope | project-wide or file-scoped | per-conversation | +| Version controlled | yes, committed to git | no | +| Shared with team | yes | no | +| Use for | technology decisions, conventions, policies | specific tasks and one-off instructions | + +## Source References + +- [Kiro Docs: Steering](https://kiro.dev/docs/steering) +- [Kiro Docs: Steering Files](https://kiro.dev/docs/steering/files) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now know how to create, scope, and combine steering files that encode persistent project rules for Kiro agents. + +Next: [Chapter 4: Autonomous Agent Mode](04-autonomous-agent-mode.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 3: Agent Steering and Rules Configuration** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 3: Agent Steering and Rules Configuration` — the `.kiro/steering/` directory as a persistent context source, the Kiro context injector as the delivery mechanism, and the agent as the consumer. +2. Separate control-plane decisions (which steering files to create, scoping rules) from data-plane execution (file reads and context injection at session start). +3. Capture input contracts: markdown files in `.kiro/steering/`; output: augmented system prompt for every agent interaction. +4. Trace state transitions: no steering → steering files created → steering loaded at session start → agent behavior reflects rules. +5. Identify extension hooks: inclusion patterns for file-scoped rules, numeric prefixes for priority ordering. +6. Map ownership boundaries: team leads own `00-project.md` and `03-security.md`; individual developers own feature-specific steering files. +7. Specify rollback paths: remove or rename a steering file to exclude it from context; use git revert for team-wide rollback. +8. Track observability signals: verify agent responses reflect steering rules by testing with rule-specific questions. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Steering granularity | one general project.md | multiple scoped files per concern | simplicity vs precision | +| Rule enforcement | informational guidance | explicit forbidden patterns | flexibility vs compliance | +| Versioning | committed to git | PR review required for changes | speed vs governance | +| Scoping | global rules only | file-pattern scoped rules | ease vs relevance | +| Team ownership | any developer edits | designated maintainer with review | velocity vs consistency | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| steering not loaded | agent ignores known rules | session not restarted after adding files | reopen workspace to trigger re-load | +| conflicting rules | inconsistent agent output | two steering files with contradicting guidance | audit files for conflicts; use numeric prefix to set explicit priority | +| overly broad rules | agent refuses valid patterns | steering file uses absolute prohibition on useful patterns | rewrite rules as guidance with explicit exceptions | +| stale steering | agent applies outdated tech stack choices | steering not updated after refactor | add a quarterly steering review to the team's engineering calendar | +| rule explosion | too many steering files slow context loading | fine-grained file-per-rule authoring | consolidate related rules into thematic files | +| secret leakage in steering | sensitive values committed to git | developer pasted credentials into steering file | scan steering files with secret detection in CI | + +### Implementation Runbook + +1. Create the `.kiro/steering/` directory and add it to the git-tracked files. +2. Write `00-project.md` with the technology stack, key decisions, and forbidden patterns. +3. Write `01-coding-style.md` with language-specific style conventions for the primary language. +4. Write `02-testing.md` with the testing framework, naming conventions, and coverage targets. +5. Write `03-security.md` with authentication requirements, input validation policies, and dependency rules. +6. Reopen the workspace to trigger steering file loading. +7. Verify each steering file by asking a targeted question that requires knowledge of that file's rules. +8. Commit all steering files to version control with a PR description explaining each file's purpose. +9. Add a CI lint step to check steering files for secret patterns and markdown syntax errors. + +### Quality Gate Checklist + +- [ ] steering files cover the four core domains: project context, coding style, testing, and security +- [ ] all steering files use plain markdown with no embedded secrets or credentials +- [ ] file-scoped rules use valid inclusion patterns tested against actual file paths +- [ ] priority ordering is explicit via numeric prefixes on filenames +- [ ] steering rules are verified by targeted agent questions before committing +- [ ] steering files are committed to version control with clear PR descriptions +- [ ] a CI step checks steering files for secret patterns +- [ ] a review process is defined for who can approve changes to security.md and project.md + +### Source Alignment + +- [Kiro Docs: Steering](https://kiro.dev/docs/steering) +- [Kiro Docs: Steering Files](https://kiro.dev/docs/steering/files) +- [Kiro Docs: Steering Scoping](https://kiro.dev/docs/steering/scoping) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [Cline Tutorial](../cline-tutorial/) +- [Roo Code Tutorial](../roo-code-tutorial/) +- [Claude Code Tutorial](../claude-code-tutorial/) +- [Agents MD Tutorial](../agents-md-tutorial/) +- [Chapter 4: Autonomous Agent Mode](04-autonomous-agent-mode.md) + +### Advanced Practice Exercises + +1. Write a complete four-file steering setup (project, style, testing, security) for a real project and verify each file's rules with targeted agent questions. +2. Create a file-scoped steering file for the `src/api/` directory and confirm it does not affect agent behavior in `src/models/`. +3. Simulate a steering conflict by writing two files with contradicting rules and observe the agent's behavior; then resolve the conflict with explicit priority ordering. +4. Add a GitHub Actions step that runs `gitleaks` or `trufflehog` against the `.kiro/steering/` directory on every PR. +5. Write a steering update proposal PR that changes a security rule and practice the review and approval workflow. + +### Review Questions + +1. What is the difference between a steering file and a chat prompt, and when should you use each? +2. How does Kiro determine the priority order when two steering files have conflicting rules? +3. What tradeoff did you make between steering granularity and context loading performance? +4. How would you recover if a steering file was accidentally committed with an API key? +5. What governance process should control changes to the security steering file in a team environment? + +### Scenario Playbook 1: Steering - Rules Not Loaded After Adding Files + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent ignores steering rules despite `.kiro/steering/` containing valid markdown files +- initial hypothesis: the workspace session was not restarted after adding the steering files +- immediate action: close and reopen the Kiro workspace to trigger steering file re-loading +- engineering control: add a note to the team onboarding guide that workspace restart is required after steering changes +- verification target: agent responds with steering-aligned content when asked a targeted rule question +- rollback trigger: if restarting does not load steering, check for markdown syntax errors in the steering files +- communication step: document the restart requirement in the project's Kiro setup README section +- learning capture: request a Kiro feature for hot-reloading steering files without workspace restart + +### Scenario Playbook 2: Steering - Conflicting Rules Between Files + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent produces inconsistent output because two steering files have contradicting guidance +- initial hypothesis: identify the specific rule conflict by reviewing all steering files for overlapping topics +- immediate action: temporarily disable the lower-priority file by renaming it with a `.disabled` extension +- engineering control: audit all steering files for topic overlap and consolidate conflicting rules into a single file +- verification target: agent consistently applies the intended rule with the conflict file disabled +- rollback trigger: if the conflict resolution introduces new inconsistencies, split into more narrowly scoped files +- communication step: document the conflict resolution decision in the PR that updates the steering files +- learning capture: add a steering file review checklist that flags topic overlap during PR review + +### Scenario Playbook 3: Steering - Secret Committed to Steering File + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a developer pasted a real API key or database password into a steering file and committed it +- initial hypothesis: confirm the secret is present by running gitleaks against the repository history +- immediate action: immediately revoke the exposed credential at the issuing service; do not wait for git history cleanup +- engineering control: use `git filter-branch` or BFG Repo Cleaner to remove the secret from git history, then force-push +- verification target: gitleaks scan reports zero secrets in `.kiro/steering/` after history cleanup +- rollback trigger: if history rewrite fails, mark the repository as compromised and rotate all project credentials +- communication step: notify the security team and affected service owners of the exposure within one hour +- learning capture: add a pre-commit hook that runs secret detection on `.kiro/steering/` before allowing commits + +### Scenario Playbook 4: Steering - Stale Technology Stack After Refactor + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent recommends patterns from the old tech stack because project.md was not updated after a framework migration +- initial hypothesis: compare the current package.json and import statements against the technology stack in project.md +- immediate action: update project.md with the new framework and remove all references to the deprecated stack +- engineering control: add a steering review to the definition of done for major refactoring tasks +- verification target: agent recommends only the new framework's patterns after project.md is updated +- rollback trigger: if the update causes agent confusion, create a migration note section in project.md explaining the transition +- communication step: announce the project.md update in the team channel and ask members to restart their Kiro workspaces +- learning capture: add "update project.md" as a required step in the refactoring PR checklist + +### Scenario Playbook 5: Steering - Overly Broad Security Rules Breaking Valid Patterns + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent refuses to generate valid code patterns because security.md uses absolute prohibitions that are too broad +- initial hypothesis: identify the specific rule that is blocking valid patterns and test with a targeted prompt +- immediate action: rewrite the prohibition as a conditional rule with explicit exceptions for the valid patterns +- engineering control: review all absolute prohibitions in security.md and add exception clauses where appropriate +- verification target: agent generates valid code patterns while still respecting the underlying security intent +- rollback trigger: if rule rewriting introduces security gaps, escalate to a security team review before committing +- communication step: document the rule refinement and its rationale in the security.md commit message +- learning capture: add a rule-testing protocol to the steering governance process: test each new rule with both valid and invalid code examples + +## What Problem Does This Solve? + +Without persistent project context, every Kiro session starts from scratch. Developers repeat the same stack choices, style preferences, and policy constraints in every chat prompt, and new team members have no way to discover what the AI has been told to do. Kiro's steering system solves this by storing project rules in version-controlled markdown files that are automatically injected into every agent interaction. + +In practical terms, this chapter helps you avoid three common failures: + +- agents generating code that contradicts established team conventions because the rules were never encoded +- inconsistent AI behavior across team members because each person prompts differently +- policy drift where security rules agreed upon in a meeting never make it into the AI's working context + +After working through this chapter, you should be able to treat the `.kiro/steering/` directory as the authoritative source of your team's AI operating rules, reviewed and version-controlled like any other engineering artifact. + +## How it Works Under the Hood + +Under the hood, `Chapter 3: Agent Steering and Rules Configuration` follows a repeatable control path: + +1. **Directory scan**: at workspace open, Kiro scans `.kiro/steering/` and loads all `.md` files in alphabetical order. +2. **Scoping evaluation**: for each file, Kiro checks the `applies_to` frontmatter against the current file context. +3. **Context injection**: active steering file content is prepended to the system prompt for every agent interaction. +4. **Priority resolution**: files with lower numeric prefixes take precedence when content conflicts. +5. **Session persistence**: steering context persists for the entire workspace session without re-loading on each message. +6. **Operational telemetry**: Kiro logs which steering files were loaded and their total character count for debugging. + +When debugging steering issues, verify each stage: files exist, scoping matches, context is injected, and agent responses reflect the rules. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Steering](https://kiro.dev/docs/steering) + Why it matters: the authoritative reference for the steering directory structure and file format. +- [Kiro Docs: Steering Files](https://kiro.dev/docs/steering/files) + Why it matters: documents the frontmatter options including `applies_to` scoping patterns. +- [Kiro Docs: Steering Scoping](https://kiro.dev/docs/steering/scoping) + Why it matters: explains how Kiro matches file-pattern rules against the current active file in the editor. +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + Why it matters: source of community-contributed steering file examples and issue tracking for steering bugs. + +Suggested trace strategy: +- check the steering docs for the exact frontmatter schema before writing `applies_to` patterns +- test each steering file with a targeted question immediately after creation to confirm loading + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 2: Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) +- [Next Chapter: Chapter 4: Autonomous Agent Mode](04-autonomous-agent-mode.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/04-autonomous-agent-mode.md b/tutorials/kiro-tutorial/04-autonomous-agent-mode.md new file mode 100644 index 00000000..da320b1e --- /dev/null +++ b/tutorials/kiro-tutorial/04-autonomous-agent-mode.md @@ -0,0 +1,388 @@ +--- +layout: default +title: "Chapter 4: Autonomous Agent Mode" +nav_order: 4 +parent: Kiro Tutorial +--- + +# Chapter 4: Autonomous Agent Mode + +Welcome to **Chapter 4: Autonomous Agent Mode**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro's autonomous agent mode delegates multi-step execution to an AI agent that can read files, write code, run terminal commands, and iterate — all without manual approval of each step. This chapter teaches you how to delegate safely and monitor effectively. + +## Learning Goals + +- understand the difference between interactive chat and autonomous agent execution +- configure autonomy levels and approval gates for different task types +- delegate complete tasks from tasks.md to the autonomous agent +- monitor agent progress and intervene when needed +- define safe boundaries for autonomous execution in production-adjacent environments + +## Fast Start Checklist + +1. open a tasks.md with at least one uncompleted task +2. select the task and choose "Execute with Agent" in the Kiro interface +3. observe the agent's step-by-step execution in the Agent Activity panel +4. verify the output files and test results after completion +5. mark the task as complete in tasks.md + +## Autonomous vs. Interactive Mode + +| Mode | Agent Behavior | Approval Required | Best For | +|:-----|:---------------|:-----------------|:---------| +| Interactive Chat | responds to each message, waits for next | every step | exploratory work, design discussion | +| Supervised Agent | executes steps, pauses at risk boundaries | pre-configured risk points | complex tasks with clear specs | +| Autonomous Agent | executes full task end-to-end | none during execution | well-specified tasks from tasks.md | + +## Autonomy Levels + +Kiro supports three autonomy levels that control how much the agent does before stopping for human input: + +| Level | Description | Stops When | +|:------|:------------|:-----------| +| Cautious | executes one sub-step at a time | after every file write or command | +| Balanced | executes logical task groups | before destructive commands or large rewrites | +| Full | executes the complete task | only on error or completion | + +Configure the autonomy level in `.kiro/settings.json`: + +```json +{ + "agent": { + "autonomyLevel": "balanced", + "allowedCommands": ["npm test", "npm run build", "npx prisma migrate dev"], + "forbiddenCommands": ["rm -rf", "git push --force", "npm publish"], + "maxFileEditsPerTask": 20, + "requireApprovalForNewFiles": false + } +} +``` + +## Delegating a Task + +``` +# In the Chat panel with tasks.md open: +> Execute task 3: implement AuthService.issueToken() using jsonwebtoken with 1h expiry + +# The agent will: +# 1. Read design.md for the AuthService interface contract +# 2. Read the current src/auth/auth.service.ts file +# 3. Write the issueToken() implementation +# 4. Run the relevant unit tests +# 5. Report completion or errors +``` + +## Agent Activity Panel + +During autonomous execution, the Agent Activity panel shows the agent's step-by-step reasoning: + +``` +[Agent] Reading design.md for AuthService interface... +[Agent] Reading src/auth/auth.service.ts... +[Agent] Writing issueToken() implementation... + + Added import { sign } from 'jsonwebtoken' + + Added issueToken(userId: string): string method +[Agent] Running: npm test -- --testPathPattern=auth.service + PASS src/auth/auth.service.test.ts (3 tests passed) +[Agent] Task 3 complete. 1 file modified, 3 tests passing. +``` + +## Interrupting Agent Execution + +You can interrupt the agent at any point: + +``` +# Press Escape or click "Stop Agent" in the activity panel + +# After stopping: +> What did you complete before I interrupted you? +# Agent summarizes completed sub-steps + +# Review and decide whether to: +# - Resume from the last checkpoint +# - Discard changes and restart +# - Complete the remaining steps manually +``` + +## Safe Boundaries for Autonomous Execution + +Define boundaries before delegating autonomous tasks, especially in shared or production-adjacent environments: + +```markdown +# Safe for autonomous execution: +- Writing new source files in src/ +- Modifying test files +- Running unit and integration tests +- Installing dev dependencies +- Running database migrations in test environments + +# Requires human approval: +- Modifying .env files or secrets +- Pushing to remote branches +- Running database migrations in staging or production +- Publishing packages +- Deleting files (except test artifacts) +``` + +## Multi-Step Task Delegation + +For complex features, delegate multiple sequential tasks: + +``` +> Execute tasks 1 through 5 in tasks.md for the user-authentication spec. + Stop after each task group and show me what was completed before proceeding. + +# Agent executes: +# Task 1: database migration → reports completion +# Task 2: AuthService.validateCredentials() → reports completion +# Task 3: AuthService.issueToken() → reports completion +# [pauses for review] +# Task 4: AuthService.revokeToken() → reports completion +# Task 5: POST /auth/login route → reports completion +# [pauses for review] +``` + +## Error Recovery in Autonomous Mode + +When the agent encounters an error, it attempts self-correction: + +``` +[Agent] Running: npm test -- --testPathPattern=auth.service + FAIL src/auth/auth.service.test.ts + ● AuthService › issueToken › should return a valid JWT + Expected: string matching /^ey/ + Received: undefined + +[Agent] Analyzing test failure... +[Agent] Issue found: JWT_SECRET environment variable not set in test environment +[Agent] Fixing: adding JWT_SECRET to jest.config.ts testEnvironment setup... +[Agent] Running tests again... + PASS src/auth/auth.service.test.ts (3 tests passed) +[Agent] Task 3 complete after one self-correction. +``` + +## Source References + +- [Kiro Docs: Autonomous Agent](https://kiro.dev/docs/agent) +- [Kiro Docs: Autonomy Levels](https://kiro.dev/docs/agent/autonomy) +- [Kiro Docs: Agent Activity](https://kiro.dev/docs/agent/activity) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now know how to delegate tasks to Kiro's autonomous agent, configure autonomy levels, monitor execution, and define safe operational boundaries. + +Next: [Chapter 5: MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 4: Autonomous Agent Mode** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 4: Autonomous Agent Mode` — the agent executor, the tool dispatch layer, and the approval gate controller. +2. Separate control-plane decisions (autonomy level, allowed commands, approval gates) from data-plane execution (file writes, test runs, self-correction loops). +3. Capture input contracts: a task description from tasks.md with spec context; output: completed code changes and passing tests. +4. Trace state transitions: task selected → agent plan generated → sub-steps executing → error or completion → human review. +5. Identify extension hooks: `allowedCommands`, `forbiddenCommands`, `maxFileEditsPerTask`, and custom approval triggers. +6. Map ownership boundaries: developer owns task delegation decisions; team leads own autonomy level configuration; security team approves `allowedCommands` list. +7. Specify rollback paths: `git checkout` to revert agent changes; restore from task checkpoint if execution was interrupted. +8. Track observability signals: agent activity log, test pass/fail rates, self-correction frequency, task completion time. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Autonomy level | cautious (one step at a time) | full autonomy for well-specified tasks | control vs efficiency | +| Command allowlist | no shell commands | narrow allowlist of known-safe commands | safety vs capability | +| Task scope | single task delegation | multi-task sequential delegation | simplicity vs throughput | +| Error recovery | human intervention on first error | agent self-correction with logging | oversight vs speed | +| Rollback strategy | manual git checkout | automated checkpoint and revert | effort vs recovery speed | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| agent scope creep | unexpected file edits outside task scope | underconstrained task description | add explicit file scope to task description | +| runaway command execution | agent runs commands not in allowlist | missing `forbiddenCommands` config | maintain an explicit `forbiddenCommands` list in settings | +| self-correction loop | agent retries same failing pattern | root cause not identified before retry | set max self-correction attempts; escalate to human on limit | +| context window overflow | agent loses task context mid-execution | very long task with many sub-steps | split long tasks into smaller independent tasks | +| environment mismatch | tests pass locally but agent tests fail | agent uses different Node or env vars | standardize the test environment in jest.config.ts | +| partial completion without reporting | agent stops mid-task without clear status | error swallowed by recovery logic | require agent to log completion status for every sub-step | + +### Implementation Runbook + +1. Define the autonomy level for the current task profile in `.kiro/settings.json`. +2. Review and set the `allowedCommands` list to only include commands safe for automated execution. +3. Confirm the task in tasks.md has a clear, bounded scope with references to specific files or modules. +4. Delegate the task via the Chat panel or the Specs task list interface. +5. Monitor the Agent Activity panel during execution and verify each sub-step output. +6. If the agent self-corrects, review the correction log to confirm the fix is sound. +7. After task completion, run the full test suite manually to verify no regressions were introduced. +8. Review the git diff to confirm changes are scoped to the expected files. +9. Mark the task as complete in tasks.md and commit the updated spec. + +### Quality Gate Checklist + +- [ ] autonomy level is configured appropriately for the task type before delegation +- [ ] `allowedCommands` and `forbiddenCommands` are explicitly set in `.kiro/settings.json` +- [ ] task description includes explicit file scope to prevent agent scope creep +- [ ] agent activity log is reviewed after each task for unexpected behavior +- [ ] self-correction events are counted and investigated for root cause +- [ ] full test suite passes after autonomous task completion +- [ ] git diff is reviewed to confirm no out-of-scope changes +- [ ] task completion is marked in tasks.md and committed to version control + +### Source Alignment + +- [Kiro Docs: Autonomous Agent](https://kiro.dev/docs/agent) +- [Kiro Docs: Autonomy Levels](https://kiro.dev/docs/agent/autonomy) +- [Kiro Docs: Agent Activity](https://kiro.dev/docs/agent/activity) +- [Kiro Docs: Settings](https://kiro.dev/docs/settings) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [OpenHands Tutorial](../openhands-tutorial/) +- [SWE-Agent Tutorial](../swe-agent-tutorial/) +- [AutoGen Tutorial](../autogen-tutorial/) +- [CrewAI Tutorial](../crewai-tutorial/) +- [Chapter 5: MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) + +### Advanced Practice Exercises + +1. Configure three different autonomy levels in `.kiro/settings.json` and test each with the same task to compare output quality and speed. +2. Deliberately give the agent an ambiguous task and observe where it gets confused; then rewrite the task with explicit scope and compare outcomes. +3. Simulate an agent self-correction scenario by introducing a test environment variable that is missing; confirm the agent detects and fixes the issue. +4. Set up a post-task git diff review workflow and practice identifying out-of-scope changes from three different autonomous executions. +5. Build a multi-task delegation sequence for a five-task feature and practice the pause-and-review pattern between task groups. + +### Review Questions + +1. What determines whether the "balanced" or "full" autonomy level is appropriate for a given task? +2. How do `allowedCommands` and `forbiddenCommands` interact, and what happens if a command appears in neither list? +3. What tradeoff did you make between autonomous efficiency and oversight granularity? +4. How would you recover if the agent completed tasks 1-3 but made an error in task 2 that was only discovered after task 3 completed? +5. What conditions must be true before delegating a task to full autonomous mode without per-step review? + +### Scenario Playbook 1: Autonomous Agent - Scope Creep + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: autonomous agent modifies files outside the task's defined scope +- initial hypothesis: task description lacked explicit file boundaries causing the agent to infer additional scope +- immediate action: interrupt the agent and run `git checkout` to revert out-of-scope changes +- engineering control: add explicit file path constraints to the task description before re-delegating +- verification target: re-run the task and confirm only expected files appear in the git diff +- rollback trigger: if scope creep recurs, switch to cautious autonomy level for this task type +- communication step: document the scope creep incident in the task's completion notes +- learning capture: update the task generation prompt to always include an explicit "files to modify" constraint + +### Scenario Playbook 2: Autonomous Agent - Runaway Command Execution + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent attempts to run a command not on the allowedCommands list +- initial hypothesis: the command was implied by the task but not explicitly listed in the allowed list +- immediate action: the agent should be blocked by the forbiddenCommands check; if not, stop execution immediately +- engineering control: audit the allowedCommands list and add the missing command if it is safe; update forbiddenCommands otherwise +- verification target: agent is blocked from the command on the next execution attempt +- rollback trigger: if the agent bypassed the command check, report as a security incident to the Kiro team +- communication step: update the team's Kiro security policy with the new command classification +- learning capture: add the new command to the appropriate list and commit the settings.json update with a PR review + +### Scenario Playbook 3: Autonomous Agent - Infinite Self-Correction Loop + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent retries the same failing test pattern more than three times without progress +- initial hypothesis: agent is not identifying the true root cause and is applying surface-level fixes +- immediate action: interrupt the agent and manually diagnose the test failure +- engineering control: set a maximum self-correction attempt count in the agent settings +- verification target: after manual fix, re-delegate the task and confirm completion on first attempt +- rollback trigger: if the failure pattern is systemic, escalate to a debugging session with the full team +- communication step: document the root cause and manual fix in the task completion notes +- learning capture: add the root cause pattern to the project steering file's troubleshooting section + +### Scenario Playbook 4: Autonomous Agent - Context Window Overflow + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent loses coherence mid-task on a large feature with many sub-steps +- initial hypothesis: the task description plus design context exceeds the model's effective context window +- immediate action: interrupt the agent and split the task into two independent smaller tasks in tasks.md +- engineering control: set a `maxFileEditsPerTask` limit in settings.json to force task splitting at design time +- verification target: each smaller task completes independently without context loss +- rollback trigger: if splitting creates unresolvable dependencies, escalate to manual implementation of the transition step +- communication step: update tasks.md to document the split and the dependency between the two new tasks +- learning capture: add a task complexity guideline to the team's spec writing standards + +### Scenario Playbook 5: Autonomous Agent - Partial Completion Without Status + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent stops executing mid-task without reporting completion or error status +- initial hypothesis: an unhandled exception in the agent execution loop caused a silent exit +- immediate action: check the Kiro logs for the last recorded sub-step and manually continue from that point +- engineering control: require the agent to write a checkpoint file after each sub-step for crash recovery +- verification target: after the fix, re-running the task resumes from the last checkpoint without duplicating work +- rollback trigger: if checkpoint recovery produces duplicate code, revert to the pre-task git state and restart +- communication step: report the silent exit as a bug to the Kiro issue tracker with the relevant log snippet +- learning capture: add a post-task verification step that confirms the agent reported a terminal status before marking the task complete + +## What Problem Does This Solve? + +The fundamental bottleneck in AI-assisted development is not model quality — it is the human-in-the-loop approval rate. When every file edit requires a manual "yes", developers spend more time approving than designing. Kiro's autonomous agent mode removes this bottleneck for well-specified tasks by delegating end-to-end execution while preserving human control at the configuration level. + +In practical terms, this chapter helps you avoid three common failures: + +- treating the AI as a typing accelerator rather than a true task delegate +- delegating tasks without clear boundaries and getting unexpected side effects +- losing confidence in autonomous mode because errors are not surfaced or recoverable + +After working through this chapter, you should be able to reason about autonomous delegation as a spectrum with explicit configuration knobs — not a binary "trust everything" or "approve everything" choice. + +## How it Works Under the Hood + +Under the hood, `Chapter 4: Autonomous Agent Mode` follows a repeatable control path: + +1. **Task ingestion**: the agent reads the task description and fetches the spec context from `design.md`. +2. **Plan generation**: the agent decomposes the task into ordered sub-steps with explicit tool calls. +3. **Tool dispatch**: each sub-step invokes a Kiro tool: file read, file write, shell command, or test runner. +4. **Output validation**: the agent checks each tool output against its expected result before proceeding. +5. **Self-correction loop**: on unexpected output, the agent applies a fix hypothesis and retries up to the configured limit. +6. **Completion reporting**: the agent writes a structured completion report to the Agent Activity log. + +When debugging autonomous failures, check each stage in sequence and look for the first sub-step where the output diverged from expectation. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Autonomous Agent](https://kiro.dev/docs/agent) + Why it matters: the primary reference for agent capabilities, tool dispatch, and execution lifecycle. +- [Kiro Docs: Autonomy Levels](https://kiro.dev/docs/agent/autonomy) + Why it matters: documents the exact behavior of each autonomy level and its approval gate logic. +- [Kiro Docs: Agent Activity](https://kiro.dev/docs/agent/activity) + Why it matters: explains the activity panel format and how to read the sub-step execution log. +- [Kiro Docs: Settings](https://kiro.dev/docs/settings) + Why it matters: reference for all configurable agent parameters including `allowedCommands` and `maxFileEditsPerTask`. + +Suggested trace strategy: +- check the autonomy levels docs before each new task type to select the right configuration +- review the agent activity log after every autonomous execution to build intuition for normal vs. anomalous behavior + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 3: Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) +- [Next Chapter: Chapter 5: MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/05-mcp-integration-and-external-tools.md b/tutorials/kiro-tutorial/05-mcp-integration-and-external-tools.md new file mode 100644 index 00000000..374f6922 --- /dev/null +++ b/tutorials/kiro-tutorial/05-mcp-integration-and-external-tools.md @@ -0,0 +1,433 @@ +--- +layout: default +title: "Chapter 5: MCP Integration and External Tools" +nav_order: 5 +parent: Kiro Tutorial +--- + +# Chapter 5: MCP Integration and External Tools + +Welcome to **Chapter 5: MCP Integration and External Tools**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro supports the Model Context Protocol (MCP) to connect external data sources, APIs, and tools to the AI agent. This chapter teaches you how to configure MCP servers and use them within specs and autonomous tasks. + +## Learning Goals + +- understand the MCP protocol and how Kiro uses it to connect external tools +- configure local and remote MCP servers in `.kiro/mcp.json` +- use connected MCP tools within chat and autonomous agent tasks +- build a custom MCP server for project-specific data sources +- manage MCP server authentication and security boundaries + +## Fast Start Checklist + +1. create `.kiro/mcp.json` with at least one MCP server entry +2. restart the Kiro workspace to load the MCP configuration +3. verify the MCP server is listed as active in Kiro settings +4. invoke a tool from the connected server in the chat panel +5. delegate an agent task that uses the MCP tool + +## What is MCP? + +MCP (Model Context Protocol) is an open protocol developed by Anthropic that defines how AI models connect to external tools and data sources. Kiro implements MCP as its primary extension mechanism, allowing agents to: + +- query external APIs (GitHub, Jira, Confluence, Slack) +- access databases and internal documentation systems +- call custom business logic via local servers +- retrieve real-time data that is not available in the codebase + +## MCP Server Configuration + +MCP servers are configured in `.kiro/mcp.json`: + +```json +{ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "${GITHUB_TOKEN}" + } + }, + "postgres": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-postgres", "postgresql://localhost/myapp"], + "env": {} + }, + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/path/to/docs"], + "env": {} + } + } +} +``` + +## Commonly Used MCP Servers + +| Server | Package | Use Case | +|:-------|:--------|:---------| +| GitHub | `@modelcontextprotocol/server-github` | read issues, PRs, and code across repos | +| PostgreSQL | `@modelcontextprotocol/server-postgres` | query and inspect database schema and data | +| Filesystem | `@modelcontextprotocol/server-filesystem` | access documents outside the workspace | +| Brave Search | `@modelcontextprotocol/server-brave-search` | web search for documentation and APIs | +| Slack | `@modelcontextprotocol/server-slack` | read channel messages and user context | +| AWS Docs | custom or community server | query AWS service documentation | + +## Using MCP Tools in Chat + +Once configured, MCP tools are available in every chat interaction: + +``` +# Query GitHub issues: +> List all open issues labeled "bug" in the kirodotdev/Kiro repository + +# Query the database: +> Show me the schema of the users table in the PostgreSQL database + +# Search documentation: +> Find the Confluence page describing our API versioning policy + +# The agent calls the appropriate MCP tool automatically and includes +# the results in its response context. +``` + +## Using MCP Tools in Autonomous Agent Tasks + +MCP tools extend the autonomous agent's capabilities for tasks that require external data: + +```markdown +# In tasks.md: +- [ ] 7. Query the GitHub issues API to identify all bugs tagged "auth-related" + and generate a bug summary section in docs/auth-bugs.md +``` + +``` +# Agent execution: +[Agent] Calling MCP tool: github.listIssues(labels=["bug", "auth-related"]) +[Agent] Retrieved 12 issues from kirodotdev/Kiro +[Agent] Generating summary... +[Agent] Writing docs/auth-bugs.md... +[Agent] Task 7 complete. +``` + +## Remote MCP Servers + +For team-shared MCP servers that are not installed locally, use the SSE or HTTP transport: + +```json +{ + "mcpServers": { + "internal-api": { + "url": "https://mcp.internal.company.com/api", + "headers": { + "Authorization": "Bearer ${INTERNAL_API_TOKEN}" + } + }, + "confluence": { + "url": "https://mcp.internal.company.com/confluence", + "headers": { + "Authorization": "Bearer ${CONFLUENCE_TOKEN}" + } + } + } +} +``` + +## Building a Custom MCP Server + +For project-specific data sources, build a custom MCP server using the MCP SDK: + +```typescript +// custom-mcp-server.ts +import { Server } from "@modelcontextprotocol/sdk/server/index.js"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; + +const server = new Server({ + name: "project-data", + version: "1.0.0" +}, { + capabilities: { tools: {} } +}); + +server.setRequestHandler("tools/list", async () => ({ + tools: [{ + name: "get_feature_flags", + description: "Get the current feature flag configuration from the internal config service", + inputSchema: { + type: "object", + properties: { + environment: { type: "string", enum: ["dev", "staging", "prod"] } + }, + required: ["environment"] + } + }] +})); + +server.setRequestHandler("tools/call", async (request) => { + if (request.params.name === "get_feature_flags") { + const env = request.params.arguments.environment; + // fetch from internal config service + const flags = await fetchFeatureFlags(env); + return { content: [{ type: "text", text: JSON.stringify(flags) }] }; + } +}); + +const transport = new StdioServerTransport(); +await server.connect(transport); +``` + +Register the custom server in `.kiro/mcp.json`: + +```json +{ + "mcpServers": { + "project-data": { + "command": "npx", + "args": ["ts-node", "./tools/custom-mcp-server.ts"], + "env": { + "CONFIG_SERVICE_URL": "${CONFIG_SERVICE_URL}" + } + } + } +} +``` + +## MCP Security Boundaries + +| Security Concern | Recommended Practice | +|:----------------|:---------------------| +| Credential storage | use environment variable references like `${VAR_NAME}` in mcp.json; never hardcode tokens | +| Network access | restrict MCP servers to read-only access for data sources when possible | +| Tool scoping | list only the tools the agent needs; disable unused tools to reduce attack surface | +| Audit logging | log all MCP tool invocations with arguments for security audit trails | +| Server isolation | run untrusted MCP servers in sandboxed environments (Docker, subprocess isolation) | + +## Source References + +- [Kiro Docs: MCP](https://kiro.dev/docs/mcp) +- [Kiro Docs: MCP Configuration](https://kiro.dev/docs/mcp/configuration) +- [MCP Specification](https://spec.modelcontextprotocol.io) +- [MCP TypeScript SDK](https://github.com/modelcontextprotocol/typescript-sdk) +- [MCP Server Registry](https://github.com/modelcontextprotocol/servers) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now know how to configure MCP servers, use external tools in chat and autonomous tasks, and build custom MCP servers for project-specific data sources. + +Next: [Chapter 6: Hooks and Automation](06-hooks-and-automation.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 5: MCP Integration and External Tools** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 5: MCP Integration and External Tools` — Kiro as the MCP client, external MCP servers as tool providers, and the agent as the tool consumer. +2. Separate control-plane decisions (which servers to connect, tool scoping, auth configuration) from data-plane execution (tool invocations, response parsing). +3. Capture input contracts: `.kiro/mcp.json` server definitions; output: tool results injected into agent context. +4. Trace state transitions: config written → workspace restart → server process started → tools registered → agent invokes tools → results returned. +5. Identify extension hooks: custom MCP server implementations, remote SSE/HTTP transports, per-tool access controls. +6. Map ownership boundaries: platform team owns shared remote MCP servers; individual developers own local server configs; security team approves tool scopes. +7. Specify rollback paths: remove server entry from `mcp.json` and restart workspace; revert to previous `mcp.json` via git. +8. Track observability signals: tool invocation logs, latency per tool call, error rates per MCP server, credential rotation alerts. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Server type | well-known community MCP packages | custom internal MCP server | ease vs specificity | +| Auth method | env var references in mcp.json | secret manager integration | simplicity vs security posture | +| Tool scope | all tools from a server enabled | explicit tool allowlist per server | ease vs least-privilege | +| Deployment | local npx-based servers | containerized remote servers | zero-setup vs isolation | +| Audit logging | none | full tool invocation audit log | performance vs compliance | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| MCP server not found | agent reports tool unavailable | server not started or misconfigured | verify server entry in mcp.json and restart workspace | +| credential exposure | hardcoded token in mcp.json committed to git | developer bypassed env var pattern | scan mcp.json in CI; enforce env var references | +| tool call timeout | agent stalls waiting for tool response | remote MCP server unavailable or slow | set timeout in mcp.json; add health check for remote servers | +| overprivileged tool | agent queries production data unintentionally | read-write access granted to data source | restrict MCP server to read-only role for non-production use | +| schema mismatch | tool returns unexpected response format | upstream API changed without updating server | add schema validation to custom MCP server response handler | +| context overflow from large tool responses | agent loses context after tool call | tool returns unfiltered large dataset | add response size limits and pagination to custom MCP servers | + +### Implementation Runbook + +1. Identify the external data sources or APIs needed for the current feature spec. +2. Select or build an MCP server for each data source. +3. Configure each server in `.kiro/mcp.json` using environment variable references for all credentials. +4. Restart the Kiro workspace to load the new MCP configuration. +5. Verify each server is listed as active in Kiro settings and test one tool call per server. +6. Update the relevant steering file (`project.md` or a new `mcp.md`) to document available MCP tools. +7. Add MCP tool invocations to relevant tasks in `tasks.md` where external data is needed. +8. Monitor tool invocation logs during the first autonomous task execution that uses MCP tools. +9. Commit `mcp.json` to version control with a note listing which environment variables must be set by each developer. + +### Quality Gate Checklist + +- [ ] all credentials in mcp.json use environment variable references, not hardcoded values +- [ ] each MCP server is verified active in Kiro settings before task delegation +- [ ] tool scopes are restricted to the minimum access required for each server +- [ ] a `.env.example` file documents the required environment variables for MCP servers +- [ ] remote MCP servers have a health check endpoint and a timeout configured +- [ ] tool invocation logging is enabled and accessible for audit review +- [ ] mcp.json is committed to version control with a clear setup README +- [ ] CI scans mcp.json and related files for hardcoded credentials on every PR + +### Source Alignment + +- [Kiro Docs: MCP](https://kiro.dev/docs/mcp) +- [Kiro Docs: MCP Configuration](https://kiro.dev/docs/mcp/configuration) +- [MCP Specification](https://spec.modelcontextprotocol.io) +- [MCP TypeScript SDK](https://github.com/modelcontextprotocol/typescript-sdk) +- [MCP Python SDK](https://github.com/modelcontextprotocol/python-sdk) +- [MCP Server Registry](https://github.com/modelcontextprotocol/servers) + +### Cross-Tutorial Connection Map + +- [MCP TypeScript SDK Tutorial](../mcp-typescript-sdk-tutorial/) +- [MCP Python SDK Tutorial](../mcp-python-sdk-tutorial/) +- [Awesome MCP Servers Tutorial](../awesome-mcp-servers-tutorial/) +- [Claude Code Tutorial](../claude-code-tutorial/) +- [Chapter 6: Hooks and Automation](06-hooks-and-automation.md) + +### Advanced Practice Exercises + +1. Configure three different MCP servers (GitHub, PostgreSQL, and a custom one) and verify each with a targeted tool call. +2. Build a minimal custom MCP server that exposes one tool reading from a local JSON config file and register it in Kiro. +3. Simulate a credential exposure incident by hardcoding a test token in mcp.json, then fix it with env var references and add a CI scan. +4. Create a Kiro task that requires data from two different MCP servers and confirm the agent orchestrates both tool calls correctly. +5. Set up a remote MCP server with an HTTP transport and configure a timeout; test the timeout behavior by intentionally delaying the server response. + +### Review Questions + +1. What is the difference between a local stdio-based MCP server and a remote HTTP-based MCP server, and when should you use each? +2. Why should all credentials in mcp.json use environment variable references rather than hardcoded values? +3. What tradeoff did you make between enabling all tools from a server and restricting to an explicit allowlist? +4. How would you recover if a custom MCP server returned a schema-breaking response that corrupted an in-progress autonomous task? +5. What must be in the project's README before team members can use a shared MCP configuration? + +### Scenario Playbook 1: MCP - Server Not Launching + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent reports a tool is unavailable after mcp.json was updated with a new server +- initial hypothesis: the MCP server process failed to start due to a missing npm package or wrong command path +- immediate action: run the server command manually in the terminal to see the startup error +- engineering control: add a startup health check to the mcp.json server entry and verify it passes after workspace restart +- verification target: the server appears as active in Kiro settings and a test tool call succeeds +- rollback trigger: if the server cannot start after three attempts, remove the entry from mcp.json and use a fallback approach +- communication step: document the startup error and fix in the project's MCP setup README +- learning capture: add a pre-installation step to the MCP onboarding guide that verifies the required npm packages are installed + +### Scenario Playbook 2: MCP - Credential Hardcoded in mcp.json + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a code review catches a real API token hardcoded in mcp.json before it is merged +- initial hypothesis: the developer copied a working token from a local test rather than using an env var reference +- immediate action: immediately revoke the exposed token and issue a new one before merging the PR +- engineering control: replace the hardcoded value with `${ENV_VAR_NAME}` and add the variable to `.env.example` +- verification target: gitleaks scan on the PR shows zero secrets in mcp.json after the fix +- rollback trigger: if the token was already merged to main, treat it as a confirmed secret exposure and escalate +- communication step: notify the security team and the token owner of the exposure within one hour +- learning capture: add a required gitleaks check to the PR pipeline targeting the `.kiro/` directory + +### Scenario Playbook 3: MCP - Tool Call Timeout During Autonomous Task + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: autonomous agent stalls waiting for a response from a remote MCP server during task execution +- initial hypothesis: the remote MCP server is unavailable or experiencing high latency +- immediate action: interrupt the agent and check the remote server's health endpoint +- engineering control: add a `timeout` field to the mcp.json server entry and implement a fallback behavior in the task +- verification target: re-run the task with timeout configured; agent fails gracefully within the timeout window +- rollback trigger: if the remote server is consistently unavailable, switch to a local MCP server for the same data source +- communication step: file an incident report for the remote MCP server team with the timeout details and impact +- learning capture: add timeout configuration as a required field in the team's MCP server onboarding template + +### Scenario Playbook 4: MCP - Overprivileged Database Access + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent accidentally modifies production database records through an MCP server with write access +- initial hypothesis: the PostgreSQL MCP server was configured with a read-write database role +- immediate action: revoke the write permissions from the MCP server's database role immediately +- engineering control: create a dedicated read-only database user for MCP server connections and update mcp.json +- verification target: confirm the agent cannot execute INSERT, UPDATE, or DELETE statements through the MCP server +- rollback trigger: if the database modification caused data corruption, initiate the database recovery runbook +- communication step: notify the DBA team and affected data owners of the unauthorized modification within 30 minutes +- learning capture: add a mandatory read-only access requirement to the security steering file for all MCP database servers + +### Scenario Playbook 5: MCP - Large Tool Response Causes Context Overflow + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: agent loses coherence after receiving a large unfiltered response from an MCP tool (e.g., thousands of GitHub issues) +- initial hypothesis: the tool response exceeds the agent's effective context window, pushing out earlier task context +- immediate action: interrupt the agent and redesign the tool call to return only the top 10 most relevant results +- engineering control: add response size limits and filtering parameters to the custom MCP server's tool schema +- verification target: re-run the agent task with the filtered tool call; agent maintains context through task completion +- rollback trigger: if filtering removes critical data, implement pagination and chain two agent calls instead of one +- communication step: update the tool's documentation in the MCP server README with the recommended query parameters +- learning capture: add a maximum response size guideline to the team's custom MCP server development standards + +## What Problem Does This Solve? + +Agentic coding IDEs are limited to what they can see in the local workspace. Kiro's MCP integration breaks this boundary by connecting agents to the full context of an engineering organization: issue trackers, documentation wikis, internal APIs, database schemas, and feature flag systems. This means agents can generate code that references the actual current state of external systems, not just what is hardcoded in the repo. + +In practical terms, this chapter helps you avoid three common failures: + +- generating code against stale or assumed API contracts because the agent cannot see the live API schema +- writing tasks that require human lookups from external systems, breaking the autonomous execution flow +- integrating with external tools through ad-hoc prompt stuffing instead of structured, auditable tool calls + +After working through this chapter, you should be able to treat MCP servers as the API boundary between Kiro agents and your organization's full data ecosystem. + +## How it Works Under the Hood + +Under the hood, `Chapter 5: MCP Integration and External Tools` follows a repeatable control path: + +1. **Server registration**: at workspace load, Kiro reads `mcp.json` and starts each server process using the configured command. +2. **Tool discovery**: Kiro sends a `tools/list` request to each server and registers the returned tool schemas. +3. **Context injection**: available tool names and schemas are injected into the agent's system prompt. +4. **Tool dispatch**: when the agent decides to use a tool, Kiro sends a `tools/call` request to the appropriate server process. +5. **Response integration**: the server's response is formatted and injected into the agent's next context block. +6. **Audit logging**: each tool invocation with its arguments and response size is logged for security and debugging. + +When debugging MCP issues, trace this sequence from server startup through tool registration before investigating individual tool calls. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: MCP](https://kiro.dev/docs/mcp) + Why it matters: the primary reference for how Kiro implements MCP client behavior and mcp.json format. +- [MCP Specification](https://spec.modelcontextprotocol.io) + Why it matters: the canonical protocol definition for tools/list and tools/call message formats. +- [MCP TypeScript SDK](https://github.com/modelcontextprotocol/typescript-sdk) + Why it matters: the official SDK for building custom MCP servers in TypeScript. +- [MCP Server Registry](https://github.com/modelcontextprotocol/servers) + Why it matters: the community catalog of ready-to-use MCP servers for common data sources. + +Suggested trace strategy: +- check the MCP server registry before building a custom server to avoid duplicating existing work +- test each new MCP server with a direct stdio call before registering it in mcp.json to isolate startup issues + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 4: Autonomous Agent Mode](04-autonomous-agent-mode.md) +- [Next Chapter: Chapter 6: Hooks and Automation](06-hooks-and-automation.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/06-hooks-and-automation.md b/tutorials/kiro-tutorial/06-hooks-and-automation.md new file mode 100644 index 00000000..530b72f4 --- /dev/null +++ b/tutorials/kiro-tutorial/06-hooks-and-automation.md @@ -0,0 +1,421 @@ +--- +layout: default +title: "Chapter 6: Hooks and Automation" +nav_order: 6 +parent: Kiro Tutorial +--- + +# Chapter 6: Hooks and Automation + +Welcome to **Chapter 6: Hooks and Automation**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro hooks are event-driven triggers that invoke agent actions automatically when specific events occur in the workspace. This chapter teaches you how to build hooks that eliminate repetitive manual workflows. + +## Learning Goals + +- understand the Kiro hook model: events, conditions, and agent actions +- create hooks for common events: file save, test completion, and spec changes +- configure hook conditions to avoid unnecessary agent activations +- combine hooks with steering files for governed automation +- avoid common hook pitfalls like infinite loops and excessive token consumption + +## Fast Start Checklist + +1. create `.kiro/hooks/` directory in your project root +2. create your first hook file (e.g., `on-save-lint.md`) +3. define the event trigger, condition, and agent action in the hook +4. save a file to trigger the hook and observe the agent response +5. review the hook's agent activity log and refine the condition + +## The Hook Model + +Each Kiro hook is a markdown file in `.kiro/hooks/` with three components: + +| Component | Purpose | Example | +|:----------|:--------|:--------| +| event | what triggers the hook | `file:save`, `test:complete`, `spec:updated` | +| condition | when to activate (optional filter) | `file matches "src/**/*.ts"` | +| action | what the agent does when triggered | "run the linter on the saved file and fix any warnings" | + +## Hook File Format + +```markdown +--- +event: file:save +condition: file matches "src/**/*.ts" +--- + +# On TypeScript File Save: Run Lint and Format + +When a TypeScript file in `src/` is saved, run ESLint with the `--fix` flag on the saved file +and apply Prettier formatting. Report any errors that cannot be auto-fixed. +``` + +## Built-in Event Types + +| Event | Trigger Condition | +|:------|:-----------------| +| `file:save` | any file is saved in the workspace | +| `file:create` | a new file is created | +| `file:delete` | a file is deleted | +| `test:complete` | a test run finishes (pass or fail) | +| `spec:updated` | a file in `.kiro/specs/` is changed | +| `task:complete` | an autonomous agent task completes | +| `git:commit` | a git commit is made in the workspace | +| `chat:response` | the agent produces a chat response | + +## Example Hooks + +### Auto-Lint on Save + +```markdown +--- +event: file:save +condition: file matches "src/**/*.{ts,tsx}" +--- + +# Auto-Lint TypeScript on Save + +Run ESLint with `--fix` on the saved file. If there are unfixable errors, open the Problems +panel and highlight the first error. Do not modify files other than the one that was saved. +``` + +### Test Failure Analysis + +```markdown +--- +event: test:complete +condition: test_result == "fail" +--- + +# Analyze Test Failures + +When the test run completes with failures, analyze the failing test output and provide: +1. A one-line root cause summary for each failing test +2. The most likely file to fix +3. A suggested code change (do not apply automatically; show in chat) +``` + +### Spec Update Propagation + +```markdown +--- +event: spec:updated +condition: file matches ".kiro/specs/**/requirements.md" +--- + +# Requirements Changed: Check Design Alignment + +When requirements.md is updated, review the current design.md for the same spec and +identify any requirements that are not covered by the existing design. List the gaps +in the chat panel without modifying design.md automatically. +``` + +### Post-Commit Documentation Update + +```markdown +--- +event: git:commit +condition: commit_files include "src/api/**" +--- + +# Update API Documentation After API Commit + +When a commit modifies files in `src/api/`, check whether `docs/api.md` needs to be +updated to reflect the changes. If documentation is stale, list the specific sections +that need updating in the chat panel. +``` + +### Task Completion Summary + +```markdown +--- +event: task:complete +--- + +# Task Completion: Generate Summary + +When an autonomous agent task completes, generate a two-sentence summary of what was +changed, which files were modified, and whether all tests are passing. Log the summary +in `.kiro/task-log.md`. +``` + +## Condition Syntax + +Hook conditions filter when the hook activates. Supported condition expressions: + +``` +# File pattern matching +file matches "src/**/*.ts" +file matches "*.test.ts" +file not matches "node_modules/**" + +# Test result conditions +test_result == "fail" +test_result == "pass" +test_count > 0 + +# Git conditions +commit_files include "src/api/**" +commit_message contains "feat:" + +# Logical operators +file matches "src/**/*.ts" AND file not matches "**/*.test.ts" +test_result == "fail" OR test_count == 0 +``` + +## Avoiding Hook Pitfalls + +| Pitfall | Description | Prevention | +|:--------|:------------|:-----------| +| Infinite loop | hook triggers on a file it modifies | add `file not matches` for agent output files | +| Token waste | hook activates on every keystroke or frequent event | add specific conditions to reduce activation frequency | +| Noisy chat | hook produces chat output on common events | direct output to a log file or suppress low-value notifications | +| Unexpected edits | hook agent modifies files beyond its scope | add explicit scope constraints in the hook action | +| Slow workspace | too many hooks activate simultaneously | use `condition` to serialize activation; avoid overlapping triggers | + +## Hook Execution Order + +When multiple hooks activate for the same event, Kiro executes them in alphabetical filename order. To control execution order, prefix hook files with numbers: + +``` +.kiro/hooks/ + 00-lint-on-save.md + 01-format-on-save.md + 02-test-on-save.md +``` + +## Disabling Hooks Temporarily + +To disable a hook without deleting it, rename it with a `.disabled` extension: + +```bash +mv .kiro/hooks/on-save-lint.md .kiro/hooks/on-save-lint.md.disabled +``` + +Re-enable by removing the `.disabled` extension and reopening the workspace. + +## Source References + +- [Kiro Docs: Hooks](https://kiro.dev/docs/hooks) +- [Kiro Docs: Hook Events](https://kiro.dev/docs/hooks/events) +- [Kiro Docs: Hook Conditions](https://kiro.dev/docs/hooks/conditions) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now know how to create event-driven hooks that automate repetitive agent actions, configure conditions to avoid noise, and prevent common hook pitfalls. + +Next: [Chapter 7: Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 6: Hooks and Automation** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 6: Hooks and Automation` — the `.kiro/hooks/` directory as the event rule store, the Kiro event bus as the trigger dispatcher, and the agent as the action executor. +2. Separate control-plane decisions (which events to hook, condition design) from data-plane execution (agent action invocation, file modification, chat output). +3. Capture input contracts: hook file with event type, condition expression, and action description; output: agent-executed action on trigger. +4. Trace state transitions: hook file written → workspace restart → event bus registers hook → event fires → condition evaluated → agent action invoked → output produced. +5. Identify extension hooks: custom event types via MCP, condition expression extensions, action scope constraints. +6. Map ownership boundaries: developers own feature-specific hooks; team leads own shared hooks in the repository; security team approves hooks that trigger git or publish operations. +7. Specify rollback paths: disable hook by adding `.disabled` extension; revert hook file via git; restart workspace to clear in-flight hook executions. +8. Track observability signals: hook activation frequency, agent action token usage per hook, false-positive activation rate, hook-induced test failures. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Hook scope | narrow file-pattern conditions | broad event hooks with explicit exclusions | simplicity vs coverage | +| Agent action type | read-only analysis and reporting | write operations on source files | safety vs automation level | +| Activation frequency | save-level hooks with debounce | commit-level or task-complete hooks | responsiveness vs cost | +| Output channel | chat panel notifications | log file writes for audit | visibility vs noise | +| Hook governance | individual developer hooks | team-reviewed hooks committed to git | velocity vs consistency | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| infinite loop | hook activates repeatedly on same file | hook modifies the file that triggered it | add exclusion for agent output files in condition | +| token cost spike | unexpectedly high daily token usage | hook activates on high-frequency events without conditions | add specific conditions to reduce activation rate | +| noisy chat | chat panel fills with hook notifications | hook outputs to chat on common events | redirect output to a log file for low-priority hooks | +| unexpected file edit | agent modifies unintended files during hook | underconstrained action description | add explicit "do not modify files other than X" constraint in action | +| hook ordering conflict | two hooks produce conflicting changes to the same file | overlapping hook triggers | use numeric prefix to serialize execution; add mutual exclusion conditions | +| slow workspace | every save triggers multiple concurrent agent invocations | too many broad hooks active simultaneously | audit hook conditions and consolidate overlapping triggers | + +### Implementation Runbook + +1. Create `.kiro/hooks/` in the project root. +2. Identify the three highest-value repetitive manual workflows in your daily development cycle. +3. Write one hook file per workflow using the event/condition/action format. +4. Save a test file to trigger the first `file:save` hook and verify the agent's action output. +5. Check the agent activity log for the hook invocation and confirm the output is correct. +6. Add numeric prefixes to hooks that share the same event to control execution order. +7. Test the full hook set after a real coding session and identify any noise or false activations. +8. Refine condition expressions to reduce false activations and commit the final hook set. +9. Document each hook's purpose and expected behavior in a `.kiro/hooks/README.md`. + +### Quality Gate Checklist + +- [ ] all hooks have a condition expression to prevent broad activation +- [ ] no hook modifies a file that could re-trigger the same event (infinite loop prevention) +- [ ] hook action descriptions include explicit scope constraints on file modifications +- [ ] hooks are tested with a real event before committing to version control +- [ ] high-frequency event hooks (file:save) use specific file pattern conditions +- [ ] a hooks README documents each hook's purpose and expected activation rate +- [ ] token usage is monitored after adding new hooks for the first week +- [ ] disabled hooks use the `.disabled` extension naming convention for easy re-enabling + +### Source Alignment + +- [Kiro Docs: Hooks](https://kiro.dev/docs/hooks) +- [Kiro Docs: Hook Events](https://kiro.dev/docs/hooks/events) +- [Kiro Docs: Hook Conditions](https://kiro.dev/docs/hooks/conditions) +- [Kiro Docs: Hook Action Constraints](https://kiro.dev/docs/hooks/actions) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [Claude Code Tutorial](../claude-code-tutorial/) +- [N8N AI Tutorial](../n8n-ai-tutorial/) +- [Activepieces Tutorial](../activepieces-tutorial/) +- [GitHub MCP Server Tutorial](../github-mcp-server-tutorial/) +- [Chapter 7: Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) + +### Advanced Practice Exercises + +1. Build a hook that triggers on test failure, analyzes the failing test, and writes a diagnostic summary to a log file. +2. Create a hook that checks documentation freshness when API files are committed and lists stale doc sections. +3. Simulate an infinite loop scenario by creating a hook that modifies the file it watches; then fix it with an exclusion condition. +4. Monitor token usage for one week with three active hooks and calculate the cost per activation for each hook type. +5. Design a hook governance proposal: define which hooks require team review before merging to main and which can be individual developer hooks. + +### Review Questions + +1. What is the difference between a `file:save` hook and a `git:commit` hook, and when is each more appropriate? +2. How do you prevent an infinite loop when a hook agent modifies a source file? +3. What tradeoff did you make between hook responsiveness (save-level) and token efficiency (commit-level)? +4. How would you recover if a hook introduced a test failure by auto-applying a lint fix that broke logic? +5. What governance process should control hooks that trigger write operations on shared source files? + +### Scenario Playbook 1: Hooks - Infinite Loop + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a file:save hook that applies lint fixes re-triggers itself every time it saves the fixed file +- initial hypothesis: the hook condition does not exclude the files the agent modifies after applying fixes +- immediate action: disable the hook immediately by adding the `.disabled` extension to stop the loop +- engineering control: add `file not matches ".kiro/agent-output/**"` or a similar exclusion to the hook condition +- verification target: save a TypeScript file and confirm the hook activates only once per developer save +- rollback trigger: if the exclusion condition is too broad and blocks legitimate activations, narrow the exclusion pattern +- communication step: document the infinite loop incident and fix in the hooks README +- learning capture: add "check for self-triggering loops" as a required review step in the hook PR checklist + +### Scenario Playbook 2: Hooks - Token Cost Spike + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: daily token usage spikes after adding a file:save hook without a file pattern condition +- initial hypothesis: the hook is activating on every file save including node_modules and build artifacts +- immediate action: disable the hook and check the activation log for unexpected trigger files +- engineering control: add a specific file pattern condition: `file matches "src/**/*.ts" AND file not matches "node_modules/**"` +- verification target: token usage returns to baseline levels after the condition is applied +- rollback trigger: if token cost remains high after condition refinement, switch the event to `git:commit` instead of `file:save` +- communication step: share the token cost findings with the team and add a token budget guideline to the hook governance doc +- learning capture: add token cost estimation to the hook design process before activating a new hook in production + +### Scenario Playbook 3: Hooks - Noisy Chat Panel + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: the chat panel is filled with low-value hook notifications every time a file is saved +- initial hypothesis: the hook is configured to output its findings to the chat panel for events that occur too frequently +- immediate action: redirect the hook's output from chat to a log file: `.kiro/hook-log.md` +- engineering control: use chat output only for high-priority hooks (test failure analysis, security warnings); use log files for routine hooks +- verification target: chat panel shows only actionable notifications; routine logs are in `.kiro/hook-log.md` +- rollback trigger: if log file grows too large, add a rotation mechanism or summarize logs daily +- communication step: update the hooks README with the output channel conventions for the team +- learning capture: add output channel selection as a required design decision in the hook template + +### Scenario Playbook 4: Hooks - Unexpected File Edit + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a hook agent modifies files beyond the intended scope during a file:save trigger +- initial hypothesis: the hook action description was underspecified and allowed the agent to infer additional scope +- immediate action: revert the unintended file modifications using `git checkout` +- engineering control: add explicit "do not modify files other than the saved file" constraint to the hook action description +- verification target: re-trigger the hook and confirm only the specified file is modified +- rollback trigger: if the constraint causes the hook to produce incomplete output, split into two hooks with different scopes +- communication step: document the out-of-scope modification in the hook's revision history +- learning capture: add scope constraint as a mandatory field in the hook file template + +### Scenario Playbook 5: Hooks - Hook Ordering Conflict + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: two hooks triggered by the same file:save event produce conflicting changes to the same file +- initial hypothesis: both hooks modify the same file without coordination, and their execution order is non-deterministic +- immediate action: disable the conflicting hook and manually merge the intended changes +- engineering control: add numeric prefixes to both hooks to enforce serial execution and add a mutual exclusion condition to the second hook +- verification target: save a test file and confirm hook 1 completes before hook 2 activates, with no conflicting changes +- rollback trigger: if serial execution still produces conflicts, merge the two hooks into one combined hook +- communication step: document the merge decision and the new combined hook in the team's hooks change log +- learning capture: add a "check for file overlap with existing hooks" step to the hook PR review checklist + +## What Problem Does This Solve? + +Repetitive manual workflows are the silent tax on engineering productivity. Every time a developer saves a file and then manually runs lint, checks test failures, and updates documentation, they are doing work that follows a predictable pattern. Kiro hooks eliminate this tax by encoding the "what happens next" logic as event-driven agents that run automatically. + +In practical terms, this chapter helps you avoid three common failures: + +- letting lint errors accumulate because running the linter is a separate manual step that gets skipped under deadline pressure +- discovering test failures hours after they were introduced because no automated analysis ran at the point of change +- letting documentation drift because doc updates are always "the next task" that never gets done + +After working through this chapter, you should be able to treat `.kiro/hooks/` as a team-owned library of automation patterns that encode the team's quality practices as first-class workspace behavior. + +## How it Works Under the Hood + +Under the hood, `Chapter 6: Hooks and Automation` follows a repeatable control path: + +1. **Hook registration**: at workspace open, Kiro scans `.kiro/hooks/` and registers each hook with the event bus. +2. **Event detection**: the Kiro event bus monitors workspace state for registered event types. +3. **Condition evaluation**: when an event fires, Kiro evaluates the hook's condition expression against the event context. +4. **Agent dispatch**: if the condition passes, Kiro dispatches the hook action to an agent with the event context as input. +5. **Action execution**: the agent executes the action, potentially reading files, writing output, or running commands. +6. **Result routing**: the agent's output is routed to the configured channel (chat panel or log file). + +When debugging hook issues, verify each stage: hook registered, event fired, condition evaluated, agent dispatched, action completed, output routed. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Hooks](https://kiro.dev/docs/hooks) + Why it matters: the primary reference for hook file format, event types, and condition syntax. +- [Kiro Docs: Hook Events](https://kiro.dev/docs/hooks/events) + Why it matters: documents all available event types and the context data available for condition evaluation. +- [Kiro Docs: Hook Conditions](https://kiro.dev/docs/hooks/conditions) + Why it matters: defines the condition expression language and supported operators. +- [Kiro Docs: Hook Action Constraints](https://kiro.dev/docs/hooks/actions) + Why it matters: explains how to scope hook agent actions to prevent unintended side effects. + +Suggested trace strategy: +- check the hook events docs for the exact context variables available before writing condition expressions +- test each hook with the minimal possible condition before expanding to broader file pattern matching + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 5: MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) +- [Next Chapter: Chapter 7: Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/07-multi-model-strategy-and-providers.md b/tutorials/kiro-tutorial/07-multi-model-strategy-and-providers.md new file mode 100644 index 00000000..d7aa97b2 --- /dev/null +++ b/tutorials/kiro-tutorial/07-multi-model-strategy-and-providers.md @@ -0,0 +1,390 @@ +--- +layout: default +title: "Chapter 7: Multi-Model Strategy and Providers" +nav_order: 7 +parent: Kiro Tutorial +--- + +# Chapter 7: Multi-Model Strategy and Providers + +Welcome to **Chapter 7: Multi-Model Strategy and Providers**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Kiro uses Claude Sonnet 4.0 and 3.7 by default and routes different task types to different model configurations. This chapter teaches you how to configure the model strategy for your team's workload profile. + +## Learning Goals + +- understand Kiro's default model routing between Claude Sonnet 4.0 and 3.7 +- configure model preferences for different task categories +- understand the cost and latency tradeoffs between model tiers +- set up budget controls and usage monitoring +- plan model upgrades as new Claude versions become available + +## Fast Start Checklist + +1. open Kiro settings and navigate to the Model section +2. confirm the default model is Claude Sonnet 4.0 +3. optionally override to Claude Sonnet 3.7 for faster or lower-cost interactive chat +4. set a daily token budget for cost control +5. review the model usage dashboard after a full session + +## Default Model Configuration + +Kiro ships with two default model profiles: + +| Profile | Model | Best For | +|:--------|:------|:---------| +| Primary | Claude Sonnet 4.0 | autonomous agent tasks, spec generation, complex code synthesis | +| Fast | Claude Sonnet 3.7 | interactive chat, quick edits, explanation and Q&A | + +Kiro automatically selects the appropriate model based on the interaction type. You can override this selection for specific use cases. + +## Model Configuration in Settings + +```json +{ + "models": { + "primary": { + "provider": "anthropic", + "model": "claude-sonnet-4-0", + "maxTokens": 8192, + "temperature": 0.1 + }, + "fast": { + "provider": "anthropic", + "model": "claude-sonnet-3-7", + "maxTokens": 4096, + "temperature": 0.2 + }, + "routing": { + "specGeneration": "primary", + "taskExecution": "primary", + "interactiveChat": "fast", + "hookActions": "fast", + "codeExplanation": "fast" + } + } +} +``` + +## Claude Sonnet 4.0 vs. 3.7 + +| Capability | Claude Sonnet 4.0 | Claude Sonnet 3.7 | +|:-----------|:-----------------|:-----------------| +| Code synthesis quality | higher | good | +| Multi-step reasoning | stronger | capable | +| Response latency | moderate | faster | +| Cost per token | higher | lower | +| Context window | 200k tokens | 200k tokens | +| Best use case | spec generation, complex tasks | chat, quick edits | + +## Task-to-Model Routing + +Map task types to model profiles based on your team's cost and quality priorities: + +```json +{ + "models": { + "routing": { + "specGeneration": "primary", // requirements → design → tasks: quality matters most + "taskExecution": "primary", // autonomous agent: complex multi-step reasoning + "codeReview": "primary", // security and correctness review: quality matters + "interactiveChat": "fast", // quick Q&A and exploration: speed matters + "hookActions": "fast", // frequent event-driven actions: cost matters + "codeExplanation": "fast", // explaining existing code: speed and cost + "documentationUpdate": "fast" // doc updates: lower complexity + } + } +} +``` + +## Budget Controls + +Set daily and monthly token budgets to prevent unexpected cost spikes: + +```json +{ + "budget": { + "daily": { + "inputTokens": 500000, + "outputTokens": 200000, + "alertThreshold": 0.8, + "action": "notify" + }, + "monthly": { + "inputTokens": 10000000, + "outputTokens": 4000000, + "alertThreshold": 0.9, + "action": "restrict" + } + } +} +``` + +Budget actions: +- `notify`: send an alert to the chat panel when the threshold is reached +- `restrict`: switch all routing to the `fast` (lower-cost) model when the threshold is reached +- `pause`: stop all agent activity and require manual reset when the limit is reached + +## Usage Monitoring + +Track model usage in the Kiro dashboard: + +``` +# In the Chat panel: +> /usage + +# Output: +Session token usage: + Input: 47,832 tokens (Claude Sonnet 4.0: 31,200 | Claude Sonnet 3.7: 16,632) + Output: 12,441 tokens (Claude Sonnet 4.0: 9,800 | Claude Sonnet 3.7: 2,641) + Estimated cost: $0.43 + +Daily usage: 182,341 input / 48,902 output tokens (36% of daily budget) +``` + +## Cost Optimization Patterns + +| Pattern | Description | Token Savings | +|:--------|:------------|:-------------| +| Route chat to fast model | use Sonnet 3.7 for all interactive chat | 30-50% reduction on chat costs | +| Scope task context | pass only relevant spec sections to agents | 20-40% reduction per task | +| Compress steering files | remove redundant rules from steering files | 5-15% reduction on base context | +| Limit hook frequency | use commit-level hooks instead of save-level | 60-80% reduction on hook costs | +| Batch spec generation | generate all spec documents in one call | 10-20% reduction vs. sequential calls | + +## Preparing for Model Upgrades + +When AWS releases a new Claude version in Kiro, follow this upgrade protocol: + +1. review the release notes for the new model version +2. test spec generation on a sample feature spec with the new model +3. compare output quality against the previous model on the same spec +4. if quality is equal or better, update the `primary` routing to the new model +5. run the full test suite on an autonomous agent task using the new model +6. monitor token usage for the first week on the new model +7. update the model configuration in version control and notify the team + +## Source References + +- [Kiro Docs: Model Configuration](https://kiro.dev/docs/models) +- [Kiro Docs: Budget Controls](https://kiro.dev/docs/models/budget) +- [Anthropic Models Overview](https://docs.anthropic.com/en/docs/models-overview) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now know how to configure Kiro's model routing, set budget controls, monitor usage, and plan for model upgrades. + +Next: [Chapter 8: Team Operations and Governance](08-team-operations-and-governance.md) + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 7: Multi-Model Strategy and Providers** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 7: Multi-Model Strategy and Providers` — the model routing layer, the budget controller, and the provider API gateway. +2. Separate control-plane decisions (model selection, routing policy, budget limits) from data-plane execution (token generation, inference calls). +3. Capture input contracts: task type classification from interaction context; output: model-routed inference request and response. +4. Trace state transitions: task initiated → type classified → routing rule applied → model selected → request sent → response received → cost tracked. +5. Identify extension hooks: custom routing rules per task type, budget action policies, provider failover paths. +6. Map ownership boundaries: developers choose fast/primary preference; team leads set routing policy; finance owns budget limits. +7. Specify rollback paths: switch routing back to previous model; restore budget settings from version control. +8. Track observability signals: token consumption per model per task type, cost per session, budget threshold alerts, model latency distribution. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Model selection | Kiro defaults (Sonnet 4.0 primary) | explicit routing per task type | ease vs cost optimization | +| Budget controls | monthly soft cap with notification | daily hard cap with auto-restrict | flexibility vs cost certainty | +| Upgrade cadence | upgrade immediately on release | validation protocol before upgrade | speed vs quality assurance | +| Usage monitoring | check manually via /usage | automated daily usage reports | effort vs visibility | +| Cost allocation | project-level budget | per-developer or per-team budgets | simplicity vs granularity | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| budget overrun | unexpected high token usage | hooks or autonomous tasks using primary model at high frequency | audit routing config and redirect high-frequency actions to fast model | +| model quality regression | lower spec generation quality after upgrade | new model performs differently on the team's task profile | run quality benchmark before upgrading primary model | +| provider outage | 503 errors on model API calls | Anthropic service disruption | configure fallback model or degrade to interactive-only mode | +| token waste on large contexts | high input token counts for simple tasks | full codebase context sent for small tasks | scope context explicitly in task descriptions | +| routing misconfiguration | wrong model used for expensive tasks | misconfigured routing JSON | audit routing config and verify with /usage after changes | +| cost spike from hook frequency | daily budget hits threshold early | save-level hooks using primary model | switch hook routing to fast model and add conditions to reduce frequency | + +### Implementation Runbook + +1. Review the Kiro model documentation to understand the current Claude Sonnet 4.0 and 3.7 capability profiles. +2. Map your team's top five task types to the appropriate model tier based on quality vs. cost priority. +3. Configure the routing policy in Kiro settings or `.kiro/settings.json`. +4. Set a daily token budget with a notify action at 80% of the limit. +5. Run a full one-day session with the new configuration and review the `/usage` output. +6. Identify the three highest-cost task types and optimize their routing or context scope. +7. Set the monthly budget with a restrict action at 90% of the limit. +8. Document the model routing rationale in `.kiro/settings.json` comments for team transparency. +9. Schedule a quarterly model upgrade review to assess whether new Claude versions improve quality or reduce cost. + +### Quality Gate Checklist + +- [ ] routing policy is explicitly configured for at least five task types in settings +- [ ] daily and monthly token budgets are set with appropriate alert thresholds +- [ ] budget action for monthly limit is set to `restrict` or `pause` to prevent overruns +- [ ] `/usage` is reviewed after the first full day with the new routing configuration +- [ ] high-frequency hook actions are routed to the fast model +- [ ] a model upgrade validation protocol is documented before the first upgrade +- [ ] routing configuration is committed to version control with clear comments +- [ ] team members are informed of the routing policy and budget limits + +### Source Alignment + +- [Kiro Docs: Model Configuration](https://kiro.dev/docs/models) +- [Kiro Docs: Budget Controls](https://kiro.dev/docs/models/budget) +- [Kiro Docs: Usage Dashboard](https://kiro.dev/docs/models/usage) +- [Anthropic Models Overview](https://docs.anthropic.com/en/docs/models-overview) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [LiteLLM Tutorial](../litellm-tutorial/) +- [Claude Code Tutorial](../claude-code-tutorial/) +- [OpenCode Tutorial](../opencode-tutorial/) +- [Cline Tutorial](../cline-tutorial/) +- [Chapter 8: Team Operations and Governance](08-team-operations-and-governance.md) + +### Advanced Practice Exercises + +1. Configure a complete routing policy for six task types and document the quality vs. cost rationale for each routing decision. +2. Run identical spec generation tasks with Sonnet 4.0 and Sonnet 3.7 and compare output quality in a structured evaluation table. +3. Simulate a budget overrun by setting a very low daily limit and observe the restrict action behavior; then restore the correct limit. +4. Build a model upgrade validation checklist for your team's specific task profile and run it against a hypothetical new Claude version. +5. Analyze one week of `/usage` output and identify the top three opportunities to reduce token consumption without reducing quality. + +### Review Questions + +1. Why does Kiro route spec generation to the primary (Sonnet 4.0) model rather than the fast model by default? +2. What is the difference between the `restrict` and `pause` budget actions, and when should you use each? +3. What tradeoff did you make between model quality and cost when routing hook actions to the fast model? +4. How would you validate that a new Claude model version is safe to use as the primary routing target for your team's spec generation tasks? +5. What conditions trigger an automatic routing switch in Kiro's budget control system? + +### Scenario Playbook 1: Model Strategy - Budget Overrun from Hook Frequency + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: daily token budget alert fires at 9am because file:save hooks are consuming primary model tokens at high frequency +- initial hypothesis: hooks are routing to the primary model and activating on every TypeScript file save in a large codebase +- immediate action: switch all hook routing to the fast model and add file-pattern conditions to reduce activation rate +- engineering control: update the routing config to explicitly map `hookActions` to `fast` model +- verification target: token usage at end of day stays below 60% of the daily budget after routing change +- rollback trigger: if fast model produces lower-quality hook outputs that are actionable, add a flag for critical hooks to use primary +- communication step: notify the team of the routing change and explain the cost rationale +- learning capture: add hook routing as a required configuration step in the team's Kiro onboarding checklist + +### Scenario Playbook 2: Model Strategy - Quality Regression After Upgrade + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: spec generation quality drops noticeably after the team upgraded to a new Claude version +- initial hypothesis: the new model has different default behaviors for EARS requirement parsing and design generation +- immediate action: revert the primary model routing to the previous version while the quality issue is investigated +- engineering control: run the quality benchmark suite on the new model version and document the delta +- verification target: benchmark scores for spec generation match or exceed the previous model version +- rollback trigger: if the new model cannot match previous quality after prompt adjustments, remain on the previous version +- communication step: share the benchmark results with the team and the model upgrade status +- learning capture: add a quality benchmark run as a mandatory step before any future model version upgrade + +### Scenario Playbook 3: Model Strategy - Provider Outage + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: Anthropic API returns 503 errors causing all Kiro model calls to fail +- initial hypothesis: the Anthropic service is experiencing an outage affecting the Claude Sonnet endpoints +- immediate action: check the Anthropic status page and switch Kiro to interactive-only mode for in-flight autonomous tasks +- engineering control: configure a fallback model in Kiro settings pointing to an alternative provider if available +- verification target: team can continue interactive chat in degraded mode while the outage is active +- rollback trigger: restore full model routing once Anthropic reports the incident resolved +- communication step: notify the team of the outage status and expected recovery time from the Anthropic status page +- learning capture: add provider outage response steps to the team's Kiro incident runbook + +### Scenario Playbook 4: Model Strategy - Token Waste on Large Contexts + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: `/usage` shows extremely high input token counts for tasks that should be simple +- initial hypothesis: the agent is loading the full codebase context for tasks that only require a single file or module +- immediate action: add explicit context constraints to the task descriptions in tasks.md: "only read files in src/auth/" +- engineering control: update the spec generation prompt template to include a "context scope" field for each task +- verification target: input token count per task decreases by at least 30% after scope constraints are applied +- rollback trigger: if scope constraints cause the agent to miss necessary context, expand the scope incrementally +- communication step: share the context scoping pattern with the team as a best practice in the Kiro usage guide +- learning capture: add a context scope field to the tasks.md template and document the expected files per task type + +### Scenario Playbook 5: Model Strategy - Routing Misconfiguration + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: interactive chat is using the primary (Sonnet 4.0) model despite routing being configured for fast model +- initial hypothesis: the routing configuration in settings.json has a syntax error or the key name does not match Kiro's expected format +- immediate action: validate the settings.json against the Kiro settings schema and fix any key name mismatches +- engineering control: add a JSON schema validation step to the CI pipeline for `.kiro/settings.json` +- verification target: `/usage` confirms interactive chat is routed to Sonnet 3.7 after the configuration fix +- rollback trigger: if schema validation is not feasible, revert settings.json to the last known good commit +- communication step: share the corrected settings.json format with the team and update the configuration docs +- learning capture: add a settings.json validation step to the Kiro onboarding checklist + +## What Problem Does This Solve? + +Most agentic coding tools treat model selection as a binary choice. Kiro's multi-model routing strategy recognizes that different task types have fundamentally different quality and cost requirements. Spec generation demands the highest-quality reasoning; interactive chat demands the lowest latency. Routing these to the same model either wastes money on fast interactions or underserves the tasks that matter most. + +In practical terms, this chapter helps you avoid three common failures: + +- paying primary-model prices for every lint check, code explanation, and quick question +- using a fast model for spec generation and getting design documents that miss key architectural considerations +- running out of daily token budget before the high-value autonomous tasks run + +After working through this chapter, you should be able to treat model routing as a cost-quality optimization policy that is explicit, versioned, and tuned to your team's actual workload distribution. + +## How it Works Under the Hood + +Under the hood, `Chapter 7: Multi-Model Strategy and Providers` follows a repeatable control path: + +1. **Task type classification**: Kiro inspects the interaction type (chat, spec generation, hook action, etc.) to classify the task. +2. **Routing rule lookup**: the routing policy in settings is consulted to select the model profile for the task type. +3. **Budget check**: before dispatching, Kiro checks the current usage against the configured budget limits. +4. **Model API call**: Kiro sends the inference request to the Anthropic API endpoint for the selected model. +5. **Response tracking**: the token counts from the API response are recorded against the session and daily budgets. +6. **Usage aggregation**: the dashboard aggregates usage by model, task type, and time window for monitoring. + +When debugging cost or quality issues, trace this sequence from task classification through budget tracking to identify where the routing or consumption is diverging from expectations. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Model Configuration](https://kiro.dev/docs/models) + Why it matters: the primary reference for routing configuration format and available model identifiers. +- [Kiro Docs: Budget Controls](https://kiro.dev/docs/models/budget) + Why it matters: documents the exact budget action behaviors and threshold configuration options. +- [Anthropic Models Overview](https://docs.anthropic.com/en/docs/models-overview) + Why it matters: the canonical reference for Claude model capabilities, context windows, and pricing tiers. +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + Why it matters: source for model configuration schema and community discussions on routing strategies. + +Suggested trace strategy: +- check the Anthropic models page before configuring routing to confirm the current model identifier strings +- run `/usage` after each configuration change to confirm routing is working as intended + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 6: Hooks and Automation](06-hooks-and-automation.md) +- [Next Chapter: Chapter 8: Team Operations and Governance](08-team-operations-and-governance.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/08-team-operations-and-governance.md b/tutorials/kiro-tutorial/08-team-operations-and-governance.md new file mode 100644 index 00000000..80e0f131 --- /dev/null +++ b/tutorials/kiro-tutorial/08-team-operations-and-governance.md @@ -0,0 +1,431 @@ +--- +layout: default +title: "Chapter 8: Team Operations and Governance" +nav_order: 8 +parent: Kiro Tutorial +--- + +# Chapter 8: Team Operations and Governance + +Welcome to **Chapter 8: Team Operations and Governance**. In this part of **Kiro Tutorial: Spec-Driven Agentic IDE from AWS**, you will build an intuitive mental model first, then move into concrete implementation details and practical production tradeoffs. + + +Running Kiro at team scale requires deliberate governance around spec ownership, steering file reviews, autonomous delegation boundaries, and AWS-native identity integration. This chapter provides the operational playbook for production team deployments. + +## Learning Goals + +- design a team-scale Kiro configuration repository structure +- establish governance workflows for steering file and spec changes +- configure AWS IAM Autopilot and other Kiro Powers for enterprise environments +- set up shared MCP servers and hook libraries for team consistency +- define escalation and incident response procedures for autonomous agent failures + +## Fast Start Checklist + +1. create a shared `.kiro/` configuration repository or add governance files to your existing monorepo +2. define ownership rules for steering files (who approves security.md, project.md) +3. configure AWS IAM Autopilot if your team uses AWS services +4. establish a PR review policy for changes to `.kiro/specs/`, `.kiro/steering/`, and `.kiro/hooks/` +5. run a team onboarding session using the governance checklist + +## Team Configuration Repository Structure + +For large teams, maintain `.kiro/` as a shared configuration source committed to version control: + +``` +.kiro/ + specs/ ← feature specs (PR review required for tasks.md changes) + user-authentication/ + requirements.md + design.md + tasks.md + steering/ ← AI behavior rules (architect + security review required) + 00-project.md + 01-coding-style.md + 02-testing.md + 03-security.md + hooks/ ← automation rules (team lead review required) + 00-lint-on-save.md + 01-test-on-fail.md + mcp.json ← MCP server config (security review required for new servers) + settings.json ← model routing and budget config (team lead approval) + task-log.md ← auto-updated by hooks; read-only for humans +``` + +## PR Review Policy for Kiro Configuration + +| File/Directory | Required Reviewers | Review Criteria | +|:---------------|:-------------------|:----------------| +| `.kiro/steering/00-project.md` | architecture lead | technology decisions aligned with roadmap | +| `.kiro/steering/03-security.md` | security engineer | no security policy downgrades; OWASP coverage | +| `.kiro/specs/*/requirements.md` | product owner | EARS syntax compliance; acceptance criteria present | +| `.kiro/specs/*/design.md` | senior engineer | architecture coherence; data model correctness | +| `.kiro/specs/*/tasks.md` | tech lead | task scope bounded; order correct; no rogue tasks | +| `.kiro/hooks/` | team lead | no infinite-loop risk; conditions present; token efficiency | +| `.kiro/mcp.json` | security engineer | no hardcoded credentials; read-only scopes verified | +| `.kiro/settings.json` | engineering manager | budget limits set; routing policy documented | + +## Kiro Powers: AWS IAM Autopilot + +Kiro Powers are extensible capability modules that integrate Kiro with external systems. The first Power is **AWS IAM Autopilot**, which enables Kiro agents to interact with AWS IAM for automated permission analysis and remediation. + +### What AWS IAM Autopilot Does + +- analyzes IAM policies for over-permissioned roles and unused permissions +- generates least-privilege IAM policy recommendations based on actual CloudTrail usage +- creates GitHub PRs with suggested policy changes for human review and approval +- monitors new IAM policy changes and alerts on permission escalation patterns + +### Enabling AWS IAM Autopilot + +```json +// .kiro/settings.json +{ + "powers": { + "awsIamAutopilot": { + "enabled": true, + "awsRegion": "us-east-1", + "awsAccountId": "${AWS_ACCOUNT_ID}", + "cloudtrailLogGroup": "${CLOUDTRAIL_LOG_GROUP}", + "prRepository": "org/infrastructure", + "alertOnEscalation": true, + "escalationAlertChannel": "#security-alerts" + } + } +} +``` + +### IAM Autopilot Workflow + +``` +# In the Chat panel: +> Analyze IAM permissions for the ECS task role used by the auth service + +[Agent] Querying CloudTrail logs for role: ecs-auth-service-role (last 90 days) +[Agent] Identified 12 permissions used, 31 permissions granted but never used +[Agent] Generating least-privilege policy recommendation... +[Agent] Creating PR in org/infrastructure: "iam: reduce auth-service-role to least privilege" +[Agent] PR #847 created: https://github.com/org/infrastructure/pull/847 +``` + +### IAM Autopilot Safety Controls + +| Control | Configuration | Purpose | +|:--------|:--------------|:--------| +| PR-only mode | `"mode": "pr-only"` | agent creates PRs but never applies changes directly | +| CloudTrail lookback window | `"lookbackDays": 90` | controls the analysis window for permission usage | +| Escalation alerts | `"alertOnEscalation": true` | notifies security team when new policy grants exceed baseline | +| Scope restriction | `"targetRoles": ["ecs-*", "lambda-*"]` | limits analysis to specific IAM role name patterns | + +## Team Onboarding Workflow + +```markdown +# Kiro Team Onboarding Checklist + +## Installation (each developer) +- [ ] Download Kiro from kiro.dev for their platform +- [ ] Authenticate with the team's preferred provider (GitHub/AWS Builder ID) +- [ ] Clone the project repository and open in Kiro +- [ ] Verify the .kiro/ directory is loaded and steering files are active + +## Environment Setup (each developer) +- [ ] Copy .env.example to .env and fill in MCP server credentials +- [ ] Verify each MCP server is active in Kiro settings +- [ ] Run /usage and confirm the model routing is correct +- [ ] Read all steering files in .kiro/steering/ to understand team conventions + +## Spec Workflow Training (each developer) +- [ ] Read an existing completed spec (requirements.md → design.md → tasks.md) +- [ ] Create a practice spec for a small personal task +- [ ] Run one autonomous agent task and review the activity log +- [ ] Participate in one spec review PR as a reviewer + +## Governance Training (tech leads and senior engineers) +- [ ] Review the PR review policy for .kiro/ changes +- [ ] Complete the security steering file review checklist +- [ ] Understand the escalation path for autonomous agent incidents +- [ ] Configure budget alerts and test the notification flow +``` + +## Autonomous Agent Incident Response + +When an autonomous agent causes an unexpected outcome in a shared environment: + +```markdown +# Kiro Autonomous Agent Incident Runbook + +## Immediate Response (< 5 minutes) +1. Interrupt the agent execution (Escape or Stop Agent button) +2. Run `git status` to identify all modified files +3. Run `git stash` or `git checkout -- .` to revert unintended changes +4. Record the task description and agent activity log for investigation + +## Investigation (< 30 minutes) +5. Review the agent activity log for the last 20 steps before the incident +6. Identify the specific decision point that led to the unintended outcome +7. Determine whether the root cause is: task underspecification, missing steering rule, or agent reasoning failure + +## Remediation (< 2 hours) +8. Update the task description or steering file to prevent recurrence +9. Re-run the task in supervised mode to verify the fix +10. Commit the updated task/steering with a clear incident reference in the commit message + +## Communication +11. Notify the team in the incident channel with: what happened, what was reverted, and what was changed +12. Add the incident to the team's Kiro incident log +13. Schedule a 15-minute retrospective if the incident involved production-adjacent changes +``` + +## Shared Hook Library + +Establish a shared hook library that all team members use: + +``` +.kiro/hooks/ + 00-lint-on-save.md ← maintained by: team (any member) + 01-test-failure-analysis.md ← maintained by: quality lead + 02-spec-freshness-check.md ← maintained by: architecture lead + 03-security-scan-on-commit.md ← maintained by: security engineer + 04-doc-update-reminder.md ← maintained by: tech writer +``` + +Each hook file includes a header comment identifying its owner and last review date: + +```markdown +--- +event: file:save +condition: file matches "src/**/*.ts" +owner: team +last_reviewed: 2025-10-15 +--- + +# Auto-Lint TypeScript on Save +... +``` + +## Source References + +- [Kiro Docs: Team Setup](https://kiro.dev/docs/team) +- [Kiro Docs: Powers](https://kiro.dev/docs/powers) +- [Kiro Docs: AWS IAM Autopilot](https://kiro.dev/docs/powers/iam-autopilot) +- [Kiro Docs: Governance](https://kiro.dev/docs/governance) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +## Summary + +You now have the operational playbook for team-scale Kiro deployment: governance structure, PR review policies, AWS IAM Autopilot configuration, team onboarding workflow, and autonomous agent incident response. + +--- + +**You have completed the Kiro Tutorial.** Return to the [Tutorial Index](index.md) to explore related tutorials. + +## Depth Expansion Playbook + + + +This chapter is expanded to v1-style depth for production-grade learning and implementation quality. + +### Strategic Context + +- tutorial: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- tutorial slug: **kiro-tutorial** +- chapter focus: **Chapter 8: Team Operations and Governance** +- system context: **Kiro Tutorial** +- objective: move from surface-level usage to repeatable engineering operation + +### Architecture Decomposition + +1. Define the runtime boundary for `Chapter 8: Team Operations and Governance` — the `.kiro/` directory as the shared configuration contract, the PR review process as the governance gate, and AWS IAM Autopilot as the infrastructure automation layer. +2. Separate control-plane decisions (PR review policies, ownership assignments, Power configurations) from data-plane execution (agent task runs, IAM analysis, hook executions). +3. Capture input contracts: team configuration in `.kiro/`, developer workstations running Kiro, AWS account ID and CloudTrail access for IAM Autopilot. +4. Trace state transitions: individual use → team config shared → governance review established → onboarding complete → incident response tested. +5. Identify extension hooks: additional Kiro Powers as they are released, custom shared hook libraries, organization-level steering templates. +6. Map ownership boundaries: security engineer owns `mcp.json` and `security.md` reviews; architecture lead owns `project.md` and `design.md` reviews; engineering manager owns budget configuration. +7. Specify rollback paths: revert `.kiro/` configuration via git; disable Powers individually in settings; use PR-only mode for IAM Autopilot to prevent direct changes. +8. Track observability signals: PR cycle time for `.kiro/` changes, autonomous agent incident rate, IAM Autopilot PR merge rate, team onboarding completion rate. + +### Operator Decision Matrix + +| Decision Area | Low-Risk Path | High-Control Path | Tradeoff | +|:--------------|:--------------|:------------------|:---------| +| Spec ownership | developer-owned specs | product owner sign-off on requirements.md | velocity vs alignment | +| Steering governance | any developer edits | architect + security sign-off | speed vs policy integrity | +| IAM Autopilot mode | PR-only, never direct apply | pr-only with security alert on escalation | automation vs safety | +| Onboarding approach | self-service with docs | guided session with tech lead | scale vs quality | +| Incident response | informal revert + fix | structured runbook with postmortem | effort vs learning | + +### Failure Modes and Countermeasures + +| Failure Mode | Early Signal | Root Cause Pattern | Countermeasure | +|:-------------|:-------------|:-------------------|:---------------| +| steering drift | agent behavior inconsistency across developers | no review process for steering changes | require PR review for all steering file changes | +| spec sprawl | many incomplete specs with no active tasks | specs created without commitment to execution | add a "spec status" field (draft/active/complete) and review in weekly planning | +| IAM over-automation | IAM Autopilot applies changes without approval | PR-only mode not configured | enforce `"mode": "pr-only"` before enabling IAM Autopilot | +| onboarding failure | new developers cannot get Kiro working in first session | incomplete env setup docs | add `.env.example` and a setup verification checklist to the onboarding guide | +| incident escalation | autonomous agent incident affects shared staging environment | no incident response protocol | implement the 13-step runbook before enabling autonomous mode in shared environments | +| Power scope creep | IAM Autopilot analyzes roles outside the defined scope | missing `targetRoles` configuration | always configure `targetRoles` to restrict analysis to known role patterns | + +### Implementation Runbook + +1. Commit the team's `.kiro/` configuration directory to the shared repository with a clear README. +2. Define the PR review policy for each `.kiro/` subdirectory and add it to the contributing guide. +3. Assign named owners for `security.md`, `project.md`, and `mcp.json` with documented approval authority. +4. Configure AWS IAM Autopilot in `settings.json` with PR-only mode and `targetRoles` restrictions. +5. Run the team onboarding checklist with the first cohort of developers and collect feedback. +6. Test the autonomous agent incident runbook with a controlled test scenario before enabling full autonomy in shared environments. +7. Establish a weekly `.kiro/` configuration review as part of the team's engineering meeting. +8. Set up a Slack or Teams channel for Kiro-related alerts from budget thresholds and IAM Autopilot escalations. +9. Schedule a quarterly Kiro governance review to assess the effectiveness of the PR policies and onboarding process. + +### Quality Gate Checklist + +- [ ] `.kiro/` directory is committed to version control with a README and owner assignments +- [ ] PR review policy is documented in the contributing guide for all `.kiro/` subdirectories +- [ ] named owners are assigned for security.md, project.md, and mcp.json reviews +- [ ] AWS IAM Autopilot is configured with `"mode": "pr-only"` and `targetRoles` restrictions +- [ ] team onboarding checklist is complete for all active developers +- [ ] autonomous agent incident runbook is tested with a controlled scenario +- [ ] budget alerts are configured and the notification channel is verified +- [ ] quarterly governance review is scheduled on the team engineering calendar + +### Source Alignment + +- [Kiro Docs: Team Setup](https://kiro.dev/docs/team) +- [Kiro Docs: Powers](https://kiro.dev/docs/powers) +- [Kiro Docs: AWS IAM Autopilot](https://kiro.dev/docs/powers/iam-autopilot) +- [Kiro Docs: Governance](https://kiro.dev/docs/governance) +- [Kiro Repository](https://github.com/kirodotdev/Kiro) + +### Cross-Tutorial Connection Map + +- [Claude Code Tutorial](../claude-code-tutorial/) +- [Goose Tutorial](../goose-tutorial/) +- [OpenHands Tutorial](../openhands-tutorial/) +- [HumanLayer Tutorial](../humanlayer-tutorial/) +- [Chapter 1: Getting Started](01-getting-started.md) + +### Advanced Practice Exercises + +1. Design a complete governance model for a 10-person team: ownership assignments, PR review policy, and escalation path for each `.kiro/` subdirectory. +2. Run the autonomous agent incident runbook as a tabletop exercise with the team: simulate an agent that modifies the wrong database migration and practice the recovery steps. +3. Configure AWS IAM Autopilot for a test AWS account with a narrow `targetRoles` filter and validate that it generates a PR without applying changes directly. +4. Write a team onboarding guide that covers install, auth, env setup, and first spec creation in under 45 minutes for a developer who has never used Kiro. +5. Propose and document a Kiro Powers roadmap for your team: which future Powers (beyond IAM Autopilot) would provide the highest value for your AWS-based infrastructure? + +### Review Questions + +1. Why should `.kiro/settings.json` and `.kiro/mcp.json` require security engineer approval in the PR review policy? +2. What is the most important safety configuration for AWS IAM Autopilot before enabling it in a production AWS account? +3. What tradeoff did you make between autonomous agent efficiency and oversight in shared staging environments? +4. How would you recover if a steering file change was merged without the required security review and the change weakened an authentication policy? +5. What must be tested before enabling full autonomous mode for a team's shared feature work environment? + +### Scenario Playbook 1: Team Operations - Steering Drift + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: different developers receive inconsistent agent behavior because steering files were edited without review +- initial hypothesis: steering file changes are being merged without the required architecture and security reviews +- immediate action: audit the git log for recent steering file changes and identify any that bypassed the review policy +- engineering control: add a CODEOWNERS file that requires specific reviewers for `.kiro/steering/` changes +- verification target: the next steering file PR is blocked until the designated reviewers approve +- rollback trigger: if inconsistent agent behavior affects a production feature, revert the steering file to the last reviewed commit +- communication step: notify the team of the CODEOWNERS addition and update the contributing guide +- learning capture: add steering file governance to the team's quarterly engineering review agenda + +### Scenario Playbook 2: Team Operations - IAM Autopilot Scope Creep + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: IAM Autopilot generates PRs targeting IAM roles outside the configured `targetRoles` filter +- initial hypothesis: the `targetRoles` pattern is too broad or a new role was created that matches the pattern unexpectedly +- immediate action: review the generated PRs and close any that target out-of-scope roles without merging +- engineering control: narrow the `targetRoles` pattern and add a specific exclusion list for known out-of-scope roles +- verification target: IAM Autopilot generates PRs only for roles matching the intended pattern after the config update +- rollback trigger: if scope creep continues, disable IAM Autopilot and conduct a manual configuration audit +- communication step: notify the security team of the scope creep and the config change made to remediate it +- learning capture: add a quarterly review of the `targetRoles` filter to the team's IAM governance calendar + +### Scenario Playbook 3: Team Operations - Autonomous Agent Incident in Staging + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: autonomous agent modifies a shared database migration in the staging environment causing test failures for other developers +- initial hypothesis: the agent executed a task with scope that included the shared migrations directory +- immediate action: interrupt the agent, revert the migration change using git, and notify the team +- engineering control: add an explicit scope exclusion for shared migration directories in the task description template +- verification target: re-run the task with the scope exclusion and confirm no shared files are modified +- rollback trigger: if the migration was already applied to the staging database, run the down migration and restore from backup +- communication step: follow the 13-step incident runbook; notify the team in the incident channel within 5 minutes +- learning capture: add "never modify shared migration files autonomously" as a rule in the task generation guidelines + +### Scenario Playbook 4: Team Operations - Onboarding Failure + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: a new developer cannot get Kiro working after 2 hours because the onboarding guide is incomplete +- initial hypothesis: the `.env.example` file is missing or the MCP server setup steps are not documented +- immediate action: pair the new developer with a senior team member to complete the setup and document each missing step +- engineering control: update the onboarding guide with the missing steps and add a setup verification checklist +- verification target: the next new developer completes onboarding independently in under 45 minutes +- rollback trigger: if the onboarding guide update does not resolve the issue, schedule a group onboarding session for the next cohort +- communication step: announce the onboarding guide update in the team channel and ask the new developer to confirm it worked +- learning capture: add onboarding guide review to the pre-release checklist for every Kiro version upgrade + +### Scenario Playbook 5: Team Operations - Spec Sprawl + +- tutorial context: **Kiro Tutorial: Spec-Driven Agentic IDE from AWS** +- trigger condition: `.kiro/specs/` contains 20+ spec directories with no active task execution, indicating abandoned or stale specs +- initial hypothesis: specs are being created as planning artifacts but never reaching the task execution phase +- immediate action: conduct a spec audit: categorize each spec as active, on-hold, or abandoned and add status labels +- engineering control: add a "spec status" field to the spec README template and require status updates in weekly planning +- verification target: after one sprint, each spec has a clear status and a responsible owner +- rollback trigger: if spec debt continues to grow, implement a spec age limit: any spec older than 30 days without task activity is automatically archived +- communication step: present the spec audit results in the next team planning session +- learning capture: add spec lifecycle management to the team's Kiro governance document + +## What Problem Does This Solve? + +Agentic tools that work brilliantly for individual developers often fail catastrophically at team scale. Without governance, steering files drift, specs accumulate without execution, autonomous agents operate without safety boundaries, and costs grow without visibility. Kiro's team operations model solves this by making governance artifacts first-class citizens: version-controlled, reviewer-assigned, and incident-runbook-backed. + +In practical terms, this chapter helps you avoid three common failures: + +- an autonomous agent modifying shared infrastructure because nobody defined the scope boundary for team environments +- security policy regressions when a well-meaning developer edits `security.md` without a security review +- AWS IAM Autopilot applying changes directly to a production account because PR-only mode was not configured before enabling the Power + +After working through this chapter, you should be able to operate Kiro as a governed team tool with the same rigor applied to AI configuration artifacts that you apply to production infrastructure code. + +## How it Works Under the Hood + +Under the hood, `Chapter 8: Team Operations and Governance` follows a repeatable control path: + +1. **Configuration sharing**: the `.kiro/` directory is committed to version control and distributed to all developer workstations via git pull. +2. **CODEOWNERS enforcement**: GitHub or GitLab CODEOWNERS rules block merges to `.kiro/` subdirectories without designated reviewer approval. +3. **Power activation**: when a Power like IAM Autopilot is enabled in `settings.json`, Kiro connects to the corresponding AWS service using the configured credentials. +4. **IAM analysis**: IAM Autopilot queries CloudTrail logs to identify permission usage patterns and generates a least-privilege policy recommendation. +5. **PR creation**: instead of applying changes, IAM Autopilot creates a PR in the configured infrastructure repository for human review. +6. **Incident response**: when an agent incident occurs, the runbook provides a structured 13-step recovery and learning process. + +When debugging governance issues, trace this sequence from configuration sharing through CODEOWNERS enforcement to identify where the policy gap occurred. + +## Source Walkthrough + +Use the following upstream sources to verify implementation details while reading this chapter: + +- [Kiro Docs: Team Setup](https://kiro.dev/docs/team) + Why it matters: the official guide for team-scale Kiro configuration and shared workspace setup. +- [Kiro Docs: Powers](https://kiro.dev/docs/powers) + Why it matters: the primary reference for the Powers extension model and available Power configurations. +- [Kiro Docs: AWS IAM Autopilot](https://kiro.dev/docs/powers/iam-autopilot) + Why it matters: the detailed reference for IAM Autopilot configuration, safety controls, and CloudTrail integration. +- [Kiro Docs: Governance](https://kiro.dev/docs/governance) + Why it matters: documents Kiro's recommended governance practices for enterprise team deployments. + +Suggested trace strategy: +- review the Powers docs before enabling any Power to understand the exact AWS permissions required by the Power +- test IAM Autopilot in a sandbox AWS account before enabling it in a production account to verify PR-only mode works as expected + +## Chapter Connections + +- [Tutorial Index](index.md) +- [Previous Chapter: Chapter 7: Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) +- [Tutorial Index](index.md) +- [Main Catalog](../../README.md#-tutorial-catalog) +- [A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) diff --git a/tutorials/kiro-tutorial/index.md b/tutorials/kiro-tutorial/index.md new file mode 100644 index 00000000..8c977e05 --- /dev/null +++ b/tutorials/kiro-tutorial/index.md @@ -0,0 +1,113 @@ +--- +layout: default +title: "Kiro Tutorial" +nav_order: 192 +has_children: true +format_version: v2 +--- + +# Kiro Tutorial: Spec-Driven Agentic IDE from AWS + +> Learn how to use `kirodotdev/Kiro` for structured AI-powered development with spec-driven workflows, agent steering, event-driven automation, and AWS-native integrations. + +[![GitHub Repo](https://img.shields.io/badge/GitHub-kirodotdev%2FKiro-black?logo=github)](https://github.com/kirodotdev/Kiro) +[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/kirodotdev/Kiro/blob/main/LICENSE) +[![Docs](https://img.shields.io/badge/docs-kiro.dev-blue)](https://kiro.dev) + +## Why This Track Matters + +Kiro is AWS's agentic IDE built on VS Code and launched in July 2025. It introduces a structured, spec-driven development model where AI assistance is organized around requirements, design documents, and task lists rather than freeform chat. + +This track focuses on: + +- setting up Kiro and authenticating with GitHub, Google, or AWS Builder ID +- mastering the spec-driven development workflow using EARS syntax +- configuring agent steering and custom rules for repeatable AI behavior +- running autonomous agents for multi-step task delegation +- connecting external data sources with MCP and managing event-driven hooks +- governing multi-model strategy and team-scale deployment on AWS + +## Current Snapshot (auto-updated) + +- repository: [`kirodotdev/Kiro`](https://github.com/kirodotdev/Kiro) +- stars: about **1.8k** +- launched: **July 2025** by AWS +- project positioning: AWS-backed spec-driven agentic IDE for structured AI engineering + +## Mental Model + +```mermaid +flowchart LR + A[Developer Intent] --> B[Spec Files: requirements, design, tasks] + B --> C[Agent Steering: .kiro/steering/] + C --> D[Autonomous Agent Execution] + D --> E[Hooks and Automation Triggers] + E --> F[MCP External Tool Integration] + F --> G[AWS-Native Governance and Powers] +``` + +## Chapter Guide + +| Chapter | Key Question | Outcome | +|:--------|:-------------|:--------| +| [01 - Getting Started](01-getting-started.md) | How do I download, authenticate, and start my first project? | Working Kiro baseline | +| [02 - Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) | How do I use requirements, design, and task files effectively? | Repeatable spec workflow | +| [03 - Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) | How do I guide AI behavior with steering files and rules? | Consistent agent behavior | +| [04 - Autonomous Agent Mode](04-autonomous-agent-mode.md) | How do I delegate multi-step tasks to autonomous agents? | Safe full-autonomy delegation | +| [05 - MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) | How do I connect external data sources with MCP servers? | Extended tool capabilities | +| [06 - Hooks and Automation](06-hooks-and-automation.md) | How do I trigger automation on file events and task completions? | Event-driven workflow automation | +| [07 - Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) | How do I configure Claude Sonnet and manage model routing? | Reliable model configuration | +| [08 - Team Operations and Governance](08-team-operations-and-governance.md) | How do I deploy Kiro at team scale with AWS governance? | Production-ready team operations | + +## What You Will Learn + +- how to install and authenticate Kiro on Mac, Windows, or Linux +- how the spec-driven workflow organizes AI assistance into structured documents +- how agent steering files shape repeatable, predictable AI behavior +- how to delegate multi-step tasks to autonomous agents with clear safety boundaries +- how MCP servers connect Kiro to external data sources and APIs +- how hooks enable event-driven automation on file saves, test results, and custom triggers +- how to configure Claude Sonnet 4.0 and 3.7 for different task profiles +- how to govern Kiro deployments at team scale using AWS IAM Autopilot and related powers + +## Source References + +- [Kiro Repository](https://github.com/kirodotdev/Kiro) +- [Kiro Website](https://kiro.dev) +- [Kiro Docs](https://kiro.dev/docs) +- [Kiro Specs Guide](https://kiro.dev/docs/specs) +- [Kiro Steering Guide](https://kiro.dev/docs/steering) +- [Kiro Hooks Guide](https://kiro.dev/docs/hooks) +- [Kiro MCP Guide](https://kiro.dev/docs/mcp) + +## Related Tutorials + +- [Cline Tutorial](../cline-tutorial/) +- [Roo Code Tutorial](../roo-code-tutorial/) +- [Claude Code Tutorial](../claude-code-tutorial/) +- [OpenCode Tutorial](../opencode-tutorial/) + +--- + +Start with [Chapter 1: Getting Started](01-getting-started.md). + +## Navigation & Backlinks + +- [Start Here: Chapter 1: Getting Started](01-getting-started.md) +- [Back to Main Catalog](../../README.md#-tutorial-catalog) +- [Browse A-Z Tutorial Directory](../../discoverability/tutorial-directory.md) +- [Search by Intent](../../discoverability/query-hub.md) +- [Explore Category Hubs](../../README.md#category-hubs) + +## Full Chapter Map + +1. [Chapter 1: Getting Started](01-getting-started.md) +2. [Chapter 2: Spec-Driven Development Workflow](02-spec-driven-development-workflow.md) +3. [Chapter 3: Agent Steering and Rules Configuration](03-agent-steering-and-rules-configuration.md) +4. [Chapter 4: Autonomous Agent Mode](04-autonomous-agent-mode.md) +5. [Chapter 5: MCP Integration and External Tools](05-mcp-integration-and-external-tools.md) +6. [Chapter 6: Hooks and Automation](06-hooks-and-automation.md) +7. [Chapter 7: Multi-Model Strategy and Providers](07-multi-model-strategy-and-providers.md) +8. [Chapter 8: Team Operations and Governance](08-team-operations-and-governance.md) + +*Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)* From 9e795f552fba67729e9b45413c38f57894c13126 Mon Sep 17 00:00:00 2001 From: johnxie Date: Tue, 24 Feb 2026 14:26:24 -0800 Subject: [PATCH 2/3] docs: refresh tutorial manifest after pack-by import (191 tutorials) Co-Authored-By: Claude Sonnet 4.6 --- tutorials/tutorial-manifest.json | 61 ++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/tutorials/tutorial-manifest.json b/tutorials/tutorial-manifest.json index abcae126..b108a5d7 100644 --- a/tutorials/tutorial-manifest.json +++ b/tutorials/tutorial-manifest.json @@ -3,9 +3,9 @@ "docs_only": 0, "index_only": 0, "mixed": 0, - "root_only": 188 + "root_only": 191 }, - "tutorial_count": 188, + "tutorial_count": 191, "tutorials": [ { "chapter_numbers": [ @@ -350,6 +350,25 @@ "top_level_chapter_count": 8, "total_numbered_chapter_count": 8 }, + { + "chapter_numbers": [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08" + ], + "docs_chapter_count": 0, + "has_index": true, + "name": "babyagi-tutorial", + "path": "tutorials/babyagi-tutorial", + "structure": "root_only", + "top_level_chapter_count": 8, + "total_numbered_chapter_count": 8 + }, { "chapter_numbers": [ "01", @@ -1034,6 +1053,25 @@ "top_level_chapter_count": 8, "total_numbered_chapter_count": 8 }, + { + "chapter_numbers": [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08" + ], + "docs_chapter_count": 0, + "has_index": true, + "name": "devika-tutorial", + "path": "tutorials/devika-tutorial", + "structure": "root_only", + "top_level_chapter_count": 8, + "total_numbered_chapter_count": 8 + }, { "chapter_numbers": [ "01", @@ -1528,6 +1566,25 @@ "top_level_chapter_count": 8, "total_numbered_chapter_count": 8 }, + { + "chapter_numbers": [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08" + ], + "docs_chapter_count": 0, + "has_index": true, + "name": "kiro-tutorial", + "path": "tutorials/kiro-tutorial", + "structure": "root_only", + "top_level_chapter_count": 8, + "total_numbered_chapter_count": 8 + }, { "chapter_numbers": [ "01", From c5c7153cbf4339c904f9b2afdfe3e5ddb96c3e3d Mon Sep 17 00:00:00 2001 From: johnxie Date: Tue, 24 Feb 2026 14:27:57 -0800 Subject: [PATCH 3/3] docs: refresh repo status docs after pack-by import Co-Authored-By: Claude Sonnet 4.6 --- CONTENT_GAPS_ANALYSIS.md | 4 ++-- TUTORIAL_STRUCTURE.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTENT_GAPS_ANALYSIS.md b/CONTENT_GAPS_ANALYSIS.md index e08618b6..57b98796 100644 --- a/CONTENT_GAPS_ANALYSIS.md +++ b/CONTENT_GAPS_ANALYSIS.md @@ -6,8 +6,8 @@ This document tracks structural and quality gaps that impact completeness and di | Metric | Value | |:-------|:------| -| Tutorial directories | 188 | -| Tutorials with exactly 8 numbered chapters | 185 | +| Tutorial directories | 191 | +| Tutorials with exactly 8 numbered chapters | 188 | | Tutorials with >8 numbered chapters | 3 | | Tutorials with 0 numbered chapters | 0 | | Tutorials with partial chapter coverage (1-7) | 0 | diff --git a/TUTORIAL_STRUCTURE.md b/TUTORIAL_STRUCTURE.md index 994b16e8..de76880a 100644 --- a/TUTORIAL_STRUCTURE.md +++ b/TUTORIAL_STRUCTURE.md @@ -17,7 +17,7 @@ tutorials// | Pattern | Count | |:--------|:------| -| `root_only` | 188 | +| `root_only` | 191 | | `docs_only` | 0 | | `index_only` | 0 | | `mixed` | 0 |