Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions pkg/create/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const (
TemplateOpenAGIComputerUse = "openagi-computer-use"
TemplateClaudeAgentSDK = "claude-agent-sdk"
TemplateYutoriComputerUse = "yutori"
TemplateTzafonComputerUse = "tzafon"
)

type TemplateInfo struct {
Expand Down Expand Up @@ -90,6 +91,11 @@ var Templates = map[string]TemplateInfo{
Description: "Implements a Yutori n1 computer use agent",
Languages: []string{LanguageTypeScript, LanguagePython},
},
TemplateTzafonComputerUse: {
Name: "Tzafon Northstar Computer Use",
Description: "Implements a Tzafon Northstar CUA Fast computer use agent",
Languages: []string{LanguageTypeScript, LanguagePython},
},
}

// GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language
Expand All @@ -116,6 +122,8 @@ func GetSupportedTemplatesForLanguage(language string) TemplateKeyValues {
return 2
case TemplateYutoriComputerUse:
return 3
case TemplateTzafonComputerUse:
return 4
default:
return 10
}
Expand Down Expand Up @@ -213,6 +221,11 @@ var Commands = map[string]map[string]DeployConfig{
NeedsEnvFile: true,
InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`,
},
TemplateTzafonComputerUse: {
EntryPoint: "index.ts",
NeedsEnvFile: true,
InvokeCommand: `kernel invoke ts-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`,
},
},
LanguagePython: {
TemplateSampleApp: {
Expand Down Expand Up @@ -260,6 +273,11 @@ var Commands = map[string]map[string]DeployConfig{
NeedsEnvFile: true,
InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`,
},
TemplateTzafonComputerUse: {
EntryPoint: "main.py",
NeedsEnvFile: true,
InvokeCommand: `kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`,
},
},
}

Expand Down
1 change: 1 addition & 0 deletions pkg/templates/python/tzafon/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
TZAFON_API_KEY=your-tzafon-api-key
57 changes: 57 additions & 0 deletions pkg/templates/python/tzafon/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Kernel Python Sample App - Tzafon Northstar Computer Use

This is a Kernel application that implements a CUA (computer use agent) loop using Tzafon's Northstar CUA Fast model with Kernel's Computer Controls API. The model is accessed via Tzafon's [Lightcone](https://docs.lightcone.ai) API platform.

[Northstar CUA Fast](https://docs.lightcone.ai) is a vision language model trained with reinforcement learning for computer use tasks.

## Setup

1. Get your API keys:
- **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com)
- **Tzafon**: [tzafon.ai](https://www.tzafon.ai)

2. Deploy the app:
```bash
kernel login
cp .env.example .env # Add your TZAFON_API_KEY
kernel deploy main.py --env-file .env
```

## Usage

```bash
kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'
```

## Recording Replays

> **Note:** Replay recording is only available to Kernel users on paid plans.

Add `"record_replay": true` to your payload to capture a video of the browser session:

```bash
kernel invoke python-tzafon-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}'
```

When enabled, the response will include a `replay_url` field with a link to view the recorded session.

## Viewport Configuration

Northstar CUA Fast works well with a **1280x800** viewport, which is the default.

## Supported Actions

| Action | Description |
|--------|-------------|
| `click` | Left or right mouse click at coordinates |
| `double_click` | Double-click at coordinates |
| `point_and_type` | Click at coordinates then type text (with optional Enter) |
| `key` | Press key combo (e.g. `Enter`, `ctrl+a`) |
| `scroll` | Scroll at coordinates |
| `drag` | Click-and-drag from start to end coordinates |
| `done` | Signal task completion with a result summary |

## Resources

- [Lightcone API Documentation](https://docs.lightcone.ai)
- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
7 changes: 7 additions & 0 deletions pkg/templates/python/tzafon/_gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
__pycache__/
*.py[cod]
*$py.class
.env
*.log
.venv/
venv/
230 changes: 230 additions & 0 deletions pkg/templates/python/tzafon/loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
"""
Tzafon Northstar Sampling Loop

Runs the Northstar CUA model via the Lightcone Responses API using explicit
function tools (click, type, key, scroll, drag, done). Full conversation
history is maintained in the input array — each tool result includes a fresh
screenshot so the model always sees the current screen state.

@see https://docs.lightcone.ai
"""

import asyncio
import json
from typing import Any
from kernel import Kernel
from tzafon import Lightcone

from tools import ComputerTool

MODEL = "tzafon.northstar-cua-fast"

INSTRUCTIONS = (
"Use a mouse and keyboard to interact with a Chromium browser and take screenshots.\n"
"* Chromium is already open on a Kernel cloud browser. If a startup wizard appears, ignore it.\n"
"* The screen's coordinate space is a 0-999 grid.\n"
"* To navigate to a URL, use point_and_type on the address bar, or key('ctrl+l') to focus it first.\n"
"* Some pages may take time to load. Wait and take successive screenshots to confirm the result.\n"
"* Whenever you click on an element, consult the screenshot to determine coordinates first.\n"
"* Click buttons, links, and icons in the center of the element, not on edges.\n"
"* If a click didn't work, try adjusting the coordinates slightly.\n"
"* For full-page scrolling, prefer key('PageDown') / key('PageUp') over the scroll tool.\n"
"* After each action, evaluate the screenshot to confirm it succeeded before moving on.\n"
"* When the task is complete, call done() with a summary of what you found or accomplished.\n"
)

TOOLS = [
{
"type": "function", "name": "click",
"description": "Single click at (x, y) in 0-999 grid.",
"parameters": {
"type": "object",
"properties": {
"x": {"type": "integer", "description": "X in 0-999 grid"},
"y": {"type": "integer", "description": "Y in 0-999 grid"},
"button": {"type": "string", "enum": ["left", "right"]},
},
"required": ["x", "y"],
},
},
{
"type": "function", "name": "double_click",
"description": "Double click at (x, y) in 0-999 grid.",
"parameters": {
"type": "object",
"properties": {
"x": {"type": "integer", "description": "X in 0-999 grid"},
"y": {"type": "integer", "description": "Y in 0-999 grid"},
},
"required": ["x", "y"],
},
},
{
"type": "function", "name": "point_and_type",
"description": "Click at position then type text. For input fields, search bars, address bars.",
"parameters": {
"type": "object",
"properties": {
"x": {"type": "integer", "description": "X in 0-999 grid"},
"y": {"type": "integer", "description": "Y in 0-999 grid"},
"text": {"type": "string"},
"press_enter": {"type": "boolean", "description": "Press Enter after typing"},
},
"required": ["x", "y", "text"],
},
},
{
"type": "function", "name": "key",
"description": "Press key combo (e.g. 'Enter', 'ctrl+a', 'Tab').",
"parameters": {
"type": "object",
"properties": {"keys": {"type": "string"}},
"required": ["keys"],
},
},
{
"type": "function", "name": "scroll",
"description": "Scroll at (x, y) in 0-999 grid. Positive dy = down, negative = up.",
"parameters": {
"type": "object",
"properties": {
"x": {"type": "integer", "description": "X in 0-999 grid"},
"y": {"type": "integer", "description": "Y in 0-999 grid"},
"dy": {"type": "integer", "description": "Scroll notches. 3=down, -3=up."},
},
"required": ["x", "y", "dy"],
},
},
{
"type": "function", "name": "drag",
"description": "Drag from (x1, y1) to (x2, y2) in 0-999 grid.",
"parameters": {
"type": "object",
"properties": {
"x1": {"type": "integer", "description": "Start X in 0-999 grid"},
"y1": {"type": "integer", "description": "Start Y in 0-999 grid"},
"x2": {"type": "integer", "description": "End X in 0-999 grid"},
"y2": {"type": "integer", "description": "End Y in 0-999 grid"},
},
"required": ["x1", "y1", "x2", "y2"],
},
},
{
"type": "function", "name": "done",
"description": "Task complete. Report findings.",
"parameters": {
"type": "object",
"properties": {"result": {"type": "string"}},
"required": ["result"],
},
},
]


def _img(screenshot_url: str, text: str = "screenshot") -> dict:
return {
"role": "user",
"content": [
{"type": "input_text", "text": text},
{"type": "input_image", "image_url": screenshot_url, "detail": "auto"},
],
}


async def sampling_loop(
*,
task: str,
api_key: str,
kernel: Kernel,
session_id: str,
model: str = MODEL,
max_steps: int = 50,
viewport_width: int = 1280,
viewport_height: int = 800,
) -> dict[str, Any]:
"""Run the Northstar CUA loop until the model calls done() or max steps."""
tzafon = Lightcone(api_key=api_key)
computer = ComputerTool(kernel, session_id, viewport_width, viewport_height)

screenshot_url = computer.capture_screenshot()
items: list[Any] = [_img(screenshot_url, text=f"{task}\n\nCurrent screenshot:")]
resp: Any = None

for step in range(max_steps):
print(f"\n=== Step {step + 1}/{max_steps} ===")

# Prevent unbounded payload growth — keep the task prompt + recent history
if len(items) > 30:
items = items[:2] + items[-20:]

resp = tzafon.responses.create(
model=model, input=items, tools=TOOLS,
instructions=INSTRUCTIONS,
temperature=0, max_output_tokens=4096,
)

calls: list[tuple[str, str, dict]] = []
for item in resp.output or []:
if item.type == "message":
for block in item.content or []:
text = block.text or ""
if text:
items.append({"role": "assistant", "content": text})
print(f" Model: {text[:150]}")

elif item.type == "function_call":
call_id = item.call_id
name = item.name
raw_args = item.arguments or "{}"
try:
args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
except (json.JSONDecodeError, TypeError):
args = {}
calls.append((call_id, name, args))
items.append({
"type": "function_call", "call_id": call_id, "name": name,
"arguments": raw_args if isinstance(raw_args, str) else json.dumps(raw_args),
})

if not calls:
continue

for call_id, name, args in calls:
print(f" [{step + 1}] {name}({json.dumps(args)[:100]})")

if name == "done":
result = args.get("result", "")
items.append({"type": "function_call_output", "call_id": call_id, "output": "ok"})
print(f" Done: {result}")
return {"messages": [], "final_result": result}

try:
await computer.execute_function(name, args)
except Exception as e:
print(f" Action failed: {e}")
items.append({"type": "function_call_output", "call_id": call_id, "output": f"Error: {e}"})
continue

await asyncio.sleep(0.5)
screenshot_url = computer.capture_screenshot()

# Replace old screenshots with placeholders to save payload space
for it in items[:-1]:
c = it.get("content") if isinstance(it, dict) else None
if isinstance(c, list):
has_img = any(isinstance(p, dict) and p.get("type") == "input_image" for p in c)
if has_img:
it["content"] = [p for p in c if not (isinstance(p, dict) and p.get("type") == "input_image")] or "(old screenshot)"

items.append({"type": "function_call_output", "call_id": call_id, "output": "[screenshot]"})
items.append(_img(screenshot_url))

messages: list[str] = []
if resp:
for item in resp.output or []:
if item.type == "message":
for block in item.content or []:
if block.text:
messages.append(block.text)

return {"messages": messages, "final_result": None}
Loading
Loading