Skip to content

Commit 6770380

Browse files
committed
feat: add Tzafon Northstar computer use CLI templates (ts/python)
Add new CLI templates for Tzafon's Northstar CUA Fast model, enabling users to scaffold browser automation projects using Kernel's infrastructure. New templates: - TypeScript: `kernel create --template ts-tzafon-cua` - Python: `kernel create --template python-tzafon-cua` Both templates include: - Agentic sampling loop using Tzafon's Lightcone SDK responses API - Computer tool mapping Northstar actions (click, type, scroll, drag, key, navigate, wait) to Kernel's Computer Controls API - Session management with optional replay recording - 1280x800 default viewport Files changed: - pkg/templates/typescript/tzafon-computer-use/ - TypeScript template - pkg/templates/python/tzafon-computer-use/ - Python template - pkg/create/templates.go - Template registration
1 parent 6d54518 commit 6770380

File tree

21 files changed

+1486
-0
lines changed

21 files changed

+1486
-0
lines changed

pkg/create/templates.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
TemplateOpenAGIComputerUse = "openagi-computer-use"
2020
TemplateClaudeAgentSDK = "claude-agent-sdk"
2121
TemplateYutoriComputerUse = "yutori"
22+
TemplateTzafonComputerUse = "tzafon-computer-use"
2223
)
2324

2425
type TemplateInfo struct {
@@ -90,6 +91,11 @@ var Templates = map[string]TemplateInfo{
9091
Description: "Implements a Yutori n1 computer use agent",
9192
Languages: []string{LanguageTypeScript, LanguagePython},
9293
},
94+
TemplateTzafonComputerUse: {
95+
Name: "Tzafon Northstar Computer Use",
96+
Description: "Implements a Tzafon Northstar CUA Fast computer use agent",
97+
Languages: []string{LanguageTypeScript, LanguagePython},
98+
},
9399
}
94100

95101
// GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language
@@ -116,6 +122,8 @@ func GetSupportedTemplatesForLanguage(language string) TemplateKeyValues {
116122
return 2
117123
case TemplateYutoriComputerUse:
118124
return 3
125+
case TemplateTzafonComputerUse:
126+
return 4
119127
default:
120128
return 10
121129
}
@@ -213,6 +221,11 @@ var Commands = map[string]map[string]DeployConfig{
213221
NeedsEnvFile: true,
214222
InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`,
215223
},
224+
TemplateTzafonComputerUse: {
225+
EntryPoint: "index.ts",
226+
NeedsEnvFile: true,
227+
InvokeCommand: `kernel invoke ts-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`,
228+
},
216229
},
217230
LanguagePython: {
218231
TemplateSampleApp: {
@@ -260,6 +273,11 @@ var Commands = map[string]map[string]DeployConfig{
260273
NeedsEnvFile: true,
261274
InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`,
262275
},
276+
TemplateTzafonComputerUse: {
277+
EntryPoint: "main.py",
278+
NeedsEnvFile: true,
279+
InvokeCommand: `kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'`,
280+
},
263281
},
264282
}
265283

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
TZAFON_API_KEY=your-tzafon-api-key
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Kernel Python Sample App - Tzafon Northstar Computer Use
2+
3+
This is a Kernel application that implements a CUA (computer use agent) loop using Tzafon's Northstar CUA Fast model with Kernel's Computer Controls API. The model is accessed via Tzafon's [Lightcone](https://docs.lightcone.ai) API platform.
4+
5+
[Northstar CUA Fast](https://docs.lightcone.ai) is a vision language model trained with reinforcement learning for computer use tasks.
6+
7+
## Setup
8+
9+
1. Get your API keys:
10+
- **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com)
11+
- **Tzafon**: [tzafon.ai](https://www.tzafon.ai)
12+
13+
2. Deploy the app:
14+
```bash
15+
kernel login
16+
cp .env.example .env # Add your TZAFON_API_KEY
17+
kernel deploy main.py --env-file .env
18+
```
19+
20+
## Usage
21+
22+
```bash
23+
kernel invoke python-tzafon-cua cua-task --payload '{"query": "Go to wikipedia.org and search for Alan Turing"}'
24+
```
25+
26+
## Recording Replays
27+
28+
> **Note:** Replay recording is only available to Kernel users on paid plans.
29+
30+
Add `"record_replay": true` to your payload to capture a video of the browser session:
31+
32+
```bash
33+
kernel invoke python-tzafon-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}'
34+
```
35+
36+
When enabled, the response will include a `replay_url` field with a link to view the recorded session.
37+
38+
## Viewport Configuration
39+
40+
Northstar CUA Fast works well with a **1280x800** viewport, which is the default.
41+
42+
## Supported Actions
43+
44+
| Action | Description |
45+
|--------|-------------|
46+
| `click` | Left or right mouse click at coordinates |
47+
| `double_click` | Double-click at coordinates |
48+
| `point_and_type` | Click at coordinates then type text (with optional Enter) |
49+
| `key` | Press key combo (e.g. `Enter`, `ctrl+a`) |
50+
| `scroll` | Scroll at coordinates |
51+
| `drag` | Click-and-drag from start to end coordinates |
52+
| `done` | Signal task completion with a result summary |
53+
54+
## Resources
55+
56+
- [Lightcone API Documentation](https://docs.lightcone.ai)
57+
- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
__pycache__/
2+
*.py[cod]
3+
*$py.class
4+
.env
5+
*.log
6+
.venv/
7+
venv/
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
"""
2+
Tzafon Northstar Sampling Loop
3+
4+
Runs the Northstar CUA model via the Lightcone Responses API using explicit
5+
function tools (click, type, key, scroll, drag, done). Full conversation
6+
history is maintained in the input array — each tool result includes a fresh
7+
screenshot so the model always sees the current screen state.
8+
9+
@see https://docs.lightcone.ai
10+
"""
11+
12+
import asyncio
13+
import json
14+
from typing import Any, Optional
15+
16+
from kernel import Kernel
17+
from tzafon import Lightcone
18+
19+
from tools import ComputerTool
20+
21+
MODEL = "tzafon.northstar-cua-fast"
22+
23+
TOOLS = [
24+
{
25+
"type": "function", "name": "click",
26+
"description": "Single click at (x, y) in 0-999 grid.",
27+
"parameters": {
28+
"type": "object",
29+
"properties": {
30+
"x": {"type": "integer", "description": "X in 0-999 grid"},
31+
"y": {"type": "integer", "description": "Y in 0-999 grid"},
32+
"button": {"type": "string", "enum": ["left", "right"]},
33+
},
34+
"required": ["x", "y"],
35+
},
36+
},
37+
{
38+
"type": "function", "name": "double_click",
39+
"description": "Double click at (x, y) in 0-999 grid.",
40+
"parameters": {
41+
"type": "object",
42+
"properties": {
43+
"x": {"type": "integer", "description": "X in 0-999 grid"},
44+
"y": {"type": "integer", "description": "Y in 0-999 grid"},
45+
},
46+
"required": ["x", "y"],
47+
},
48+
},
49+
{
50+
"type": "function", "name": "point_and_type",
51+
"description": "Click at position then type text. For input fields, search bars, address bars.",
52+
"parameters": {
53+
"type": "object",
54+
"properties": {
55+
"x": {"type": "integer", "description": "X in 0-999 grid"},
56+
"y": {"type": "integer", "description": "Y in 0-999 grid"},
57+
"text": {"type": "string"},
58+
"press_enter": {"type": "boolean", "description": "Press Enter after typing"},
59+
},
60+
"required": ["x", "y", "text"],
61+
},
62+
},
63+
{
64+
"type": "function", "name": "key",
65+
"description": "Press key combo (e.g. 'Enter', 'ctrl+a', 'Tab').",
66+
"parameters": {
67+
"type": "object",
68+
"properties": {"keys": {"type": "string"}},
69+
"required": ["keys"],
70+
},
71+
},
72+
{
73+
"type": "function", "name": "scroll",
74+
"description": "Scroll at (x, y) in 0-999 grid. Positive dy = down, negative = up.",
75+
"parameters": {
76+
"type": "object",
77+
"properties": {
78+
"x": {"type": "integer", "description": "X in 0-999 grid"},
79+
"y": {"type": "integer", "description": "Y in 0-999 grid"},
80+
"dy": {"type": "integer", "description": "Scroll amount. 300=down, -300=up."},
81+
},
82+
"required": ["x", "y", "dy"],
83+
},
84+
},
85+
{
86+
"type": "function", "name": "drag",
87+
"description": "Drag from (x1, y1) to (x2, y2) in 0-999 grid.",
88+
"parameters": {
89+
"type": "object",
90+
"properties": {
91+
"x1": {"type": "integer", "description": "Start X in 0-999 grid"},
92+
"y1": {"type": "integer", "description": "Start Y in 0-999 grid"},
93+
"x2": {"type": "integer", "description": "End X in 0-999 grid"},
94+
"y2": {"type": "integer", "description": "End Y in 0-999 grid"},
95+
},
96+
"required": ["x1", "y1", "x2", "y2"],
97+
},
98+
},
99+
{
100+
"type": "function", "name": "done",
101+
"description": "Task complete. Report findings.",
102+
"parameters": {
103+
"type": "object",
104+
"properties": {"result": {"type": "string"}},
105+
"required": ["result"],
106+
},
107+
},
108+
]
109+
110+
111+
def _get(obj: Any, key: str, default: Any = None) -> Any:
112+
"""Attribute-or-key access — the Lightcone SDK may return dicts or objects."""
113+
return obj.get(key, default) if isinstance(obj, dict) else getattr(obj, key, default)
114+
115+
116+
def _img(screenshot_url: str, text: str = "screenshot") -> dict:
117+
return {
118+
"role": "user",
119+
"content": [
120+
{"type": "input_text", "text": text},
121+
{"type": "input_image", "image_url": screenshot_url, "detail": "auto"},
122+
],
123+
}
124+
125+
126+
async def sampling_loop(
127+
*,
128+
task: str,
129+
api_key: str,
130+
kernel: Kernel,
131+
session_id: str,
132+
model: str = MODEL,
133+
max_steps: int = 50,
134+
viewport_width: int = 1280,
135+
viewport_height: int = 800,
136+
) -> dict[str, Any]:
137+
"""Run the Northstar CUA loop until the model calls done() or max steps."""
138+
tzafon = Lightcone(api_key=api_key)
139+
computer = ComputerTool(kernel, session_id, viewport_width, viewport_height)
140+
141+
screenshot_url = computer.capture_screenshot()
142+
items: list[Any] = [_img(screenshot_url, text=f"{task}\n\nCurrent screenshot:")]
143+
144+
for step in range(max_steps):
145+
print(f"\n=== Step {step + 1}/{max_steps} ===")
146+
147+
# Prevent unbounded payload growth — keep the task prompt + recent history
148+
if len(items) > 30:
149+
items = items[:2] + items[-20:]
150+
151+
resp = tzafon.responses.create(
152+
model=model, input=items, tools=TOOLS,
153+
temperature=0, max_output_tokens=4096,
154+
)
155+
156+
calls: list[tuple[str, str, dict]] = []
157+
for item in _get(resp, "output") or []:
158+
item_type = _get(item, "type")
159+
160+
if item_type == "message":
161+
for block in _get(item, "content") or []:
162+
text = _get(block, "text", "")
163+
if text:
164+
items.append({"role": "assistant", "content": text})
165+
print(f" Model: {text[:150]}")
166+
167+
elif item_type == "function_call":
168+
call_id = _get(item, "call_id")
169+
name = _get(item, "name")
170+
raw_args = _get(item, "arguments", "{}")
171+
try:
172+
args = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
173+
except (json.JSONDecodeError, TypeError):
174+
args = {}
175+
calls.append((call_id, name, args))
176+
items.append({
177+
"type": "function_call", "call_id": call_id, "name": name,
178+
"arguments": raw_args if isinstance(raw_args, str) else json.dumps(raw_args),
179+
})
180+
181+
if not calls:
182+
continue
183+
184+
for call_id, name, args in calls:
185+
print(f" [{step + 1}] {name}({json.dumps(args)[:100]})")
186+
187+
if name == "done":
188+
result = args.get("result", "")
189+
items.append({"type": "function_call_output", "call_id": call_id, "output": "ok"})
190+
print(f" Done: {result}")
191+
return {"messages": [], "final_result": result}
192+
193+
try:
194+
await computer.execute_function(name, args)
195+
except Exception as e:
196+
print(f" Action failed: {e}")
197+
items.append({"type": "function_call_output", "call_id": call_id, "output": f"Error: {e}"})
198+
continue
199+
200+
await asyncio.sleep(0.5)
201+
screenshot_url = computer.capture_screenshot()
202+
203+
# Replace old screenshots with placeholders to save payload space
204+
for it in items[:-1]:
205+
c = it.get("content") if isinstance(it, dict) else None
206+
if isinstance(c, list):
207+
has_img = any(isinstance(p, dict) and p.get("type") == "input_image" for p in c)
208+
if has_img:
209+
it["content"] = [p for p in c if not (isinstance(p, dict) and p.get("type") == "input_image")] or "(old screenshot)"
210+
211+
items.append({"type": "function_call_output", "call_id": call_id, "output": "[screenshot]"})
212+
items.append(_img(screenshot_url))
213+
214+
messages: list[str] = []
215+
for item in _get(resp, "output") or []:
216+
if _get(item, "type") == "message":
217+
for block in _get(item, "content") or []:
218+
text = _get(block, "text")
219+
if text:
220+
messages.append(text)
221+
222+
return {"messages": messages, "final_result": None}

0 commit comments

Comments
 (0)