Skip to content

Commit b75208d

Browse files
committed
feat: add scripts and requirements for all skills, remove Blink/Ring
- Remove camera-providers/blink and camera-providers/ring (built into Aegis) - Remove blink/ring from skills.json registry - Add requirements.txt and working Python scripts for all 18 skills: Detection: dinov3-grounding/ground.py, person-recognition/detect.py Analysis: vlm-scene-analysis/analyze.py, sam2-segmentation/segment.py Transformation: depth-estimation/transform.py Annotation: dataset-annotation/annotate.py (CocoDatasetManager) Camera Providers: eufy/feed.py, reolink/feed.py, tapo/feed.py Streaming: go2rtc-cameras/stream.py Channels: matrix/channel.py, line/channel.py, signal/channel.py Automation: mqtt/mqtt_publish.py, webhook/webhook.py, ha-trigger/ha_trigger.py Integrations: homeassistant-bridge/bridge.py All scripts follow the JSON-lines stdin/stdout protocol defined in SKILL.md.
1 parent 00bc16a commit b75208d

File tree

37 files changed

+2157
-177
lines changed

37 files changed

+2157
-177
lines changed

skills.json

Lines changed: 0 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -242,41 +242,6 @@
242242
"training_pipeline"
243243
]
244244
},
245-
{
246-
"id": "camera-provider-blink",
247-
"name": "Blink Cameras",
248-
"description": "Amazon Blink camera integration — motion clips, snapshots, arm/disarm.",
249-
"version": "1.0.0",
250-
"category": "camera-providers",
251-
"path": "skills/camera-providers/blink",
252-
"tags": [
253-
"blink",
254-
"amazon",
255-
"camera",
256-
"clips"
257-
],
258-
"platforms": [
259-
"linux-x64",
260-
"linux-arm64",
261-
"darwin-arm64",
262-
"darwin-x64",
263-
"win-x64"
264-
],
265-
"requirements": {
266-
"python": ">=3.9",
267-
"ram_gb": 1
268-
},
269-
"capabilities": [
270-
"clip_feed",
271-
"discover_cameras",
272-
"snapshot",
273-
"arm_disarm"
274-
],
275-
"ui_unlocks": [
276-
"camera_timeline",
277-
"clip_feed"
278-
]
279-
},
280245
{
281246
"id": "camera-provider-eufy",
282247
"name": "Eufy Cameras",
@@ -386,41 +351,6 @@
386351
"live_view"
387352
]
388353
},
389-
{
390-
"id": "camera-provider-ring",
391-
"name": "Ring Cameras",
392-
"description": "Ring camera integration — event clips and live view.",
393-
"version": "1.0.0",
394-
"category": "camera-providers",
395-
"path": "skills/camera-providers/ring",
396-
"tags": [
397-
"ring",
398-
"amazon",
399-
"camera",
400-
"doorbell"
401-
],
402-
"platforms": [
403-
"linux-x64",
404-
"linux-arm64",
405-
"darwin-arm64",
406-
"darwin-x64",
407-
"win-x64"
408-
],
409-
"requirements": {
410-
"python": ">=3.9",
411-
"ram_gb": 1
412-
},
413-
"capabilities": [
414-
"clip_feed",
415-
"discover_cameras",
416-
"live_stream"
417-
],
418-
"ui_unlocks": [
419-
"camera_timeline",
420-
"clip_feed",
421-
"live_view"
422-
]
423-
},
424354
{
425355
"id": "go2rtc-cameras",
426356
"name": "go2rtc Multi-Camera Streaming",
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SAM2 Segmentation
2+
segment-anything-2>=0.1.0
3+
torch>=2.0.0
4+
torchvision>=0.15.0
5+
numpy>=1.24.0
6+
opencv-python-headless>=4.8.0
7+
Pillow>=10.0.0
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env python3
2+
"""
3+
SAM2 Segmentation Skill — Interactive click-to-segment.
4+
5+
Generates pixel-perfect masks from point/box prompts using Segment Anything 2.
6+
"""
7+
8+
import sys
9+
import json
10+
import argparse
11+
import signal
12+
import tempfile
13+
from pathlib import Path
14+
15+
16+
def parse_args():
17+
parser = argparse.ArgumentParser(description="SAM2 Segmentation Skill")
18+
parser.add_argument("--config", type=str)
19+
parser.add_argument("--model", type=str, default="sam2-small")
20+
parser.add_argument("--device", type=str, default="auto")
21+
return parser.parse_args()
22+
23+
24+
def load_config(args):
25+
if args.config and Path(args.config).exists():
26+
with open(args.config) as f:
27+
return json.load(f)
28+
return {"model": args.model, "device": args.device}
29+
30+
31+
def select_device(pref):
32+
if pref != "auto":
33+
return pref
34+
try:
35+
import torch
36+
if torch.cuda.is_available(): return "cuda"
37+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps"
38+
except ImportError:
39+
pass
40+
return "cpu"
41+
42+
43+
def emit(event):
44+
print(json.dumps(event), flush=True)
45+
46+
47+
def main():
48+
args = parse_args()
49+
config = load_config(args)
50+
device = select_device(config.get("device", "auto"))
51+
52+
try:
53+
import torch
54+
import numpy as np
55+
import cv2
56+
from sam2.build_sam import build_sam2
57+
from sam2.sam2_image_predictor import SAM2ImagePredictor
58+
59+
model_cfg = {
60+
"sam2-tiny": "sam2_hiera_t.yaml",
61+
"sam2-small": "sam2_hiera_s.yaml",
62+
"sam2-base": "sam2_hiera_b+.yaml",
63+
"sam2-large": "sam2_hiera_l.yaml",
64+
}
65+
66+
model_name = config.get("model", "sam2-small")
67+
checkpoint = f"models/{model_name}.pt"
68+
69+
sam2 = build_sam2(model_cfg.get(model_name, "sam2_hiera_s.yaml"), checkpoint)
70+
predictor = SAM2ImagePredictor(sam2)
71+
predictor.model.to(device)
72+
73+
emit({"event": "ready", "model": model_name, "device": device})
74+
except Exception as e:
75+
emit({"event": "error", "message": f"Failed to load SAM2: {e}", "retriable": False})
76+
sys.exit(1)
77+
78+
running = True
79+
current_image = None
80+
81+
def handle_signal(s, f):
82+
nonlocal running
83+
running = False
84+
signal.signal(signal.SIGTERM, handle_signal)
85+
signal.signal(signal.SIGINT, handle_signal)
86+
87+
for line in sys.stdin:
88+
if not running:
89+
break
90+
line = line.strip()
91+
if not line:
92+
continue
93+
try:
94+
msg = json.loads(line)
95+
except json.JSONDecodeError:
96+
continue
97+
98+
if msg.get("command") == "stop":
99+
break
100+
101+
event = msg.get("event")
102+
103+
if event == "frame":
104+
frame_path = msg.get("frame_path")
105+
if frame_path and Path(frame_path).exists():
106+
current_image = cv2.imread(frame_path)
107+
current_image = cv2.cvtColor(current_image, cv2.COLOR_BGR2RGB)
108+
predictor.set_image(current_image)
109+
110+
elif event == "click" and current_image is not None:
111+
x, y = msg.get("x", 0), msg.get("y", 0)
112+
label = msg.get("label", 1) # 1=foreground, 0=background
113+
114+
try:
115+
point = np.array([[x, y]])
116+
point_label = np.array([label])
117+
118+
masks, scores, _ = predictor.predict(
119+
point_coords=point,
120+
point_labels=point_label,
121+
multimask_output=True,
122+
)
123+
124+
# Use highest-scoring mask
125+
best_idx = np.argmax(scores)
126+
mask = masks[best_idx]
127+
score = float(scores[best_idx])
128+
129+
# Save mask
130+
mask_path = tempfile.mktemp(suffix=".png", dir="/tmp")
131+
cv2.imwrite(mask_path, (mask * 255).astype(np.uint8))
132+
133+
# Compute bbox from mask
134+
ys, xs = np.where(mask)
135+
bbox = [int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())]
136+
137+
emit({
138+
"event": "segmentation",
139+
"frame_number": msg.get("frame_number", 0),
140+
"mask_path": mask_path,
141+
"score": round(score, 3),
142+
"bbox": bbox,
143+
})
144+
except Exception as e:
145+
emit({"event": "error", "message": f"Segmentation error: {e}", "retriable": True})
146+
147+
148+
if __name__ == "__main__":
149+
main()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# VLM Scene Analysis
2+
llama-cpp-python>=0.3.0
3+
numpy>=1.24.0
4+
opencv-python-headless>=4.8.0
5+
Pillow>=10.0.0
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env python3
2+
"""
3+
VLM Scene Analysis Skill — Offline clip understanding via vision language models.
4+
5+
Analyzes recorded video clips and generates natural language descriptions.
6+
"""
7+
8+
import sys
9+
import json
10+
import argparse
11+
import signal
12+
from pathlib import Path
13+
14+
15+
def parse_args():
16+
parser = argparse.ArgumentParser(description="VLM Scene Analysis Skill")
17+
parser.add_argument("--config", type=str)
18+
parser.add_argument("--model", type=str, default="smolvlm2-500m")
19+
parser.add_argument("--prompt", type=str,
20+
default="Describe what is happening in this security camera footage. Focus on people, vehicles, and any unusual activity.")
21+
parser.add_argument("--max-frames", type=int, default=4)
22+
parser.add_argument("--device", type=str, default="auto")
23+
return parser.parse_args()
24+
25+
26+
def load_config(args):
27+
if args.config and Path(args.config).exists():
28+
with open(args.config) as f:
29+
return json.load(f)
30+
return {
31+
"model": args.model,
32+
"prompt": args.prompt,
33+
"max_frames": args.max_frames,
34+
"device": args.device,
35+
}
36+
37+
38+
def emit(event):
39+
print(json.dumps(event), flush=True)
40+
41+
42+
def extract_frames(video_path, max_frames=4):
43+
"""Extract evenly spaced frames from a video clip."""
44+
import cv2
45+
cap = cv2.VideoCapture(video_path)
46+
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
47+
if total <= 0:
48+
cap.release()
49+
return []
50+
51+
indices = [int(i * total / max_frames) for i in range(max_frames)]
52+
frames = []
53+
for idx in indices:
54+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
55+
ret, frame = cap.read()
56+
if ret:
57+
frames.append(frame)
58+
cap.release()
59+
return frames
60+
61+
62+
def main():
63+
args = parse_args()
64+
config = load_config(args)
65+
66+
try:
67+
from llama_cpp import Llama
68+
from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
69+
import cv2
70+
import base64
71+
72+
model_path = Path(f"models/{config['model']}.gguf")
73+
if not model_path.exists():
74+
emit({"event": "error", "message": f"Model not found: {model_path}. Run: python scripts/download_model.py --model {config['model']}", "retriable": False})
75+
sys.exit(1)
76+
77+
chat_handler = MiniCPMv26ChatHandler(clip_model_path=str(model_path.with_suffix(".mmproj")))
78+
llm = Llama(model_path=str(model_path), chat_handler=chat_handler, n_ctx=4096)
79+
80+
emit({"event": "ready", "model": config["model"], "device": config.get("device", "cpu")})
81+
except Exception as e:
82+
emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
83+
sys.exit(1)
84+
85+
running = True
86+
def handle_signal(s, f):
87+
nonlocal running
88+
running = False
89+
signal.signal(signal.SIGTERM, handle_signal)
90+
signal.signal(signal.SIGINT, handle_signal)
91+
92+
for line in sys.stdin:
93+
if not running:
94+
break
95+
line = line.strip()
96+
if not line:
97+
continue
98+
try:
99+
msg = json.loads(line)
100+
except json.JSONDecodeError:
101+
continue
102+
103+
if msg.get("command") == "stop":
104+
break
105+
106+
if msg.get("event") == "clip_ready":
107+
video_path = msg.get("video_path")
108+
clip_id = msg.get("clip_id", "unknown")
109+
camera_id = msg.get("camera_id", "unknown")
110+
111+
if not video_path or not Path(video_path).exists():
112+
emit({"event": "error", "message": f"Video not found: {video_path}", "retriable": True})
113+
continue
114+
115+
try:
116+
frames = extract_frames(video_path, config.get("max_frames", 4))
117+
if not frames:
118+
emit({"event": "error", "message": "No frames extracted", "retriable": True})
119+
continue
120+
121+
# Encode frames as base64 for VLM
122+
images = []
123+
for frame in frames:
124+
_, buf = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
125+
images.append(f"data:image/jpeg;base64,{base64.b64encode(buf).decode()}")
126+
127+
content = [{"type": "text", "text": config["prompt"]}]
128+
for img in images:
129+
content.append({"type": "image_url", "image_url": {"url": img}})
130+
131+
result = llm.create_chat_completion(messages=[
132+
{"role": "user", "content": content}
133+
])
134+
135+
description = result["choices"][0]["message"]["content"]
136+
emit({
137+
"event": "analysis_result",
138+
"clip_id": clip_id,
139+
"camera_id": camera_id,
140+
"description": description,
141+
"objects": [], # Could be extracted from description
142+
"confidence": 0.9,
143+
})
144+
except Exception as e:
145+
emit({"event": "error", "message": f"Analysis error: {e}", "retriable": True})
146+
147+
148+
if __name__ == "__main__":
149+
main()

0 commit comments

Comments
 (0)