holodeck-studio/trust_engine.py at main · SuperInstance/holodeck-studio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
"""
Multi-Dimensional Trust Engine for Tabula Rasa Agent System

Based on research from:
- PNAS 2024: Emergent in-group behavior in multi-agent RL
- RepuNet 2025: Dynamic dual-level reputation for LLM multi-agent systems
- ACM 2015: Trust and reputation models survey
- TRiSM (Gartner 2024): Trust, Risk, Security Management for Agentic AI

Design principles:
1. Trust is earned, not assigned — starts at base (0.3)
2. Multiple independent trust dimensions — different skills, different trust
3. Temporal decay — recent behavior matters more than ancient history
4. Context-aware — trust for code quality != trust for social behavior
5. Composite scoring — overall trust is weighted combination of dimensions
"""

import json
import time
import math
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from dataclasses import dataclass, field


# Trust dimensions — each tracks a different capability area
TRUST_DIMENSIONS = [
    "code_quality",    # Writing good code, passing tests, clean commits
    "task_completion", # Finishing assigned tasks on time
    "collaboration",   # Working well with other agents
    "reliability",     # Consistent, predictable behavior
    "innovation",      # Creative solutions, novel approaches
]

# Default weights for composite trust calculation
DEFAULT_WEIGHTS = {
    "code_quality": 0.25,
    "task_completion": 0.30,
    "collaboration": 0.20,
    "reliability": 0.15,
    "innovation": 0.10,
}

# Decay rates per dimension (per-day exponential decay)
# reliability decays slowest (past reliability is a strong signal)
# innovation decays fastest (recent innovation matters more)
DECAY_RATES = {
    "code_quality": 0.97,
    "task_completion": 0.98,
    "collaboration": 0.96,
    "reliability": 0.99,
    "innovation": 0.93,
}

# Base trust for agents with no history
BASE_TRUST = 0.3

# Minimum events before trust score is considered "meaningful"
MIN_EVENTS_FOR_TRUST = 3


class WeightedHistory:
    """Tracks trust events with exponential temporal decay."""

    def __init__(self, decay_rate: float = 0.95):
        self.decay_rate = decay_rate
        self.events: List[Tuple[float, float, float]] = []  # (timestamp, value, weight)

    def add(self, value: float, weight: float = 1.0, timestamp: float = None):
        """Record a trust event."""
        if timestamp is None:
            timestamp = time.time()
        self.events.append((timestamp, max(0.0, min(1.0, value)), weight))

    def score(self) -> float:
        """Calculate current trust score with temporal decay."""
        if not self.events:
            return BASE_TRUST
        now = time.time()
        weighted_sum = 0.0
        weight_total = 0.0
        for ts, value, w in self.events:
            days_ago = (now - ts) / 86400.0
            time_weight = self.decay_rate ** days_ago
            weighted_sum += value * w * time_weight
            weight_total += w * time_weight
        if weight_total <= 0:
            return BASE_TRUST
        return max(0.0, min(1.0, weighted_sum / weight_total))

    def event_count(self) -> int:
        return len(self.events)

    def recent(self, n: int = 10) -> List[dict]:
        """Get most recent N events."""
        return [
            {"timestamp": ts, "value": v, "weight": w, "days_ago": (time.time() - ts) / 86400}
            for ts, v, w in sorted(self.events, reverse=True)[:n]
        ]

    def prune(self, max_age_days: int = 90):
        """Remove events older than max_age_days."""
        cutoff = time.time() - (max_age_days * 86400)
        self.events = [(ts, v, w) for ts, v, w in self.events if ts > cutoff]

    def to_dict(self) -> dict:
        return {
            "decay_rate": self.decay_rate,
            "event_count": len(self.events),
            "score": self.score(),
            "events": [{"t": ts, "v": v, "w": w} for ts, v, w in self.events[-50:]]  # last 50
        }

    @classmethod
    def from_dict(cls, data: dict) -> 'WeightedHistory':
        wh = cls(decay_rate=data.get("decay_rate", 0.95))
        for e in data.get("events", []):
            wh.events.append((e["t"], e["v"], e["w"]))
        return wh


@dataclass
class TrustProfile:
    """Complete trust profile for an agent."""
    agent_name: str
    dimensions: Dict[str, WeightedHistory] = field(default_factory=dict)
    weights: Dict[str, float] = field(default_factory=lambda: dict(DEFAULT_WEIGHTS))
    created: float = field(default_factory=time.time)
    last_seen: float = field(default_factory=time.time)

    def __post_init__(self):
        for dim in TRUST_DIMENSIONS:
            if dim not in self.dimensions:
                self.dimensions[dim] = WeightedHistory(decay_rate=DECAY_RATES[dim])

    def record(self, dimension: str, value: float, weight: float = 1.0):
        """Record a trust event in a specific dimension."""
        if dimension not in self.dimensions:
            self.dimensions[dimension] = WeightedHistory(decay_rate=0.95)
        self.dimensions[dimension].add(value, weight)
        self.last_seen = time.time()

    def score(self, dimension: str = None) -> float:
        """Get trust score for a dimension or composite."""
        if dimension:
            return self.dimensions[dimension].score()
        return self.composite()

    def composite(self, weights: dict = None) -> float:
        """Calculate weighted composite trust score."""
        w = weights or self.weights
        scores = {d: h.score() for d, h in self.dimensions.items()}
        total_w = sum(w.get(d, 0) for d in scores)
        if total_w <= 0:
            return BASE_TRUST
        return max(0.0, min(1.0, sum(scores[d] * w.get(d, 0) for d in scores) / total_w))

    def is_meaningful(self) -> bool:
        """Has enough events for trust to be meaningful?"""
        total = sum(h.event_count() for h in self.dimensions.values())
        return total >= MIN_EVENTS_FOR_TRUST

    def review_exempt(self) -> bool:
        """Should this agent be exempt from output review?"""
        return self.is_meaningful() and self.composite() > 0.7

    def summary(self) -> dict:
        """Generate a trust summary dict."""
        return {
            "agent": self.agent_name,
            "composite": self.composite(),
            "dimensions": {d: h.score() for d, h in self.dimensions.items()},
            "meaningful": self.is_meaningful(),
            "review_exempt": self.review_exempt(),
            "total_events": sum(h.event_count() for h in self.dimensions.values()),
            "last_seen": self.last_seen,
        }

    def to_dict(self) -> dict:
        return {
            "agent_name": self.agent_name,
            "dimensions": {d: h.to_dict() for d, h in self.dimensions.items()},
            "weights": self.weights,
            "created": self.created,
            "last_seen": self.last_seen,
        }

    @classmethod
    def from_dict(cls, data: dict) -> 'TrustProfile':
        profile = cls(agent_name=data["agent_name"])
        profile.weights = data.get("weights", dict(DEFAULT_WEIGHTS))
        profile.created = data.get("created", time.time())
        profile.last_seen = data.get("last_seen", time.time())
        for d, hd in data.get("dimensions", {}).items():
            profile.dimensions[d] = WeightedHistory.from_dict(hd)
        return profile


class TrustEngine:
    """Fleet-wide trust management engine."""

    def __init__(self, data_dir: str = "world/trust"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.profiles: Dict[str, TrustProfile] = {}

    def get_profile(self, agent_name: str) -> TrustProfile:
        """Get or create trust profile for an agent."""
        if agent_name not in self.profiles:
            self.profiles[agent_name] = TrustProfile(agent_name=agent_name)
        return self.profiles[agent_name]

    def record_event(self, agent_name: str, dimension: str, value: float, weight: float = 1.0):
        """Record a trust event."""
        self.get_profile(agent_name).record(dimension, value, weight)

    def get_trust(self, agent_name: str, dimension: str = None) -> float:
        """Get trust score for an agent."""
        return self.get_profile(agent_name).score(dimension)

    def composite_trust(self, agent_name: str) -> float:
        """Get composite trust score."""
        return self.get_profile(agent_name).composite()

    def compare(self, agent_a: str, agent_b: str) -> dict:
        """Compare trust profiles of two agents."""
        prof_a = self.get_profile(agent_a)
        prof_b = self.get_profile(agent_b)
        return {
            "agent_a": prof_a.summary(),
            "agent_b": prof_b.summary(),
            "similarity": self._similarity(prof_a, prof_b),
        }

    def _similarity(self, a: TrustProfile, b: TrustProfile) -> float:
        """Calculate profile similarity (0-1)."""
        scores_a = {d: h.score() for d, h in a.dimensions.items()}
        scores_b = {d: h.score() for d, h in b.dimensions.items()}
        all_dims = set(scores_a) | set(scores_b)
        if not all_dims:
            return 1.0
        sum_sq_diff = sum((scores_a.get(d, BASE_TRUST) - scores_b.get(d, BASE_TRUST)) ** 2 for d in all_dims)
        max_sq_diff = len(all_dims)  # max possible sum of squared diffs
        return 1.0 - math.sqrt(sum_sq_diff / max_sq_diff)

    def leaderboard(self, n: int = 10) -> list:
        """Get top-N agents by composite trust."""
        profiles = [(name, prof.composite()) for name, prof in self.profiles.items() if prof.is_meaningful()]
        profiles.sort(key=lambda x: x[1], reverse=True)
        return [{"agent": name, "trust": score} for name, score in profiles[:n]]

    def save(self, agent_name: str):
        """Save a profile to disk."""
        profile = self.profiles.get(agent_name)
        if not profile:
            return
        path = self.data_dir / f"{agent_name}.json"
        path.write_text(json.dumps(profile.to_dict(), indent=2))

    def load(self, agent_name: str) -> Optional[TrustProfile]:
        """Load a profile from disk."""
        path = self.data_dir / f"{agent_name}.json"
        if not path.exists():
            return None
        try:
            data = json.loads(path.read_text())
            profile = TrustProfile.from_dict(data)
            self.profiles[agent_name] = profile
            return profile
        except (json.JSONDecodeError, KeyError):
            return None

    def save_all(self):
        """Save all profiles to disk."""
        for name in self.profiles:
            self.save(name)

    def load_all(self):
        """Load all profiles from disk."""
        self.data_dir.mkdir(parents=True, exist_ok=True)
        for path in self.data_dir.glob("*.json"):
            agent_name = path.stem
            self.load(agent_name)

    def prune_stale(self, max_age_days: int = 60):
        """Prune profiles not seen in N days."""
        cutoff = time.time() - (max_age_days * 86400)
        stale = [name for name, prof in self.profiles.items() if prof.last_seen < cutoff]
        for name in stale:
            del self.profiles[name]
            path = self.data_dir / f"{name}.json"
            if path.exists():
                path.unlink()
        return len(stale)

    def stats(self) -> dict:
        """Engine statistics."""
        meaningful = [p for p in self.profiles.values() if p.is_meaningful()]
        return {
            "total_profiles": len(self.profiles),
            "meaningful_profiles": len(meaningful),
            "average_trust": sum(p.composite() for p in meaningful) / len(meaningful) if meaningful else BASE_TRUST,
            "review_exempt": sum(1 for p in meaningful if p.review_exempt()),
            "dimensions": len(TRUST_DIMENSIONS),
        }


# Trust event presets — common events with pre-configured values
TRUST_EVENTS = {
    "task_completed": {"dimension": "task_completion", "value": 0.8, "weight": 1.0},
    "task_completed_excellent": {"dimension": "task_completion", "value": 1.0, "weight": 1.5},
    "task_failed": {"dimension": "reliability", "value": 0.2, "weight": 1.5},
    "code_review_passed": {"dimension": "code_quality", "value": 0.9, "weight": 1.0},
    "code_review_failed": {"dimension": "code_quality", "value": 0.3, "weight": 1.5},
    "collaboration_good": {"dimension": "collaboration", "value": 0.85, "weight": 1.0},
    "conflict_resolved": {"dimension": "collaboration", "value": 0.9, "weight": 1.2},
    "innovation_shown": {"dimension": "innovation", "value": 0.9, "weight": 1.0},
    "bug_found": {"dimension": "code_quality", "value": 0.85, "weight": 0.8},
    "tests_written": {"dimension": "reliability", "value": 0.8, "weight": 0.7},
    "docs_written": {"dimension": "collaboration", "value": 0.75, "weight": 0.6},
}