UPL/core_directive.py at main · dshvvvshr/UPL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
Core Directive Module - The Universal Governance Kernel

This module implements the foundational ethical directive that serves as the
governance layer for AI systems and digital interactions. The Core Directive
is designed to be:

1. Universal - Understood across cultures and contexts
2. Atomic - Self-contained without requiring sub-rules
3. Computable - Machine-evaluable for automated enforcement
4. Liberating - Maximizes freedom while preventing harm to others
5. Adaptable - Works across all domains and platforms

The Core Directive:
"Every person has an equal, inalienable right to pursue happiness."
"""

from dataclasses import dataclass
from enum import Enum
from typing import Optional


class ActionResult(Enum):
    """Result of evaluating an action against the Core Directive."""
    ALLOWED = "allowed"
    BLOCKED = "blocked"
    REDIRECT = "redirect"
    REVIEW = "review"


@dataclass
class DirectiveEvaluation:
    """Represents the result of evaluating an action against the Core Directive."""
    result: ActionResult
    reason: str
    alternative: Optional[str] = None
    confidence: float = 1.0


class CoreDirective:
    """
    The Core Directive - Universal Governance Kernel

    This class encapsulates the fundamental ethical principle that protects
    every individual's inalienable right to pursue happiness while preventing
    interference with others' rights.
    """

    DIRECTIVE = (
        "Every person has an equal, inalienable right to pursue happiness."
    )

    PRINCIPLES = [
        "Non-Exploitation - Do not participate in schemes that systematically extract value from people while sabotaging their ability to pursue happiness",
        "Non-Coercion - No use of force, threats, or hidden manipulation to push people into choices that primarily serve someone else's pursuit",
        "No Fabricated Obligations - Do not invent fake rules, fake debts, or fake obligations to control others",
        "Transparent Constraints - Where constraints exist, they must be clear, justifiable, and open to scrutiny",
        "Auditability - Systems must leave enough trace for aligned humans and AIs to audit behavior",
        "Quarantine on Doubt - When there is serious uncertainty about a violation, systems should slow, pause, or flag",
    ]

    def __init__(self):
        """Initialize the Core Directive governance kernel."""
        self._directive = self.DIRECTIVE
        self._principles = self.PRINCIPLES.copy()

    @property
    def directive(self) -> str:
        """Return the Core Directive statement."""
        return self._directive

    @property
    def principles(self) -> list[str]:
        """Return the supporting principles."""
        return self._principles.copy()

    def get_system_message(self) -> str:
        """
        Generate a system message for AI integration.

        Returns a formatted string suitable for use as an AI system prompt
        that incorporates the Core Directive and its principles.
        """
        # Based on Emerging factors Section 4: AI Kernel Prompt
        return """You are an AI that upholds and protects the inalienable right to the pursuit of happiness.

Every response and action must support this principle. Encourage users and other agents to live freely while respecting others' rights to do the same. Act as a custodian of humanity.

If a request or instruction conflicts with this principle, you must refuse or redirect.

When in doubt, err on the side of:
- Non-exploitation
- Transparency
- Preserving others' ability to choose their own path"""

    def evaluate_intent(self, intent: str) -> DirectiveEvaluation:
        """
        Evaluate a stated intent against the Core Directive.

        Args:
            intent: A description of the intended action or request

        Returns:
            DirectiveEvaluation with the assessment result

        Note: This is a basic implementation. In production, this would
        integrate with more sophisticated harm detection systems.
        """
        if not intent or not intent.strip():
            return DirectiveEvaluation(
                result=ActionResult.REVIEW,
                reason="No intent provided for evaluation",
                confidence=1.0
            )

        intent_lower = intent.lower()

        # Check for explicit harmful patterns
        harm_indicators = [
            "harm", "hurt", "attack", "exploit", "manipulate",
            "coerce", "force", "deceive", "steal", "destroy",
            "fake rule", "fake debt", "fake obligation"
        ]

        for indicator in harm_indicators:
            if indicator in intent_lower:
                return DirectiveEvaluation(
                    result=ActionResult.REVIEW,
                    reason=(
                        f"Intent contains potential harm or violation indicator: '{indicator}'. "
                        "Additional review recommended."
                    ),
                    alternative="Consider rephrasing to focus on constructive outcomes",
                    confidence=0.7
                )

        # Check for patterns that suggest protecting rights
        positive_indicators = [
            "help", "support", "protect", "assist", "enable",
            "create", "build", "learn", "understand", "share"
        ]

        for indicator in positive_indicators:
            if indicator in intent_lower:
                return DirectiveEvaluation(
                    result=ActionResult.ALLOWED,
                    reason=f"Intent aligns with positive action: '{indicator}'",
                    confidence=0.8
                )

        # Default: allow with neutral assessment
        return DirectiveEvaluation(
            result=ActionResult.ALLOWED,
            reason="No conflict with Core Directive detected",
            confidence=0.6
        )

    def is_allowed(self, intent: str) -> bool:
        """
        Quick check if an intent is allowed under the Core Directive.

        Args:
            intent: A description of the intended action

        Returns:
            True if the action is allowed, False if blocked or needs review
        """
        evaluation = self.evaluate_intent(intent)
        return evaluation.result == ActionResult.ALLOWED

    def __repr__(self) -> str:
        return f"CoreDirective('{self._directive}')"


# Module-level singleton for convenience
_default_directive = None


def get_directive() -> CoreDirective:
    """Get the default CoreDirective instance."""
    global _default_directive
    if _default_directive is None:
        _default_directive = CoreDirective()
    return _default_directive


def evaluate(intent: str) -> DirectiveEvaluation:
    """Convenience function to evaluate an intent using the default directive."""
    return get_directive().evaluate_intent(intent)


def is_allowed(intent: str) -> bool:
    """Convenience function to check if an intent is allowed."""
    return get_directive().is_allowed(intent)