-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Expand file tree
/
Copy pathentropy.py
More file actions
117 lines (91 loc) · 3.41 KB
/
entropy.py
File metadata and controls
117 lines (91 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import annotations
import math
from collections import Counter
"""
In information theory, entropy is a measure of the uncertainty or randomness of a
source of data. It quantifies the expected amount of information contained in each
message from the source.
The core formula for Shannon Entropy H(X) is:
H(X) = -Σ P(x) * log2(P(x))
where P(x) is the probability of an event x occurring.
This concept mirrors the thermodynamic entropy in physics, representing the level
of disorder in a system. In a digital context, it defines the theoretical limit
for data compression.
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
def shannon_entropy(probabilities: list[float]) -> float:
"""
Calculates the Shannon entropy of a given probability distribution.
Args:
probabilities: A list of probabilities representing a discrete distribution.
Returns:
The entropy value in bits.
Raises:
ValueError: If probabilities are negative or do not sum to approximately 1.0.
Examples:
>>> shannon_entropy([0.5, 0.5])
1.0
>>> shannon_entropy([1.0, 0.0])
0.0
>>> shannon_entropy([0.25, 0.25, 0.25, 0.25])
2.0
"""
if any(p < 0 for p in probabilities):
raise ValueError("Probabilities cannot be negative.")
# Due to floating point precision, we check for closeness to 1.0
if (
not math.isclose(sum(probabilities), 1.0, rel_tol=1e-9)
and sum(probabilities) > 0
):
# Normalize if not summed to 1 but has values
probabilities = [p / sum(probabilities) for p in probabilities]
entropy = 0.0
for p in probabilities:
if p > 0:
entropy -= p * math.log2(p)
return entropy
def analyze_text_entropy(text: str) -> dict[str, float]:
"""
Analyzes the entropy of a given text at different levels (1-gram, 2-gram).
Args:
text: The input string to analyze.
Returns:
A dictionary containing entropy values for different n-gram levels.
Examples:
>>> result = analyze_text_entropy("aaaaa")
>>> result['1-gram']
0.0
>>> result = analyze_text_entropy("abab")
>>> round(result['1-gram'], 2)
1.0
"""
if not text:
return {"1-gram": 0.0, "2-gram": 0.0}
# 1-gram analysis (individual characters)
counts_1gram = Counter(text)
total_chars = len(text)
probs_1gram = [count / total_chars for count in counts_1gram.values()]
entropy_1gram = shannon_entropy(probs_1gram)
# 2-gram analysis (pairs of characters)
if len(text) < 2:
entropy_2gram = 0.0
else:
pairs = [text[i : i + 2] for i in range(len(text) - 1)]
counts_2gram = Counter(pairs)
total_pairs = len(pairs)
probs_2gram = [count / total_pairs for count in counts_2gram.values()]
entropy_2gram = shannon_entropy(probs_2gram)
return {
"1-gram": entropy_1gram,
"2-gram": entropy_2gram,
"conditional_entropy": max(0.0, entropy_2gram - entropy_1gram),
}
if __name__ == "__main__":
import doctest
doctest.testmod()
# Manual demonstration
sample_text = "Behind Winston's back the voice from the telescreen was still"
entropy_stats = analyze_text_entropy(sample_text)
print(f"Text: '{sample_text[:30]}...'")
for level, value in entropy_stats.items():
print(f"{level:>20}: {value:.4f} bits")