Python/machine_learning/apriori_algorithm.py at b397f526632f125f5c7f6bff9e857cb65954ec57 · TheAlgorithms/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Apriori Algorithm with Association Rules (support, confidence, lift).

This implementation finds:
- Frequent itemsets
- Association rules with minimum confidence and lift

WIKI: https://en.wikipedia.org/wiki/Apriori_algorithm
"""

from collections import defaultdict
from collections import Counter
from itertools import combinations


def load_data() -> list[list[str]]:
    """
    Returns a sample transaction dataset.

    >>> load_data()
    [['milk'], ['milk', 'butter'], ['milk', 'bread'], ['milk', 'bread', 'chips']]
    """
    return [["milk"], ["milk", "butter"], ["milk", "bread"], ["milk", "bread", "chips"]]


class Apriori:
    """Apriori algorithm class with support, confidence, and lift filtering."""

    def __init__(
        self,
        transactions: list[list[str]],
        min_support: float = 0.25,
        min_confidence: float = 0.5,
        min_lift: float = 1.0,
    ) -> None:
        self.transactions: list[set[str]] = [set(t) for t in transactions]
        self.min_support: float = min_support
        self.min_confidence: float = min_confidence
        self.min_lift: float = min_lift
        self.itemsets: list[dict[frozenset, float]] = []
        self.rules: list[tuple[frozenset, frozenset, float, float]] = []

        self.find_frequent_itemsets()
        self.generate_association_rules()

    def _get_support(self, itemset: frozenset) -> float:
        """Return support of an itemset."""
        return sum(1 for t in self.transactions if itemset.issubset(t)) / len(
            self.transactions
        )

    def confidence(self, antecedent: frozenset, consequent: frozenset) -> float:
        """Calculate confidence of a rule A -> B."""
        support_antecedent = self._get_support(antecedent)
        support_both = self._get_support(antecedent | consequent)
        return support_both / support_antecedent if support_antecedent > 0 else 0.0

    def lift(self, antecedent: frozenset, consequent: frozenset) -> float:
        """Calculate lift of a rule A -> B."""
        support_consequent = self._get_support(consequent)
        conf = self.confidence(antecedent, consequent)
        return conf / support_consequent if support_consequent > 0 else 0.0

    def find_frequent_itemsets(self) -> list[dict[frozenset, float]]:
        """Generate all frequent itemsets."""
        item_counts: dict[frozenset, int] = defaultdict(int)
        for t in self.transactions:
            for item in t:
                item_counts[frozenset([item])] += 1

        total: int = len(self.transactions)
        current_itemsets: dict[frozenset, float] = {
            k: v / total
            for k, v in item_counts.items()
            if v / total >= self.min_support
        }
        if current_itemsets:
            self.itemsets.append(current_itemsets)

        k: int = 2
        while current_itemsets:
            candidates: set[frozenset] = set()
            keys: list[frozenset] = list(current_itemsets.keys())
            for i in range(len(keys)):
                for j in range(i + 1, len(keys)):
                    union = keys[i] | keys[j]
                    if len(union) == k and all(
                        frozenset(sub) in current_itemsets
                        for sub in combinations(union, k - 1)
                    ):
                        candidates.add(union)

            freq_candidates: dict[frozenset, float] = {
                c: self._get_support(c)
                for c in candidates
                if self._get_support(c) >= self.min_support
            }
            if not freq_candidates:
def prune(itemset: list, candidates: list, length: int) -> list:
    """
    Prune candidate itemsets that are not frequent.
    The goal of pruning is to filter out candidate itemsets that are not frequent.  This
    is done by checking if all the (k-1) subsets of a candidate itemset are present in
    the frequent itemsets of the previous iteration (valid subsequences of the frequent
    itemsets from the previous iteration).

    Prunes candidate itemsets that are not frequent.

    >>> itemset = ['X', 'Y', 'Z']
    >>> candidates = [['X', 'Y'], ['X', 'Z'], ['Y', 'Z']]
    >>> prune(itemset, candidates, 2)
    [['X', 'Y'], ['X', 'Z'], ['Y', 'Z']]

    >>> itemset = ['1', '2', '3', '4']
    >>> candidates = ['1', '2', '4']
    >>> prune(itemset, candidates, 3)
    []
    """
    itemset_counter = Counter(tuple(item) for item in itemset)
    pruned = []
    for candidate in candidates:
        is_subsequence = True
        for item in candidate:
            item_tuple = tuple(item)
            if (
                item_tuple not in itemset_counter
                or itemset_counter[item_tuple] < length - 1
            ):
                is_subsequence = False
                break

            self.itemsets.append(freq_candidates)
            current_itemsets = freq_candidates
            k += 1

        return self.itemsets

    def generate_association_rules(
        self,
    ) -> list[tuple[frozenset, frozenset, float, float]]:
        """Generate association rules with min confidence and lift."""
        for level in self.itemsets:
            for itemset in level:
                if len(itemset) < 2:
                    continue
                for i in range(1, len(itemset)):
                    for antecedent in combinations(itemset, i):
                        antecedent_set = frozenset(antecedent)
                        consequent_set = itemset - antecedent_set
                        conf = self.confidence(antecedent_set, consequent_set)
                        lft = self.lift(antecedent_set, consequent_set)
                        if conf >= self.min_confidence and lft >= self.min_lift:
                            self.rules.append(
                                (antecedent_set, consequent_set, conf, lft)
                            )
        return self.rules


if __name__ == "__main__":
    import doctest

    doctest.testmod()

    transactions = load_data()
    model = Apriori(transactions, min_support=0.25, min_confidence=0.1, min_lift=0.0)

    print("Frequent itemsets:")
    for level in model.itemsets:
        for items, sup in level.items():
            print(f"{set(items)}: {sup:.2f}")

    print("\nAssociation Rules:")
    for rule in model.rules:
        antecedent, consequent, conf, lift = rule
        print(
            f"{set(antecedent)} -> {set(consequent)}, conf={conf:.2f}, lift={lift:.2f}"
        )