Skip to content

Commit edbaed1

Browse files
Merge pull request #88 from tombarys/apriori
Adding an article about apriori algorithm before Macroexpand conference
2 parents ab39960 + e467a6c commit edbaed1

12 files changed

+3222
-3
lines changed

deps.edn

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
clj-thamil/clj-thamil {:mvn/version "0.2.0"}
1414
org.scicloj/clay {#_#_:mvn/version "2-beta54"
1515
:git/url "https://github.com/scicloj/clay.git"
16-
:git/sha "26fb97b5ea2f72a98f8a64d5b8b8abf705767a39"}
16+
:git/sha "c4fa35c11034614d42c48c1223d8767267cad385"}
17+
org.scicloj/kindly {:mvn/version "4-beta20"}
1718
thi.ng/geom {:mvn/version "1.0.1"}
1819
org.eclipse.elk/org.eclipse.elk.core {:mvn/version "0.10.0"}
1920
org.eclipse.elk/org.eclipse.elk.graph {:mvn/version "0.10.0"}

site/db.edn

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@
116116
:email "edwardaw@connect.hku.hk"
117117
:affiliation [:cim]
118118
:links [{:icon "github" :href "https://github.com/W-Edward"}]}
119+
{:id :tombarys
120+
:name "Tomáš Baránek"
121+
:image "https://avatars.githubusercontent.com/u/5729352?v=4"
122+
:url "https://barys.me/#english-section"
123+
:email "tom@barys.me"
124+
:links [{:icon "github" :href "https://github.com/tombarys"}]}
119125
{:id :mattb
120126
:name "Matthias Buehlmaier"
121127
:affiliation [:cim]}
@@ -145,8 +151,7 @@
145151
:url "https://techascent.com"}
146152
{:id :cim
147153
:name "cim"
148-
:url "https://cim.hkubs.hku.hk/"
149-
}]
154+
:url "https://cim.hkubs.hku.hk/"}]
150155

151156
:topic
152157
[{:id :core

src/data_analysis/book_sales_analysis/about_apriori.clj

Lines changed: 349 additions & 0 deletions
Large diffs are not rendered by default.

src/data_analysis/book_sales_analysis/anonymized-all-sliced-sharing-V2.csv

Lines changed: 2001 additions & 0 deletions
Large diffs are not rendered by default.
442 KB
Loading
Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
(ns data-analysis.book-sales-analysis.core-helpers-v2
2+
(:import [java.text Normalizer Normalizer$Form]
3+
[java.io ByteArrayInputStream ObjectInputStream])
4+
(:require [tablecloth.api :as tc]
5+
[tech.v3.dataset :as ds]
6+
[tablecloth.column.api :as tcc]
7+
[clojure.string :as str]
8+
[java-time.api :as jt]
9+
[fastmath.stats :as stats]
10+
[scicloj.kindly.v4.kind :as kind]
11+
[data-analysis.book-sales-analysis.data-sources-v2 :as data]))
12+
13+
;; ## Data Transformation Functions
14+
;; Common data processing functions used across multiple analysis files
15+
16+
;; ### Scicloj Helpers
17+
18+
(defn merge-csvs [file-list options]
19+
(->> (mapv #(tc/dataset % options) file-list)
20+
(apply tc/concat)))
21+
22+
;; ### Column and Content Sanitizers
23+
24+
(defn sanitize-str
25+
"Sanitizes a string for use as a slug or identifier.
26+
Replaces underscores and spaces with hyphens, removes diacritics and parentheses, and converts to lower-case.
27+
Intended for general-purpose text like book titles."
28+
[s]
29+
(if (or (nil? s) (empty? s))
30+
s
31+
(let [hyphens (str/replace s #"_" "-")
32+
trimmed (str/trim hyphens)
33+
nfd-normalized (Normalizer/normalize trimmed Normalizer$Form/NFD)
34+
no-diacritics (str/replace nfd-normalized #"\p{InCombiningDiacriticalMarks}+" "")
35+
no-spaces (str/replace no-diacritics #" " "-")
36+
no-brackets (str/replace no-spaces #"\(|\)" "")
37+
lower-cased (str/lower-case no-brackets)]
38+
lower-cased)))
39+
40+
41+
(defn sanitize-column-name-str
42+
"Sanitizes a string for use as a dataset column name.
43+
More aggressive than `sanitize-str`, it also converts slashes to hyphens, collapses multiple hyphens,
44+
and removes special substrings like '(YYYY-MM)'."
45+
[s]
46+
(if (or (nil? s) (empty? s))
47+
s
48+
(-> s
49+
(str/replace #"\(YYYY-MM\)" "") ; special removal
50+
str/trim
51+
(str/lower-case)
52+
(str/replace #"_" "-") ; underscore to hyphens
53+
(str/replace #" " "-")
54+
(str/replace #"\/" "-") ; slash to hyphens
55+
(str/replace #"-{2,}" "-") ; multiple hyphens to one
56+
(#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
57+
(str/replace #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics
58+
(str/replace #"\(|\)" ""))))
59+
60+
(defn sanitize-category-str
61+
"Sanitizes a string representing categories.
62+
Similar to other sanitizers, but specifically handles comma-separated lists by removing the space
63+
after a comma (e.g., 'a, b' -> 'a,b')."
64+
[s]
65+
(if (or (nil? s) (empty? s))
66+
s
67+
(-> s
68+
str/trim
69+
(str/lower-case)
70+
(str/replace #"\,\s" ",") ; underscore to hyphens
71+
(str/replace #"\s" "-")
72+
(str/replace #"\/" "-") ; slash to hyphens
73+
(str/replace #"-{2,}" "-") ; multiple hyphens to one
74+
(#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
75+
(str/replace #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics (dočasně)
76+
(str/replace #"\(|\)" ""))))
77+
78+
(defn parse-book-name [s]
79+
(-> s ;; proti parse-books bere jen řetězec
80+
(str/replace #"," "")
81+
(str/replace #"\+" "")
82+
(str/trim)
83+
sanitize-category-str
84+
(str/replace #"^3" "k3")
85+
(str/replace #"^5" "k5")))
86+
87+
88+
(defn parse-csv-date [date-str]
89+
(let [month-names ["led" "úno" "bře" "dub" "kvě" "čvn" "čvc" "srp" "zář" "říj" "lis" "pro"]
90+
pad-month #(format "%02d" %)
91+
parse-full-date (fn [s]
92+
(let [month (Integer/parseInt (subs s 3 5))]
93+
(str (subs s 6 10) "-01-" (pad-month month))))
94+
parse-short-date (fn [s]
95+
(let [[month-str year-str] (str/split s #"\.")
96+
month (inc (.indexOf month-names month-str))
97+
year (+ 2000 (Integer/parseInt year-str))]
98+
(str year "-01-" (pad-month month))))]
99+
(try
100+
(jt/local-date "yyyy-dd-MM"
101+
(if (> (count date-str) 6)
102+
(parse-full-date date-str)
103+
(parse-short-date date-str)))
104+
(catch Exception _
105+
(str "Chyba: " date-str)))))
106+
107+
(defn parse-books-from-list
108+
"Parses a book names from string `s` separated by commas into vector of cleaned keywords."
109+
[s]
110+
(if (seq s) (->> (str/split s #",\s\d+")
111+
(map #(str/replace % #"\d\s" ""))
112+
(map #(str/replace % #"," ""))
113+
(map #(str/replace % #"\(A\+E\)|\[|\]|komplet|a\+e|\s\(P\+E\+A\)|\s\(e\-kniha\)|\s\(P\+E\)|\s\(P\+A\)|\s\(E\+A\)|papír|papir|audio|e\-kniha|taška" ""))
114+
(map #(str/replace % #"\+" ""))
115+
(map #(str/trim %))
116+
(map sanitize-str)
117+
(map #(str/replace % #"\-\-.+$" ""))
118+
(map #(str/replace % #"\-+$" ""))
119+
(map #(str/replace % #"^3" "k3"))
120+
(map #(str/replace % #"^5" "k5"))
121+
(remove (fn [item] (some (fn [substr] (str/includes? (name item) substr))
122+
["balicek" "poukaz" "zapisnik" "limitovana-edice" "taska" "aktualizovane-vydani" "cd" "puvodni-vydani/neni-skladem"
123+
"merch"])))
124+
distinct
125+
(mapv keyword))
126+
nil))
127+
128+
;; ### Metadata Enriching and Convenience Functions
129+
130+
(defn czech-author? [book-title]
131+
(let [czech-books #{:k30-hodin
132+
:k365-anglickych-cool-fraz-a-vyrazov
133+
:k365-anglickych-cool-frazi-a-vyrazu
134+
:bulbem-zachranare
135+
:hacknuta-cestina
136+
:handmade-byznys
137+
:hot
138+
:hry-site-porno
139+
:jak-na-site
140+
:jak-sbalit-zenu-2.0
141+
:konec-prokrastinace
142+
:let-your-english-september
143+
:myty-a-nadeje-digitalniho-sveta
144+
:na-volne-noze
145+
:napoleonuv-vojak
146+
:nedelni-party-s-picassem
147+
:restart-kreativity
148+
:sport-je-bolest
149+
:stat-se-investorem
150+
:temne-pocatky-ceskych-dejin
151+
:uc-jako-umelec
152+
:velka-kniha-fuckupu
153+
:zamilujte-se-do-anglictiny
154+
:pretizeny
155+
:od-chaosu-ke-smyslu
156+
:very-hard-orechy
157+
:heureka!}]
158+
(if (str/starts-with? (str book-title) "book") ;; this is a part used to add flags of Czech books into fully anonymized dataset
159+
(rand-int 2)
160+
(if (contains? czech-books (keyword book-title)) 1 0))))
161+
162+
;; ### One-Hot Encoding Functions
163+
164+
165+
(defn onehot-encode-by-customers ;; FIXME needs refactor and simplification :)
166+
"One-hot encode dataset aggregated by customer.
167+
Each customer gets one row with 0/1 values for each book they bought.
168+
Used for market basket analysis, customer segmentation, etc."
169+
[raw-ds]
170+
(let [;; First, aggregate all purchases by customer
171+
customer+orders (-> raw-ds
172+
(ds/drop-missing :zakaznik)
173+
(tc/drop-rows #(= "" (str/trim (:zakaznik %))))
174+
(ds/drop-missing :produkt-produkty)
175+
(tc/group-by [:zakaznik])
176+
(tc/aggregate {:all-products #(str/join ", " (tc/column % :produkt-produkty))})
177+
(tc/rename-columns {:summary :all-products}))
178+
;; Get all unique books from all the lines
179+
all-titles (->> (tc/column customer+orders :all-products)
180+
(mapcat parse-books-from-list)
181+
distinct
182+
sort)
183+
;; For each customer create one aggregated row with all purchases in 0/1 format
184+
customers->rows (map
185+
(fn [customer-row]
186+
(let [customer-name (:zakaznik customer-row)
187+
books-bought-set (set (parse-books-from-list (:all-products customer-row)))
188+
one-hot-map (reduce (fn [acc book]
189+
(assoc acc book (if (contains? books-bought-set book) 1 0)))
190+
{}
191+
all-titles)]
192+
(merge {:zakaznik customer-name}
193+
one-hot-map)))
194+
(tc/rows customer+orders :as-maps))
195+
;; Create new dataset from one-hot data
196+
one-hot-ds (tc/dataset customers->rows)]
197+
;; Return dataset with one-hot encoding
198+
one-hot-ds))
199+
200+
201+
;; ### Statistical Functions for Apriori Analysis
202+
203+
(defn calculate-support
204+
"Calculate support for a given itemset in a one-hot-encoded dataset.
205+
Support = (rows containing itemset) / (total rows)"
206+
[dataset itemset]
207+
(let [total-transactions (tc/row-count dataset)
208+
transactions-with-itemset (-> dataset
209+
(tc/select-rows (fn [row] (every? #(not (zero? (get row %))) itemset)))
210+
tc/row-count)]
211+
(if (zero? total-transactions)
212+
0.0
213+
(double (/ transactions-with-itemset total-transactions)))))
214+
215+
216+
^:kindly/hide-code
217+
(defn calculate-adaptive-coefficient
218+
"Calculates adaptive coefficient for popularity bias correction."
219+
[rules popularity-index]
220+
(let [median-lift (tcc/median (map :lift rules))
221+
max-popularity (apply max (vals popularity-index))
222+
;; Calculate coefficient that would reduce a rule with max popularity
223+
;; items on both sides by approximately 50% of median lift
224+
target-coefficient (/ (* 0.5 median-lift) (* 2 max-popularity))]
225+
target-coefficient))
226+
227+
^:kindly/hide-code
228+
(defn improved-adjusted-lift
229+
"Lift adjustment using adaptive popularity bias correction."
230+
[rule popularity-index adaptive-coefficient]
231+
(let [base-lift (:lift rule)
232+
antecedent-items (:antecedent rule)
233+
consequent-items (seq (:consequent rule))
234+
;; Average popularity of items in antecedent
235+
antecedent-popularity (tcc/mean (vals (select-keys popularity-index antecedent-items)))
236+
;; Average popularity of items in consequent
237+
consequent-popularity (tcc/mean (vals (select-keys popularity-index consequent-items)))
238+
;; Dampening factor for popular items
239+
popularity-penalty (+ 1 (* adaptive-coefficient
240+
(+ antecedent-popularity consequent-popularity)))]
241+
;; Divide lift by penalty (popular combinations get reduced lift)
242+
(assoc rule :lift (double (/ base-lift popularity-penalty)))))
243+
244+
245+
;; ### Visuals
246+
247+
(defn color-hex [support min-support max-support]
248+
(let [min-opacity 20
249+
max-opacity 255
250+
;; Map support from [min-support, max-support] to [min-opacity, max-opacity]
251+
opacity (if (= min-support max-support)
252+
;; Handle edge case where min and max are the same
253+
(int (/ (+ min-opacity max-opacity) 2))
254+
(int (+ min-opacity
255+
(* (- max-opacity min-opacity)
256+
(/ (- support min-support)
257+
(- max-support min-support))))))
258+
;; Ensure opacity stays within bounds
259+
clamped-opacity (min max-opacity (max min-opacity opacity))
260+
hex-opacity (format "%02x" clamped-opacity)]
261+
(str "#c1ab55" hex-opacity)))
262+
263+
264+
;; ### Correlation functions
265+
266+
(defn corr-a-x-b
267+
"Creates a correlation matrix with book columns and the added :book column \n
268+
- `ds` is dataset \n
269+
Example: \n
270+
=> _unnamed [2 3]: \n
271+
| :a | :b | :book |
272+
|------------:|------------:|-------|
273+
| 1.00000000 | -0.12121831 | :a |
274+
| -0.12121831 | 1.00000000 | :b |"
275+
[ds]
276+
(let
277+
[columns (tc/column-names ds)
278+
clean-ds (-> ds
279+
(tc/drop-columns [:zakaznik]))]
280+
(-> (zipmap columns (stats/correlation-matrix (tc/columns clean-ds)))
281+
tc/dataset
282+
(tc/add-column :book columns))))
283+
284+
(defn corr-3-col
285+
"Creates a correlation matrix with two columns of books \n
286+
=> _unnamed [4 3]: \n
287+
| :book-0 | :book-1 | :correlation |
288+
|---------|---------|-------------:|
289+
| :a | :a | 1.00000000 |
290+
| :a | :b | -0.12121831 |
291+
| :b | :a | -0.12121831 |
292+
| :b | :b | 1.00000000 | \n
293+
- `flatten` is used here to make a linear sequence of numbers which should match corresponding variable names. \n
294+
- Since we make pairs of names `((for...[a b])` creates a cartesian product) we need to separate these to individual columns, tc/seperate-column does the trick, refer: https://scicloj.github.io/tablecloth/#separate"
295+
[ds]
296+
(let [names (tc/column-names ds)
297+
mat (flatten (stats/correlation-matrix (tc/columns ds)))]
298+
(-> (tc/dataset {:book (for [a names b names] [a b])
299+
:correlation mat})
300+
(tc/separate-column :book)
301+
(tc/rename-columns {":book-0" :titul-knihy
302+
":book-1" :book-1}))))
303+
304+
305+
;; ### Export helper functions from other namespaces for convenience
306+
307+
(def sanitize-str sanitize-str)
308+
(def merge-csvs merge-csvs)
309+
(def parse-books-from-list parse-books-from-list)
310+
(def sanitize-column-name-str sanitize-column-name-str)
311+
(def parse-csv-date parse-csv-date)
312+
313+
(println "Core helpers loaded.")
203 KB
Binary file not shown.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
(ns data-analysis.book-sales-analysis.data-sources-v2
2+
(:require [tablecloth.api :as tc]
3+
[data-analysis.book-sales-analysis.core-helpers-v2 :as helpers]))
4+
5+
;; ## Central Data Loading
6+
;; All CSV datasets are loaded here with consistent names and simple tc/dataset calls
7+
8+
;; ### Main Orders Data (WooCommerce exports)
9+
10+
#_(def anonymized-presentation-ds
11+
"Anonymized customers full dataset of all orders - for presentation purposes
12+
❗ NOT FOR SHARING"
13+
(tc/dataset
14+
(helpers/merge-csvs
15+
["data/anonymized-customers-only-presentation-v2.csv"]
16+
{:header? true
17+
:separator ","
18+
:key-fn #(keyword (helpers/sanitize-str %))})))
19+
20+
(def anonymized-shareable-ds
21+
"Fully anonymized slice of db - for sharing purposes
22+
✅ SAFE TO SHARE"
23+
(tc/dataset
24+
(helpers/merge-csvs
25+
["src/data_analysis/book_sales_analysis/anonymized-all-sliced-sharing-V2.csv"]
26+
{:header? true
27+
:separator ","
28+
:key-fn #(keyword (helpers/sanitize-str %))})))
29+
30+
;; ### Book Metadata
31+
32+
;; ## Quick Access
33+
;; Most commonly used datasets with short aliases
34+
35+
(def orders-share anonymized-shareable-ds)
36+
#_(def orders-slides anonymized-presentation-ds)
37+
38+
(def corr-matrix-precalculated
39+
(tc/dataset "src/data_analysis/book_sales_analysis/corr-matrix.nippy"))
40+
41+
corr-matrix-precalculated
42+
43+
(prn "Data sources loaded.")
Binary file not shown.
131 KB
Loading

0 commit comments

Comments
 (0)