1+ (ns data-analysis.book-sales-analysis.core-helpers-v2
2+ (:import [java.text Normalizer Normalizer$Form]
3+ [java.io ByteArrayInputStream ObjectInputStream])
4+ (:require [tablecloth.api :as tc]
5+ [tech.v3.dataset :as ds]
6+ [tablecloth.column.api :as tcc]
7+ [clojure.string :as str]
8+ [java-time.api :as jt]
9+ [fastmath.stats :as stats]
10+ [scicloj.kindly.v4.kind :as kind]
11+ [data-analysis.book-sales-analysis.data-sources-v2 :as data]))
12+
13+ ; ; ## Data Transformation Functions
14+ ; ; Common data processing functions used across multiple analysis files
15+
16+ ; ; ### Scicloj Helpers
17+
18+ (defn merge-csvs [file-list options]
19+ (->> (mapv #(tc/dataset % options) file-list)
20+ (apply tc/concat)))
21+
22+ ; ; ### Column and Content Sanitizers
23+
24+ (defn sanitize-str
25+ " Sanitizes a string for use as a slug or identifier.
26+ Replaces underscores and spaces with hyphens, removes diacritics and parentheses, and converts to lower-case.
27+ Intended for general-purpose text like book titles."
28+ [s]
29+ (if (or (nil? s) (empty? s))
30+ s
31+ (let [hyphens (str/replace s #"_" " -" )
32+ trimmed (str/trim hyphens)
33+ nfd-normalized (Normalizer/normalize trimmed Normalizer$Form/NFD)
34+ no-diacritics (str/replace nfd-normalized #"\p {InCombiningDiacriticalMarks}+" " " )
35+ no-spaces (str/replace no-diacritics #" " " -" )
36+ no-brackets (str/replace no-spaces #"\( |\) " " " )
37+ lower-cased (str/lower-case no-brackets)]
38+ lower-cased)))
39+
40+
41+ (defn sanitize-column-name-str
42+ " Sanitizes a string for use as a dataset column name.
43+ More aggressive than `sanitize-str`, it also converts slashes to hyphens, collapses multiple hyphens,
44+ and removes special substrings like '(YYYY-MM)'."
45+ [s]
46+ (if (or (nil? s) (empty? s))
47+ s
48+ (-> s
49+ (str/replace #"\( YYYY-MM\) " " " ) ; special removal
50+ str/trim
51+ (str/lower-case )
52+ (str/replace #"_" " -" ) ; underscore to hyphens
53+ (str/replace #" " " -" )
54+ (str/replace #"\/ " " -" ) ; slash to hyphens
55+ (str/replace #"-{2,}" " -" ) ; multiple hyphens to one
56+ (#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
57+ (str/replace #"\p {InCombiningDiacriticalMarks}+" " " ) ; no-diacritics
58+ (str/replace #"\( |\) " " " ))))
59+
60+ (defn sanitize-category-str
61+ " Sanitizes a string representing categories.
62+ Similar to other sanitizers, but specifically handles comma-separated lists by removing the space
63+ after a comma (e.g., 'a, b' -> 'a,b')."
64+ [s]
65+ (if (or (nil? s) (empty? s))
66+ s
67+ (-> s
68+ str/trim
69+ (str/lower-case )
70+ (str/replace #"\,\s " " ," ) ; underscore to hyphens
71+ (str/replace #"\s " " -" )
72+ (str/replace #"\/ " " -" ) ; slash to hyphens
73+ (str/replace #"-{2,}" " -" ) ; multiple hyphens to one
74+ (#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
75+ (str/replace #"\p {InCombiningDiacriticalMarks}+" " " ) ; no-diacritics (dočasně)
76+ (str/replace #"\( |\) " " " ))))
77+
78+ (defn parse-book-name [s]
79+ (-> s ; ; proti parse-books bere jen řetězec
80+ (str/replace #"," " " )
81+ (str/replace #"\+ " " " )
82+ (str/trim )
83+ sanitize-category-str
84+ (str/replace #"^3" " k3" )
85+ (str/replace #"^5" " k5" )))
86+
87+
88+ (defn parse-csv-date [date-str]
89+ (let [month-names [" led" " úno" " bře" " dub" " kvě" " čvn" " čvc" " srp" " zář" " říj" " lis" " pro" ]
90+ pad-month #(format " %02d" %)
91+ parse-full-date (fn [s]
92+ (let [month (Integer/parseInt (subs s 3 5 ))]
93+ (str (subs s 6 10 ) " -01-" (pad-month month))))
94+ parse-short-date (fn [s]
95+ (let [[month-str year-str] (str/split s #"\. " )
96+ month (inc (.indexOf month-names month-str))
97+ year (+ 2000 (Integer/parseInt year-str))]
98+ (str year " -01-" (pad-month month))))]
99+ (try
100+ (jt/local-date " yyyy-dd-MM"
101+ (if (> (count date-str) 6 )
102+ (parse-full-date date-str)
103+ (parse-short-date date-str)))
104+ (catch Exception _
105+ (str " Chyba: " date-str)))))
106+
107+ (defn parse-books-from-list
108+ " Parses a book names from string `s` separated by commas into vector of cleaned keywords."
109+ [s]
110+ (if (seq s) (->> (str/split s #",\s\d +" )
111+ (map #(str/replace % #"\d *×\s " " " ))
112+ (map #(str/replace % #"," " " ))
113+ (map #(str/replace % #"\( A\+ E\) |\[ |\] |komplet|a\+ e|\s\( P\+ E\+ A\) |\s\( e\- kniha\) |\s\( P\+ E\) |\s\( P\+ A\) |\s\( E\+ A\) |papír|papir|audio|e\- kniha|taška" " " ))
114+ (map #(str/replace % #"\+ " " " ))
115+ (map #(str/trim %))
116+ (map sanitize-str)
117+ (map #(str/replace % #"\-\- .+$" " " ))
118+ (map #(str/replace % #"\- +$" " " ))
119+ (map #(str/replace % #"^3" " k3" ))
120+ (map #(str/replace % #"^5" " k5" ))
121+ (remove (fn [item] (some (fn [substr] (str/includes? (name item) substr))
122+ [" balicek" " poukaz" " zapisnik" " limitovana-edice" " taska" " aktualizovane-vydani" " cd" " puvodni-vydani/neni-skladem"
123+ " merch" ])))
124+ distinct
125+ (mapv keyword))
126+ nil ))
127+
128+ ; ; ### Metadata Enriching and Convenience Functions
129+
130+ (defn czech-author? [book-title]
131+ (let [czech-books #{:k30-hodin
132+ :k365-anglickych-cool-fraz-a-vyrazov
133+ :k365-anglickych-cool-frazi-a-vyrazu
134+ :bulbem-zachranare
135+ :hacknuta-cestina
136+ :handmade-byznys
137+ :hot
138+ :hry-site-porno
139+ :jak-na-site
140+ :jak-sbalit-zenu-2.0
141+ :konec-prokrastinace
142+ :let-your-english-september
143+ :myty-a-nadeje-digitalniho-sveta
144+ :na-volne-noze
145+ :napoleonuv-vojak
146+ :nedelni-party-s-picassem
147+ :restart-kreativity
148+ :sport-je-bolest
149+ :stat-se-investorem
150+ :temne-pocatky-ceskych-dejin
151+ :uc-jako-umelec
152+ :velka-kniha-fuckupu
153+ :zamilujte-se-do-anglictiny
154+ :pretizeny
155+ :od-chaosu-ke-smyslu
156+ :very-hard-orechy
157+ :heureka! }]
158+ (if (str/starts-with? (str book-title) " book" ) ; ; this is a part used to add flags of Czech books into fully anonymized dataset
159+ (rand-int 2 )
160+ (if (contains? czech-books (keyword book-title)) 1 0 ))))
161+
162+ ; ; ### One-Hot Encoding Functions
163+
164+
165+ (defn onehot-encode-by-customers ; ; FIXME needs refactor and simplification :)
166+ " One-hot encode dataset aggregated by customer.
167+ Each customer gets one row with 0/1 values for each book they bought.
168+ Used for market basket analysis, customer segmentation, etc."
169+ [raw-ds]
170+ (let [; ; First, aggregate all purchases by customer
171+ customer+orders (-> raw-ds
172+ (ds/drop-missing :zakaznik )
173+ (tc/drop-rows #(= " " (str/trim (:zakaznik %))))
174+ (ds/drop-missing :produkt-produkty )
175+ (tc/group-by [:zakaznik ])
176+ (tc/aggregate {:all-products #(str/join " , " (tc/column % :produkt-produkty ))})
177+ (tc/rename-columns {:summary :all-products }))
178+ ; ; Get all unique books from all the lines
179+ all-titles (->> (tc/column customer+orders :all-products )
180+ (mapcat parse-books-from-list)
181+ distinct
182+ sort)
183+ ; ; For each customer create one aggregated row with all purchases in 0/1 format
184+ customers->rows (map
185+ (fn [customer-row]
186+ (let [customer-name (:zakaznik customer-row)
187+ books-bought-set (set (parse-books-from-list (:all-products customer-row)))
188+ one-hot-map (reduce (fn [acc book]
189+ (assoc acc book (if (contains? books-bought-set book) 1 0 )))
190+ {}
191+ all-titles)]
192+ (merge {:zakaznik customer-name}
193+ one-hot-map)))
194+ (tc/rows customer+orders :as-maps ))
195+ ; ; Create new dataset from one-hot data
196+ one-hot-ds (tc/dataset customers->rows)]
197+ ; ; Return dataset with one-hot encoding
198+ one-hot-ds))
199+
200+
201+ ; ; ### Statistical Functions for Apriori Analysis
202+
203+ (defn calculate-support
204+ " Calculate support for a given itemset in a one-hot-encoded dataset.
205+ Support = (rows containing itemset) / (total rows)"
206+ [dataset itemset]
207+ (let [total-transactions (tc/row-count dataset)
208+ transactions-with-itemset (-> dataset
209+ (tc/select-rows (fn [row] (every? #(not (zero? (get row %))) itemset)))
210+ tc/row-count)]
211+ (if (zero? total-transactions)
212+ 0.0
213+ (double (/ transactions-with-itemset total-transactions)))))
214+
215+
216+ ^:kindly/hide-code
217+ (defn calculate-adaptive-coefficient
218+ " Calculates adaptive coefficient for popularity bias correction."
219+ [rules popularity-index]
220+ (let [median-lift (tcc/median (map :lift rules))
221+ max-popularity (apply max (vals popularity-index))
222+ ; ; Calculate coefficient that would reduce a rule with max popularity
223+ ; ; items on both sides by approximately 50% of median lift
224+ target-coefficient (/ (* 0.5 median-lift) (* 2 max-popularity))]
225+ target-coefficient))
226+
227+ ^:kindly/hide-code
228+ (defn improved-adjusted-lift
229+ " Lift adjustment using adaptive popularity bias correction."
230+ [rule popularity-index adaptive-coefficient]
231+ (let [base-lift (:lift rule)
232+ antecedent-items (:antecedent rule)
233+ consequent-items (seq (:consequent rule))
234+ ; ; Average popularity of items in antecedent
235+ antecedent-popularity (tcc/mean (vals (select-keys popularity-index antecedent-items)))
236+ ; ; Average popularity of items in consequent
237+ consequent-popularity (tcc/mean (vals (select-keys popularity-index consequent-items)))
238+ ; ; Dampening factor for popular items
239+ popularity-penalty (+ 1 (* adaptive-coefficient
240+ (+ antecedent-popularity consequent-popularity)))]
241+ ; ; Divide lift by penalty (popular combinations get reduced lift)
242+ (assoc rule :lift (double (/ base-lift popularity-penalty)))))
243+
244+
245+ ; ; ### Visuals
246+
247+ (defn color-hex [support min-support max-support]
248+ (let [min-opacity 20
249+ max-opacity 255
250+ ; ; Map support from [min-support, max-support] to [min-opacity, max-opacity]
251+ opacity (if (= min-support max-support)
252+ ; ; Handle edge case where min and max are the same
253+ (int (/ (+ min-opacity max-opacity) 2 ))
254+ (int (+ min-opacity
255+ (* (- max-opacity min-opacity)
256+ (/ (- support min-support)
257+ (- max-support min-support))))))
258+ ; ; Ensure opacity stays within bounds
259+ clamped-opacity (min max-opacity (max min-opacity opacity))
260+ hex-opacity (format " %02x" clamped-opacity)]
261+ (str " #c1ab55" hex-opacity)))
262+
263+
264+ ; ; ### Correlation functions
265+
266+ (defn corr-a-x-b
267+ " Creates a correlation matrix with book columns and the added :book column \n
268+ - `ds` is dataset \n
269+ Example: \n
270+ => _unnamed [2 3]: \n
271+ | :a | :b | :book |
272+ |------------:|------------:|-------|
273+ | 1.00000000 | -0.12121831 | :a |
274+ | -0.12121831 | 1.00000000 | :b |"
275+ [ds]
276+ (let
277+ [columns (tc/column-names ds)
278+ clean-ds (-> ds
279+ (tc/drop-columns [:zakaznik ]))]
280+ (-> (zipmap columns (stats/correlation-matrix (tc/columns clean-ds)))
281+ tc/dataset
282+ (tc/add-column :book columns))))
283+
284+ (defn corr-3-col
285+ " Creates a correlation matrix with two columns of books \n
286+ => _unnamed [4 3]: \n
287+ | :book-0 | :book-1 | :correlation |
288+ |---------|---------|-------------:|
289+ | :a | :a | 1.00000000 |
290+ | :a | :b | -0.12121831 |
291+ | :b | :a | -0.12121831 |
292+ | :b | :b | 1.00000000 | \n
293+ - `flatten` is used here to make a linear sequence of numbers which should match corresponding variable names. \n
294+ - Since we make pairs of names `((for...[a b])` creates a cartesian product) we need to separate these to individual columns, tc/seperate-column does the trick, refer: https://scicloj.github.io/tablecloth/#separate"
295+ [ds]
296+ (let [names (tc/column-names ds)
297+ mat (flatten (stats/correlation-matrix (tc/columns ds)))]
298+ (-> (tc/dataset {:book (for [a names b names] [a b])
299+ :correlation mat})
300+ (tc/separate-column :book )
301+ (tc/rename-columns {" :book-0" :titul-knihy
302+ " :book-1" :book-1 }))))
303+
304+
305+ ; ; ### Export helper functions from other namespaces for convenience
306+
307+ (def sanitize-str sanitize-str )
308+ (def merge-csvs merge-csvs )
309+ (def parse-books-from-list parse-books-from-list )
310+ (def sanitize-column-name-str sanitize-column-name-str )
311+ (def parse-csv-date parse-csv-date )
312+
313+ (println " Core helpers loaded." )
0 commit comments