MachineLearningTestsGo/main.go at master · attentiondotnet/MachineLearningTestsGo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
/*
Random Forest Model for Loan Action Prediction
This program trains a Random Forest classifier on mortgage/loan data to predict loan actions.
Converted from Python to Go implementation.
*/

package main

import (
	"encoding/csv"
	"fmt"
	"log"
	"math"
	"math/rand"
	"os"
	"sort"
	"strconv"
	"strings"
	"time"
)

// Dataset represents a collection of data samples
type Dataset struct {
	Features [][]float64
	Labels   []int
	Headers  []string
}

// DataSplit contains training, test, and validation datasets
type DataSplit struct {
	XTrain, XTest, XVal [][]float64
	YTrain, YTest, YVal []int
	Headers             []string
	LabelEncoders       map[string]map[string]int
}

// RandomForest represents the Random Forest model
type RandomForest struct {
	Trees             []*DecisionTree
	NTrees            int
	MaxDepth          int
	MinSplit          int
	MinLeaf           int
	FeatureSubsetSize int
}

// DecisionTree represents a single decision tree
type DecisionTree struct {
	Root *TreeNode
}

// TreeNode represents a node in the decision tree
type TreeNode struct {
	Feature   int
	Threshold float64
	Left      *TreeNode
	Right     *TreeNode
	IsLeaf    bool
	Class     int
	Samples   int
}

// FeatureImportance represents feature importance score
type FeatureImportance struct {
	Feature    string
	Importance float64
}

// ModelMetrics contains evaluation metrics
type ModelMetrics struct {
	Accuracy        float64
	ConfusionMatrix [][]int
	ClassReport     map[int]map[string]float64
}

func main() {
	fmt.Println(strings.Repeat("=", 60))
	fmt.Println("RANDOM FOREST MODEL FOR LOAN ACTION PREDICTION")
	fmt.Println(strings.Repeat("=", 60))

	// File paths
	trainPath := "TrainingSet.csv"
	testPath := "TestSet.csv"
	validationPath := "ValidationSet.csv"

	// Load and explore data
	trainData, testData, validationData, err := loadAndExploreData(trainPath, testPath, validationPath)
	if err != nil {
		log.Fatalf("Error loading data: %v", err)
	}

	// Preprocess data
	dataSplit, err := preprocessData(trainData, testData, validationData)
	if err != nil {
		log.Fatalf("Error preprocessing data: %v", err)
	}

	// Train Random Forest model
	fmt.Println("\nTraining Random Forest model...")
	rf := NewRandomForest(100, 10, 5, 2)
	err = rf.Train(dataSplit.XTrain, dataSplit.YTrain)
	if err != nil {
		log.Fatalf("Error training model: %v", err)
	}
	fmt.Println("Model training completed!")

	// Calculate feature importance
	featureImportances := rf.CalculateFeatureImportance(dataSplit.Headers)
	fmt.Println("\nTop 10 Most Important Features:")
	for i, fi := range featureImportances[:min(10, len(featureImportances))] {
		fmt.Printf("%d. %s: %.4f\n", i+1, fi.Feature, fi.Importance)
	}

	// Evaluate model
	fmt.Println("\nEvaluating model performance...")
	testMetrics := evaluateModel(rf, dataSplit.XTest, dataSplit.YTest)
	fmt.Printf("Test Accuracy: %.4f\n", testMetrics.Accuracy)

	// Evaluate on validation set
	if len(dataSplit.XVal) > 0 {
		valMetrics := evaluateModel(rf, dataSplit.XVal, dataSplit.YVal)
		fmt.Printf("Validation Accuracy: %.4f\n", valMetrics.Accuracy)
	}

	// Cross-validation
	fmt.Println("\nPerforming 5-fold cross-validation on training set...")
	cvScores := crossValidation(dataSplit.XTrain, dataSplit.YTrain, 5)
	fmt.Printf("Cross-validation scores: %v\n", cvScores)
	fmt.Printf("Mean CV accuracy: %.4f (+/- %.4f)\n", mean(cvScores), std(cvScores)*2)

	fmt.Println("\n" + strings.Repeat("=", 60))
	fmt.Println("MODEL TRAINING AND EVALUATION COMPLETED!")
	fmt.Println(strings.Repeat("=", 60))
}

// loadAndExploreData loads CSV files and performs basic exploration
func loadAndExploreData(trainPath, testPath, validationPath string) (*Dataset, *Dataset, *Dataset, error) {
	fmt.Println("Loading datasets...")

	trainData, err := loadCSV(trainPath)
	if err != nil {
		return nil, nil, nil, fmt.Errorf("loading training data: %v", err)
	}

	testData, err := loadCSV(testPath)
	if err != nil {
		return nil, nil, nil, fmt.Errorf("loading test data: %v", err)
	}

	validationData, err := loadCSV(validationPath)
	if err != nil {
		return nil, nil, nil, fmt.Errorf("loading validation data: %v", err)
	}

	fmt.Printf("Training set shape: (%d, %d)\n", len(trainData.Features), len(trainData.Headers))
	fmt.Printf("Test set shape: (%d, %d)\n", len(testData.Features), len(testData.Headers))
	fmt.Printf("Validation set shape: (%d, %d)\n", len(validationData.Features), len(validationData.Headers))

	// Print target variable distribution
	fmt.Println("\nTarget variable (action_taken) distribution in training set:")
	labelCounts := make(map[int]int)
	for _, label := range trainData.Labels {
		labelCounts[label]++
	}
	for label, count := range labelCounts {
		fmt.Printf("Class %d: %d samples\n", label, count)
	}

	return trainData, testData, validationData, nil
}

// loadCSV loads a CSV file and returns a Dataset
func loadCSV(filename string) (*Dataset, error) {
	file, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	reader := csv.NewReader(file)
	records, err := reader.ReadAll()
	if err != nil {
		return nil, err
	}

	if len(records) < 2 {
		return nil, fmt.Errorf("CSV file must have at least header and one data row")
	}

	headers := records[0]
	data := records[1:]

	// Find target column (action_taken)
	targetCol := -1
	for i, header := range headers {
		if strings.ToLower(header) == "action_taken" {
			targetCol = i
			break
		}
	}

	if targetCol == -1 {
		return nil, fmt.Errorf("target column 'action_taken' not found")
	}

	// Prepare feature headers (excluding target)
	var featureHeaders []string
	for i, header := range headers {
		if i != targetCol {
			featureHeaders = append(featureHeaders, header)
		}
	}

	var features [][]float64
	var labels []int

	for _, record := range data {
		// Extract label
		label, err := strconv.Atoi(record[targetCol])
		if err != nil {
			return nil, fmt.Errorf("invalid label value: %s", record[targetCol])
		}
		labels = append(labels, label)

		// Extract features
		var row []float64
		for i, value := range record {
			if i != targetCol {
				// Try to parse as float, if fails treat as categorical string
				if val, err := strconv.ParseFloat(value, 64); err == nil {
					row = append(row, val)
				} else {
					// For now, we'll use a simple hash for categorical values
					// In a real implementation, you'd want proper label encoding
					row = append(row, float64(simpleHash(value)))
				}
			}
		}
		features = append(features, row)
	}

	return &Dataset{
		Features: features,
		Labels:   labels,
		Headers:  featureHeaders,
	}, nil
}

// preprocessData preprocesses the datasets for machine learning
func preprocessData(trainData, testData, validationData *Dataset) (*DataSplit, error) {
	fmt.Println("\nPreprocessing data...")

	// For simplicity, we'll assume the data is already properly formatted
	// In a real implementation, you'd want to:
	// 1. Handle missing values
	// 2. Properly encode categorical variables
	// 3. Scale numerical features if needed

	fmt.Println("Handling missing values and data types...")

	// Create label encoders map (simplified)
	labelEncoders := make(map[string]map[string]int)

	// For this implementation, we'll use the data as-is
	// The loadCSV function already handles basic conversion

	fmt.Printf("Final feature matrix shape: (%d, %d)\n", len(trainData.Features), len(trainData.Headers))

	return &DataSplit{
		XTrain:        trainData.Features,
		XTest:         testData.Features,
		XVal:          validationData.Features,
		YTrain:        trainData.Labels,
		YTest:         testData.Labels,
		YVal:          validationData.Labels,
		Headers:       trainData.Headers,
		LabelEncoders: labelEncoders,
	}, nil
}

// NewRandomForest creates a new Random Forest model
func NewRandomForest(nTrees, maxDepth, minSplit, minLeaf int) *RandomForest {
	return &RandomForest{
		NTrees:            nTrees,
		MaxDepth:          maxDepth,
		MinSplit:          minSplit,
		MinLeaf:           minLeaf,
		FeatureSubsetSize: int(math.Sqrt(float64(0))), // Will be set during training
	}
}

// Train trains the Random Forest model
func (rf *RandomForest) Train(X [][]float64, y []int) error {
	if len(X) == 0 || len(X) != len(y) {
		return fmt.Errorf("invalid training data dimensions")
	}

	nFeatures := len(X[0])
	rf.FeatureSubsetSize = int(math.Sqrt(float64(nFeatures)))
	if rf.FeatureSubsetSize < 1 {
		rf.FeatureSubsetSize = 1
	}

	rf.Trees = make([]*DecisionTree, rf.NTrees)

	// Train each tree with bootstrap sampling
	for i := 0; i < rf.NTrees; i++ {
		// Bootstrap sampling
		bootX, bootY := bootstrapSample(X, y)

		// Create and train tree
		tree := &DecisionTree{}
		tree.Root = rf.buildTree(bootX, bootY, 0, getRandomFeatureSubset(nFeatures, rf.FeatureSubsetSize))
		rf.Trees[i] = tree
	}

	return nil
}

// Predict makes predictions using the Random Forest
func (rf *RandomForest) Predict(X [][]float64) []int {
	predictions := make([]int, len(X))

	for i, sample := range X {
		votes := make(map[int]int)

		// Get prediction from each tree
		for _, tree := range rf.Trees {
			pred := tree.predict(sample)
			votes[pred]++
		}

		// Find majority vote
		maxVotes := 0
		prediction := 0
		for class, count := range votes {
			if count > maxVotes {
				maxVotes = count
				prediction = class
			}
		}

		predictions[i] = prediction
	}

	return predictions
}

// buildTree recursively builds a decision tree
func (rf *RandomForest) buildTree(X [][]float64, y []int, depth int, featureSubset []int) *TreeNode {
	// Check stopping criteria
	if depth >= rf.MaxDepth || len(X) < rf.MinSplit || isPure(y) {
		return &TreeNode{
			IsLeaf:  true,
			Class:   majorityClass(y),
			Samples: len(y),
		}
	}

	// Find best split
	bestFeature, bestThreshold, _ := findBestSplit(X, y, featureSubset)

	// If no good split found, create leaf
	if bestFeature == -1 {
		return &TreeNode{
			IsLeaf:  true,
			Class:   majorityClass(y),
			Samples: len(y),
		}
	}

	// Split data
	leftX, leftY, rightX, rightY := splitData(X, y, bestFeature, bestThreshold)

	// Check minimum leaf size
	if len(leftY) < rf.MinLeaf || len(rightY) < rf.MinLeaf {
		return &TreeNode{
			IsLeaf:  true,
			Class:   majorityClass(y),
			Samples: len(y),
		}
	}

	// Create internal node
	node := &TreeNode{
		Feature:   bestFeature,
		Threshold: bestThreshold,
		IsLeaf:    false,
		Samples:   len(y),
	}

	// Recursively build left and right subtrees
	newFeatureSubset := getRandomFeatureSubset(len(X[0]), rf.FeatureSubsetSize)
	node.Left = rf.buildTree(leftX, leftY, depth+1, newFeatureSubset)
	node.Right = rf.buildTree(rightX, rightY, depth+1, newFeatureSubset)

	return node
}

// predict makes a prediction for a single sample using the decision tree
func (tree *DecisionTree) predict(sample []float64) int {
	return tree.traverseTree(tree.Root, sample)
}

// traverseTree traverses the tree to make a prediction
func (tree *DecisionTree) traverseTree(node *TreeNode, sample []float64) int {
	if node.IsLeaf {
		return node.Class
	}

	if sample[node.Feature] <= node.Threshold {
		return tree.traverseTree(node.Left, sample)
	}
	return tree.traverseTree(node.Right, sample)
}

// Helper functions

func simpleHash(s string) int {
	hash := 0
	for _, c := range s {
		hash = hash*31 + int(c)
	}
	if hash < 0 {
		hash = -hash
	}
	return hash % 1000 // Keep it reasonable
}

func bootstrapSample(X [][]float64, y []int) ([][]float64, []int) {
	n := len(X)
	bootX := make([][]float64, n)
	bootY := make([]int, n)

	for i := 0; i < n; i++ {
		idx := rand.Intn(n)
		bootX[i] = make([]float64, len(X[idx]))
		copy(bootX[i], X[idx])
		bootY[i] = y[idx]
	}

	return bootX, bootY
}

func getRandomFeatureSubset(nFeatures, subsetSize int) []int {
	features := make([]int, nFeatures)
	for i := range features {
		features[i] = i
	}

	// Fisher-Yates shuffle
	for i := len(features) - 1; i > 0; i-- {
		j := rand.Intn(i + 1)
		features[i], features[j] = features[j], features[i]
	}

	return features[:subsetSize]
}

func isPure(y []int) bool {
	if len(y) <= 1 {
		return true
	}
	first := y[0]
	for _, label := range y {
		if label != first {
			return false
		}
	}
	return true
}

func majorityClass(y []int) int {
	counts := make(map[int]int)
	for _, label := range y {
		counts[label]++
	}

	maxCount := 0
	majority := 0
	for class, count := range counts {
		if count > maxCount {
			maxCount = count
			majority = class
		}
	}
	return majority
}

func calculateGini(y []int) float64 {
	if len(y) == 0 {
		return 0
	}

	counts := make(map[int]int)
	for _, label := range y {
		counts[label]++
	}

	gini := 1.0
	n := float64(len(y))
	for _, count := range counts {
		p := float64(count) / n
		gini -= p * p
	}

	return gini
}

func findBestSplit(X [][]float64, y []int, featureSubset []int) (int, float64, float64) {
	bestGini := math.Inf(1)
	bestFeature := -1
	bestThreshold := 0.0

	for _, feature := range featureSubset {
		// Get unique values for this feature
		values := make([]float64, len(X))
		for i := range X {
			values[i] = X[i][feature]
		}
		sort.Float64s(values)

		// Try splits between unique values
		for i := 0; i < len(values)-1; i++ {
			if values[i] == values[i+1] {
				continue
			}
			threshold := (values[i] + values[i+1]) / 2

			// Calculate weighted Gini impurity for this split
			leftY, rightY := splitLabels(X, y, feature, threshold)

			if len(leftY) == 0 || len(rightY) == 0 {
				continue
			}

			n := float64(len(y))
			leftWeight := float64(len(leftY)) / n
			rightWeight := float64(len(rightY)) / n

			weightedGini := leftWeight*calculateGini(leftY) + rightWeight*calculateGini(rightY)

			if weightedGini < bestGini {
				bestGini = weightedGini
				bestFeature = feature
				bestThreshold = threshold
			}
		}
	}

	return bestFeature, bestThreshold, bestGini
}

func splitLabels(X [][]float64, y []int, feature int, threshold float64) ([]int, []int) {
	var leftY, rightY []int

	for i, sample := range X {
		if sample[feature] <= threshold {
			leftY = append(leftY, y[i])
		} else {
			rightY = append(rightY, y[i])
		}
	}

	return leftY, rightY
}

func splitData(X [][]float64, y []int, feature int, threshold float64) ([][]float64, []int, [][]float64, []int) {
	var leftX, rightX [][]float64
	var leftY, rightY []int

	for i, sample := range X {
		if sample[feature] <= threshold {
			leftX = append(leftX, sample)
			leftY = append(leftY, y[i])
		} else {
			rightX = append(rightX, sample)
			rightY = append(rightY, y[i])
		}
	}

	return leftX, leftY, rightX, rightY
}

// CalculateFeatureImportance calculates feature importance scores
func (rf *RandomForest) CalculateFeatureImportance(headers []string) []FeatureImportance {
	importance := make([]float64, len(headers))

	// Simple feature importance based on usage in splits
	for _, tree := range rf.Trees {
		rf.addTreeImportance(tree.Root, importance)
	}

	// Normalize by number of trees
	for i := range importance {
		importance[i] /= float64(rf.NTrees)
	}

	// Create sorted list
	var featureImportances []FeatureImportance
	for i, imp := range importance {
		featureImportances = append(featureImportances, FeatureImportance{
			Feature:    headers[i],
			Importance: imp,
		})
	}

	sort.Slice(featureImportances, func(i, j int) bool {
		return featureImportances[i].Importance > featureImportances[j].Importance
	})

	return featureImportances
}

func (rf *RandomForest) addTreeImportance(node *TreeNode, importance []float64) {
	if node == nil || node.IsLeaf {
		return
	}

	importance[node.Feature] += float64(node.Samples)
	rf.addTreeImportance(node.Left, importance)
	rf.addTreeImportance(node.Right, importance)
}

// evaluateModel evaluates the model performance
func evaluateModel(rf *RandomForest, X [][]float64, y []int) ModelMetrics {
	predictions := rf.Predict(X)

	// Calculate accuracy
	correct := 0
	for i := range predictions {
		if predictions[i] == y[i] {
			correct++
		}
	}
	accuracy := float64(correct) / float64(len(y))

	// Calculate confusion matrix
	classes := getUniqueClasses(y)
	confusionMatrix := make([][]int, len(classes))
	for i := range confusionMatrix {
		confusionMatrix[i] = make([]int, len(classes))
	}

	classToIndex := make(map[int]int)
	for i, class := range classes {
		classToIndex[class] = i
	}

	for i := range predictions {
		actualIdx := classToIndex[y[i]]
		predIdx := classToIndex[predictions[i]]
		confusionMatrix[actualIdx][predIdx]++
	}

	return ModelMetrics{
		Accuracy:        accuracy,
		ConfusionMatrix: confusionMatrix,
	}
}

// crossValidation performs k-fold cross-validation
func crossValidation(X [][]float64, y []int, k int) []float64 {
	n := len(X)
	foldSize := n / k
	scores := make([]float64, k)

	for i := 0; i < k; i++ {
		// Create train and validation sets
		start := i * foldSize
		end := start + foldSize
		if i == k-1 {
			end = n
		}

		var trainX, valX [][]float64
		var trainY, valY []int

		for j := 0; j < n; j++ {
			if j >= start && j < end {
				valX = append(valX, X[j])
				valY = append(valY, y[j])
			} else {
				trainX = append(trainX, X[j])
				trainY = append(trainY, y[j])
			}
		}

		// Train model on fold
		rf := NewRandomForest(50, 8, 5, 2) // Smaller model for CV
		rf.Train(trainX, trainY)

		// Evaluate on validation set
		metrics := evaluateModel(rf, valX, valY)
		scores[i] = metrics.Accuracy
	}

	return scores
}

// Utility functions
func getUniqueClasses(y []int) []int {
	classSet := make(map[int]bool)
	for _, class := range y {
		classSet[class] = true
	}

	var classes []int
	for class := range classSet {
		classes = append(classes, class)
	}
	sort.Ints(classes)
	return classes
}

func mean(values []float64) float64 {
	sum := 0.0
	for _, v := range values {
		sum += v
	}
	return sum / float64(len(values))
}

func std(values []float64) float64 {
	m := mean(values)
	sumSquares := 0.0
	for _, v := range values {
		diff := v - m
		sumSquares += diff * diff
	}
	return math.Sqrt(sumSquares / float64(len(values)))
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}

func init() {
	rand.Seed(time.Now().UnixNano())
}