CaseManagementAI · Troy08 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 20, 2025
diff --git a/app/clients/service/MLmodels/linear_model_subset.pkl b/app/clients/service/MLmodels/linear_model_subset.pkl
diff --git a/app/clients/service/model.pkl → app/clients/service/MLmodels/model.pkl b/app/clients/service/model.pkl → app/clients/service/MLmodels/model.pkl
diff --git a/app/clients/service/MLmodels/ridge_model.pkl b/app/clients/service/MLmodels/ridge_model.pkl
diff --git a/app/clients/service/MLmodels/xgboost_model.pkl b/app/clients/service/MLmodels/xgboost_model.pkl
diff --git a/app/clients/service/analyze_data/analyze_features.py b/app/clients/service/analyze_data/analyze_features.py
@@ -0,0 +1,33 @@
+import os
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def analyze_features(csv_path, target_col="success_rate", threshold=0.1):
+    # Load dataset
+    df = pd.read_csv(csv_path)
+
+    # Plot correlation heatmap for all features
+    corr = df.corr()
+    plt.figure(figsize=(14, 12))
+    sns.heatmap(corr, annot=True, cmap="coolwarm")
+    plt.title("Correlation Heatmap")
+    plt.show()
+
+    # Select features with correlation above threshold w.r.t target
+    target_corr = corr[target_col].drop(target_col)  # drop self-correlation
+    recommended_features = target_corr[abs(target_corr) >= threshold].index.tolist()
+
+    print("Recommended features (|correlation with '{}'| >= {:.2f}):".format(target_col, threshold))
+    print(recommended_features)
+
+    return recommended_features
+
+
+if __name__ == "__main__":
+    # Dynamically build relative path to your dataset
+    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+    csv_path = os.path.join(BASE_DIR, '../../clients/service/data_commontool_synthetic.csv')
+
+    # Call feature analysis
+    selected_features = analyze_features(csv_path)
diff --git a/app/clients/service/data_commontool_synthetic.csv b/app/clients/service/data_commontool_synthetic.csv
diff --git a/app/clients/service/data_commontool_synthetic_testdata.csv b/app/clients/service/data_commontool_synthetic_testdata.csv
diff --git a/app/clients/service/final_MLmodels/final_linear_model_subset.pkl b/app/clients/service/final_MLmodels/final_linear_model_subset.pkl
diff --git a/app/clients/service/final_MLmodels/final_ridge_model.pkl b/app/clients/service/final_MLmodels/final_ridge_model.pkl
diff --git a/app/clients/service/final_MLmodels/final_xgboost_gridcv_model.pkl b/app/clients/service/final_MLmodels/final_xgboost_gridcv_model.pkl
diff --git a/app/clients/service/initial_model_trainers/XGBoost_model.py b/app/clients/service/initial_model_trainers/XGBoost_model.py
@@ -0,0 +1,89 @@
+import os
+import pickle
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score, KFold
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from xgboost import XGBRegressor
+
+# Define feature set
+FEATURES = [
+    'age', 'gender', 'work_experience', 'canada_workex', 'dep_num', 'canada_born', 'citizen_status',
+    'level_of_schooling', 'fluent_english', 'reading_english_scale', 'speaking_english_scale',
+    'writing_english_scale', 'numeracy_scale', 'computer_scale', 'transportation_bool', 'caregiver_bool',
+    'housing', 'income_source', 'felony_bool', 'attending_school', 'currently_employed',
+    'substance_use', 'time_unemployed', 'need_mental_health_support_bool',
+    'employment_assistance', 'life_stabilization', 'retention_services', 'specialized_services',
+    'employment_related_financial_supports', 'employer_financial_supports', 'enhanced_referrals'
+]
+
+# Set relative paths
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_PATH = os.path.join(BASE_DIR, '../data_commontool_synthetic.csv')
+MODEL_DIR = os.path.join(BASE_DIR, '../MLmodels/')   
+os.makedirs(MODEL_DIR, exist_ok=True)  # Auto-create directory if it doesn't exist
+
+def load_data(selected_features):
+    """Load dataset and return features and targets"""
+    data = pd.read_csv(DATA_PATH)
+    X = np.array(data[selected_features])
+    y = np.array(data['success_rate'])
+    return X, y
+
+def train_xgboost(X_train, y_train):
+    """Train XGBoost Regressor"""
+    model = XGBRegressor(
+        n_estimators=100,
+        max_depth=5,
+        learning_rate=0.1,
+        objective='reg:squarederror',
+        random_state=42
+    )
+    model.fit(X_train, y_train)
+    return model
+
+def evaluate_model(model, X_test, y_test):
+    """Evaluate model on the hold-out test set"""
+    predictions = model.predict(X_test)
+    mae = mean_absolute_error(y_test, predictions)
+    mse = mean_squared_error(y_test, predictions)
+    r2 = r2_score(y_test, predictions)
+    print(f"Hold-out Test Set - MAE: {mae:.2f} | MSE: {mse:.2f} | R2: {r2:.2f}")
+
+def cross_validate(model, X, y):
+    """Perform 5-fold cross-validation"""
+    cv = KFold(n_splits=5, shuffle=True, random_state=42)
+    r2_scores = cross_val_score(model, X, y, cv=cv, scoring="r2")
+    mae_scores = -cross_val_score(model, X, y, cv=cv, scoring="neg_mean_absolute_error")
+
+    print(f"Cross-Validation R² scores: {r2_scores}")
+    print(f"Mean R²: {np.mean(r2_scores):.3f}")
+    print(f"Cross-Validation MAE scores: {mae_scores}")
+    print(f"Mean MAE: {np.mean(mae_scores):.3f}")
+
+def save_model(model, filename):
+    """Save trained model as a pickle file"""
+    model_path = os.path.join(MODEL_DIR, filename)
+    with open(model_path, "wb") as f:
+        pickle.dump(model, f)
+    print(f"✅ XGBoost model saved at {model_path}")
+
+def main():
+    # Load data
+    X, y = load_data(FEATURES)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Train model
+    model = train_xgboost(X_train, y_train)
+
+    # Evaluate on hold-out test set
+    evaluate_model(model, X_test, y_test)
+
+    # Cross-validation
+    cross_validate(model, X, y)
+
+    # Save model
+    save_model(model, "xgboost_model.pkl")
+
+if __name__ == "__main__":
+    main()
diff --git a/app/clients/service/initial_model_trainers/get_synthetic_data.py b/app/clients/service/initial_model_trainers/get_synthetic_data.py
@@ -0,0 +1,90 @@
+import pandas as pd
+import numpy as np
+import os
+
+num_samples = 7000  # Total number of samples to generate
+
+# Age distribution: Normal distribution centered around 30, capped between 18 and 60
+age = np.random.normal(loc=30, scale=8, size=num_samples).astype(int)
+age = np.clip(age, 18, 60)
+
+# Work experience based on age (years since 18 minus some gap years)
+work_experience = (age - 18) - np.random.randint(0, 4, num_samples)
+work_experience = np.clip(work_experience, 0, 42)
+
+# Canadian work experience is a fraction of total work experience
+canada_workex = (work_experience * np.random.uniform(0.3, 0.9, num_samples)).astype(int)
+
+# Education level: weighted choice simulating real-world distribution
+level_of_schooling = np.random.choice([6, 8, 10, 12, 14], size=num_samples, p=[0.1, 0.25, 0.3, 0.25, 0.1])
+
+# English proficiency: fluent_english impacts reading/speaking/writing scores
+fluent_english = np.random.randint(4, 10, num_samples)
+reading_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+speaking_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+writing_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+
+# Numeracy and computer skills
+numeracy = np.random.randint(3, 10, num_samples)
+computer = np.random.randint(3, 10, num_samples)
+
+# Binary features generator (0 or 1)
+binary_features = lambda: np.random.randint(0, 2, num_samples)
+
+# Base success rate formula influenced by key features
+success_rate_base = (
+    fluent_english * 2 + 
+    level_of_schooling * 2 +
+    computer * 2 +
+    numeracy * 1.5 +
+    work_experience * 1.2 +
+    (binary_features() * 10)  # bonus random life factors
+)
+
+# Normalize and add Gaussian noise
+success_rate = (success_rate_base / success_rate_base.max() * 100) + np.random.normal(0, 5, num_samples)
+success_rate = np.clip(success_rate.astype(int), 0, 100)
+
+# Create the dataframe
+data = pd.DataFrame({
+    "age": age,
+    "gender": binary_features(),
+    "work_experience": work_experience,
+    "canada_workex": canada_workex,
+    "dep_num": np.random.randint(0, 5, num_samples),
+    "canada_born": binary_features(),
+    "citizen_status": binary_features(),
+    "level_of_schooling": level_of_schooling,
+    "fluent_english": fluent_english,
+    "reading_english_scale": reading_english,
+    "speaking_english_scale": speaking_english,
+    "writing_english_scale": writing_english,
+    "numeracy_scale": numeracy,
+    "computer_scale": computer,
+    "transportation_bool": binary_features(),
+    "caregiver_bool": binary_features(),
+    "housing": np.random.randint(1, 10, num_samples),
+    "income_source": np.random.randint(1, 10, num_samples),
+    "felony_bool": binary_features(),
+    "attending_school": binary_features(),
+    "currently_employed": binary_features(),
+    "substance_use": binary_features(),
+    "time_unemployed": np.random.randint(0, 8, num_samples),
+    "need_mental_health_support_bool": binary_features(),
+    "employment_assistance": binary_features(),
+    "life_stabilization": binary_features(),
+    "retention_services": binary_features(),
+    "specialized_services": binary_features(),
+    "employment_related_financial_supports": binary_features(),
+    "employer_financial_supports": binary_features(),
+    "enhanced_referrals": binary_features(),
+    "success_rate": success_rate
+})
+
+# Save to CSV
+# output_path = os.path.join("app", "clients", "service", "data_commontool_synthetic.csv")
+output_path = os.path.join(os.path.dirname(__file__), "data_commontool_synthetic.csv")
+
+data.to_csv(output_path, index=False)
+
+print(f"✅ {num_samples} synthetic data rows generated and saved to data_commontool_synthetic.csv")
diff --git a/app/clients/service/initial_model_trainers/get_synthetic_testdata.py b/app/clients/service/initial_model_trainers/get_synthetic_testdata.py
@@ -0,0 +1,130 @@
+import pandas as pd
+import numpy as np
+import os
+
+np.random.seed(42)  # For reproducibility
+
+num_samples = 4000  # Number of rows
+
+# Age distribution: Normal distribution centered around 30, capped between 18 and 60
+age = np.random.normal(loc=30, scale=8, size=num_samples).astype(int)
+age = np.clip(age, 18, 60)
+
+# Work experience based on age (years since 18 minus some gap years)
+work_experience = (age - 18) - np.random.randint(0, 4, num_samples)
+work_experience = np.clip(work_experience, 0, 42)
+
+# Canadian work experience is a fraction of total work experience
+canada_workex = (work_experience * np.random.uniform(0.3, 0.9, num_samples)).astype(int)
+
+# Education level: weighted choice
+level_of_schooling = np.random.choice([6, 8, 10, 12, 14], size=num_samples, p=[0.1, 0.25, 0.3, 0.25, 0.1])
+
+# English proficiency
+fluent_english = np.random.randint(4, 10, num_samples)
+reading_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+speaking_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+writing_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
+
+# Numeracy and computer skills
+numeracy = np.random.randint(3, 10, num_samples)
+computer = np.random.randint(3, 10, num_samples)
+
+# Binary features generator
+binary_features = lambda: np.random.randint(0, 2, num_samples)
+
+# Other categorical and binary features
+gender = binary_features()
+dep_num = np.random.randint(0, 5, num_samples)
+canada_born = binary_features()
+citizen_status = binary_features()
+transportation_bool = binary_features()
+caregiver_bool = binary_features()
+housing = np.random.randint(1, 10, num_samples)
+income_source = np.random.randint(1, 10, num_samples)
+felony_bool = binary_features()
+attending_school = binary_features()
+currently_employed = binary_features()
+substance_use = binary_features()
+time_unemployed = np.random.randint(0, 8, num_samples)
+need_mental_health_support_bool = binary_features()
+employment_assistance = binary_features()
+life_stabilization = binary_features()
+retention_services = binary_features()
+specialized_services = binary_features()
+employment_related_financial_supports = binary_features()
+employer_financial_supports = binary_features()
+enhanced_referrals = binary_features()
+
+# Success Rate Logic
+
+felony_penalty = np.where(felony_bool == 1, -15, 0)
+substance_penalty = np.where(substance_use == 1, -10, 0)
+long_unemployed_penalty = np.where(time_unemployed >= 4, -5, 0)
+
+bonus_services = (
+    (employment_assistance + life_stabilization + retention_services + specialized_services) * 3
+)
+
+employment_bonus = np.where(currently_employed == 1, 5, 0)
+education_effect = np.minimum(level_of_schooling, 12)  # Cap effect after a certain level
+
+success_rate_base = (
+    fluent_english * 2 +
+    education_effect * 2 +
+    computer * 2 +
+    numeracy * 1.5 +
+    work_experience * 1.0 +
+    employment_bonus +
+    bonus_services +
+    felony_penalty +
+    substance_penalty +
+    long_unemployed_penalty +
+    (np.random.randint(0, 2, num_samples) * 5)  # Random life variance
+)
+
+# Normalize and add Gaussian noise
+success_rate = (success_rate_base / success_rate_base.max() * 100) + np.random.normal(0, 5, num_samples)
+success_rate = np.clip(success_rate.astype(int), 0, 100)
+
+# Create DataFrame
+data = pd.DataFrame({
+    "age": age,
+    "gender": gender,
+    "work_experience": work_experience,
+    "canada_workex": canada_workex,
+    "dep_num": dep_num,
+    "canada_born": canada_born,
+    "citizen_status": citizen_status,
+    "level_of_schooling": level_of_schooling,
+    "fluent_english": fluent_english,
+    "reading_english_scale": reading_english,
+    "speaking_english_scale": speaking_english,
+    "writing_english_scale": writing_english,
+    "numeracy_scale": numeracy,
+    "computer_scale": computer,
+    "transportation_bool": transportation_bool,
+    "caregiver_bool": caregiver_bool,
+    "housing": housing,
+    "income_source": income_source,
+    "felony_bool": felony_bool,
+    "attending_school": attending_school,
+    "currently_employed": currently_employed,
+    "substance_use": substance_use,
+    "time_unemployed": time_unemployed,
+    "need_mental_health_support_bool": need_mental_health_support_bool,
+    "employment_assistance": employment_assistance,
+    "life_stabilization": life_stabilization,
+    "retention_services": retention_services,
+    "specialized_services": specialized_services,
+    "employment_related_financial_supports": employment_related_financial_supports,
+    "employer_financial_supports": employer_financial_supports,
+    "enhanced_referrals": enhanced_referrals,
+    "success_rate": success_rate
+})
+
+# Save to CSV
+output_path = os.path.join(os.path.dirname(__file__), "../data_commontool_synthetic_testdata.csv")
+data.to_csv(output_path, index=False)
+
+print(f" {num_samples} diversified synthetic data rows generated and saved to data_commontool_synthetic.csv")