Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added app/clients/service/MLmodels/linear_model_subset.pkl
Binary file not shown.
File renamed without changes.
Binary file added app/clients/service/MLmodels/ridge_model.pkl
Binary file not shown.
Binary file added app/clients/service/MLmodels/xgboost_model.pkl
Binary file not shown.
33 changes: 33 additions & 0 deletions app/clients/service/analyze_data/analyze_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_features(csv_path, target_col="success_rate", threshold=0.1):
# Load dataset
df = pd.read_csv(csv_path)

# Plot correlation heatmap for all features
corr = df.corr()
plt.figure(figsize=(14, 12))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Select features with correlation above threshold w.r.t target
target_corr = corr[target_col].drop(target_col) # drop self-correlation
recommended_features = target_corr[abs(target_corr) >= threshold].index.tolist()

print("Recommended features (|correlation with '{}'| >= {:.2f}):".format(target_col, threshold))
print(recommended_features)

return recommended_features


if __name__ == "__main__":
# Dynamically build relative path to your dataset
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
csv_path = os.path.join(BASE_DIR, '../../clients/service/data_commontool_synthetic.csv')

# Call feature analysis
selected_features = analyze_features(csv_path)
7,001 changes: 7,001 additions & 0 deletions app/clients/service/data_commontool_synthetic.csv

Large diffs are not rendered by default.

4,001 changes: 4,001 additions & 0 deletions app/clients/service/data_commontool_synthetic_testdata.csv

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
89 changes: 89 additions & 0 deletions app/clients/service/initial_model_trainers/XGBoost_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Define feature set
FEATURES = [
'age', 'gender', 'work_experience', 'canada_workex', 'dep_num', 'canada_born', 'citizen_status',
'level_of_schooling', 'fluent_english', 'reading_english_scale', 'speaking_english_scale',
'writing_english_scale', 'numeracy_scale', 'computer_scale', 'transportation_bool', 'caregiver_bool',
'housing', 'income_source', 'felony_bool', 'attending_school', 'currently_employed',
'substance_use', 'time_unemployed', 'need_mental_health_support_bool',
'employment_assistance', 'life_stabilization', 'retention_services', 'specialized_services',
'employment_related_financial_supports', 'employer_financial_supports', 'enhanced_referrals'
]

# Set relative paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, '../data_commontool_synthetic.csv')
MODEL_DIR = os.path.join(BASE_DIR, '../MLmodels/')
os.makedirs(MODEL_DIR, exist_ok=True) # Auto-create directory if it doesn't exist

def load_data(selected_features):
"""Load dataset and return features and targets"""
data = pd.read_csv(DATA_PATH)
X = np.array(data[selected_features])
y = np.array(data['success_rate'])
return X, y

def train_xgboost(X_train, y_train):
"""Train XGBoost Regressor"""
model = XGBRegressor(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
objective='reg:squarederror',
random_state=42
)
model.fit(X_train, y_train)
return model

def evaluate_model(model, X_test, y_test):
"""Evaluate model on the hold-out test set"""
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Hold-out Test Set - MAE: {mae:.2f} | MSE: {mse:.2f} | R2: {r2:.2f}")

def cross_validate(model, X, y):
"""Perform 5-fold cross-validation"""
cv = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = cross_val_score(model, X, y, cv=cv, scoring="r2")
mae_scores = -cross_val_score(model, X, y, cv=cv, scoring="neg_mean_absolute_error")

print(f"Cross-Validation R² scores: {r2_scores}")
print(f"Mean R²: {np.mean(r2_scores):.3f}")
print(f"Cross-Validation MAE scores: {mae_scores}")
print(f"Mean MAE: {np.mean(mae_scores):.3f}")

def save_model(model, filename):
"""Save trained model as a pickle file"""
model_path = os.path.join(MODEL_DIR, filename)
with open(model_path, "wb") as f:
pickle.dump(model, f)
print(f"✅ XGBoost model saved at {model_path}")

def main():
# Load data
X, y = load_data(FEATURES)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = train_xgboost(X_train, y_train)

# Evaluate on hold-out test set
evaluate_model(model, X_test, y_test)

# Cross-validation
cross_validate(model, X, y)

# Save model
save_model(model, "xgboost_model.pkl")

if __name__ == "__main__":
main()
90 changes: 90 additions & 0 deletions app/clients/service/initial_model_trainers/get_synthetic_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
import os

num_samples = 7000 # Total number of samples to generate

# Age distribution: Normal distribution centered around 30, capped between 18 and 60
age = np.random.normal(loc=30, scale=8, size=num_samples).astype(int)
age = np.clip(age, 18, 60)

# Work experience based on age (years since 18 minus some gap years)
work_experience = (age - 18) - np.random.randint(0, 4, num_samples)
work_experience = np.clip(work_experience, 0, 42)

# Canadian work experience is a fraction of total work experience
canada_workex = (work_experience * np.random.uniform(0.3, 0.9, num_samples)).astype(int)

# Education level: weighted choice simulating real-world distribution
level_of_schooling = np.random.choice([6, 8, 10, 12, 14], size=num_samples, p=[0.1, 0.25, 0.3, 0.25, 0.1])

# English proficiency: fluent_english impacts reading/speaking/writing scores
fluent_english = np.random.randint(4, 10, num_samples)
reading_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
speaking_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
writing_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)

# Numeracy and computer skills
numeracy = np.random.randint(3, 10, num_samples)
computer = np.random.randint(3, 10, num_samples)

# Binary features generator (0 or 1)
binary_features = lambda: np.random.randint(0, 2, num_samples)

# Base success rate formula influenced by key features
success_rate_base = (
fluent_english * 2 +
level_of_schooling * 2 +
computer * 2 +
numeracy * 1.5 +
work_experience * 1.2 +
(binary_features() * 10) # bonus random life factors
)

# Normalize and add Gaussian noise
success_rate = (success_rate_base / success_rate_base.max() * 100) + np.random.normal(0, 5, num_samples)
success_rate = np.clip(success_rate.astype(int), 0, 100)

# Create the dataframe
data = pd.DataFrame({
"age": age,
"gender": binary_features(),
"work_experience": work_experience,
"canada_workex": canada_workex,
"dep_num": np.random.randint(0, 5, num_samples),
"canada_born": binary_features(),
"citizen_status": binary_features(),
"level_of_schooling": level_of_schooling,
"fluent_english": fluent_english,
"reading_english_scale": reading_english,
"speaking_english_scale": speaking_english,
"writing_english_scale": writing_english,
"numeracy_scale": numeracy,
"computer_scale": computer,
"transportation_bool": binary_features(),
"caregiver_bool": binary_features(),
"housing": np.random.randint(1, 10, num_samples),
"income_source": np.random.randint(1, 10, num_samples),
"felony_bool": binary_features(),
"attending_school": binary_features(),
"currently_employed": binary_features(),
"substance_use": binary_features(),
"time_unemployed": np.random.randint(0, 8, num_samples),
"need_mental_health_support_bool": binary_features(),
"employment_assistance": binary_features(),
"life_stabilization": binary_features(),
"retention_services": binary_features(),
"specialized_services": binary_features(),
"employment_related_financial_supports": binary_features(),
"employer_financial_supports": binary_features(),
"enhanced_referrals": binary_features(),
"success_rate": success_rate
})

# Save to CSV
# output_path = os.path.join("app", "clients", "service", "data_commontool_synthetic.csv")
output_path = os.path.join(os.path.dirname(__file__), "data_commontool_synthetic.csv")

data.to_csv(output_path, index=False)

print(f"✅ {num_samples} synthetic data rows generated and saved to data_commontool_synthetic.csv")
130 changes: 130 additions & 0 deletions app/clients/service/initial_model_trainers/get_synthetic_testdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import numpy as np
import os

np.random.seed(42) # For reproducibility

num_samples = 4000 # Number of rows

# Age distribution: Normal distribution centered around 30, capped between 18 and 60
age = np.random.normal(loc=30, scale=8, size=num_samples).astype(int)
age = np.clip(age, 18, 60)

# Work experience based on age (years since 18 minus some gap years)
work_experience = (age - 18) - np.random.randint(0, 4, num_samples)
work_experience = np.clip(work_experience, 0, 42)

# Canadian work experience is a fraction of total work experience
canada_workex = (work_experience * np.random.uniform(0.3, 0.9, num_samples)).astype(int)

# Education level: weighted choice
level_of_schooling = np.random.choice([6, 8, 10, 12, 14], size=num_samples, p=[0.1, 0.25, 0.3, 0.25, 0.1])

# English proficiency
fluent_english = np.random.randint(4, 10, num_samples)
reading_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
speaking_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)
writing_english = np.clip(fluent_english + np.random.randint(-1, 2, num_samples), 1, 10)

# Numeracy and computer skills
numeracy = np.random.randint(3, 10, num_samples)
computer = np.random.randint(3, 10, num_samples)

# Binary features generator
binary_features = lambda: np.random.randint(0, 2, num_samples)

# Other categorical and binary features
gender = binary_features()
dep_num = np.random.randint(0, 5, num_samples)
canada_born = binary_features()
citizen_status = binary_features()
transportation_bool = binary_features()
caregiver_bool = binary_features()
housing = np.random.randint(1, 10, num_samples)
income_source = np.random.randint(1, 10, num_samples)
felony_bool = binary_features()
attending_school = binary_features()
currently_employed = binary_features()
substance_use = binary_features()
time_unemployed = np.random.randint(0, 8, num_samples)
need_mental_health_support_bool = binary_features()
employment_assistance = binary_features()
life_stabilization = binary_features()
retention_services = binary_features()
specialized_services = binary_features()
employment_related_financial_supports = binary_features()
employer_financial_supports = binary_features()
enhanced_referrals = binary_features()

# Success Rate Logic

felony_penalty = np.where(felony_bool == 1, -15, 0)
substance_penalty = np.where(substance_use == 1, -10, 0)
long_unemployed_penalty = np.where(time_unemployed >= 4, -5, 0)

bonus_services = (
(employment_assistance + life_stabilization + retention_services + specialized_services) * 3
)

employment_bonus = np.where(currently_employed == 1, 5, 0)
education_effect = np.minimum(level_of_schooling, 12) # Cap effect after a certain level

success_rate_base = (
fluent_english * 2 +
education_effect * 2 +
computer * 2 +
numeracy * 1.5 +
work_experience * 1.0 +
employment_bonus +
bonus_services +
felony_penalty +
substance_penalty +
long_unemployed_penalty +
(np.random.randint(0, 2, num_samples) * 5) # Random life variance
)

# Normalize and add Gaussian noise
success_rate = (success_rate_base / success_rate_base.max() * 100) + np.random.normal(0, 5, num_samples)
success_rate = np.clip(success_rate.astype(int), 0, 100)

# Create DataFrame
data = pd.DataFrame({
"age": age,
"gender": gender,
"work_experience": work_experience,
"canada_workex": canada_workex,
"dep_num": dep_num,
"canada_born": canada_born,
"citizen_status": citizen_status,
"level_of_schooling": level_of_schooling,
"fluent_english": fluent_english,
"reading_english_scale": reading_english,
"speaking_english_scale": speaking_english,
"writing_english_scale": writing_english,
"numeracy_scale": numeracy,
"computer_scale": computer,
"transportation_bool": transportation_bool,
"caregiver_bool": caregiver_bool,
"housing": housing,
"income_source": income_source,
"felony_bool": felony_bool,
"attending_school": attending_school,
"currently_employed": currently_employed,
"substance_use": substance_use,
"time_unemployed": time_unemployed,
"need_mental_health_support_bool": need_mental_health_support_bool,
"employment_assistance": employment_assistance,
"life_stabilization": life_stabilization,
"retention_services": retention_services,
"specialized_services": specialized_services,
"employment_related_financial_supports": employment_related_financial_supports,
"employer_financial_supports": employer_financial_supports,
"enhanced_referrals": enhanced_referrals,
"success_rate": success_rate
})

# Save to CSV
output_path = os.path.join(os.path.dirname(__file__), "../data_commontool_synthetic_testdata.csv")
data.to_csv(output_path, index=False)

print(f" {num_samples} diversified synthetic data rows generated and saved to data_commontool_synthetic.csv")
Loading