-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp_kmeans.py
More file actions
112 lines (90 loc) · 3.84 KB
/
app_kmeans.py
File metadata and controls
112 lines (90 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import logging
from flask import Flask, request, jsonify
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Enable logging
logging.basicConfig(level=logging.INFO)
# Initialize NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Flask application
app = Flask(__name__)
class TextPreprocessor:
def __init__(self):
self.stemmer = PorterStemmer()
self.stop_words = set(stopwords.words('english'))
def preprocess(self, text):
tokens = nltk.word_tokenize(text)
tokens = [word.lower() for word in tokens]
tokens = [word for word in tokens if word not in self.stop_words]
tokens = [self.stemmer.stem(word) for word in tokens]
return ' '.join(tokens)
class JobClusterModel:
def __init__(self, n_clusters=4, max_features=1000):
self.n_clusters = n_clusters
self.vectorizer = TfidfVectorizer(max_features=max_features)
self.model = KMeans(n_clusters=self.n_clusters, init='k-means++', random_state=42)
self.top_keywords = {}
self.cluster_labels = ["Engineering", "Support", "Data Science", "Marketing"]
def fit(self, job_descriptions):
X = self.vectorizer.fit_transform(job_descriptions).toarray()
self.model.fit(X)
self._extract_keywords()
def predict(self, job_description):
X_new = self.vectorizer.transform([job_description]).toarray()
cluster = self.model.predict(X_new)[0]
return cluster
def _extract_keywords(self):
cluster_centers = self.model.cluster_centers_
feature_names = self.vectorizer.get_feature_names_out()
for i in range(self.n_clusters):
words = [feature_names[index] for index in cluster_centers[i].argsort()[-10:]]
self.top_keywords[i] = words
def get_keywords(self, cluster):
return self.top_keywords.get(cluster, [])
def get_cluster_label(self, cluster):
return self.cluster_labels[cluster] if cluster < len(self.cluster_labels) else "Unknown"
def load_dataset(file_path):
logging.info("Loading dataset from %s", file_path)
df = pd.read_csv(file_path)
return df['Job Description'].dropna()
@app.route('/predict', methods=['POST'])
def predict():
try:
data = request.json
logging.info("Received request: %s", data)
job_description = data.get('job_description')
if not job_description:
logging.error("Job description not provided")
return jsonify({'error': 'Job description not provided'}), 400
preprocessed_text = preprocessor.preprocess(job_description)
cluster = model.predict(preprocessed_text)
cluster_label = model.get_cluster_label(cluster)
logging.info("Predicted cluster: %d (%s)", cluster, cluster_label)
return jsonify({
'cluster': int(cluster),
'cluster_label': cluster_label,
'keywords': model.get_keywords(cluster)
})
except Exception as e:
logging.exception("Error during prediction")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
dataset_path = os.getenv("DATASET_PATH", "./data/Booking_Jobs_All_220218.csv")
n_clusters = int(os.getenv("N_CLUSTERS", 4))
max_features = int(os.getenv("TFIDF_MAX_FEATURES", 1000))
port = int(os.getenv("FLASK_PORT", 6000))
job_descriptions = load_dataset(dataset_path)
preprocessor = TextPreprocessor()
processed_descriptions = job_descriptions.apply(preprocessor.preprocess)
model = JobClusterModel(n_clusters=n_clusters, max_features=max_features)
model.fit(processed_descriptions)
app.run(debug=True, port=port)