-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
81 lines (62 loc) · 2.79 KB
/
main.py
File metadata and controls
81 lines (62 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import zipfile
import pandas as pd
import pickle
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
from nltk.corpus import stopwords
model = None
vectorizer = None
stop_words = stopwords.words("english") + stopwords.words("russian")
vectorizer = TfidfVectorizer(
stop_words=stop_words,
min_df=1,
max_df=0.7,
lowercase=True,
token_pattern=r"(?u)\b[а-яА-Я-a-zA-Z]{2,}\b"
)
if not os.path.exists('model.pkl'):
dataset = "dataset"
eng_dataset_zip = os.path.join(dataset, "jigsaw-toxic-comment-classification-challenge.zip")
ru_dataset_zip = os.path.join(dataset, "russian-language-toxic-comments.zip")
if not os.path.exists(eng_dataset_zip):
os.system(f"kaggle datasets download -d julian3833/jigsaw-toxic-comment-classification-challenge -p {dataset}")
with zipfile.ZipFile(eng_dataset_zip) as zip_ref:
zip_ref.extractall(dataset)
if not os.path.exists(ru_dataset_zip):
os.system(f"kaggle datasets download -d blackmoon/russian-language-toxic-comments -p {dataset}")
with zipfile.ZipFile(ru_dataset_zip) as zip_ref:
zip_ref.extractall(dataset)
eng_train_df = pd.read_csv(f"{dataset}\\train.csv")
eng_test_labels_df = pd.read_csv(f"{dataset}\\test_labels.csv")
eng_test_df = pd.read_csv(f"{dataset}\\test.csv").merge(eng_test_labels_df, on="id")
rus_df = pd.read_csv(f"{dataset}\\labeled.csv").rename(columns={"comment": "comment_text"})
rus_train_df, rus_test_df = train_test_split(rus_df, test_size=0.2, random_state=42)
combined_train_df = pd.concat([eng_train_df, rus_train_df], ignore_index=True)
combined_test_df = pd.concat([eng_test_df, rus_train_df], ignore_index=True)
print(combined_train_df[combined_train_df["toxic"]==1.0])
target_train = combined_train_df["toxic"]
data_train = vectorizer.fit_transform(combined_train_df["comment_text"])
target_test = combined_test_df["toxic"]
data_test = vectorizer.transform(combined_test_df["comment_text"])
model = LogisticRegression()
_ = model.fit(data_train, target_train)
with open('model.pkl', 'wb') as file:
pickle.dump((model, vectorizer), file)
pred = model.predict(data_test)
accuracy = accuracy_score(target_test, pred)
print(f"Accurancy: {accuracy:.4f}")
else:
with open('model.pkl', 'rb') as file:
model, vectorizer = pickle.load(file)
user_comment = input("write up your comment here: ")
comment_pred = model.predict(vectorizer.transform([user_comment]))
isToxic = {
0: "Your comment is kindness.",
1: "Your comment is toxic.",
}
print(isToxic.get(comment_pred[0]))