ToxicCommentClassificationAI/main.py at main · pashudzu/ToxicCommentClassificationAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import zipfile
import pandas as pd
import pickle
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
from nltk.corpus import stopwords

model = None
vectorizer = None

stop_words = stopwords.words("english") + stopwords.words("russian")

vectorizer = TfidfVectorizer(
    stop_words=stop_words,
    min_df=1,
    max_df=0.7,
    lowercase=True,
    token_pattern=r"(?u)\b[а-яА-Я-a-zA-Z]{2,}\b"
)

if not os.path.exists('model.pkl'):
    dataset = "dataset"
    eng_dataset_zip = os.path.join(dataset, "jigsaw-toxic-comment-classification-challenge.zip")
    ru_dataset_zip = os.path.join(dataset, "russian-language-toxic-comments.zip")

    if not os.path.exists(eng_dataset_zip):
        os.system(f"kaggle datasets download -d julian3833/jigsaw-toxic-comment-classification-challenge -p {dataset}")
        with zipfile.ZipFile(eng_dataset_zip) as zip_ref:
            zip_ref.extractall(dataset)
    if not os.path.exists(ru_dataset_zip):
        os.system(f"kaggle datasets download -d blackmoon/russian-language-toxic-comments -p {dataset}")
        with zipfile.ZipFile(ru_dataset_zip) as zip_ref:
            zip_ref.extractall(dataset)

    eng_train_df = pd.read_csv(f"{dataset}\\train.csv")
    eng_test_labels_df = pd.read_csv(f"{dataset}\\test_labels.csv")
    eng_test_df = pd.read_csv(f"{dataset}\\test.csv").merge(eng_test_labels_df, on="id")

    rus_df = pd.read_csv(f"{dataset}\\labeled.csv").rename(columns={"comment": "comment_text"})
    rus_train_df, rus_test_df = train_test_split(rus_df, test_size=0.2, random_state=42)

    combined_train_df = pd.concat([eng_train_df, rus_train_df], ignore_index=True)
    combined_test_df = pd.concat([eng_test_df, rus_train_df], ignore_index=True)

    print(combined_train_df[combined_train_df["toxic"]==1.0])

    target_train = combined_train_df["toxic"]
    data_train = vectorizer.fit_transform(combined_train_df["comment_text"])

    target_test = combined_test_df["toxic"]
    data_test = vectorizer.transform(combined_test_df["comment_text"])

    model = LogisticRegression()
    _ = model.fit(data_train, target_train)

    with open('model.pkl', 'wb') as file:
        pickle.dump((model, vectorizer), file)

    pred = model.predict(data_test)
    accuracy = accuracy_score(target_test, pred)
    print(f"Accurancy: {accuracy:.4f}")

else:
    with open('model.pkl', 'rb') as file:
        model, vectorizer = pickle.load(file)

user_comment = input("write up your comment here: ")
comment_pred = model.predict(vectorizer.transform([user_comment]))

isToxic = {
    0: "Your comment is kindness.",
    1: "Your comment is toxic.",
}

print(isToxic.get(comment_pred[0]))