import string

import pandas as pd
# Set the maximum width of the 'comment' column to 1000 characters
pd.set_option('display.max_colwidth', 500)
import numpy as np

import plotly.express as px

# It is used for splitting data into training and testing sets
from sklearn.model_selection import train_test_split
# Pipelines are used to chain multiple steps together in a machine learning workflow
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# It is used to convert text documents into numeric feature vectors
# using TF-IDF representation (term frequency inverse of document frequency).
from sklearn.feature_extraction.text import TfidfVectorizer
# These functions are used to evaluate the performance of classification models.
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
# It is used for visualizing precision-recall curves.
from sklearn.metrics import plot_precision_recall_curve
# It is used for cross-validation of grid search to tune machine learning model hyperparameters.
from sklearn.model_selection import GridSearchCV

# Load the nltk library for natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Load the punkt tokenizer data from the NLTK library. Punkt tokenizer
# is used by the word_tokenize function to split text into words or tokens.
nltk.download('punkt')
# It is used to form stemming words, leading them to their base or root form.
from nltk.stem import SnowballStemmer


df = pd.read_csv('./data/comments_labeled.csv') 
df.head(), df.shape

(                                                                                                                                                                                     comment  \
 0                                                                                                                                                       Верблюдов-то за что? Дебилы, бл...\n   
 1                                                                 Хохлы, это отдушина затюканого россиянина, мол, вон, а у хохлов еще хуже. Если бы хохлов не было, кисель их бы придумал.\n   
 2                                                                                                                                                                  Собаке - собачья смерть\n   
 3  Страницу обнови, дебил. Это тоже не оскорбление, а доказанный факт - не-дебил про себя во множественном числе писать не будет. Или мы в тебя верим - это ты и твои воображаемые друзья?\n   
 4                                                                  тебя не убедил 6-страничный пдф в том, что Скрипалей отравила Россия? Анализировать и думать пытаешься? Ватник что ли?)\n   
 
    toxic  
 0    1.0  
 1    1.0  
 2    1.0  
 3    1.0  
 4    1.0  ,
 (14412, 2))


# convert df['toxic'] column to int
df['toxic'] = df['toxic'].astype(int)
df.head()


# Get the number of each unique value in the 'toxic' column
df['toxic'].value_counts()

0    9586
1    4826
Name: toxic, dtype: int64


# There are a lot of toxic comments in the dataset, but only aggressive and offensive ones are marked as bad.
a = df[df['toxic'] == 1][['comment']].sample(5)
b = df[df['toxic'] == 0][['comment']].head(5)
a, b

(                                                                                                                                                      comment
 13892  Хохлы крайне примитивны, если тонко их троллить они ничего не поймут, другое дело если с ходу вбросить говно, минимум один хохол порвется и ответит.\n
 6776                                                                                     Почитаю твои посты на досуге и посмотрю ebay, может что и подберу)\n
 737                                                                         если хохлы это лахта,то они существуют страно что еще накрутчик раги не врубили\n
 2156               В который раз убеждаюсь, что с идиотами нельзя разговаривать как с нормальными людьми. который обосрал их игру Я её ни разу не запускал.\n
 10767                                                                                                  Да пидор он, доктор - А откуда у вас такие картинки?\n,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 comment
 6                                                                                                                                                                                                            В шапке были ссылки на инфу по текущему фильму марвел. Эти ссылки были заменены на фразу Репортим брипидора, игнорируем его посты. Если этого недостаточно, чтобы понять, что модератор абсолютный неадекват, и его нужно лишить полномочий, тогда эта борда пробивает абсолютное дно по неадекватности.\n
 12                                                                                                                                                                                                                                                                                                                                                                       Почитайте посты у этого автора,может найдете что нибудь полезное. Надеюсь помог) https: pikabu.ru story obyichnyie budni dezsluzhbyi 4932098\n
 17  Про графику было обидно) я так то проходил все серии гта со второй части по пятую, кроме гта 4. И мне не мешала графика ни в одной из частей. На компе у меня было куча видеокарт. Начиная с 32мб RIVA TNT и заканчивая 2Гб 560Ti на которой я спокойно играю который год в танки, гта5, ведьмака3 купил на распродаже и начал проходить. Да, не на ультрах. С пониженными текстурами. И не мешает. Я не понимаю дрочева на графике, требовать графику уровня плойки 4 минимум. Мне надо чтобы глаза не резало, ...
 28                                                                                                                                                                                                                                                                                                                                                                                                                                                    https: pp.userapi.com c848520 v848520411 11627b cOhWqFbGjWE.jpg\n
 36                                                                                                     Может и старый, может и маразматик. Про то писать кириллицей или латинницей вам виднее, не спорю. Но как задвигают русский язык уже видно. Дальше скажут что все будет только на казахском и внезапно пол страны окажется вторым сортом. (Я надеюсь, что ошибаюсь, но это уже проходили в других странах азии, прибалтики, закавказья) А так я за мир во всем мире. Русский слон лучший друг Казахского слона.\n)


# Divide the data into test dataframe (500 comments) and training dataframe.
train_df, test_df = train_test_split(df, test_size=500)
test_df.shape, train_df.shape

((500, 2), (13912, 2))


test_df['toxic'].value_counts()

0    328
1    172
Name: toxic, dtype: int64


sentence_example = df.loc[1]["comment"]
# split comments into tokens (words, signs)
tokens = word_tokenize(sentence_example, language="russian")
# remove punctuation marks
tokens_without_punctuation = [i for i in tokens if i not in string.punctuation]
# remove stop words
russian_stop_words = stopwords.words("russian")
tokens_without_stop_words_and_punctuation = [i for i in tokens_without_punctuation if i not in russian_stop_words]
# bring words to their initial form (stemming)
snowball = SnowballStemmer(language="russian")
stemmed_tokens = [snowball.stem(i) for i in tokens_without_stop_words_and_punctuation]


# Write a function for text preprocessing
snowball = SnowballStemmer(language="russian")
russian_stop_words = stopwords.words("russian")

def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language="russian")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in russian_stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

tokenize_sentence(sentence_example)

['хохл',
 'эт',
 'отдушин',
 'затюкан',
 'россиянин',
 'мол',
 'вон',
 'хохл',
 'хуж',
 'есл',
 'хохл',
 'кисел',
 'придума']


# Create TF-IDF vectorizer and replace its tokenizer with ours
vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))
# Now we can train our vectorizer. Pass train_df to it and after fit_transform train vectorizer
# it will return ready-made features that we can pass to the machine learning model
features = vectorizer.fit_transform(train_df['comment'])


model = LogisticRegression(random_state=10)
# fit using for training
model.fit(features, train_df['toxic'])

LogisticRegression(random_state=10)


# the model has been trained and we will test it, for example, on a training dataset
model.predict(features[0])

array([1])


# The model works correctly. We use iloc and not loc to check, because now we have an unsorted array,
# and initial indexes are randomly mixed up when the dataset was divided into training and test ones
train_df['comment'].iloc[0]

'Зигани на фюрера, полегчает...\n'


# I will create a Pipeline to connect the creation of the model and features.
# It will accept sentences as input, not vectors.
model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=10))
]
)


# Pass to it a list of sentences and labels and train the model.
model_pipeline.fit(train_df["comment"], train_df["toxic"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000001A5C1D605E8>)),
                ('model', LogisticRegression(random_state=10))])


# Trying a positive comment
model_pipeline.predict(["какая-то фича, привет, всё нормально у тебя?"])

array([0])


# Trying a rude comment
model_pipeline.predict(["пошел нахер отсюда"])

array([1])


# Precision = TP / (TP + FP)
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict(test_df["comment"]))

0.896


# Recall = TP / (TP + FN)
recall_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict(test_df["comment"]))

0.6511627906976745


# The threshold changes from 0 to 1, depending on it, Precision and recall are calculated and then a graph is plotted
# count 3 arrays for building a graph
prec, rec, thresholds = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline.predict_proba(test_df["comment"])[:, 1])


# Build a graph using the standard method: plot_precision_recall_curve(model_pipeline, X_test_df, y_test_df)
plot_precision_recall_curve(estimator=model_pipeline, X=test_df["comment"], y=test_df["toxic"])

d:\Dev\pandas_data_vis\env\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning:

Function plot_precision_recall_curve is deprecated; Function `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: PrecisionRecallDisplay.from_predictions or PrecisionRecallDisplay.from_estimator.

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x1a5c23b4ac8>


# we got prec, rec and thresholds not the same length, so we will exclude the last elements of prec and rec to create a dataframe.
# create a dictionary with arrays
data = {
    "precision": prec[:-1], 
    "recall": rec[:-1],
    "thresholds": thresholds  
}

# create a dataframe
df_tresholds = pd.DataFrame(data)

fig = px.line(
    df_tresholds,
    x="recall",
    y="precision",
    hover_data=["thresholds"],
    title="Precision-Recall Curve",
    width=1200,
    height=500
)

fig.show()


np.where((prec > 0.95) & (rec > 0.46))

(array([405], dtype=int64),)


df_tresholds.iloc[405]

precision     0.952381
recall        0.465116
thresholds    0.600599
Name: 405, dtype: float64


# Calculate the precision_score again.
precision_score(y_true=test_df["toxic"], y_pred=model_pipeline.predict_proba(test_df["comment"])[:,1] >= thresholds[405])

0.9523809523809523


grid_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", 
     GridSearchCV(
        LogisticRegression(random_state=10),
        # 3 random parameters for C.
        param_grid={'C': [0.2, 1, 2]},
        # during cross validation, we will split it into 3 folds. Fitting 3 folds for each of the 3 candidates
        cv=3,
        # display all information about training in the terminal
         verbose=4
        )
    )
])


# We pass to Pipeline a list of sentences, labels and GridSearch and also train the model.
grid_pipeline.fit(train_df["comment"], train_df["toxic"])

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3] END .............................C=0.2;, score=0.734 total time=   0.2s
[CV 2/3] END .............................C=0.2;, score=0.730 total time=   0.2s
[CV 3/3] END .............................C=0.2;, score=0.733 total time=   0.1s
[CV 1/3] END ...............................C=1;, score=0.839 total time=   0.5s
[CV 2/3] END ...............................C=1;, score=0.844 total time=   0.5s
[CV 3/3] END ...............................C=1;, score=0.834 total time=   0.4s
[CV 1/3] END ...............................C=2;, score=0.856 total time=   0.6s
[CV 2/3] END ...............................C=2;, score=0.862 total time=   0.5s
[CV 3/3] END ...............................C=2;, score=0.851 total time=   0.5s

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000001A5C260AC18>)),
                ('model',
                 GridSearchCV(cv=3,
                              estimator=LogisticRegression(random_state=10),
                              param_grid={'C': [0.2, 1, 2]}, verbose=4))])


# Substitute parameter C = 2 and create a pipeline
model_pipeline_c_1 = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=0, C=2))
]
)


model_pipeline_c_1.fit(train_df["comment"], train_df["toxic"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000001A5C2B7FA68>)),
                ('model', LogisticRegression(C=2, random_state=0))])


# Trying a rude comment
model_pipeline.predict(["пошел нахер отсюда"])

array([1])


# Trying a positive comment
model_pipeline_c_1.predict(["какая-то  фича, привет, всё нормально у тебя?"])

array([0])


# count 3 arrays for building a chart
prec_c_1, rec_c_1, thresholds_c_1 = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline_c_1.predict_proba(test_df["comment"])[:, 1])


prec_c_1, rec_c_1, thresholds_c_1
data = {
    "precision": prec_c_1[:-1], 
    "recall": rec_c_1[:-1],
    "thresholds": thresholds_c_1  
}

df_tresholds = pd.DataFrame(data)

fig = px.line(
    df_tresholds,
    x="recall",
    y="precision",
    hover_data=["thresholds"],
    title="Precision-Recall Curve",
    width=1200,
    height=500
)

fig.show()

Product task - removing harmful and offensive comments.¶

Text preprocessing.¶

We will use the TF-IDF algorithm¶

Create a model¶

Pipeline¶

The model works properly in our examples, but we need to calculate the metrics.¶

	comment	toxic
0	Верблюдов-то за что? Дебилы, бл...\n	1
1	Хохлы, это отдушина затюканого россиянина, мол, вон, а у хохлов еще хуже. Если бы хохлов не было, кисель их бы придумал.\n	1
2	Собаке - собачья смерть\n	1
3	Страницу обнови, дебил. Это тоже не оскорбление, а доказанный факт - не-дебил про себя во множественном числе писать не будет. Или мы в тебя верим - это ты и твои воображаемые друзья?\n	1
4	тебя не убедил 6-страничный пдф в том, что Скрипалей отравила Россия? Анализировать и думать пытаешься? Ватник что ли?)\n	1