import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the maximum width of the table cells to 500 characters
pd.set_option('display.max_colwidth', 500)


df = pd.read_csv('./data/nlp-challenge.csv') 
df.head(10)


df.shape

(36417, 3)


# Find out the Average number of annotations per Text.
df.shape[0] / df['Text'].nunique()

3.000494356101178


# Check out if there are duplicates in the dataset
df.duplicated().sum()

0


# Check out if there are NaN values in the dataset
df.isna().sum()

Annotator ID    0
Text            0
Answer          0
dtype: int64


# Calculate whether the annotator always marked texts with the same answer (1) or gave different answers (2)
annotators_stat = df.groupby('Annotator ID')['Answer'].nunique().reset_index(name='Count unique Answers')
annotators_stat


# Count quantity of unique annotations by annotator
answers_count = df.groupby('Annotator ID')['Text'].count().reset_index(name='Count unique Texts') 
answers_count


# Merge the two Dataframes
annotators_stat = pd.merge(annotators_stat, answers_count, on='Annotator ID')


annotators_stat[annotators_stat['Count unique Answers'] == 1]


# Find the rows with unique combinations of 'Text' and 'Answer'
unique_rows = df.drop_duplicates(subset=['Text', 'Answer'], keep=False)

# Create a new DataFrame with the selected rows and check it's rows samples
unique_df = pd.DataFrame(unique_rows)
unique_df.sample(10)


# Count rows in the Dataframe
unique_df.shape

(7792, 3)


# Calculate the number of rows in the Dataframe for each annotator
uniq_answer_count = unique_df.groupby('Annotator ID')['Text'].count().reset_index(name='Count unique text/annot') 
uniq_answer_count


# Merge this dataframe to the annotators_stat dataframe
annotators_stat = pd.merge(annotators_stat, uniq_answer_count, on='Annotator ID', how='left')
annotators_stat['Count unique text/annot'].fillna(0, inplace=True)


annotators_stat.head()


# Select annotators with more than 20 answers to reduce random outliers
annotators_more_20 = annotators_stat.loc[annotators_stat['Count unique Texts'] > 20]
annotators_more_20.head()


# Calculate the percentage of suspicious responses from selected annotators
Percentages = pd.DataFrame({
    'Annotator ID': annotators_more_20['Annotator ID'],
    'Percentage': annotators_more_20['Count unique text/annot'] * 100 / annotators_more_20['Count unique Texts']
})

Percentages


# Visualize the data
sns.set(rc={'figure.figsize':(10, 3)})

# Select annotators with over 25% of suspicious responses
colors = ['red' if count >= 25 else 'blue' for count in Percentages['Percentage']]

# Create the bar plot with colors set by 'Count'
sns.barplot(x='Annotator ID', y='Percentage', data=Percentages, palette=colors)

plt.xlabel('Annotator ID')
plt.ylabel('Percentage of suspicious responses')
plt.xticks(rotation=90)
plt.show()


# Select annotators with over 25% of suspicious responses
Bot_IDs = Percentages.loc[(Percentages['Percentage'] >= 25)]['Annotator ID'].tolist()
print(Bot_IDs)

['A1MG8KNVSVZ365', 'A2CJFO19NY4T5R', 'A33Y36Y252Z30U', 'A3BJX6UUSOIKFN', 'A3OCJJMRKAIJZA', 'A9HQ3E0F2AGVO', 'AQIP3DSYXEXX5']


# Remove all suspicious annotations and write a new file
new_df = df[~df['Annotator ID'].isin(Bot_IDs)]

new_df.to_csv('./data/nlp-challenge-without-bots.csv', index=False)

	Annotator ID	Count unique Answers
0	A16184N1RO5OJV	2
1	A198H320N5MSXX	2
2	A19X8IA9EKC3XH	2
3	A1DP551UV06FN6	2
4	A1G187YBG0DVMQ	2
5	A1M5BJTQIXCM33	2
6	A1MG8KNVSVZ365	2
7	A1MJVTR0PCKBWW	2
8	A1R0689JPSQ3OF	2
9	A1SN0UU3V61IIM	1
10	A1YSYI926BBOHW	2
11	A27W025UEXS1G0	1
12	A292TFDMNVS0TP	2
13	A2A78DMGLC1S0Y	2
14	A2CJFO19NY4T5R	2
15	A2GM5BW75YCKKW	2
16	A2GO2OXS4VM1PR	1
17	A2HM35CWB7IIFM	2
18	A2JP9IKRHNLRPI	2
19	A2KHLJ2F58BEZK	2
20	A2LU259QPV1I4V	2
21	A2QTL039A5VV3I	1
22	A2R25RH05K0B68	2
23	A2R28HXAEFKBPC	2
24	A2VQBOJJ8HD6W9	2
25	A2WPHVMLLEV5ZB	2
26	A3124SRR191UIL	2
27	A33B85TN97HQ33	2
28	A33Y36Y252Z30U	2
29	A3BCKNE5CWHODZ	2
30	A3BISMR4GI02ZG	2
31	A3BJX6UUSOIKFN	2
32	A3HYCA7N5F6DL9	2
33	A3MV3PT4TOO69P	2
34	A3OCJJMRKAIJZA	2
35	A9HQ3E0F2AGVO	2
36	AAX9LTAOIBECD	2
37	AC95JAUAM2L2Z	2
38	AD1ILDUXZHASF	2
39	AG36U7IOG2LAP	2
40	AGRYG65WJ1CKJ	2
41	AJQGWGESKQT4Y	2
42	AKQAI78JTXXC9	2
43	AKSJ3C5O3V9RB	1
44	AKSLU0C30G3JT	2
45	AMYURTQIMAC8T	2
46	AOOF0H0EIEFFQ	2
47	AQIP3DSYXEXX5	2
48	AR9AU5FY1S3RO	2
49	ARW1TCHCLEK1W	2
50	AXY0D2AMLKE2A	1
51	AYTH0E5PUXWX8	2

	Annotator ID	Count unique Texts
0	A16184N1RO5OJV	119
1	A198H320N5MSXX	6
2	A19X8IA9EKC3XH	429
3	A1DP551UV06FN6	39
4	A1G187YBG0DVMQ	6
5	A1M5BJTQIXCM33	124
6	A1MG8KNVSVZ365	3561
7	A1MJVTR0PCKBWW	980
8	A1R0689JPSQ3OF	31
9	A1SN0UU3V61IIM	5
10	A1YSYI926BBOHW	84
11	A27W025UEXS1G0	1
12	A292TFDMNVS0TP	3
13	A2A78DMGLC1S0Y	100
14	A2CJFO19NY4T5R	138
15	A2GM5BW75YCKKW	714
16	A2GO2OXS4VM1PR	2
17	A2HM35CWB7IIFM	73
18	A2JP9IKRHNLRPI	3
19	A2KHLJ2F58BEZK	78
20	A2LU259QPV1I4V	2689
21	A2QTL039A5VV3I	1
22	A2R25RH05K0B68	144
23	A2R28HXAEFKBPC	7
24	A2VQBOJJ8HD6W9	5
25	A2WPHVMLLEV5ZB	334
26	A3124SRR191UIL	205
27	A33B85TN97HQ33	194
28	A33Y36Y252Z30U	99
29	A3BCKNE5CWHODZ	1443
30	A3BISMR4GI02ZG	3589
31	A3BJX6UUSOIKFN	1472
32	A3HYCA7N5F6DL9	67
33	A3MV3PT4TOO69P	5126
34	A3OCJJMRKAIJZA	5001
35	A9HQ3E0F2AGVO	1217
36	AAX9LTAOIBECD	18
37	AC95JAUAM2L2Z	35
38	AD1ILDUXZHASF	267
39	AG36U7IOG2LAP	250
40	AGRYG65WJ1CKJ	201
41	AJQGWGESKQT4Y	45
42	AKQAI78JTXXC9	20
43	AKSJ3C5O3V9RB	2
44	AKSLU0C30G3JT	11
45	AMYURTQIMAC8T	532
46	AOOF0H0EIEFFQ	565
47	AQIP3DSYXEXX5	3469
48	AR9AU5FY1S3RO	990
49	ARW1TCHCLEK1W	1042
50	AXY0D2AMLKE2A	17
51	AYTH0E5PUXWX8	864

	Annotator ID	Count unique Answers	Count unique Texts
9	A1SN0UU3V61IIM	1	5
11	A27W025UEXS1G0	1	1
16	A2GO2OXS4VM1PR	1	2
21	A2QTL039A5VV3I	1	1
43	AKSJ3C5O3V9RB	1	2
50	AXY0D2AMLKE2A	1	17

	Annotator ID	Text	Answer
16699	A3OCJJMRKAIJZA	@DarkCryptoLord @kucoincom Why do we still have/use CEXs? The liquidity should be sucked out from them like you distribute/take all left assets from a dead person. Whoever sents money to a dying person soon to be dead should know better. Hard talk here but dinosaurs also didn't make it at the end<span class='emoji-bytes' data-emoji-bytes='[240, 159, 152, 130]'></span>	True
20979	A3BISMR4GI02ZG	@Paisan26849860 @moneywraith @BitrueOfficial How much do they insure? 10 , 100, 1000They never give the full amount.Even real banks only insure account till like 100k	False
56	AQIP3DSYXEXX5	@sheldonbitmart @BitMartExchange hopefully you can buy back my rug pulled tokens on your platform aswell wtf am I gonna do wit 35 trillion tzki tokens at least give me the 3.18 you value it at.	False
25375	A3BISMR4GI02ZG	@EthereumBlue Thanks a million, I managed to withdraw 60% of my funds from Etherdelta before the hackers cleaned me out thanks to your video - no thanks to @etherdelta . Please send me you wallet address so I can show my appreciation. Respect.	False
30243	A3OCJJMRKAIJZA	The #DogeBonk community dodged another bullet with the recent #bitmarthack. $DOBO was offered a listing at #BitMart but decided not to accept. Good decision. #CryptocurrencyNews #BSCGem #BTC #Hacked https://t.co/blXz9NzTgv	True
15028	A3BCKNE5CWHODZ	@Mike28356315 @BitrueOfficial I think you'll regret later. This exchange rules, just a hiccup in a long journey. BTR is gonna be a smash hit success	True
5002	A3MV3PT4TOO69P	@1cyrilblanc1 @spiroseliot Also worth remembering that @bitfinex is victim as somebody has destroyed their business until we know otherwise	True
12566	AQIP3DSYXEXX5	@nutbutterzzz @justinsuntron @binance @cz_binance :) Peach has never showed to me or to anyone how much he still holds. However that is not important. No trolling allowed. Husky shows great similarities with Peach in terms of his style and everything else. Thanks for supporting me and you make me possible to buy Yeed for cheap	True
12406	A2LU259QPV1I4V	@crypto_bobby no they didn't even technically lose their money since exchange balances are IOUs that say kucoin is obligated to give you something. Kucoin lost money which means they may or may not honor IOUs. You lost your money as soon as you deposited	True
584	ARW1TCHCLEK1W	@Atiyda We love you too We'll continue improving to make Bitrue the #1 exchange worldwide, this is just a little setback on our journey	False

	Annotator ID	Count unique text/annot
0	A16184N1RO5OJV	10
1	A198H320N5MSXX	1
2	A19X8IA9EKC3XH	19
3	A1DP551UV06FN6	5
4	A1M5BJTQIXCM33	27
5	A1MG8KNVSVZ365	961
6	A1MJVTR0PCKBWW	81
7	A1R0689JPSQ3OF	4
8	A1YSYI926BBOHW	16
9	A292TFDMNVS0TP	3
10	A2A78DMGLC1S0Y	19
11	A2CJFO19NY4T5R	46
12	A2GM5BW75YCKKW	105
13	A2HM35CWB7IIFM	6
14	A2JP9IKRHNLRPI	1
15	A2KHLJ2F58BEZK	12
16	A2LU259QPV1I4V	241
17	A2R25RH05K0B68	15
18	A2VQBOJJ8HD6W9	2
19	A2WPHVMLLEV5ZB	53
20	A3124SRR191UIL	33
21	A33B85TN97HQ33	46
22	A33Y36Y252Z30U	32
23	A3BCKNE5CWHODZ	341
24	A3BISMR4GI02ZG	833
25	A3BJX6UUSOIKFN	605
26	A3HYCA7N5F6DL9	5
27	A3MV3PT4TOO69P	777
28	A3OCJJMRKAIJZA	1442
29	A9HQ3E0F2AGVO	384
30	AAX9LTAOIBECD	4
31	AC95JAUAM2L2Z	1
32	AD1ILDUXZHASF	22
33	AG36U7IOG2LAP	44
34	AGRYG65WJ1CKJ	36
35	AJQGWGESKQT4Y	6
36	AKQAI78JTXXC9	3
37	AKSLU0C30G3JT	2
38	AMYURTQIMAC8T	40
39	AOOF0H0EIEFFQ	52
40	AQIP3DSYXEXX5	1060
41	AR9AU5FY1S3RO	132
42	ARW1TCHCLEK1W	139
43	AXY0D2AMLKE2A	2
44	AYTH0E5PUXWX8	124

NLP task - Identifying bots among annotators¶

First hypothesis:¶

Second hypothesis:¶

	Annotator ID	Text	Answer
0	A1MG8KNVSVZ365	@cz_binance Thanks for being upfront about this CZ. Your fans still believe in you. Plug the holes that need to be plugged. You'll be fine. Good luck.	True
1	A3OCJJMRKAIJZA	@cz_binance Thanks for being upfront about this CZ. Your fans still believe in you. Plug the holes that need to be plugged. You'll be fine. Good luck.	True
2	AQIP3DSYXEXX5	@cz_binance Thanks for being upfront about this CZ. Your fans still believe in you. Plug the holes that need to be plugged. You'll be fine. Good luck.	True
3	A1MG8KNVSVZ365	@cz_binance The speed with which Binance has responded to their security incidentincluding public disclosureis impressive, and I admire the clarity of the message that they take responsibility and will make the users whole.	False
4	A3MV3PT4TOO69P	@cz_binance The speed with which Binance has responded to their security incidentincluding public disclosureis impressive, and I admire the clarity of the message that they take responsibility and will make the users whole.	True
5	AQIP3DSYXEXX5	@cz_binance The speed with which Binance has responded to their security incidentincluding public disclosureis impressive, and I admire the clarity of the message that they take responsibility and will make the users whole.	True
6	A1MG8KNVSVZ365	@ByzBox @cz_binance The big difference is that governments allow the banks to be corrupt, which makes governments corrupt too. Despite all of these entities being corrupt, Binance isn't protected by governments the way banks are. This scandal does affect Binance.	False
7	A3MV3PT4TOO69P	@ByzBox @cz_binance The big difference is that governments allow the banks to be corrupt, which makes governments corrupt too. Despite all of these entities being corrupt, Binance isn't protected by governments the way banks are. This scandal does affect Binance.	True
8	AGRYG65WJ1CKJ	@ByzBox @cz_binance The big difference is that governments allow the banks to be corrupt, which makes governments corrupt too. Despite all of these entities being corrupt, Binance isn't protected by governments the way banks are. This scandal does affect Binance.	False
9	A1MG8KNVSVZ365	@King_Tech__ You might have to read the terms and conditions... No use talking now since your funds are already in hotbit and no exchange will give a choice like that... Hope we are smart enough to understand that... Money is money either way doesn't come 4r free unless they want something	False

	Annotator ID	Count unique Answers	Count unique Texts	Count unique text/annot
0	A16184N1RO5OJV	2	119	10.0
1	A198H320N5MSXX	2	6	1.0
2	A19X8IA9EKC3XH	2	429	19.0
3	A1DP551UV06FN6	2	39	5.0
4	A1G187YBG0DVMQ	2	6	0.0

	Annotator ID	Percentage
0	A16184N1RO5OJV	8.403361
2	A19X8IA9EKC3XH	4.428904
3	A1DP551UV06FN6	12.820513
5	A1M5BJTQIXCM33	21.774194
6	A1MG8KNVSVZ365	26.986801
7	A1MJVTR0PCKBWW	8.265306
8	A1R0689JPSQ3OF	12.903226
10	A1YSYI926BBOHW	19.047619
13	A2A78DMGLC1S0Y	19.000000
14	A2CJFO19NY4T5R	33.333333
15	A2GM5BW75YCKKW	14.705882
17	A2HM35CWB7IIFM	8.219178
19	A2KHLJ2F58BEZK	15.384615
20	A2LU259QPV1I4V	8.962440
22	A2R25RH05K0B68	10.416667
25	A2WPHVMLLEV5ZB	15.868263
26	A3124SRR191UIL	16.097561
27	A33B85TN97HQ33	23.711340
28	A33Y36Y252Z30U	32.323232
29	A3BCKNE5CWHODZ	23.631324
30	A3BISMR4GI02ZG	23.209808
31	A3BJX6UUSOIKFN	41.100543
32	A3HYCA7N5F6DL9	7.462687
33	A3MV3PT4TOO69P	15.158018
34	A3OCJJMRKAIJZA	28.834233
35	A9HQ3E0F2AGVO	31.552999
37	AC95JAUAM2L2Z	2.857143
38	AD1ILDUXZHASF	8.239700
39	AG36U7IOG2LAP	17.600000
40	AGRYG65WJ1CKJ	17.910448
41	AJQGWGESKQT4Y	13.333333
45	AMYURTQIMAC8T	7.518797
46	AOOF0H0EIEFFQ	9.203540
47	AQIP3DSYXEXX5	30.556356
48	AR9AU5FY1S3RO	13.333333
49	ARW1TCHCLEK1W	13.339731
51	AYTH0E5PUXWX8	14.351852