Jit Bahadur Khamcha commited on
Commit
b0e7079
1 Parent(s): deda789
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ bert_model/*
2
+ **/__pycache__/
3
+ driver/*
4
+ try.py
5
+ nepaliBert.pkl
6
+ .ipynb_checkpoints/twitter-checkpoint.ipynb
7
+ chromedriver
8
+ geckodriver.log
BertSentimentAnalysisNepali.ipynb ADDED
@@ -0,0 +1,1068 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 32,
6
+ "id": "1ce0c43d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import torch\n",
11
+ "import numpy as np \n",
12
+ "import pandas as pd \n",
13
+ "from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM,AutoModel\n",
14
+ "from scipy.spatial.distance import cosine \n",
15
+ "import tokenizers \n",
16
+ "import pandas as pd \n",
17
+ "from sklearn.model_selection import train_test_split,GridSearchCV\n",
18
+ "from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score\n",
19
+ "from nltk.corpus import stopwords\n",
20
+ "import snowballstemmer \n",
21
+ "from sklearn.svm import SVC\n",
22
+ "from sklearn.naive_bayes import GaussianNB\n",
23
+ "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
24
+ "from sklearn.decomposition import PCA\n",
25
+ "from sklearn.preprocessing import StandardScaler\n",
26
+ "import snowballstemmer\n",
27
+ "import numpy\n",
28
+ "import os \n",
29
+ "import re\n",
30
+ "import json\n",
31
+ "import pickle "
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 72,
37
+ "id": "1b519b36",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "model = AutoModelForMaskedLM.from_pretrained(\"Shushant/nepaliBERT\", output_hidden_states = True, return_dict = True, output_attentions = True)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 73,
47
+ "id": "7dc414c6",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "tokenizers = AutoTokenizer.from_pretrained(\"Shushant/nepaliBERT\")"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "id": "1871cd20",
58
+ "metadata": {
59
+ "scrolled": true
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "tokenizers.tokenize(\"के मौजुदा लोकतान्त्रिक व्यवस्था राज्य पुनःसंरचनासँग जोडिएका हिजोका सवालहरूलाई यथास्थितिमा छोडेर सबल होला?\")"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "id": "00ca9f25",
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# text = 'अनि तेस्रो चिन्ता मौसम परिवर्तनले हिमशिखरहरूमा परेका आघातसँगसँगै सिमानावारिपारि नदीले ल्याएका प्रकोपहरू कसरी सम्हाल्ने'\n",
74
+ "# marked_text = \" [CLS] \"+text+\" [SEP] \"\n",
75
+ "# tokenized_text = tokenizer.tokenize(marked_text)\n",
76
+ "# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
77
+ "# segments_ids = [1] * len(indexed_tokens)\n",
78
+ "\n",
79
+ "# tokens_tensors = torch.tensor([indexed_tokens])\n",
80
+ "# segments_tensors = torch.tensor([segments_ids])"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "id": "88a853e8",
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "# with torch.no_grad():\n",
91
+ "# outputs = model(tokens_tensors, segments_tensors)\n",
92
+ "# hidden_states = outputs.hidden_states\n",
93
+ "# # print(hidden_states[-1])\n",
94
+ "# token_embeddings = hidden_states[-1]\n",
95
+ " \n",
96
+ "# token_embeddings = torch.squeeze(token_embeddings, dim = 0)\n",
97
+ " \n",
98
+ "# list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]\n",
99
+ "# print(list_token_embeddings)"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "id": "6fc8c04e",
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "nepali_stemmer = snowballstemmer.NepaliStemmer()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "06bc3947",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "texts = ['तर','दुधमा तर बसेन|','तिम्रो घर आउन मन लाग्छ तर अल्छि लाग्छ']"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "5cd86297",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "def bert_text_preparation(text, tokenizer ):\n",
130
+ " \"\"\"Preparing input for BERT\"\"\"\n",
131
+ " \n",
132
+ " marked_text = \" [CLS] \" + text + \" [SEP] \"\n",
133
+ " tokenized_text = tokenizer.tokenize(marked_text)\n",
134
+ " indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
135
+ " segments_ids = [1] * len(indexed_tokens) \n",
136
+ " \n",
137
+ " # Convert inputs to Pytorch tensors\n",
138
+ " tokens_tensors = torch.tensor([indexed_tokens])\n",
139
+ " segments_tensors = torch.tensor([segments_ids])\n",
140
+ " \n",
141
+ " return tokenized_text, tokens_tensors, segments_tensors"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "a70ff12b",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "def get_bert_embeddings(tokens_tensor, segments_tensors, model):\n",
152
+ " # Gradient claculation id disabled \n",
153
+ " # Model is in inference mode\n",
154
+ " \n",
155
+ " with torch.no_grad():\n",
156
+ " outputs = model(tokens_tensor, segments_tensors)\n",
157
+ " # removing the first hidden state\n",
158
+ " # the first state is the input state \n",
159
+ " hidden_states = outputs.hidden_states\n",
160
+ " \n",
161
+ " # Getting embeddings from final Bert Layer\n",
162
+ " tokens_embeddings = hidden_states[-1]\n",
163
+ " # Collasping the tensor into 1-dimension \n",
164
+ " tokens_embeddings = torch.squeeze(tokens_embeddings, dim = 0)\n",
165
+ " # Converting torchtensors to lists \n",
166
+ " list_token_embeddings = [token_embed.tolist() for token_embed in tokens_embeddings]\n",
167
+ " \n",
168
+ " return list_token_embeddings "
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "id": "9d4db1a5",
175
+ "metadata": {
176
+ "scrolled": false
177
+ },
178
+ "outputs": [],
179
+ "source": [
180
+ "target_word_embeddings = []\n",
181
+ "\n",
182
+ "for text in texts:\n",
183
+ " tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizers)\n",
184
+ " list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
185
+ "# print(len(list_token_embeddings))\n",
186
+ " ## list_token_embeddings has embeddings of the given words\n",
187
+ " word_index = tokenized_text.index('तर')\n",
188
+ " word_embedding = list_token_embeddings[word_index]\n",
189
+ "# print(word_embedding)\n",
190
+ " target_word_embeddings.append(word_embedding)"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "id": "9c79f53c",
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "target_word_embeddings[0] == target_word_embeddings[1]"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "id": "eeb28025",
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": []
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "id": "0578cc53",
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "len(target_word_embeddings)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 39,
224
+ "id": "e5144fea",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "# list_of_distances = []\n",
229
+ "# for text1, embed1 in zip(texts, target_word_embeddings):\n",
230
+ "# for text2, embed2 in zip(texts, target_word_embeddings):\n",
231
+ "# cos_dist = 1 - cosine(embed1,embed2)\n",
232
+ "# list_of_distances.append([text1, text2, cos_dist])\n",
233
+ "\n",
234
+ "\n",
235
+ "# distances_df = pd.DataFrame(list_of_distances, columns = ['text1','text2','distance'])\n"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "07d31ba6",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "# df = pd.read_csv(\"finalData.csv\")\n",
246
+ "df = pd.read_csv('collected_labeled_data.csv')"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "b92d7bd7",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "df['label'].unique()"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 25,
262
+ "id": "048ef9d1",
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": [
266
+ "# df.to_csv('collected_labeled_data.csv',index = False)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "id": "a91654b3",
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": []
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "f6649f45",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": []
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 26,
288
+ "id": "ff995c8c",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "# train_X, test_X = train_test_split(df, test_size = 0.2)"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": 27,
298
+ "id": "012c49a5",
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": [
302
+ "# train_X.to_csv('train.csv',index = False)\n",
303
+ "# test_X.to_csv('test.csv',index = False)"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 28,
309
+ "id": "6103b035",
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "# def check_len(text):\n",
314
+ "# txt = text.split(' ')[:20]\n",
315
+ "# return ' '.join(txt)"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": 29,
321
+ "id": "b0aeeb8c",
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "# df['text'] = df['text'].apply(check_len)"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 30,
331
+ "id": "57168ec5",
332
+ "metadata": {},
333
+ "outputs": [],
334
+ "source": [
335
+ "# def get_word_embeddings(text):\n",
336
+ "# tokenizer = tokenizers\n",
337
+ "# tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizer)\n",
338
+ "# list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
339
+ "# ## list_token_embeddings has embeddings of the given words\n",
340
+ "# return list_token_embeddings"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 40,
346
+ "id": "36144615",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "stopwords= stopwords.words(\"nepali\")"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": 41,
356
+ "id": "2163997b",
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "words = ['अक्सर','आदि','कसरी','अन्तर्गत','अर्थात','अर्थात्','अलग','आयो','उदाहरण','एकदम','राम्रो','बिरुद्ध','बिशेष','नराम्रो']"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 42,
366
+ "id": "67268e98",
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "stopwords = list(set(stopwords).difference(set(words)))"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 43,
376
+ "id": "e9ea0fe0",
377
+ "metadata": {},
378
+ "outputs": [],
379
+ "source": [
380
+ "def remove_emojis(text):\n",
381
+ " emoji_pattern = re.compile(\"[\"\n",
382
+ " u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
383
+ " u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
384
+ " u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
385
+ " u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
386
+ " u\"\\U00002500-\\U00002BEF\" # chinese char\n",
387
+ " u\"\\U00002702-\\U000027B0\"\n",
388
+ " u\"\\U00002702-\\U000027B0\"\n",
389
+ " u\"\\U000024C2-\\U0001F251\"\n",
390
+ " u\"\\U0001f926-\\U0001f937\"\n",
391
+ " u\"\\U00010000-\\U0010ffff\"\n",
392
+ " u\"\\u2640-\\u2642\" \n",
393
+ " u\"\\u2600-\\u2B55\"\n",
394
+ " u\"\\u200d\"\n",
395
+ " u\"\\u23cf\"\n",
396
+ " u\"\\u23e9\"\n",
397
+ " u\"\\u231a\"\n",
398
+ " u\"\\ufe0f\" # dingbats\n",
399
+ " u\"\\u3030\"\n",
400
+ " \"]+\", re.UNICODE)\n",
401
+ " text = emoji_pattern.sub(r'', text)\n",
402
+ " return text"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": 63,
408
+ "id": "af7b34a1",
409
+ "metadata": {},
410
+ "outputs": [],
411
+ "source": [
412
+ "def clean_text(text):\n",
413
+ " text = remove_emojis(text)\n",
414
+ " text = text.split(' ')\n",
415
+ " clean_text_list = []\n",
416
+ " for word in text:\n",
417
+ " if word not in stopwords:\n",
418
+ " clean_text_list.append(word)\n",
419
+ " clean_text = ' '.join(clean_text_list)\n",
420
+ " stem_words = nepali_stemmer.stemWords(clean_text.split())\n",
421
+ "# stem_text = ' '.join(stem_words)\n",
422
+ "# txt = re.sub(r\"[|a-zA-z.'#0-9@,:?'\\u200b\\u200c\\u200d!/&~-]\",'',stem_text)\n",
423
+ " return ' '.join([i for i in stem_words])"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 64,
429
+ "id": "05ec9fd9",
430
+ "metadata": {},
431
+ "outputs": [
432
+ {
433
+ "data": {
434
+ "text/plain": [
435
+ "'घाम जति लग् हामी तेती राम्रो apple'"
436
+ ]
437
+ },
438
+ "execution_count": 64,
439
+ "metadata": {},
440
+ "output_type": "execute_result"
441
+ }
442
+ ],
443
+ "source": [
444
+ "clean_text(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \")"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 65,
450
+ "id": "05c48277",
451
+ "metadata": {},
452
+ "outputs": [
453
+ {
454
+ "data": {
455
+ "text/plain": [
456
+ "['घाम', 'जति', 'लग्', 'हामी', 'तेती', 'राम्रो', '', 'apple']"
457
+ ]
458
+ },
459
+ "execution_count": 65,
460
+ "metadata": {},
461
+ "output_type": "execute_result"
462
+ }
463
+ ],
464
+ "source": [
465
+ "nepali_stemmer.stemWords(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \".split())"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 66,
471
+ "id": "61c1b1dd",
472
+ "metadata": {},
473
+ "outputs": [],
474
+ "source": [
475
+ "df['text'] = df['text'].apply(clean_text)\n"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": 67,
481
+ "id": "d3b2275f",
482
+ "metadata": {},
483
+ "outputs": [
484
+ {
485
+ "data": {
486
+ "text/html": [
487
+ "<div>\n",
488
+ "<style scoped>\n",
489
+ " .dataframe tbody tr th:only-of-type {\n",
490
+ " vertical-align: middle;\n",
491
+ " }\n",
492
+ "\n",
493
+ " .dataframe tbody tr th {\n",
494
+ " vertical-align: top;\n",
495
+ " }\n",
496
+ "\n",
497
+ " .dataframe thead th {\n",
498
+ " text-align: right;\n",
499
+ " }\n",
500
+ "</style>\n",
501
+ "<table border=\"1\" class=\"dataframe\">\n",
502
+ " <thead>\n",
503
+ " <tr style=\"text-align: right;\">\n",
504
+ " <th></th>\n",
505
+ " <th>text</th>\n",
506
+ " <th>label</th>\n",
507
+ " </tr>\n",
508
+ " </thead>\n",
509
+ " <tbody>\n",
510
+ " <tr>\n",
511
+ " <th>0</th>\n",
512
+ " <td>बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ...</td>\n",
513
+ " <td>2</td>\n",
514
+ " </tr>\n",
515
+ " <tr>\n",
516
+ " <th>1</th>\n",
517
+ " <td>1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...</td>\n",
518
+ " <td>1</td>\n",
519
+ " </tr>\n",
520
+ " <tr>\n",
521
+ " <th>2</th>\n",
522
+ " <td>होइन सानि बैंक bonus घोसणा २ महिना (book clos...</td>\n",
523
+ " <td>2</td>\n",
524
+ " </tr>\n",
525
+ " <tr>\n",
526
+ " <th>3</th>\n",
527
+ " <td>खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/- 10क...</td>\n",
528
+ " <td>2</td>\n",
529
+ " </tr>\n",
530
+ " <tr>\n",
531
+ " <th>4</th>\n",
532
+ " <td>राम्रो</td>\n",
533
+ " <td>1</td>\n",
534
+ " </tr>\n",
535
+ " </tbody>\n",
536
+ "</table>\n",
537
+ "</div>"
538
+ ],
539
+ "text/plain": [
540
+ " text label\n",
541
+ "0 बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ... 2\n",
542
+ "1 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... 1\n",
543
+ "2 होइन सानि बैंक bonus घोसणा २ महिना (book clos... 2\n",
544
+ "3 खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/- 10क... 2\n",
545
+ "4 राम्रो 1"
546
+ ]
547
+ },
548
+ "execution_count": 67,
549
+ "metadata": {},
550
+ "output_type": "execute_result"
551
+ }
552
+ ],
553
+ "source": [
554
+ "df.head()"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "execution_count": 74,
560
+ "id": "b76b3d7e",
561
+ "metadata": {},
562
+ "outputs": [],
563
+ "source": [
564
+ "def get_bert_embedding_sentence(input_sentence):\n",
565
+ " md = model\n",
566
+ " tokenizer = tokenizers\n",
567
+ " marked_text = \" [CLS] \" + input_sentence + \" [SEP] \"\n",
568
+ " tokenized_text = tokenizer.tokenize(marked_text)\n",
569
+ "\n",
570
+ " indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
571
+ " segments_ids = [1] * len(indexed_tokens) \n",
572
+ " \n",
573
+ " # Convert inputs to Pytorch tensors\n",
574
+ " tokens_tensors = torch.tensor([indexed_tokens])\n",
575
+ " segments_tensors = torch.tensor([segments_ids])\n",
576
+ " \n",
577
+ " with torch.no_grad():\n",
578
+ " outputs = md(tokens_tensors, segments_tensors)\n",
579
+ " # removing the first hidden state\n",
580
+ " # the first state is the input state \n",
581
+ "\n",
582
+ " hidden_states = outputs.hidden_states\n",
583
+ "# print(hidden_states[-2])\n",
584
+ " # second_hidden_states = outputs[2]\n",
585
+ " # hidden_states has shape [13 x 1 x 22 x 768]\n",
586
+ "\n",
587
+ " # token_vecs is a tensor with shape [22 x 768]\n",
588
+ "# token_vecs = hidden_states[-2][0]\n",
589
+ " # get last four layers\n",
590
+ "# last_four_layers = [hidden_states[i] for i in (-1,-2, -3,-4)]\n",
591
+ "\n",
592
+ "\n",
593
+ " # cast layers to a tuple and concatenate over the last dimension\n",
594
+ "# cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)\n",
595
+ "# print(cat_hidden_states.shape)\n",
596
+ " token_vecs = hidden_states[-2][0]\n",
597
+ "\n",
598
+ " # take the mean of the concatenated vector over the token dimension\n",
599
+ "# sentence_embedding = torch.mean(cat_hidden_states, dim=0).squeeze()\n",
600
+ "\n",
601
+ " # Calculate the average of all 22 token vectors.\n",
602
+ " sentence_embedding = torch.mean(token_vecs, dim=0)\n",
603
+ "# sentence_embedding = torch.mean(token_vecs, dim=1)\n",
604
+ " return sentence_embedding.numpy()"
605
+ ]
606
+ },
607
+ {
608
+ "cell_type": "code",
609
+ "execution_count": 58,
610
+ "id": "1da99701",
611
+ "metadata": {},
612
+ "outputs": [],
613
+ "source": [
614
+ "# get_bert_embedding_sentence(\"नेपाल को ससकृती ध्वस्त पार्ने योजना\")"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 69,
620
+ "id": "d08f787c",
621
+ "metadata": {},
622
+ "outputs": [],
623
+ "source": [
624
+ "df=df.drop(df[df['label']==2].index)"
625
+ ]
626
+ },
627
+ {
628
+ "cell_type": "code",
629
+ "execution_count": 70,
630
+ "id": "9c8990f7",
631
+ "metadata": {},
632
+ "outputs": [],
633
+ "source": [
634
+ "df.dropna(inplace = True)"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 75,
640
+ "id": "ba7e75a3",
641
+ "metadata": {},
642
+ "outputs": [],
643
+ "source": [
644
+ "df['word_embeddings'] = df['text'].apply(get_bert_embedding_sentence)"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": 76,
650
+ "id": "edad3099",
651
+ "metadata": {},
652
+ "outputs": [
653
+ {
654
+ "data": {
655
+ "text/plain": [
656
+ "(6056, 3)"
657
+ ]
658
+ },
659
+ "execution_count": 76,
660
+ "metadata": {},
661
+ "output_type": "execute_result"
662
+ }
663
+ ],
664
+ "source": [
665
+ "df.shape"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "code",
670
+ "execution_count": 77,
671
+ "id": "4760c1d1",
672
+ "metadata": {},
673
+ "outputs": [
674
+ {
675
+ "data": {
676
+ "text/html": [
677
+ "<div>\n",
678
+ "<style scoped>\n",
679
+ " .dataframe tbody tr th:only-of-type {\n",
680
+ " vertical-align: middle;\n",
681
+ " }\n",
682
+ "\n",
683
+ " .dataframe tbody tr th {\n",
684
+ " vertical-align: top;\n",
685
+ " }\n",
686
+ "\n",
687
+ " .dataframe thead th {\n",
688
+ " text-align: right;\n",
689
+ " }\n",
690
+ "</style>\n",
691
+ "<table border=\"1\" class=\"dataframe\">\n",
692
+ " <thead>\n",
693
+ " <tr style=\"text-align: right;\">\n",
694
+ " <th></th>\n",
695
+ " <th>text</th>\n",
696
+ " <th>label</th>\n",
697
+ " <th>word_embeddings</th>\n",
698
+ " </tr>\n",
699
+ " </thead>\n",
700
+ " <tbody>\n",
701
+ " <tr>\n",
702
+ " <th>1</th>\n",
703
+ " <td>1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ...</td>\n",
704
+ " <td>1</td>\n",
705
+ " <td>[-0.2517209, 0.80447733, -0.30090085, 0.363934...</td>\n",
706
+ " </tr>\n",
707
+ " <tr>\n",
708
+ " <th>4</th>\n",
709
+ " <td>राम्रो</td>\n",
710
+ " <td>1</td>\n",
711
+ " <td>[-0.4275645, 0.90052205, -0.6469192, 0.3758416...</td>\n",
712
+ " </tr>\n",
713
+ " <tr>\n",
714
+ " <th>6</th>\n",
715
+ " <td>जानकारी धन्यवाद रामहरी ब्रदर</td>\n",
716
+ " <td>1</td>\n",
717
+ " <td>[0.24045938, 0.72639877, -0.11193645, 0.146293...</td>\n",
718
+ " </tr>\n",
719
+ " <tr>\n",
720
+ " <th>18</th>\n",
721
+ " <td>भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने...</td>\n",
722
+ " <td>0</td>\n",
723
+ " <td>[0.15390012, 0.67477095, -0.1543702, -0.212426...</td>\n",
724
+ " </tr>\n",
725
+ " <tr>\n",
726
+ " <th>25</th>\n",
727
+ " <td>लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्...</td>\n",
728
+ " <td>0</td>\n",
729
+ " <td>[-0.07738958, 1.039313, -0.1071973, -0.0086015...</td>\n",
730
+ " </tr>\n",
731
+ " </tbody>\n",
732
+ "</table>\n",
733
+ "</div>"
734
+ ],
735
+ "text/plain": [
736
+ " text label \\\n",
737
+ "1 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... 1 \n",
738
+ "4 राम्रो 1 \n",
739
+ "6 जानकारी धन्यवाद रामहरी ब्रदर 1 \n",
740
+ "18 भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने... 0 \n",
741
+ "25 लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्... 0 \n",
742
+ "\n",
743
+ " word_embeddings \n",
744
+ "1 [-0.2517209, 0.80447733, -0.30090085, 0.363934... \n",
745
+ "4 [-0.4275645, 0.90052205, -0.6469192, 0.3758416... \n",
746
+ "6 [0.24045938, 0.72639877, -0.11193645, 0.146293... \n",
747
+ "18 [0.15390012, 0.67477095, -0.1543702, -0.212426... \n",
748
+ "25 [-0.07738958, 1.039313, -0.1071973, -0.0086015... "
749
+ ]
750
+ },
751
+ "execution_count": 77,
752
+ "metadata": {},
753
+ "output_type": "execute_result"
754
+ }
755
+ ],
756
+ "source": [
757
+ "df.head()"
758
+ ]
759
+ },
760
+ {
761
+ "cell_type": "code",
762
+ "execution_count": 78,
763
+ "id": "bc3840ee",
764
+ "metadata": {},
765
+ "outputs": [],
766
+ "source": [
767
+ "# df.to_csv('embedding_data.csv',index = False)"
768
+ ]
769
+ },
770
+ {
771
+ "cell_type": "code",
772
+ "execution_count": 79,
773
+ "id": "2da7b924",
774
+ "metadata": {},
775
+ "outputs": [],
776
+ "source": [
777
+ "X,y = df['word_embeddings'], df['label']"
778
+ ]
779
+ },
780
+ {
781
+ "cell_type": "code",
782
+ "execution_count": 80,
783
+ "id": "6bc72bb6",
784
+ "metadata": {},
785
+ "outputs": [],
786
+ "source": [
787
+ "# scaler = StandardScaler()\n",
788
+ "# pca = PCA(n_components = 768)"
789
+ ]
790
+ },
791
+ {
792
+ "cell_type": "code",
793
+ "execution_count": 81,
794
+ "id": "99ad87ec",
795
+ "metadata": {},
796
+ "outputs": [],
797
+ "source": [
798
+ "# scaled_X = scaler.fit_transform(X.tolist())\n",
799
+ "# pca_X = pca.fit_transform(scaled_X)"
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "code",
804
+ "execution_count": 82,
805
+ "id": "9689b1a4",
806
+ "metadata": {},
807
+ "outputs": [],
808
+ "source": [
809
+ "train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.2, random_state = 420)"
810
+ ]
811
+ },
812
+ {
813
+ "cell_type": "code",
814
+ "execution_count": 83,
815
+ "id": "828e1a7a",
816
+ "metadata": {},
817
+ "outputs": [],
818
+ "source": [
819
+ "svc = SVC()"
820
+ ]
821
+ },
822
+ {
823
+ "cell_type": "code",
824
+ "execution_count": 84,
825
+ "id": "d6524c9d",
826
+ "metadata": {},
827
+ "outputs": [],
828
+ "source": [
829
+ "# train_X = [i[0] for i in train_X]\n",
830
+ "# test_X = [i[0] for i in test_X]"
831
+ ]
832
+ },
833
+ {
834
+ "cell_type": "code",
835
+ "execution_count": 85,
836
+ "id": "f8311883",
837
+ "metadata": {},
838
+ "outputs": [],
839
+ "source": [
840
+ "# train_X[0][0].shape"
841
+ ]
842
+ },
843
+ {
844
+ "cell_type": "code",
845
+ "execution_count": 86,
846
+ "id": "2af91c5f",
847
+ "metadata": {},
848
+ "outputs": [
849
+ {
850
+ "data": {
851
+ "text/plain": [
852
+ "SVC()"
853
+ ]
854
+ },
855
+ "execution_count": 86,
856
+ "metadata": {},
857
+ "output_type": "execute_result"
858
+ }
859
+ ],
860
+ "source": [
861
+ "svc.fit(train_X.tolist(), train_y)\n",
862
+ "#svc.fit(train_X, train_y)"
863
+ ]
864
+ },
865
+ {
866
+ "cell_type": "code",
867
+ "execution_count": 87,
868
+ "id": "16d5e606",
869
+ "metadata": {},
870
+ "outputs": [],
871
+ "source": [
872
+ "svc_pred = svc.predict(test_X.tolist())\n",
873
+ "# svc_pred = svc.predict(test_X)"
874
+ ]
875
+ },
876
+ {
877
+ "cell_type": "code",
878
+ "execution_count": 88,
879
+ "id": "fdd814fe",
880
+ "metadata": {},
881
+ "outputs": [
882
+ {
883
+ "name": "stdout",
884
+ "output_type": "stream",
885
+ "text": [
886
+ "[[424 91]\n",
887
+ " [ 79 618]]\n"
888
+ ]
889
+ }
890
+ ],
891
+ "source": [
892
+ "print(confusion_matrix(test_y, svc_pred))"
893
+ ]
894
+ },
895
+ {
896
+ "cell_type": "code",
897
+ "execution_count": 89,
898
+ "id": "c87a1d85",
899
+ "metadata": {},
900
+ "outputs": [
901
+ {
902
+ "name": "stdout",
903
+ "output_type": "stream",
904
+ "text": [
905
+ " precision recall f1-score support\n",
906
+ "\n",
907
+ " 0 0.84 0.82 0.83 515\n",
908
+ " 1 0.87 0.89 0.88 697\n",
909
+ "\n",
910
+ " accuracy 0.86 1212\n",
911
+ " macro avg 0.86 0.85 0.86 1212\n",
912
+ "weighted avg 0.86 0.86 0.86 1212\n",
913
+ "\n"
914
+ ]
915
+ }
916
+ ],
917
+ "source": [
918
+ "print(classification_report(test_y, svc_pred))"
919
+ ]
920
+ },
921
+ {
922
+ "cell_type": "code",
923
+ "execution_count": 90,
924
+ "id": "78fe89bc",
925
+ "metadata": {},
926
+ "outputs": [
927
+ {
928
+ "data": {
929
+ "text/plain": [
930
+ "0.8597359735973598"
931
+ ]
932
+ },
933
+ "execution_count": 90,
934
+ "metadata": {},
935
+ "output_type": "execute_result"
936
+ }
937
+ ],
938
+ "source": [
939
+ "accuracy_score(test_y, svc_pred)"
940
+ ]
941
+ },
942
+ {
943
+ "cell_type": "code",
944
+ "execution_count": 91,
945
+ "id": "87c34455",
946
+ "metadata": {},
947
+ "outputs": [
948
+ {
949
+ "data": {
950
+ "text/plain": [
951
+ "0.8790896159317211"
952
+ ]
953
+ },
954
+ "execution_count": 91,
955
+ "metadata": {},
956
+ "output_type": "execute_result"
957
+ }
958
+ ],
959
+ "source": [
960
+ "f1_score(test_y, svc_pred)"
961
+ ]
962
+ },
963
+ {
964
+ "cell_type": "code",
965
+ "execution_count": 92,
966
+ "id": "fa889bcb",
967
+ "metadata": {},
968
+ "outputs": [
969
+ {
970
+ "name": "stdout",
971
+ "output_type": "stream",
972
+ "text": [
973
+ "नराम्रो is negative sentiment\n"
974
+ ]
975
+ }
976
+ ],
977
+ "source": [
978
+ "sent = \"नराम्रो\"\n",
979
+ "predicted_label = svc.predict(np.array(get_bert_embedding_sentence(sent).tolist()).reshape(1,-1))[0]\n",
980
+ "if predicted_label == 0:\n",
981
+ " print(f'{sent} is negative sentiment')\n",
982
+ "else:\n",
983
+ " print(f'{sent} is positive sentiment')"
984
+ ]
985
+ },
986
+ {
987
+ "cell_type": "code",
988
+ "execution_count": 24,
989
+ "id": "2c5d51e6",
990
+ "metadata": {},
991
+ "outputs": [],
992
+ "source": [
993
+ "pickle.dump(svc, open('scv_sentiment','wb'))"
994
+ ]
995
+ },
996
+ {
997
+ "cell_type": "code",
998
+ "execution_count": 18,
999
+ "id": "00640092",
1000
+ "metadata": {},
1001
+ "outputs": [],
1002
+ "source": [
1003
+ "\n",
1004
+ "# pickle.dump(svc,open('svc_sentiment','wb'))"
1005
+ ]
1006
+ },
1007
+ {
1008
+ "cell_type": "code",
1009
+ "execution_count": 22,
1010
+ "id": "cdc460bd",
1011
+ "metadata": {},
1012
+ "outputs": [],
1013
+ "source": [
1014
+ "svc_sentiment = pickle.load(open('svc_sentiment','rb'))"
1015
+ ]
1016
+ },
1017
+ {
1018
+ "cell_type": "code",
1019
+ "execution_count": 113,
1020
+ "id": "0791ecca",
1021
+ "metadata": {},
1022
+ "outputs": [
1023
+ {
1024
+ "data": {
1025
+ "text/plain": [
1026
+ "0"
1027
+ ]
1028
+ },
1029
+ "execution_count": 113,
1030
+ "metadata": {},
1031
+ "output_type": "execute_result"
1032
+ }
1033
+ ],
1034
+ "source": [
1035
+ "svc.predict(np.array(get_bert_embedding_sentence(\"देश बिग्रियो\").tolist()).reshape(1,-1))[0]"
1036
+ ]
1037
+ },
1038
+ {
1039
+ "cell_type": "code",
1040
+ "execution_count": null,
1041
+ "id": "32c10466",
1042
+ "metadata": {},
1043
+ "outputs": [],
1044
+ "source": []
1045
+ }
1046
+ ],
1047
+ "metadata": {
1048
+ "kernelspec": {
1049
+ "display_name": "Python 3 (ipykernel)",
1050
+ "language": "python",
1051
+ "name": "python3"
1052
+ },
1053
+ "language_info": {
1054
+ "codemirror_mode": {
1055
+ "name": "ipython",
1056
+ "version": 3
1057
+ },
1058
+ "file_extension": ".py",
1059
+ "mimetype": "text/x-python",
1060
+ "name": "python",
1061
+ "nbconvert_exporter": "python",
1062
+ "pygments_lexer": "ipython3",
1063
+ "version": "3.8.10"
1064
+ }
1065
+ },
1066
+ "nbformat": 4,
1067
+ "nbformat_minor": 5
1068
+ }
README.md CHANGED
@@ -9,4 +9,12 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ Requirement Installation:
13
+ ```bash
14
+ pip install -r requirements.txt
15
+ ```
16
+
17
+ To run the script, run the following command.
18
+ ```bash
19
+ streamlit run app.py
20
+ ```
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict
2
+ from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA
3
+ import streamlit as st
4
+ import pickle
5
+ import torch
6
+ from googletrans import Translator
7
+ from langdetect import detect
8
+
9
+ from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
10
+ from scipy.spatial.distance import cosine
11
+ import tokenizers
12
+ from sklearn.model_selection import train_test_split,GridSearchCV
13
+ from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
14
+ from nltk.corpus import stopwords
15
+
16
+ from sklearn.svm import SVC
17
+ from sklearn.naive_bayes import GaussianNB
18
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
19
+ from sklearn.decomposition import PCA
20
+ from sklearn.preprocessing import StandardScaler
21
+ from nepali_unicode_converter.convert import Converter
22
+ from textblob import TextBlob
23
+
24
+
25
+ # model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True)
26
+
27
+ # tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")
28
+ # pickle.dump(model, open('nepaliBert.pkl','wb'))
29
+ # pickle.dump(tokenizers, open('tokenizers.pkl','wb'))
30
+ model = pickle.load(open('bert_model/model','rb'))
31
+ tokenizers = pickle.load(open('bert_model/tokenizer','rb'))
32
+ # if torch.cuda.is_available():
33
+
34
+ # dev = "cuda:0"
35
+ # else:
36
+
37
+ # dev = "cpu"
38
+
39
+ # print(dev)
40
+ device = torch.device("cpu")
41
+
42
+ st.header("Nepali sentiment analysis")
43
+ st.subheader("This app gives the sentiment analysis of Nepali text.")
44
+
45
+
46
+
47
+
48
+ def get_bert_embedding_sentence(input_sentence):
49
+ md = model
50
+ tokenizer = tokenizers
51
+ marked_text = " [CLS] " + input_sentence + " [SEP] "
52
+ tokenized_text = tokenizer.tokenize(marked_text)
53
+
54
+ indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
55
+ segments_ids = [1] * len(indexed_tokens)
56
+
57
+
58
+ tokens_tensors = torch.tensor([indexed_tokens])
59
+ segments_tensors = torch.tensor([segments_ids])
60
+
61
+ with torch.no_grad():
62
+ outputs = md(tokens_tensors, segments_tensors)
63
+ hidden_states = outputs.hidden_states
64
+
65
+ token_vecs = hidden_states[-2][0]
66
+
67
+ sentence_embedding = torch.mean(token_vecs, dim=0)
68
+
69
+ return sentence_embedding.numpy()
70
+ lang_list = ["hi","ne","mr"]
71
+ svc_sentiment = pickle.load(open('scv_sentiment','rb'))
72
+ text = st.text_input("Please input your nepali sentence here:")
73
+ translator = Translator()
74
+ converter = Converter()
75
+ if text:
76
+ st.write("Your input text is: ", text)
77
+ if detect(text) not in lang_list:
78
+ if detect(text) != "en":
79
+ text = text.lower()
80
+ result = converter.convert(text)
81
+ st.write(result)
82
+ embedding = get_bert_embedding_sentence(result)
83
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
84
+ if svc_pred == 0:
85
+ st.write("Sentiment is: NEGATIVE ")
86
+ else:
87
+ st.write("Sentiment is: POSITIVE ")
88
+ elif detect(text)=='en':
89
+ st.write("Sorry our app can't understand english text")
90
+
91
+ else:
92
+ embedding = get_bert_embedding_sentence(text)
93
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
94
+ if svc_pred == 0:
95
+ st.write("Sentiment is: NEGATIVE ")
96
+ else:
97
+ st.write("Sentiment is: POSITIVE ")
98
+
collected_labeled_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==23.2.0
3
+ blinker==1.7.0
4
+ cachetools==5.3.3
5
+ certifi==2022.6.15
6
+ chardet==3.0.4
7
+ charset-normalizer==2.1.0
8
+ click==8.1.3
9
+ colorama==0.4.5
10
+ exceptiongroup==1.2.0
11
+ filelock==3.7.1
12
+ gitdb==4.0.11
13
+ GitPython==3.1.43
14
+ googletrans==3.0.0
15
+ h11==0.9.0
16
+ h2==3.2.0
17
+ hpack==3.0.0
18
+ hstspreload==2024.4.1
19
+ httpcore==0.9.1
20
+ httpx==0.13.3
21
+ huggingface-hub==0.0.12
22
+ hyperframe==5.2.0
23
+ idna==2.10
24
+ Jinja2==3.1.3
25
+ joblib==1.1.0
26
+ jsonschema==4.21.1
27
+ jsonschema-specifications==2023.12.1
28
+ langdetect==1.0.9
29
+ markdown-it-py==3.0.0
30
+ MarkupSafe==2.1.5
31
+ mdurl==0.1.2
32
+ nepali-unicode-converter==1.0.3
33
+ nltk==3.8.1
34
+ numpy==1.23.1
35
+ outcome==1.3.0.post0
36
+ packaging==21.3
37
+ pandas==1.4.3
38
+ pillow==10.3.0
39
+ plotly==5.21.0
40
+ protobuf==4.25.3
41
+ pyarrow==15.0.2
42
+ pydeck==0.8.1b0
43
+ Pygments==2.17.2
44
+ pyparsing==3.0.9
45
+ PySocks==1.7.1
46
+ python-dateutil==2.8.2
47
+ pytz==2022.1
48
+ PyYAML==6.0
49
+ referencing==0.34.0
50
+ regex==2022.7.25
51
+ requests==2.28.1
52
+ rfc3986==1.5.0
53
+ rich==13.7.1
54
+ rpds-py==0.18.0
55
+ sacremoses==0.0.53
56
+ scikit-learn==1.1.2
57
+ scipy==1.8.1
58
+ selenium==4.19.0
59
+ sentencepiece==0.1.96
60
+ six==1.16.0
61
+ sklearn==0.0
62
+ smmap==5.0.1
63
+ sniffio==1.3.1
64
+ sortedcontainers==2.4.0
65
+ streamlit==1.33.0
66
+ tenacity==8.2.3
67
+ textblob==0.18.0.post0
68
+ threadpoolctl==3.1.0
69
+ tokenizers==0.10.3
70
+ toml==0.10.2
71
+ toolz==0.12.1
72
+ torch==1.11.0
73
+ tornado==6.4
74
+ tqdm==4.64.0
75
+ transformers==4.9.2
76
+ trio==0.25.0
77
+ trio-websocket==0.11.1
78
+ typing_extensions==4.11.0
79
+ urllib3==1.26.11
80
+ watchdog==4.0.0
81
+ wsproto==1.2.0
scrap_data.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import requests
5
+ import ast
6
+ import pickle
7
+ import json
8
+ import torch
9
+ import pandas as pd
10
+ from selenium import webdriver
11
+ from selenium.webdriver.common.by import By
12
+ from selenium.webdriver.support import expected_conditions as EC
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from langdetect import detect
15
+ from nepali_unicode_converter.convert import Converter
16
+ from selenium.webdriver.common.keys import Keys
17
+ from selenium.webdriver.chrome.options import Options
18
+ from selenium.webdriver.common.action_chains import ActionChains
19
+
20
+ # dataset = pd.read_csv("/media/gpu/157/Nepali_sentiment_Analysis/collected_labeled_data.csv")
21
+ review_url = "https://my.daraz.com.np/pdp/review/getReviewList?itemId=_id_&pageSize=5&filter=0&sort=0&pageNo=1"
22
+
23
+ model = pickle.load(open('bert_model/model','rb'))
24
+ tokenizers = pickle.load(open('tokenizers.pkl','rb'))
25
+ svc_sentiment = pickle.load(open('scv_sentiment','rb'))
26
+ chrome_options = Options()
27
+ chrome_options.add_argument("--headless")
28
+
29
+
30
+
31
+ def remove_emojis(text):
32
+ emoji_pattern = re.compile("["
33
+ u"\U0001F600-\U0001F64F" # emoticons
34
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
35
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
36
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
37
+ u"\U00002500-\U00002BEF" # chinese char
38
+ u"\U00002702-\U000027B0"
39
+ u"\U00002702-\U000027B0"
40
+ u"\U000024C2-\U0001F251"
41
+ u"\U0001f926-\U0001f937"
42
+ u"\U00010000-\U0010ffff"
43
+ u"\u2640-\u2642"
44
+ u"\u2600-\u2B55"
45
+ u"\u200d"
46
+ u"\u23cf"
47
+ u"\u23e9"
48
+ u"\u231a"
49
+ u"\ufe0f" # dingbats
50
+ u"\u3030"
51
+ "]+", re.UNICODE)
52
+ text = emoji_pattern.sub(r'', text)
53
+ return text
54
+
55
+ def get_bert_embedding_sentence(input_sentence):
56
+ md = model
57
+ tokenizer = tokenizers
58
+ marked_text = " [CLS] " + input_sentence + " [SEP] "
59
+ tokenized_text = tokenizer.tokenize(marked_text)
60
+
61
+ indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
62
+ segments_ids = [1] * len(indexed_tokens)
63
+
64
+
65
+ tokens_tensors = torch.tensor([indexed_tokens])
66
+ segments_tensors = torch.tensor([segments_ids])
67
+
68
+ with torch.no_grad():
69
+ outputs = md(tokens_tensors, segments_tensors)
70
+ hidden_states = outputs.hidden_states
71
+
72
+ token_vecs = hidden_states[-2][0]
73
+
74
+ sentence_embedding = torch.mean(token_vecs, dim=0)
75
+
76
+ return sentence_embedding.numpy()
77
+
78
+ def scrap_data():
79
+ positive_sentimet = dataset.loc[dataset['label'] == 1]
80
+ negative_sentiment = dataset.loc[dataset['label'] == 0]
81
+
82
+ return positive_sentimet, negative_sentiment
83
+
84
+ def comment_sentiment(text):
85
+ lang_list = ["hi","ne","mr"]
86
+ converter = Converter()
87
+ if detect(text) == "ne":
88
+ embedding = get_bert_embedding_sentence(text)
89
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
90
+ """
91
+ if detect(text) not in lang_list:
92
+ result = converter.convert(text)
93
+ embedding = get_bert_embedding_sentence(result)
94
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
95
+ # predicted_label.append(svc_pred)
96
+ # comment_text.append(review["reviewContent"])
97
+ else:
98
+ embedding = get_bert_embedding_sentence(text)
99
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
100
+ # predicted_label.append(svc_pred)
101
+ # comment_text.append(review["reviewContent"])
102
+ """
103
+ return svc_pred
104
+
105
+ def scrape_comment(url):
106
+ lang_list = ["hi","ne","mr"]
107
+ converter = Converter()
108
+ id = url.split("-")[-2].replace("i","")
109
+ api_url = review_url.replace("_id_",id)
110
+ print("---------------------------------")
111
+ response = requests.get(api_url).text
112
+ print(response)
113
+ response = json.loads(response)
114
+ df = pd.DataFrame(columns=["text",'label'])
115
+ reviews = response["model"]["items"]
116
+ predicted_label =[]
117
+ comment_text =[]
118
+
119
+ for review in reviews:
120
+ text = review["reviewContent"]
121
+ try:
122
+
123
+ if detect(text) not in lang_list:
124
+ result = converter.convert(text)
125
+ embedding = get_bert_embedding_sentence(result)
126
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
127
+ predicted_label.append(svc_pred)
128
+ comment_text.append(review["reviewContent"])
129
+ else:
130
+ embedding = get_bert_embedding_sentence(text)
131
+ svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
132
+ predicted_label.append(svc_pred)
133
+ comment_text.append(review["reviewContent"])
134
+ except Exception as e:
135
+ print(e)
136
+ pass
137
+ df['text'] = comment_text
138
+ df['label'] = predicted_label
139
+ positive_sentimet = df.loc[df['label'] == 1]
140
+ negative_sentiment = df.loc[df['label'] == 0]
141
+ return positive_sentimet, negative_sentiment
142
+
143
+ # def scrap_twitter(url):
144
+ # tweets = driver.find_elements(By.XPATH,'//*[@id="id__nspdargek9"]/span/text()')
145
+ # print(tweets)
146
+
147
+ def scrape_twitter(url):
148
+ '''
149
+ to scrape tweet from given username provide username and tweet id
150
+ '''
151
+ driver = webdriver.Chrome("driver/chromedriver",options=chrome_options)
152
+
153
+ # driver.get(f"https://twitter.com/{username}/status/{tweet_id}")
154
+ driver.get(url)
155
+ time.sleep(5) #change according to your pc and internet connection
156
+
157
+ tweets = []
158
+ result = False
159
+ old_height = driver.execute_script("return document.body.scrollHeight")
160
+
161
+ #set initial all_tweets to start loop
162
+ all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
163
+
164
+ while result == False:
165
+
166
+ for item in all_tweets[1:]: # skip tweet already scrapped
167
+
168
+ try:
169
+ text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
170
+ except:
171
+ text = '[empty]'
172
+
173
+ #Append new tweets replies to tweet array
174
+ tweets.append(text)
175
+
176
+ #scroll down the page
177
+ driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
178
+
179
+ time.sleep(2)
180
+
181
+ try:
182
+ try:
183
+ button = driver.find_element_by_css_selector("div.css-901oao.r-1cvl2hr.r-37j5jr.r-a023e6.r-16dba41.r-rjixqe.r-bcqeeo.r-q4m81j.r-qvutc0")
184
+ except:
185
+ button = driver.find_element_by_css_selector("div.css-1dbjc4n.r-1ndi9ce") #there are two kinds of buttons
186
+
187
+ ActionChains(driver).move_to_element(button).click(button).perform()
188
+ time.sleep(2)
189
+ driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
190
+ time.sleep(2)
191
+ except:
192
+ pass
193
+
194
+ new_height = driver.execute_script("return document.body.scrollHeight")
195
+
196
+ if new_height == old_height:
197
+ result = True
198
+ old_height = new_height
199
+
200
+ #update all_tweets to keep loop
201
+ all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
202
+ driver.close()
203
+ text = []
204
+ predicted_label = []
205
+ for comments in tweets:
206
+ try:
207
+ result = comment_sentiment(comments)
208
+ comments = remove_emojis(comments)
209
+ text.append(comments)
210
+ predicted_label.append(result)
211
+ except Exception as e:
212
+ pass
213
+ df = pd.DataFrame(columns=["text","label"])
214
+ df['text'] = text
215
+ df['label'] = predicted_label
216
+ positive_sentimet = df.loc[df['label'] == 1]
217
+ negative_sentiment = df.loc[df['label'] == 0]
218
+ return positive_sentimet, negative_sentiment
219
+
220
+
221
+ def scrape_youtube(url):
222
+ driver = webdriver.Chrome("driver/chromedriver",options=chrome_options)
223
+ data =[]
224
+
225
+ wait = WebDriverWait(driver,15)
226
+ driver.get(url)
227
+ predicted_label = []
228
+
229
+ for item in range(5):
230
+ wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
231
+ time.sleep(5)
232
+ for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))):
233
+ data.append(comment.text)
234
+
235
+ text =[]
236
+ for comments in data:
237
+ try:
238
+ result =comment_sentiment(comments)
239
+ comments = remove_emojis(comments)
240
+ text.append(comments)
241
+ predicted_label.append(result)
242
+ except Exception as e:
243
+ # raise
244
+ pass
245
+ driver.close()
246
+ df = pd.DataFrame(columns=["text","label"])
247
+ df['text'] = text
248
+ df['label'] = predicted_label
249
+ positive_sentimet = df.loc[df['label'] == 1]
250
+ negative_sentiment = df.loc[df['label'] == 0]
251
+ return positive_sentimet, negative_sentiment
252
+
253
+ if __name__ == "__main__":
254
+ url = "https://www.youtube.com/watch?v=uD58-EHwaeI"
255
+ positive_sentimet, negative_sentiment= scrap_youtube(url=url)
256
+ print(positive_sentimet, negative_sentiment)
257
+
sentimential_analysis_2.jpg ADDED
tokenizers.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ddc53a957b46777cea5801e25169318a868e8b933773092e407134a4d7eb98
3
+ size 764284