File size: 3,349 Bytes
b0e7079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from dataclasses import asdict
from stat import FILE_ATTRIBUTE_NO_SCRUB_DATA
import streamlit as st
import pickle 
import torch
from googletrans import Translator
from langdetect import detect

from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
from scipy.spatial.distance import cosine 
import tokenizers 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from nltk.corpus import stopwords

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from nepali_unicode_converter.convert import Converter
from textblob import TextBlob


# model = AutoModelForMaskedLM.from_pretrained("Shushant/nepaliBERT", output_hidden_states = True, return_dict = True, output_attentions = True)

# tokenizers = AutoTokenizer.from_pretrained("Shushant/nepaliBERT")
# pickle.dump(model, open('nepaliBert.pkl','wb'))
# pickle.dump(tokenizers, open('tokenizers.pkl','wb'))
model = pickle.load(open('bert_model/model','rb'))
tokenizers = pickle.load(open('bert_model/tokenizer','rb'))
# if torch.cuda.is_available():  

#     dev = "cuda:0" 
# else:  

#     dev = "cpu"  

# print(dev)
device = torch.device("cpu")  

st.header("Nepali sentiment analysis")
st.subheader("This app gives the sentiment analysis of Nepali text.")




def get_bert_embedding_sentence(input_sentence):
    md = model
    tokenizer = tokenizers
    marked_text = " [CLS] " + input_sentence + " [SEP] "
    tokenized_text = tokenizer.tokenize(marked_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(indexed_tokens) 
    

    tokens_tensors = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = md(tokens_tensors, segments_tensors)
        hidden_states = outputs.hidden_states

    token_vecs = hidden_states[-2][0]

    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding.numpy()
lang_list = ["hi","ne","mr"]
svc_sentiment = pickle.load(open('scv_sentiment','rb'))
text = st.text_input("Please input your nepali sentence here:")
translator = Translator()
converter = Converter()
if text:
    st.write("Your input text is:          ", text)
    if detect(text) not in lang_list:
        if detect(text) != "en":
            text = text.lower()
            result = converter.convert(text)
            st.write(result)
            embedding = get_bert_embedding_sentence(result)
            svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
            if svc_pred == 0:
                st.write("Sentiment is: NEGATIVE ")
            else:
                st.write("Sentiment is: POSITIVE ")
        elif detect(text)=='en':
            st.write("Sorry our app can't understand english text")
     
    else:
        embedding = get_bert_embedding_sentence(text)
        svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0]
        if svc_pred == 0:
            st.write("Sentiment is: NEGATIVE ")
        else:
            st.write("Sentiment is: POSITIVE ")