File size: 4,365 Bytes
ea90e06
e683309
ea90e06
 
a09216c
407249a
365710f
07344b9
 
 
fca740f
a99b14a
6a4d258
5d2b0fe
12fc412
1dae260
5f8bbd6
12fc412
a99b14a
12fc412
538d7ca
080e0b4
5d2b0fe
7e76459
080e0b4
f096fbb
080e0b4
 
 
 
 
635e801
080e0b4
 
 
b81dada
f096fbb
a8e534e
 
 
eccd375
830785a
b81dada
e9c1ddf
79c67dd
a8e534e
 
 
05d75cd
61fabcb
 
 
 
 
 
 
 
2291a30
79c67dd
c3cfbed
a8e534e
83a58a9
 
 
 
b9b338b
ea2e94d
83a58a9
 
a8e534e
 
c3cfbed
a8e534e
 
 
d914ade
 
ea2e94d
 
d914ade
a8e534e
be34b4b
a8e534e
1549b96
a9ed1fc
 
 
1549b96
 
a9ed1fc
1549b96
 
 
 
 
 
a9ed1fc
1549b96
a8e534e
 
 
 
704ac1c
 
a8e534e
 
286e8f3
 
a8e534e
 
 
22fefa5
 
830785a
7e76459
 
 
 
eccd375
cf44ec1
07344b9
f58aaf4
1dae260
eccd375
07344b9
1dae260
07344b9
f58aaf4
5f846a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import streamlit as st
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import string
from time import time
from PIL import Image


image = Image.open('./Logo_APP.png')
n_image  = image.resize((150, 150))
st.image(n_image)

st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها")
default_value = "أستاذ التعليم"

# sent is the variable holding the user's input
sent = st.text_area('المدخل',default_value)

tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)

model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)

def next_word(text, pipe):
    res_dict= {  
      'الكلمة المقترحة':[],
      'العلامة':[],
    }
    for e in pipe(text):
        if all(c not in list(string.punctuation) for c in e['token_str']):
            res_dict['الكلمة المقترحة'].append(e['token_str'])
            res_dict['العلامة'].append(e['score'])
    return res_dict

if (st.button('بحث', disabled=False)):
    text_st = sent+ ' <mask>'
    dict_next_words = next_word(text_st, pipe)
    df = pd.DataFrame.from_dict(dict_next_words)
    st.dataframe(df)
#using Graph   

if (st.checkbox('الاستعانة بالرسم البياني المعرفي الاحتمالي', value=False)):
    a = time()
    VocMap = './voc.csv'
    ScoreMap = './BM25.csv'
    
    #@st.cache
    def reading_df(path1, path2):
        df_voc = pd.read_csv(path1, delimiter='\t')
        df_graph = pd.read_csv(path2, delimiter='\t')
        df_graph.set_index(['ID1','ID2'], inplace=True)
        df_gr = pd.read_csv(ScoreMap, delimiter='\t')
        df_gr.set_index(['ID1'], inplace=True)
        return df_voc, df_graph, df_gr
        
    df3, df_g, df_in = reading_df(VocMap, ScoreMap)
    

    def Query2id(voc, query):
        res= [] 
        for word in query.split():
            try:
                res.append(voc.index[voc['word'] == word].values[0])
            except (IndexError, KeyError) as e:
                st.write('Token not found')
                continue
        return res
    
    id_list = Query2id(df3, sent)

    def setQueriesVoc(df, id_list):
        res = []
        for e in id_list:
            try:
                res.extend(list(df.loc[e]['ID2'].values)) 
            except (KeyError, AttributeError) as f:
                st.write('Token not found')
                continue
        return list(set(res))
    
    L = setQueriesVoc(df_in, id_list)
    @st.cache
    def compute_score(L_terms, id_l):
        tmt = {}
        for nc in L_terms:
            score = 0.0
            temp = []
            for ni in id_l:
                try:
                    score = score + df_g.loc[(ni, nc),'score']
                except KeyError:
                    continue
            key  = df3.loc[nc].values[0]
            tmt[key] = score
        return tmt
    tmt = compute_score(L, id_list)    
    exp_terms = []
    t_li = tmt.values()
    tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
    i = 0
    dict_res = {'الكلمة المقترحة':[], 
    'العلامة':[]}
    for key, value in tmexp:
        new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
        dict_res['العلامة'].append(str(new_score)[:6])
        dict_res['الكلمة المقترحة'].append(key)
        i+=1
        if (i==10):
            break
    res_df = pd.DataFrame.from_dict(dict_res)
    res_df.index += 1
    b = time()
    exec_time = (b-a)
    text_st = sent+ ' <mask>'
    dict_next_words = next_word(text_st, pipe)
    df = pd.DataFrame.from_dict(dict_next_words)
    df.index += 1
    str_time = str(exec_time)[:3]
    
    st.markdown("""---""")
    st.header("الكلمات المقترحة باستعمال النموذج اللغوي")
    st.dataframe(df)
    st.markdown("""---""")
    st.header("الكلمات المقترحة باستعمال الرسم البياني")
    st.dataframe(res_df)
    st.markdown("""---""")
    st.write(f'{str_time} s :الوقت المستغرق باستعمال الرسم البياني')