File size: 3,406 Bytes
ea90e06 e683309 ea90e06 a09216c 407249a a09216c 5d2b0fe 12fc412 5f8bbd6 12fc412 5f8bbd6 12fc412 538d7ca 5d2b0fe ea90e06 d8f9f62 a8e534e 286e8f3 a8e534e 286e8f3 a8e534e 61fabcb a8e534e 05d75cd 61fabcb 2291a30 61fabcb 05d75cd a8e534e 05d75cd a8e534e be34b4b a8e534e 1549b96 a9ed1fc 1549b96 a9ed1fc 1549b96 a9ed1fc 1549b96 a8e534e 704ac1c a8e534e 286e8f3 a8e534e 7078b67 cd7dcf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import streamlit as st
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import string
st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها")
default_value = "بيعت الأسلحة في السوق"
# sent is the variable holding the user's input
sent = st.text_area('مدخل',default_value)
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
#@st.cache
if (st.button('بحث', disabled=False)):
def next_word(text, pipe):
res_dict= {
'الكلمة المقترحة':[],
'العلامة':[],
}
for e in pipe(text):
if all(c not in list(string.punctuation) for c in e['token_str']):
res_dict['الكلمة المقترحة'].append(e['token_str'])
res_dict['العلامة'].append(e['score'])
return res_dict
text_st = sent+ ' <mask>'
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
dict_next_words = next_word(text_st, pipe)
df = pd.DataFrame.from_dict(dict_next_words)
df.reset_index(drop=True, inplace=True)
st.dataframe(df)
if (st.button('استعمال الرسم البياني', disabled=False)):
VocMap = './voc.csv'
ScoreMap = './BM25.csv'
#@st.cache
def reading_df(path1, path2):
df_voc = pd.read_csv(path1, delimiter='\t')
df_graph = pd.read_csv(path2, delimiter='\t')
df_graph.set_index(['ID1','ID2'], inplace=True)
df_gr = pd.read_csv(ScoreMap, delimiter='\t')
df_gr.set_index(['ID1'], inplace=True)
return df_voc, df_graph, df_gr
df3, df_g, df_in = reading_df(VocMap, ScoreMap)
#@st.cache
def Query2id(voc, query):
return [voc.index[voc['word'] == word].values[0] for word in query.split()]
id_list = Query2id(df3, sent)
#@st.cache
def setQueriesVoc(df, id_list):
res = []
for e in id_list:
res.extend(list(df.loc[e]['ID2'].values))
return list(set(res))
L = setQueriesVoc(df_in, id_list)
@st.cache
def compute_score(L_terms, id_l):
tmt = {}
for nc in L_terms:
score = 0.0
temp = []
for ni in id_l:
try:
score = score + df_g.loc[(ni, nc),'score']
except KeyError:
continue
key = df3.loc[nc].values[0]
tmt[key] = score
return tmt
tmt = compute_score(L, id_list)
exp_terms = []
t_li = tmt.values()
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
i = 0
dict_res = {'الكلمة المقترحة':[],
'العلامة':[]}
for key, value in tmexp:
new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
dict_res['العلامة'].append(str(new_score)[:6])
dict_res['الكلمة المقترحة'].append(key)
i+=1
if (i==10):
break
res_df = pd.DataFrame.from_dict(dict_res)
res_df.index += 1
st.dataframe(res_df)
#st.table(df) |