|
import streamlit as st |
|
import transformers |
|
from transformers import pipeline |
|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
import pandas as pd |
|
import string |
|
|
|
|
|
st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها") |
|
default_value = "بيعت الأسلحة في السوق" |
|
|
|
|
|
sent = st.text_area('مدخل',default_value) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True) |
|
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART") |
|
|
|
|
|
if (st.button('بحث', disabled=False)): |
|
def next_word(text, pipe): |
|
res_dict= { |
|
'الكلمة المقترحة':[], |
|
'العلامة':[], |
|
} |
|
for e in pipe(text): |
|
if all(c not in list(string.punctuation) for c in e['token_str']): |
|
res_dict['الكلمة المقترحة'].append(e['token_str']) |
|
res_dict['العلامة'].append(e['score']) |
|
return res_dict |
|
|
|
text_st = sent+ ' <mask>' |
|
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10) |
|
dict_next_words = next_word(text_st, pipe) |
|
df = pd.DataFrame.from_dict(dict_next_words) |
|
df.reset_index(drop=True, inplace=True) |
|
st.dataframe(df) |
|
|
|
if (st.button('استعمال الرسم البياني', disabled=False)): |
|
|
|
VocMap = './voc.csv' |
|
ScoreMap = './BM25.csv' |
|
|
|
|
|
def reading_df(path1, path2): |
|
df_voc = pd.read_csv(path1, delimiter='\t') |
|
df_graph = pd.read_csv(path2, delimiter='\t') |
|
df_graph.set_index(['ID1','ID2'], inplace=True) |
|
df_gr = pd.read_csv(ScoreMap, delimiter='\t') |
|
df_gr.set_index(['ID1'], inplace=True) |
|
return df_voc, df_graph, df_gr |
|
|
|
df3, df_g, df_in = reading_df(VocMap, ScoreMap) |
|
|
|
|
|
def Query2id(voc, query): |
|
return [voc.index[voc['word'] == word].values[0] for word in query.split()] |
|
|
|
id_list = Query2id(df3, sent) |
|
|
|
def setQueriesVoc(df, id_list): |
|
res = [] |
|
for e in id_list: |
|
res.extend(list(df.loc[e]['ID2'].values)) |
|
return list(set(res)) |
|
|
|
L = setQueriesVoc(df_in, id_list) |
|
@st.cache |
|
def compute_score(L_terms, id_l): |
|
tmt = {} |
|
for nc in L_terms: |
|
score = 0.0 |
|
temp = [] |
|
for ni in id_l: |
|
try: |
|
score = score + df_g.loc[(ni, nc),'score'] |
|
except KeyError: |
|
continue |
|
key = df3.loc[nc].values[0] |
|
tmt[key] = score |
|
return tmt |
|
|
|
|
|
tmt = compute_score(L, id_list) |
|
exp_terms = [] |
|
t_li = tmt.values() |
|
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True) |
|
i = 0 |
|
dict_res = {'الكلمة المقترحة':[], |
|
'العلامة':[]} |
|
for key, value in tmexp: |
|
new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001 |
|
dict_res['العلامة'].append(str(new_score)[:6]) |
|
dict_res['الكلمة المقترحة'].append(key) |
|
i+=1 |
|
if (i==10): |
|
break |
|
res_df = pd.DataFrame.from_dict(dict_res) |
|
res_df.index += 1 |
|
st.dataframe(res_df) |
|
|
|
|