VN_laws_qa / app.py
wanderer2k1's picture
f
9833a80
raw
history blame
5.18 kB
#basics
import time
import pandas as pd
import numpy as np
import pickle
from PIL import Image
#DL
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
#streamlit
import streamlit as st
import SessionState
from load_css import local_css
local_css("./style.css")
#text preprocess
import re
from pyvi import ViTokenizer
from rank_bm25 import BM25Okapi
#helper functions
from inspect import getsourcefile
import os.path as path, sys
from pathlib import Path
current_dir = path.dirname(path.abspath(getsourcefile(lambda:0)))
sys.path.insert(0, current_dir[:current_dir.rfind(path.sep)])
import src.clean_dataset as clean
@st.cache(allow_output_mutation=True)
def preprocess(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
word_list = rem_url.split()
preped = ViTokenizer.tokenize(" ".join(word_list))
return preped
DEFAULT = '< PICK A VALUE >'
def selectbox_with_default(text, values, default=DEFAULT, sidebar=False):
func = st.sidebar.selectbox if sidebar else st.selectbox
return func(text, np.insert(np.array(values, object), 0, default))
def neuralqa():
model = T5ForConditionalGeneration.from_pretrained("wanderer2k1/T5-LawsQA")
tokenizer = T5TokenizerFast.from_pretrained("wanderer2k1/T5-LawsQA")
bi_encoder = SentenceTransformer('wanderer2k1/BertCondenser_LawsQA')
return tokenizer, model, bi_encoder
def hf_run_model(tokenizer, model, input_string, **generator_args):
generator_args = {
"max_length": 256,
"temperature":0.0,
"num_beams": 4,
"length_penalty": 0.1,
"no_repeat_ngram_size": 8,
"early_stopping": True,
}
input_string = "generate questions: " + input_string + " </s>"
input_ids = tokenizer.encode(input_string, return_tensors="pt")
res = model.generate(input_ids, **generator_args)
output = tokenizer.batch_decode(res, skip_special_tokens=True)
output = [item.split("<sep>") for item in output]
return output
#%%
sys.path.pop(0)
#1. load in complete transformed and processed dataset
df = pd.read_csv('./data/corpus.pkl', sep = '\t')
passages = df['text'].values.tolist()
passage_id = df['title'].values.tolist()
#2 load corpus embeddings for neural QA:
with open("./data/embedded_corpus_BertCondenser_tuples.pkl", 'rb') as inp:
embedded_passages = pickle.load(inp)
embedded_passages = torch.Tensor(embedded_passages)
#3 load BM25:
with open("models/BM25_pyvi_segmented_splitted.pkl", 'rb') as inp:
bm25 = pickle.load(inp)
#%%
session = SessionState.get(run_id=0)
#%%
#title start page
st.title('Closed Domain (Vietnamese Laws) QA System')
sdg = Image.open('./logo.jpg')
st.sidebar.image(sdg, width=300)
st.sidebar.title('Settings')
st.caption("by HoangNV - on custom laws QA data set")
returns = st.sidebar.slider('Maximal number of answer suggestions:', 1, 3, 2)
def deploy(question):
tokenizer, model, bi_encoder = neuralqa()
top_k = returns # Number of passages we want to retrieve with the bi-encoder
tokenized_query = preprocess(question).split()
query = ' '.join(tokenized_query)
emb_query = bi_encoder.encode(query)
scores = bm25.get_scores(tokenized_query)
top_score_ids = np.argpartition(scores, -50)[-50:]
emb_candidates = torch.Tensor()
for i in top_score_ids:
emb_candidates = torch.cat([emb_candidates,embedded_passages[i:i+1]], axis = 0)
cosine_sim = cos_sim(emb_query, emb_candidates)
doc_inds = np.argpartition(cosine_sim.numpy()[0], -top_k)[-top_k:]
top_score_ids = top_score_ids.take(doc_inds)
matches = []
ids = []
answers = []
for doc_ind in top_score_ids:
doc = passages[doc_ind].replace('_',' ')
matches.append(doc)#' '.join(doc).replace('_',' '))
ids.append(passage_id[doc_ind].replace('_',' '))#' '.join(doc[:30].split()[:3]))
# i=0
for context in matches:
q = "Trả lời câu hỏi: "+query + " Trong ngữ cảnh: "+context#tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(context))
a = hf_run_model(tokenizer, model, q)[0][0]
answers.append(a)
# generate result df
df_results = pd.DataFrame(
{'Title': ids,
'Answer': answers,
'Retrieved': matches,
})
# st.header("Retrieved Answers:")
# df_results.set_index('title', inplace=True)
st.header("Results:")
st.table(df_results)
del tokenizer, model, bi_encoder#, question_embedding
#%%
question = st.text_input('Type in your legal question (be as specific as possible):')
if len(question) != 0:
t0 = time.time()
with st.spinner('Finding best answers...'):
deploy(question)
st.write(str(time.time()-t0))
st.write(' ')
st.write(' ')
st.write(' ')
st.write(' ')
st.write(' ')
st.write(' ')
if st.button("Run again!"):
session.run_id += 1
#%%
p = Path('.')