Spaces:

wanderer2k1
/

VN_laws_qa

Sleeping

App Files Files Community

wanderer2k1 commited on Aug 14, 2023

Commit

9833a80

1 Parent(s): baa3568

f

Browse files

Files changed (14) hide show

README.txt +38 -0
SessionState.py +117 -0
app.py +181 -0
data/corpus.pkl +3 -0
data/embedded_corpus_BertCondenser_tuples.pkl +3 -0
load_css.py +5 -0
logo.jpg +0 -0
models/BM25_pyvi_segmented_splitted.pkl +3 -0
requirements.txt +14 -0
src/__init__.py +8 -0
src/__pycache__/__init__.cpython-39.pyc +0 -0
src/__pycache__/clean_dataset.cpython-39.pyc +0 -0
src/clean_dataset.py +76 -0
style.css +30 -0

README.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+#Setup
+1. Cài Python 3.9.13: https://www.python.org/ftp/python/3.9.13/python-3.9.13-amd64.exe
+Lưu ý: khi install lưu ý tick chọn "Add Python 3.9 to PATH".
+2. Mở command line dẫn đến thư mục này, nhập:
+	python -m venv venv
+	venv/Scripts/activate
+	python -m pip install -r requirements.txt
+3. Tải file dữ liệu về từ link: https://drive.google.com/file/d/1s2-Yi1R8pEgGOPNwbJEsSY-Ltum1UNLH/view?usp=sharing
+Giải nén file ở thư mục này, tên thư mục sau giải nén là "data". Lưu ý: các file dữ liệu ở ngay trong thư mục data, tránh sau khi giải nén thêm folder data bên trong folder data.
+4. Tải file models về từ link: https://drive.google.com/file/d/1aHBXKINBuLEDLPYF-GMUTwQBDDF-FNSj/view?usp=sharing
+ Giải nén file ở thư mục này, tên thư mục sau giải nén là "models".
+5. để chạy chương trình, mở command line dẫn đến thư mục này, nhập:
+	venv/Scripts/activate
+	streamlit run streamlit/main.py
+* Lưu ý: lần đầu query đầu, hệ thống sẽ tải các models về từ repo cá nhân, dung lượng khoảng 3GB nên mất nhiều thời gian.
+# Cấu trúc thư mục:
+.
+|
+|_Notebooks: Các .ipynb notebooks đã xử lý dữ liệu, huấn luyện mô hình và đánh giá mô hình. Đã chạy trên Google colab.
+||_Prepare_data: Các .ipynb notebook xử lý dữ liệu cho huấn luyện và đánh giá mô hình.
+||_Training: Các .ipynb notebook huấn luyện mô hình.
+||_Evaluation: Các .ipynb notebook đánh giá mô hình.
+|
+|_src: Script python chứa hàm chạy chương trình.
+|
+|_streamlit: Chứa script chạy webapp, file css style webapp, các lớp liên quan đến web app.
+|
+|_requirements.txt: file chứa các thư viện python cần cài đặt.
+|
+|_README.txt

SessionState.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Hack to add per-session state to Streamlit.
+Usage
+-----
+>>> import SessionState
+>>>
+>>> session_state = SessionState.get(user_name='', favorite_color='black')
+>>> session_state.user_name
+''
+>>> session_state.user_name = 'Mary'
+>>> session_state.favorite_color
+'black'
+Since you set user_name above, next time your script runs this will be the
+result:
+>>> session_state = get(user_name='', favorite_color='black')
+>>> session_state.user_name
+'Mary'
+"""
+try:
+    import streamlit.ReportThread as ReportThread
+    from streamlit.server.Server import Server
+except Exception:
+    # Streamlit >= 0.65.0
+    import streamlit.report_thread as ReportThread
+    from streamlit.server.server import Server
+class SessionState(object):
+    def __init__(self, **kwargs):
+        """A new SessionState object.
+        Parameters
+        ----------
+        **kwargs : any
+            Default values for the session state.
+        Example
+        -------
+        >>> session_state = SessionState(user_name='', favorite_color='black')
+        >>> session_state.user_name = 'Mary'
+        ''
+        >>> session_state.favorite_color
+        'black'
+        """
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+def get(**kwargs):
+    """Gets a SessionState object for the current session.
+    Creates a new object if necessary.
+    Parameters
+    ----------
+    **kwargs : any
+        Default values you want to add to the session state, if we're creating a
+        new one.
+    Example
+    -------
+    >>> session_state = get(user_name='', favorite_color='black')
+    >>> session_state.user_name
+    ''
+    >>> session_state.user_name = 'Mary'
+    >>> session_state.favorite_color
+    'black'
+    Since you set user_name above, next time your script runs this will be the
+    result:
+    >>> session_state = get(user_name='', favorite_color='black')
+    >>> session_state.user_name
+    'Mary'
+    """
+    # Hack to get the session object from Streamlit.
+    ctx = ReportThread.get_report_ctx()
+    this_session = None
+    current_server = Server.get_current()
+    if hasattr(current_server, '_session_infos'):
+        # Streamlit < 0.56
+        session_infos = Server.get_current()._session_infos.values()
+    else:
+        session_infos = Server.get_current()._session_info_by_id.values()
+    for session_info in session_infos:
+        s = session_info.session
+        if (
+            # Streamlit < 0.54.0
+            (hasattr(s, '_main_dg') and s._main_dg == ctx.main_dg)
+            or
+            # Streamlit >= 0.54.0
+            (not hasattr(s, '_main_dg') and s.enqueue == ctx.enqueue)
+            or
+            # Streamlit >= 0.65.2
+            (not hasattr(s, '_main_dg') and s._uploaded_file_mgr == ctx.uploaded_file_mgr)
+        ):
+            this_session = s
+    if this_session is None:
+        raise RuntimeError(
+            "Oh noes. Couldn't get your Streamlit Session object. "
+            'Are you doing something fancy with threads?')
+    # Got the session object! Now let's attach some state into it.
+    if not hasattr(this_session, '_custom_session_state'):
+        this_session._custom_session_state = SessionState(**kwargs)
+    return this_session._custom_session_state

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+#basics
+import time
+import pandas as pd
+import numpy as np
+import pickle
+from PIL import Image
+#DL
+import torch
+from transformers import T5ForConditionalGeneration, T5TokenizerFast
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+#streamlit
+import streamlit as st
+import SessionState
+from load_css import local_css
+local_css("./style.css")
+#text preprocess
+import re
+from pyvi import ViTokenizer
+from rank_bm25 import BM25Okapi
+#helper functions
+from inspect import getsourcefile
+import os.path as path, sys
+from pathlib import Path
+current_dir = path.dirname(path.abspath(getsourcefile(lambda:0)))
+sys.path.insert(0, current_dir[:current_dir.rfind(path.sep)])
+import src.clean_dataset as clean
+@st.cache(allow_output_mutation=True)
+def preprocess(sentence):
+  sentence=str(sentence)
+  sentence = sentence.lower()
+  sentence=sentence.replace('{html}',"")
+  cleanr = re.compile('<.*?>')
+  cleantext = re.sub(cleanr, '', sentence)
+  rem_url=re.sub(r'http\S+', '',cleantext)
+  word_list = rem_url.split()
+  preped = ViTokenizer.tokenize(" ".join(word_list))
+  return preped
+DEFAULT = '< PICK A VALUE >'
+def selectbox_with_default(text, values, default=DEFAULT, sidebar=False):
+    func = st.sidebar.selectbox if sidebar else st.selectbox
+    return func(text, np.insert(np.array(values, object), 0, default))
+def neuralqa():
+    model = T5ForConditionalGeneration.from_pretrained("wanderer2k1/T5-LawsQA")
+    tokenizer = T5TokenizerFast.from_pretrained("wanderer2k1/T5-LawsQA")
+    bi_encoder = SentenceTransformer('wanderer2k1/BertCondenser_LawsQA')
+    return tokenizer, model, bi_encoder
+def hf_run_model(tokenizer, model, input_string, **generator_args):
+  generator_args = {
+  "max_length": 256,
+  "temperature":0.0,
+  "num_beams": 4,
+  "length_penalty": 0.1,
+  "no_repeat_ngram_size": 8,
+  "early_stopping": True,
+  }
+  input_string = "generate questions: " + input_string + " </s>"
+  input_ids = tokenizer.encode(input_string, return_tensors="pt")
+  res = model.generate(input_ids, **generator_args)
+  output = tokenizer.batch_decode(res, skip_special_tokens=True)
+  output = [item.split("<sep>") for item in output]
+  return output
+#%%
+sys.path.pop(0)
+#1. load in complete transformed and processed dataset
+df = pd.read_csv('./data/corpus.pkl', sep = '\t')
+passages = df['text'].values.tolist()
+passage_id = df['title'].values.tolist()
+#2 load corpus embeddings for neural QA:
+with open("./data/embedded_corpus_BertCondenser_tuples.pkl", 'rb') as inp:
+    embedded_passages = pickle.load(inp)
+embedded_passages = torch.Tensor(embedded_passages)
+#3 load BM25:
+with open("models/BM25_pyvi_segmented_splitted.pkl", 'rb') as inp:
+    bm25 = pickle.load(inp)
+#%%
+session = SessionState.get(run_id=0)
+#%%
+#title start page
+st.title('Closed Domain (Vietnamese Laws) QA System')
+sdg = Image.open('./logo.jpg')
+st.sidebar.image(sdg, width=300)
+st.sidebar.title('Settings')
+st.caption("by HoangNV - on custom laws QA data set")
+returns = st.sidebar.slider('Maximal number of answer suggestions:', 1, 3, 2)
+def deploy(question):
+    tokenizer, model, bi_encoder = neuralqa()
+    top_k = returns  # Number of passages we want to retrieve with the bi-encoder
+    tokenized_query = preprocess(question).split()
+    query = ' '.join(tokenized_query)
+    emb_query = bi_encoder.encode(query)
+    scores = bm25.get_scores(tokenized_query)
+    top_score_ids = np.argpartition(scores, -50)[-50:]
+    emb_candidates = torch.Tensor()
+    for i in top_score_ids:
+        emb_candidates = torch.cat([emb_candidates,embedded_passages[i:i+1]], axis = 0)
+    cosine_sim = cos_sim(emb_query, emb_candidates)
+    doc_inds = np.argpartition(cosine_sim.numpy()[0], -top_k)[-top_k:]
+    top_score_ids = top_score_ids.take(doc_inds)
+    matches = []
+    ids = []
+    answers = []
+    for doc_ind in top_score_ids:
+        doc = passages[doc_ind].replace('_',' ')
+        matches.append(doc)#' '.join(doc).replace('_',' '))
+        ids.append(passage_id[doc_ind].replace('_',' '))#' '.join(doc[:30].split()[:3]))
+    # i=0
+    for context in matches:
+        q = "Trả lời câu hỏi: "+query + " Trong ngữ cảnh: "+context#tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(context))
+        a = hf_run_model(tokenizer, model, q)[0][0]
+        answers.append(a)
+    # generate result df
+    df_results = pd.DataFrame(
+        {'Title': ids,
+            'Answer': answers,
+            'Retrieved': matches,
+        })
+    # st.header("Retrieved Answers:")
+    # df_results.set_index('title', inplace=True)
+    st.header("Results:")
+    st.table(df_results)
+    del tokenizer, model, bi_encoder#, question_embedding
+#%%
+question = st.text_input('Type in your legal question (be as specific as possible):')
+if len(question) != 0:
+    t0 = time.time()
+    with st.spinner('Finding best answers...'):
+        deploy(question)
+        st.write(str(time.time()-t0))
+st.write('           ')
+st.write('           ')
+st.write('           ')
+st.write('           ')
+st.write('           ')
+st.write('           ')
+if st.button("Run again!"):
+  session.run_id += 1
+#%%
+p = Path('.')

data/corpus.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e987b70b6d1d60dbee9db5999dc05faf72de806f77c66ad28002fc22b115c664
+size 136262181

data/embedded_corpus_BertCondenser_tuples.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57fd74e8ba28cf5ed8a9b9785eb06b0dd1dd1b2147bf180a9c9987aedd1d5a67
+size 308422821

load_css.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)

logo.jpg ADDED Viewed

models/BM25_pyvi_segmented_splitted.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f782ece67739ffc40cd64cd443918b857e2ab26cf13003e8bfee6c620da0f66d
+size 93633728

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit==1.3.1
+sentence_transformers==2.2.2
+numpy==1.20.1
+transformers==4.28.0
+pandas==1.2.3
+textwrap3==0.9.2
+torch==1.8.0
+joblib==1.0.1
+Pillow==8.1.2
+protobuf==3.20.*
+altair<5
+rank-bm25
+nltk
+pyvi

src/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 5 2020
+@author: jonas
+"""

src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (202 Bytes). View file

src/__pycache__/clean_dataset.cpython-39.pyc ADDED Viewed

Binary file (1.91 kB). View file

src/clean_dataset.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on
+@author:
+@title: clean_dataset
+@descriptions: set of functions that enable splitting and cleaning.
+"""
+#%%
+import pandas as pd
+import numpy as np
+import string
+from itertools import chain
+from textwrap3 import wrap
+import re
+def split_at_length(dataframe, column, length, title = True):
+    wrapped = []
+    for i in dataframe[column]:
+        wrapped.append(wrap(str(i), length))
+    dataframe = dataframe.assign(wrapped=wrapped)
+    dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))
+    if title == True:
+        splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )
+                            for _, row in dataframe.iterrows()]).reset_index()
+        splitted = splitted.rename(columns={"index": "text", 0: "title"})
+    else:
+        splitted = []
+    return dataframe, splitted
+def basic(s):
+    """
+    :param s: string to be processed
+    :return: processed string: see comments in the source code for more info
+    """
+    # Text Lowercase
+    s = s.lower()
+    # Remove punctuation
+    translator = str.maketrans(' ', ' ', string.punctuation)
+    s = s.translate(translator)
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    # Remove distracting single quotes
+    s = re.sub("\'", " ", s)
+    # Remove all remaining numbers and non alphanumeric characters
+    s = re.sub(r'\d+', ' ', s)
+    s = re.sub(r'\W+', ' ', s)
+    # define custom words to replace:
+    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
+    return s.strip()
+def remove_linebreaks(s):
+    """
+    :param s: string to be processed
+    :return: processed string: see comments in the source code for more info
+    """
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    return s.strip()

style.css ADDED Viewed

	@@ -0,0 +1,30 @@

+.highlight {
+  border-radius: 0.4rem;
+  color: white;
+  padding: 0.5rem;
+  margin-bottom: 1rem;
+}
+.bold {
+  padding-left: 1rem;
+  font-weight: 700;
+}
+.blue {
+  background-color: lightcoral;
+}
+.green {
+  background-color: green;
+}
+.red {
+  background-color: red;
+}
+.IndianRed {
+  background-color: IndianRed;
+}
+.lightgreen {
+  background-color: lightgreen;
+}
+.turquoise {
+  background-color: turquoise;
+}