Spaces:

kaisugi
/

academic-paraphraser

Runtime error

App Files Files Community

kaisugi commited on Feb 16, 2023

Commit

00eee05

1 Parent(s): 60689d7

update

Browse files

Files changed (1) hide show

app.py +88 -17

app.py CHANGED Viewed

@@ -6,7 +6,8 @@ import streamlit as st
 import torch
 import math
-import os
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
@@ -14,7 +15,7 @@ os.environ['KMP_DUPLICATE_LIB_OK']='True'
 @st.cache(allow_output_mutation=True)
 def load_model_and_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("kaisugi/scitoricsbert")
-    model = AutoModel.from_pretrained("kaisugi/scitoricsbert")
     model.eval()
     return model, tokenizer
@@ -63,7 +64,56 @@ def load_sentence_embeddings_and_index():
 @st.cache(allow_output_mutation=True)
-def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df, exclude_word_list):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
             input_text,
@@ -79,19 +129,28 @@ def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_d
     _, ids = index.search(x=np.array([query_embeddings]), k=top_k)
     retrieved_sentences = []
-    retrieved_paper_id = []
     for id in ids[0]:
-        retrieved_sentences.append(sentence_df.loc[id, "sentence"])
-        retrieved_paper_id.append(f"https://aclanthology.org/{sentence_df.loc[id, 'file_id']}")
-    all_df = pd.DataFrame({"sentence": retrieved_sentences, "source link": retrieved_paper_id})
-    if len(exclude_word_list) == 0:
-        return all_df
-    else:
-        exclude_word_list_regex = '|'.join(exclude_word_list)
-        return all_df[~all_df["sentence"].str.contains(exclude_word_list_regex)]
 if __name__ == "__main__":
@@ -102,11 +161,23 @@ if __name__ == "__main__":
     st.markdown("## AI-based Paraphrasing for Academic Writing")
-    input_text = st.text_area("text input", "We saw difference in the results between A and B.", placeholder="Write something here...")
-    top_k = st.number_input('top_k (upperbound)', min_value=1, value=200, step=1)
-    input_words = st.text_input("exclude words (comma separated)", "see, saw")
     if st.button('search'):
         exclude_word_list = [s.strip() for s in input_words.split(",") if s.strip() != ""]
-        df = get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df, exclude_word_list)
-        st.table(df)

 import torch
 import math
+import os
+import re
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 @st.cache(allow_output_mutation=True)
 def load_model_and_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("kaisugi/scitoricsbert")
+    model = AutoModel.from_pretrained("kaisugi/scitoricsbert", output_attentions=True)
     model.eval()
     return model, tokenizer
 @st.cache(allow_output_mutation=True)
+def formulaic_phrase_extraction(sentences, model, tokenizer):
+    THRESHOLD = 0.01
+    LAYER = 10
+    output_sentences = []
+    with torch.no_grad():
+        inputs = tokenizer.batch_encode_plus(
+            sentences,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors='pt'
+        )
+        outputs = model(**inputs)
+        attention = outputs[-1]
+        cls_attentions = torch.mean(attention[LAYER][0], dim=0)
+        for sentence, cls_attention in zip(sentences, cls_attentions):
+            check_bool_arr = list((cls_attention > THRESHOLD).numpy())[1:-1]
+            tokens = tokenizer.tokenize(sentence)
+            cur_tokens = tokens.copy()
+            while True:
+                flg = False
+                for idx, token in enumerate(cur_tokens):
+                    if token.startswith("##"):
+                        flg = True
+                        back_token = token.replace("##", "")
+                        front_token = cur_tokens.pop(idx-1)
+                        cur_tokens[idx-1] = front_token + back_token
+                        back_bool_val = check_bool_arr[idx]
+                        front_bool_val = check_bool_arr.pop(idx-1)
+                        check_bool_arr[idx-1] = front_bool_val and back_bool_val
+                if not flg:
+                    break
+            result = " ".join([f'<font color="coral">{original_word}</font>' if b else original_word for (b, original_word) in zip(check_bool_arr, sentence.split())])
+            output_sentences.append(result)
+    return output_sentences
+@st.cache(allow_output_mutation=True)
+def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df, exclude_word_list, phrase_annotated=True):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
             input_text,
     _, ids = index.search(x=np.array([query_embeddings]), k=top_k)
     retrieved_sentences = []
+    retrieved_paper_ids = []
     for id in ids[0]:
+        cur_sentence = sentence_df.loc[id, "sentence"]
+        cur_link = f"https://aclanthology.org/{sentence_df.loc[id, 'file_id']}"
+        if len(exclude_word_list) == 0:
+            retrieved_sentences.append(cur_sentence)
+            retrieved_paper_ids.append(cur_link)
+        else:
+            exclude_word_list_regex = '|'.join(exclude_word_list)
+            pat = re.compile(f'{exclude_word_list_regex}')
+            if not bool(pat.search(cur_sentence)):
+                retrieved_sentences.append(cur_sentence)
+                retrieved_paper_ids.append(cur_link)
+    if phrase_annotated:
+        retrieved_sentences = formulaic_phrase_extraction(retrieved_sentences, model, tokenizer)
+    return retrieved_sentences, retrieved_paper_ids
 if __name__ == "__main__":
     st.markdown("## AI-based Paraphrasing for Academic Writing")
+    input_text = st.text_area("text input", "Our model shows good results.", placeholder="Write something here...")
+    top_k = st.number_input('top_k (upperbound)', min_value=1, value=30, step=1)
+    input_words = st.text_input("exclude words (comma separated)", "good, result")
+    agree = st.checkbox('Include phrase annotation')
     if st.button('search'):
         exclude_word_list = [s.strip() for s in input_words.split(",") if s.strip() != ""]
+        retrieved_sentences, retrieved_paper_ids = get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df, exclude_word_list, phrase_annotated=agree)
+        result_table_markdown = "|  sentence  |  source link  |\n|:---|:---|\n"
+        for (retrieved_sentence, retrieved_paper_id) in zip(retrieved_sentences, retrieved_paper_ids):
+            result_table_markdown += f"| {retrieved_sentence} | {retrieved_paper_id} |\n"
+        st.markdown(result_table_markdown, unsafe_allow_html=True)
+    st.markdown("---\n#### How this works")
+    st.markdown("This app uses ScitoricsBERT [(Sugimoto and Aizawa, 2022)](https://aclanthology.org/2022.sdp-1.7/), a functional sentence representation model, to retrieve sentences that are functionally similar to the input. It also extracts phrasal patterns that accord to the function, by leveraging the attention patterns within ScitoricsBERT.")