Spaces:

kaisugi
/

academic-paraphraser

Runtime error

App Files Files Community

kaisugi commited on Feb 13, 2023

Commit

e1a3f25

1 Parent(s): b6363d9

update

Browse files

Files changed (1) hide show

app.py +37 -40

app.py CHANGED Viewed

@@ -27,14 +27,41 @@ def load_sentence_data():
 @st.cache(allow_output_mutation=True)
-def load_sentence_embeddings():
     npz_comp = np.load("sentence_embeddings_789k.npz")
     sentence_embeddings = npz_comp["arr_0"]
-    return sentence_embeddings
-@st.cache
 def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
@@ -58,47 +85,17 @@ def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_d
     return pd.DataFrame({"sentences": retrieved_sentences})
-def main(model, tokenizer, sentence_df, index):
-    st.markdown("## AI-based Paraphrasing for Academic Writing")
-    input_text = st.text_area("text input", "Model have good results.", placeholder="Write something here...")
-    top_k = st.number_input('top_k', min_value=1, value=10, step=1)
-    df = get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df)
-    st.table(df)
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer()
     sentence_df = load_sentence_data()
-    sentence_embeddings = load_sentence_embeddings()
-    faiss.normalize_L2(sentence_embeddings)
-    D = 768
-    N = 789188
-    Xt = sentence_embeddings[:39000]
-    X = sentence_embeddings
-    # Param of PQ
-    M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
-    nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
-    # Param of IVF
-    nlist = 1000  # The number of cells (space partition). Typical value is sqrt(N)
-    # Param of HNSW
-    hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
-    # Setup
-    quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
-    index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)
-    # Train
-    index.train(Xt)
-    # Add
-    index.add(X)
-    # Search
-    index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
-    main(model, tokenizer, sentence_df, index)

 @st.cache(allow_output_mutation=True)
+def load_sentence_embeddings_and_index():
     npz_comp = np.load("sentence_embeddings_789k.npz")
     sentence_embeddings = npz_comp["arr_0"]
+    faiss.normalize_L2(sentence_embeddings)
+    D = 768
+    N = 789188
+    Xt = sentence_embeddings[:39000]
+    X = sentence_embeddings
+    # Param of PQ
+    M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
+    nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
+    # Param of IVF
+    nlist = 888  # The number of cells (space partition). Typical value is sqrt(N)
+    # Param of HNSW
+    hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
+    # Setup
+    quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
+    index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)
+    # Train
+    index.train(Xt)
+    # Add
+    index.add(X)
+    # Search
+    index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
+    return sentence_embeddings, index
+@st.cache(allow_output_mutation=True)
 def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
     return pd.DataFrame({"sentences": retrieved_sentences})
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer()
     sentence_df = load_sentence_data()
+    sentence_embeddings, index = load_sentence_embeddings_and_index()
+    st.markdown("## AI-based Paraphrasing for Academic Writing")
+    input_text = st.text_area("text input", "Model have good results.", placeholder="Write something here...")
+    top_k = st.number_input('top_k', min_value=1, value=10, step=1)
+    if st.button('search'):
+        df = get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df)
+        st.table(df)