Neural-Chat-Southampton

Runtime error

App Files Files Community

Warlord-K commited on Oct 13, 2023

Commit

01df155

1 Parent(s): f1ca1a4

Proper Chatbot Interface

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -21,11 +21,15 @@ MAX_INPUT_TOKEN_LENGTH = 4000
 EMBED_DIM = 1024
 K = 10
 EF = 100
 SEARCH_INDEX = "search_index.bin"
 EMBEDDINGS_FILE = "embeddings.npy"
 DOCUMENT_DATASET = "chunked_data.parquet"
 COSINE_THRESHOLD = 0.7
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
@@ -36,6 +40,11 @@ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2", max_length
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
 def create_qa_prompt(query, relevant_chunks):
     stuffed_context = " ".join(relevant_chunks)
@@ -112,7 +121,7 @@ def get_completion(
     return response["choices"][0]["message"]["content"] if not stream else response
-# load the index for the PEFT docs
 def load_hnsw_index(index_file):
     # Load the HNSW index from the specified file
     index = hnswlib.Index(space="ip", dim=EMBED_DIM)
@@ -120,7 +129,7 @@ def load_hnsw_index(index_file):
     return index
-# create the index for the PEFT docs from numpy embeddings
 # avoid the arch mismatches when creating search index
 def create_hnsw_index(embeddings_file, M=16, efC=100):
     embeddings = np.load(embeddings_file)
@@ -181,7 +190,7 @@ DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4000
 DESCRIPTION = """
-# PEFT Docs QA Chatbot 🤗
 """
 LICENSE = """
@@ -285,6 +294,18 @@ def check_input_token_length(message: str, chat_history: list[tuple[str, str]],
             f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
         )
 search_index = create_hnsw_index(EMBEDDINGS_FILE)  # load_hnsw_index(SEARCH_INDEX)
 data_df = pd.read_parquet(DOCUMENT_DATASET).reset_index()
@@ -341,11 +362,12 @@ with gr.Blocks(css="style.css") as demo:
     gr.Examples(
         examples=[
-            "What is 🤗 PEFT?",
-            "How do I create a LoraConfig?",
-            "What are the different tuners supported?",
-            "How do I use LoRA with custom models?",
-            "What are the different real-world applications that I can use PEFT for?",
         ],
         inputs=textbox,
         outputs=[textbox, chatbot],

 EMBED_DIM = 1024
 K = 10
 EF = 100
+TEXT_FILE = 'data.txt'
 SEARCH_INDEX = "search_index.bin"
 EMBEDDINGS_FILE = "embeddings.npy"
 DOCUMENT_DATASET = "chunked_data.parquet"
 COSINE_THRESHOLD = 0.7
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 print("CPU threads:", torch.get_num_threads())
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
+def read_text_from_file(file_path):
+    with open(file_path, "r") as text_file:
+        text = text_file.read()
+    texts = text.split("&&")
+    return [t.strip() for t in texts]
 def create_qa_prompt(query, relevant_chunks):
     stuffed_context = " ".join(relevant_chunks)
     return response["choices"][0]["message"]["content"] if not stream else response
+# load the index for the data
 def load_hnsw_index(index_file):
     # Load the HNSW index from the specified file
     index = hnswlib.Index(space="ip", dim=EMBED_DIM)
     return index
+# create the index for the data from numpy embeddings
 # avoid the arch mismatches when creating search index
 def create_hnsw_index(embeddings_file, M=16, efC=100):
     embeddings = np.load(embeddings_file)
 MAX_INPUT_TOKEN_LENGTH = 4000
 DESCRIPTION = """
+# AVA Southampton Chatbot 🤗
 """
 LICENSE = """
             f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
         )
+if not os.path.exists(TEXT_FILE):
+    os.system(f"wget -O {TEXT_FILE} https://huggingface.co/spaces/Slycat/Southampton-Similarity/resolve/main/Southampton.txt")
+if not os.path.exists(EMBEDDINGS_FILE):
+    texts = read_text_from_file(TEXT_FILE)
+    embeddings = biencoder.encode(texts, normalize_embeddings=True)
+    np.save(EMBEDDINGS_FILE,embeddings)
+if not os.path.exists(DOCUMENT_DATASET):
+    texts = read_text_from_file(TEXT_FILE)
+    df = pd.DataFrame(texts, columns = ["chunk_content"])
+    df.to_parquet(DOCUMENT_DATASET,index=False)
 search_index = create_hnsw_index(EMBEDDINGS_FILE)  # load_hnsw_index(SEARCH_INDEX)
 data_df = pd.read_parquet(DOCUMENT_DATASET).reset_index()
     gr.Examples(
         examples=[
+            "What is University of Southampton?",
+            "Is University of Southampton Good?",
+            "What is sports facility at southampton university?",
+            "How big is the Southampton campus?",
+            "What are the rankings of southampton university?",
+            "What research facilities does the Southampton university offer?"
         ],
         inputs=textbox,
         outputs=[textbox, chatbot],