Spaces:

orionweller
/

retrieval-prompting

Sleeping

App Files Files Community

orionweller commited on Sep 6, 2024

Commit

53b3bb9

1 Parent(s): d89580e

mmap

Browse files

Files changed (5) hide show

app.py +32 -58
scifact/corpus_emb.0.pkl +0 -3
scifact/corpus_emb.1.pkl +0 -3
scifact/corpus_emb.2.pkl +0 -3
scifact/corpus_emb.3.pkl +0 -3

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ corpus_lookups = {}
 queries = {}
 q_lookups = {}
 qrels = {}
-datasets = ["scifact"] # others are too large for the Space unfortunately :(
 current_dataset = "scifact"
 def pool(last_hidden_states, attention_mask):
@@ -68,61 +68,45 @@ def load_model():
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "right"
-    base_model_instance = AutoModel.from_pretrained(BASE_MODEL)
     model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
-    model = model.merge_and_unload()
     model.eval()
-def save_faiss_index(index, dataset_name):
-    index_path = f"{dataset_name}/faiss_index.bin"
-    faiss.write_index(index, index_path)
-    logger.info(f"Saved FAISS index for {dataset_name} to {index_path}")
 def load_faiss_index(dataset_name):
     index_path = f"{dataset_name}/faiss_index.bin"
     if os.path.exists(index_path):
         logger.info(f"Loading existing FAISS index for {dataset_name} from {index_path}")
-        return faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
     return None
-def load_corpus_embeddings(dataset_name):
-    global retrievers, corpus_lookups
-    corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
-    index_files = glob.glob(corpus_path)
-    logger.info(f'Loading {len(index_files)} files into index for {dataset_name}.')
-    # Try to load existing FAISS index
     faiss_index = load_faiss_index(dataset_name)
     if faiss_index is None:
-        # If no existing index, create a new one
-        p_reps_0, p_lookup_0 = pickle_load(index_files[0])
-        retrievers[dataset_name] = FaissFlatSearcher(p_reps_0)
-        shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
-        corpus_lookups[dataset_name] = []
-        for p_reps, p_lookup in tqdm.tqdm(shards, desc=f'Loading shards into index for {dataset_name}', total=len(index_files)):
-            retrievers[dataset_name].add(p_reps)
-            corpus_lookups[dataset_name] += p_lookup
-        # Save the newly created index
-        save_faiss_index(retrievers[dataset_name].index, dataset_name)
-    else:
-        # Use the loaded index
-        retrievers[dataset_name] = FaissFlatSearcher(faiss_index)
-        # Load corpus lookups
-        corpus_lookups[dataset_name] = []
-        for file in index_files:
-            _, p_lookup = pickle_load(file)
-            corpus_lookups[dataset_name] += p_lookup
-def pickle_load(path):
-    with open(path, 'rb') as f:
-        reps, lookup = pickle.load(f)
-    return np.array(reps), lookup
 def load_queries(dataset_name):
     global queries, q_lookups, qrels
@@ -143,7 +127,6 @@ def load_queries(dataset_name):
 @spaces.GPU
 def encode_queries(dataset_name, postfix):
     global queries, tokenizer, model
-    model = model.cuda()
     input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[dataset_name]]
     encoded_embeds = []
@@ -161,15 +144,8 @@ def encode_queries(dataset_name, postfix):
             embeds = F.normalize(embeds, p=2, dim=-1)
             encoded_embeds.append(embeds.cpu().numpy())
-    # remove model from GPU
-    model = model.cpu()
     return np.concatenate(encoded_embeds, axis=0)
-def search_queries(dataset_name, q_reps, depth=1000):
-    all_scores, all_indices = retrievers[dataset_name].search(q_reps, depth)
-    psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
-    return all_scores, np.array(psg_indices)
 def evaluate(qrels, results, k_values):
     evaluator = pytrec_eval.RelevanceEvaluator(
@@ -187,8 +163,8 @@ def evaluate(qrels, results, k_values):
 def run_evaluation(dataset, postfix):
     global current_dataset
-    if dataset not in retrievers or dataset not in queries:
-        load_corpus_embeddings(dataset)
         load_queries(dataset)
     current_dataset = dataset
@@ -208,16 +184,14 @@ def run_evaluation(dataset, postfix):
 def gradio_interface(dataset, postfix):
     if 'model' not in globals() or model is None:
-        # Load model and initial datasets
         load_model()
         for dataset in datasets:
             print(f"Loading dataset: {dataset}")
-            load_corpus_embeddings(dataset)
             load_queries(dataset)
     return run_evaluation(dataset, postfix)
 # Create Gradio interface
 iface = gr.Interface(
     fn=gradio_interface,
@@ -230,7 +204,7 @@ iface = gr.Interface(
     description="Select a dataset and enter a prompt to evaluate the model's performance. Note: it takes about **ten seconds** to evaluate.",
     examples=[
         ["scifact", ""],
-        ["scifact", "When judging the relevance of a document, focus on the pragmatics of the query and consider irrelevant any documents for which the user would have used a different query."]
     ],
     cache_examples=True,
 )

 queries = {}
 q_lookups = {}
 qrels = {}
+datasets = ["scifact"]
 current_dataset = "scifact"
 def pool(last_hidden_states, attention_mask):
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "right"
+    base_model_instance = AutoModel.from_pretrained(BASE_MODEL, device_map="auto", torch_dtype=torch.float16)
     model = PeftModel.from_pretrained(base_model_instance, CUR_MODEL)
     model.eval()
 def load_faiss_index(dataset_name):
     index_path = f"{dataset_name}/faiss_index.bin"
     if os.path.exists(index_path):
         logger.info(f"Loading existing FAISS index for {dataset_name} from {index_path}")
+        return faiss.read_index(index_path, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
     return None
+def search_queries(dataset_name, q_reps, depth=1000):
     faiss_index = load_faiss_index(dataset_name)
     if faiss_index is None:
+        raise ValueError(f"No FAISS index found for dataset {dataset_name}")
+    # Ensure q_reps is a 2D numpy array of the correct type
+    q_reps = np.ascontiguousarray(q_reps.astype('float32'))
+    # Perform the search
+    all_scores, all_indices = faiss_index.search(q_reps, depth)
+    psg_indices = [[str(corpus_lookups[dataset_name][x]) for x in q_dd] for q_dd in all_indices]
+    # Clean up
+    del faiss_index
+    return all_scores, np.array(psg_indices)
+def load_corpus_lookups(dataset_name):
+    global corpus_lookups
+    corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
+    index_files = glob.glob(corpus_path)
+    corpus_lookups[dataset_name] = []
+    for file in index_files:
+        with open(file, 'rb') as f:
+            _, p_lookup = pickle.load(f)
+        corpus_lookups[dataset_name] += p_lookup
 def load_queries(dataset_name):
     global queries, q_lookups, qrels
 @spaces.GPU
 def encode_queries(dataset_name, postfix):
     global queries, tokenizer, model
     input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[dataset_name]]
     encoded_embeds = []
             embeds = F.normalize(embeds, p=2, dim=-1)
             encoded_embeds.append(embeds.cpu().numpy())
     return np.concatenate(encoded_embeds, axis=0)
 def evaluate(qrels, results, k_values):
     evaluator = pytrec_eval.RelevanceEvaluator(
 def run_evaluation(dataset, postfix):
     global current_dataset
+    if dataset not in corpus_lookups or dataset not in queries:
+        load_corpus_lookups(dataset)
         load_queries(dataset)
     current_dataset = dataset
 def gradio_interface(dataset, postfix):
     if 'model' not in globals() or model is None:
         load_model()
         for dataset in datasets:
             print(f"Loading dataset: {dataset}")
+            load_corpus_lookups(dataset)
             load_queries(dataset)
     return run_evaluation(dataset, postfix)
 # Create Gradio interface
 iface = gr.Interface(
     fn=gradio_interface,
     description="Select a dataset and enter a prompt to evaluate the model's performance. Note: it takes about **ten seconds** to evaluate.",
     examples=[
         ["scifact", ""],
+        ["scifact", "Think carefully about these conditions when determining relevance."]
     ],
     cache_examples=True,
 )

scifact/corpus_emb.0.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0bb98e68350983519732b0b39e8f98ec0225abd2c68775e7317da9b17f0db1dd
-size 21247618

scifact/corpus_emb.1.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3dd3501342754aeb2ffb895480868e0976895bded3e5accbd8e5b6fa404e5484
-size 21247619

scifact/corpus_emb.2.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e1a98c698cbe367bc1abc789da76794a8e79e92743059b26faafbd34808aa15
-size 21247619

scifact/corpus_emb.3.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:911c8d6654bfb14a3d68363c96a70462348cfbbf35a591e020877ed28591339c
-size 21231225