Commit
Β·
51a31d4
1
Parent(s):
51dabd6
refactor evaluation
Browse files
main.py
CHANGED
@@ -1,23 +1,38 @@
|
|
1 |
-
from
|
2 |
-
from src.utils.log import get_logger
|
3 |
|
|
|
|
|
|
|
|
|
4 |
|
5 |
logger = get_logger()
|
6 |
|
7 |
|
8 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Initialize retriever
|
10 |
r = FAISRetriever()
|
11 |
|
12 |
-
# Retrieve example
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
for i, score in enumerate(scores):
|
17 |
logger.info(f"Result {i+1} (score: {score:.02f}):")
|
18 |
logger.info(result['text'][i])
|
19 |
|
20 |
# Compute overall performance
|
21 |
-
exact_match, f1_score =
|
|
|
22 |
logger.info(f"Exact match: {exact_match:.02f}\n"
|
23 |
f"F1-score: {f1_score:.02f}")
|
|
|
1 |
+
from datasets import DatasetDict, load_dataset
|
|
|
2 |
|
3 |
+
from src.retrievers.fais_retriever import FAISRetriever
|
4 |
+
from src.utils.log import get_logger
|
5 |
+
from src.evaluation import evaluate
|
6 |
+
from typing import cast
|
7 |
|
8 |
logger = get_logger()
|
9 |
|
10 |
|
11 |
if __name__ == '__main__':
|
12 |
+
dataset_name = "GroNLP/ik-nlp-22_slp"
|
13 |
+
paragraphs = load_dataset(dataset_name, "paragraphs")
|
14 |
+
questions = cast(DatasetDict, load_dataset(dataset_name, "questions"))
|
15 |
+
|
16 |
+
questions_test = questions["test"]
|
17 |
+
|
18 |
+
logger.info(questions)
|
19 |
+
|
20 |
# Initialize retriever
|
21 |
r = FAISRetriever()
|
22 |
|
23 |
+
# # Retrieve example
|
24 |
+
example_q = "What is the perplexity of a language model?"
|
25 |
+
scores, result = r.retrieve(example_q)
|
26 |
+
|
27 |
+
logger.info(
|
28 |
+
f"Example q: {example_q} answer: {result['text'][0]}")
|
29 |
|
30 |
for i, score in enumerate(scores):
|
31 |
logger.info(f"Result {i+1} (score: {score:.02f}):")
|
32 |
logger.info(result['text'][i])
|
33 |
|
34 |
# Compute overall performance
|
35 |
+
exact_match, f1_score = evaluate(
|
36 |
+
r, questions_test["question"], questions_test["answer"])
|
37 |
logger.info(f"Exact match: {exact_match:.02f}\n"
|
38 |
f"F1-score: {f1_score:.02f}")
|
src/evaluation.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
from typing import Callable, List
|
|
|
2 |
|
3 |
from src.utils.string_utils import (lower, remove_articles, remove_punc,
|
4 |
white_space_fix)
|
@@ -63,3 +64,29 @@ def f1(prediction: str, answer: str) -> float:
|
|
63 |
rec = len(common_tokens) / len(answer_tokens)
|
64 |
|
65 |
return 2 * (prec * rec) / (prec + rec)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Callable, List
|
2 |
+
from src.retrievers.base_retriever import Retriever
|
3 |
|
4 |
from src.utils.string_utils import (lower, remove_articles, remove_punc,
|
5 |
white_space_fix)
|
|
|
64 |
rec = len(common_tokens) / len(answer_tokens)
|
65 |
|
66 |
return 2 * (prec * rec) / (prec + rec)
|
67 |
+
|
68 |
+
|
69 |
+
def evaluate(retriever: Retriever, questions: Any, answers: Any):
|
70 |
+
"""Evaluates the entire model by computing F1-score and exact match on the
|
71 |
+
entire dataset.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
float: overall exact match
|
75 |
+
float: overall F1-score
|
76 |
+
"""
|
77 |
+
|
78 |
+
predictions = []
|
79 |
+
scores = 0
|
80 |
+
|
81 |
+
# Currently just takes the first answer and does not look at scores yet
|
82 |
+
for question in questions:
|
83 |
+
score, result = retriever.retrieve(question, 1)
|
84 |
+
scores += score[0]
|
85 |
+
predictions.append(result['text'][0])
|
86 |
+
|
87 |
+
exact_matches = [exact_match(
|
88 |
+
predictions[i], answers[i]) for i in range(len(answers))]
|
89 |
+
f1_scores = [f1(
|
90 |
+
predictions[i], answers[i]) for i in range(len(answers))]
|
91 |
+
|
92 |
+
return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)
|
src/retrievers/base_retriever.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
class Retriever():
|
2 |
+
def retrieve(self, query: str, k: int):
|
3 |
+
pass
|
src/{es_retriever.py β retrievers/es_retriever.py}
RENAMED
@@ -1,8 +1,10 @@
|
|
1 |
-
|
2 |
-
def __init__(self, dataset_name: str = "GroNLP/ik-nlp-22_slp"):
|
3 |
-
self.dataset_name = dataset_name
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
6 |
pass
|
7 |
|
8 |
def retrieve(self, query: str, k: int):
|
|
|
1 |
+
from src.utils.log import get_logger
|
|
|
|
|
2 |
|
3 |
+
logger = get_logger()
|
4 |
+
|
5 |
+
|
6 |
+
class ESRetriever(Retriever):
|
7 |
+
def __init__(self, data_set):
|
8 |
pass
|
9 |
|
10 |
def retrieve(self, query: str, k: int):
|
src/{fais_retriever.py β retrievers/fais_retriever.py}
RENAMED
@@ -1,19 +1,27 @@
|
|
1 |
-
# Hacky fix for FAISS error on macOS
|
2 |
-
# See https://stackoverflow.com/a/63374568/4545692
|
3 |
import os
|
4 |
import os.path
|
5 |
|
6 |
import torch
|
7 |
from datasets import load_dataset
|
8 |
-
from transformers import (
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
from src.
|
|
|
12 |
|
13 |
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
|
|
|
|
|
|
|
14 |
|
|
|
15 |
|
16 |
-
|
|
|
17 |
"""A class used to retrieve relevant documents based on some query.
|
18 |
based on https://huggingface.co/docs/datasets/faiss_es#faiss.
|
19 |
"""
|
@@ -65,7 +73,7 @@ class FAISRetriever:
|
|
65 |
# Load dataset
|
66 |
ds = load_dataset(dataset_name, name="paragraphs")[
|
67 |
"train"] # type: ignore
|
68 |
-
|
69 |
|
70 |
if os.path.exists(embedding_path):
|
71 |
# If we already have FAISS embeddings, load them from disk
|
@@ -115,32 +123,3 @@ class FAISRetriever:
|
|
115 |
)
|
116 |
|
117 |
return scores, results
|
118 |
-
|
119 |
-
def evaluate(self):
|
120 |
-
"""Evaluates the entire model by computing F1-score and exact match on the
|
121 |
-
entire dataset.
|
122 |
-
|
123 |
-
Returns:
|
124 |
-
float: overall exact match
|
125 |
-
float: overall F1-score
|
126 |
-
"""
|
127 |
-
questions_ds = load_dataset(
|
128 |
-
self.dataset_name, name="questions")['test']
|
129 |
-
questions = questions_ds['question']
|
130 |
-
answers = questions_ds['answer']
|
131 |
-
|
132 |
-
predictions = []
|
133 |
-
scores = 0
|
134 |
-
|
135 |
-
# Currently just takes the first answer and does not look at scores yet
|
136 |
-
for question in questions:
|
137 |
-
score, result = self.retrieve(question, 1)
|
138 |
-
scores += score[0]
|
139 |
-
predictions.append(result['text'][0])
|
140 |
-
|
141 |
-
exact_matches = [exact_match(
|
142 |
-
predictions[i], answers[i]) for i in range(len(answers))]
|
143 |
-
f1_scores = [f1(
|
144 |
-
predictions[i], answers[i]) for i in range(len(answers))]
|
145 |
-
|
146 |
-
return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)
|
|
|
|
|
|
|
1 |
import os
|
2 |
import os.path
|
3 |
|
4 |
import torch
|
5 |
from datasets import load_dataset
|
6 |
+
from transformers import (
|
7 |
+
DPRContextEncoder,
|
8 |
+
DPRContextEncoderTokenizer,
|
9 |
+
DPRQuestionEncoder,
|
10 |
+
DPRQuestionEncoderTokenizer,
|
11 |
+
)
|
12 |
|
13 |
+
from src.retrievers.base_retriever import Retriever
|
14 |
+
from src.utils.log import get_logger
|
15 |
|
16 |
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
|
17 |
+
# Hacky fix for FAISS error on macOS
|
18 |
+
# See https://stackoverflow.com/a/63374568/4545692
|
19 |
+
|
20 |
|
21 |
+
logger = get_logger()
|
22 |
|
23 |
+
|
24 |
+
class FAISRetriever(Retriever):
|
25 |
"""A class used to retrieve relevant documents based on some query.
|
26 |
based on https://huggingface.co/docs/datasets/faiss_es#faiss.
|
27 |
"""
|
|
|
73 |
# Load dataset
|
74 |
ds = load_dataset(dataset_name, name="paragraphs")[
|
75 |
"train"] # type: ignore
|
76 |
+
logger.info(ds)
|
77 |
|
78 |
if os.path.exists(embedding_path):
|
79 |
# If we already have FAISS embeddings, load them from disk
|
|
|
123 |
)
|
124 |
|
125 |
return scores, results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|