Robert commited on
Commit
2827202
1 Parent(s): 83870cc

Added a way to evaluate overall performance of our model based on exact match and F1-score.

Browse files
base_model/evaluate.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def normalize_text(s: str) -> str:
2
+ """Preprocesses the sentence string by normalizing.
3
+
4
+ Args:
5
+ s (str): the sentence
6
+
7
+ Returns:
8
+ string: normalized sentence
9
+ """
10
+ import string, re
11
+
12
+ def remove_articles(text):
13
+ regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
14
+ return re.sub(regex, " ", text)
15
+
16
+ def white_space_fix(text):
17
+ return " ".join(text.split())
18
+
19
+ def remove_punc(text):
20
+ exclude = set(string.punctuation)
21
+ return "".join(ch for ch in text if ch not in exclude)
22
+
23
+ def lower(text):
24
+ return text.lower()
25
+
26
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
27
+
28
+
29
+ def compute_exact_match(prediction: str, answer: str) -> int:
30
+ """Computes exact match for sentences.
31
+
32
+ Args:
33
+ prediction (str): the predicted answer
34
+ answer (str): the gold answer
35
+
36
+ Returns:
37
+ int: 1 for exact match, 0 for not
38
+ """
39
+ return int(normalize_text(prediction) == normalize_text(answer))
40
+
41
+
42
+ def compute_f1(prediction: str, answer: str) -> float:
43
+ """Computes F1-score on token overlap for sentences.
44
+
45
+ Args:
46
+ prediction (str): the predicted answer
47
+ answer (str): the gold answer
48
+
49
+ Returns:
50
+ boolean: the f1 score
51
+ """
52
+ pred_tokens = normalize_text(prediction).split()
53
+ answer_tokens = normalize_text(answer).split()
54
+
55
+ if len(pred_tokens) == 0 or len(answer_tokens) == 0:
56
+ return int(pred_tokens == answer_tokens)
57
+
58
+ common_tokens = set(pred_tokens) & set(answer_tokens)
59
+
60
+ if len(common_tokens) == 0:
61
+ return 0
62
+
63
+ prec = len(common_tokens) / len(pred_tokens)
64
+ rec = len(common_tokens) / len(answer_tokens)
65
+
66
+ return 2 * (prec * rec) / (prec + rec)
base_model/main.py CHANGED
@@ -13,3 +13,8 @@ if __name__ == '__main__':
13
  print(f"Result {i+1} (score: {score:.02f}):")
14
  print(result['text'][i])
15
  print() # Newline
 
 
 
 
 
 
13
  print(f"Result {i+1} (score: {score:.02f}):")
14
  print(result['text'][i])
15
  print() # Newline
16
+
17
+ # Compute overall performance
18
+ exact_match, f1_score, total = r.evaluate()
19
+ print(f"Exact match: {exact_match} / {total}\n"
20
+ f"F1-score: {f1_score:.02f}")
base_model/retriever.py CHANGED
@@ -7,6 +7,9 @@ from transformers import (
7
  from datasets import load_dataset
8
  import torch
9
  import os.path
 
 
 
10
 
11
  # Hacky fix for FAISS error on macOS
12
  # See https://stackoverflow.com/a/63374568/4545692
@@ -49,6 +52,7 @@ class Retriever:
49
  # Dataset building
50
  self.dataset = self.__init_dataset(dataset)
51
 
 
52
  def __init_dataset(self,
53
  dataset: str,
54
  fname: str = "./models/paragraphs_embedding.faiss"):
@@ -65,6 +69,7 @@ class Retriever:
65
  """
66
  # Load dataset
67
  ds = load_dataset(dataset, name="paragraphs")["train"]
 
68
 
69
  if os.path.exists(fname):
70
  # If we already have FAISS embeddings, load them from disk
@@ -112,4 +117,32 @@ class Retriever:
112
  scores, results = self.dataset.get_nearest_examples(
113
  "embeddings", question_embedding, k=k
114
  )
 
115
  return scores, results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from datasets import load_dataset
8
  import torch
9
  import os.path
10
+ import numpy
11
+
12
+ import evaluate
13
 
14
  # Hacky fix for FAISS error on macOS
15
  # See https://stackoverflow.com/a/63374568/4545692
 
52
  # Dataset building
53
  self.dataset = self.__init_dataset(dataset)
54
 
55
+
56
  def __init_dataset(self,
57
  dataset: str,
58
  fname: str = "./models/paragraphs_embedding.faiss"):
 
69
  """
70
  # Load dataset
71
  ds = load_dataset(dataset, name="paragraphs")["train"]
72
+ print(ds)
73
 
74
  if os.path.exists(fname):
75
  # If we already have FAISS embeddings, load them from disk
 
117
  scores, results = self.dataset.get_nearest_examples(
118
  "embeddings", question_embedding, k=k
119
  )
120
+
121
  return scores, results
122
+
123
+ def evaluate(self):
124
+ """Evaluates the entire model by computing F1-score and exact match on the
125
+ entire dataset.
126
+
127
+ Returns:
128
+ int: overall exact match
129
+ float: overall F1-score
130
+ int: total amount of questions handled
131
+ """
132
+ questions_ds = load_dataset("GroNLP/ik-nlp-22_slp", name="questions")['test']
133
+ questions = questions_ds['question']
134
+ answers = questions_ds['answer']
135
+
136
+ predictions = []
137
+ scores = 0
138
+
139
+ # Currently just takes the first answer and does not look at scores yet
140
+ for question in questions:
141
+ score, result = self.retrieve(question, 1)
142
+ scores += score[0]
143
+ predictions.append(result['text'][0])
144
+
145
+ exact_match = max((evaluate.compute_exact_match(predictions[i], answers[i])) for i in range(len(answers)))
146
+ f1_score = max((evaluate.compute_f1(predictions[i], answers[i])) for i in range(len(answers)))
147
+
148
+ return exact_match, f1_score, len(answers)