Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

App Files Files Community

Robert commited on Mar 18, 2022

Commit

7570c1d

•

2 Parent(s): 8fe5a80 aa426fb

Merge pull request #1

Browse files

Files changed (3) hide show

base_model/evaluate.py +18 -20
base_model/retriever.py +16 -13
base_model/string_utils.py +20 -0

base_model/evaluate.py CHANGED Viewed

@@ -1,29 +1,27 @@
-def normalize_text(s: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
         s (str): the sentence
     Returns:
-        string: normalized sentence
     """
-    import string, re
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-    def white_space_fix(text):
-        return " ".join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
 def compute_exact_match(prediction: str, answer: str) -> int:
@@ -36,7 +34,7 @@ def compute_exact_match(prediction: str, answer: str) -> int:
     Returns:
         int: 1 for exact match, 0 for not
     """
-    return int(normalize_text(prediction) == normalize_text(answer))
 def compute_f1(prediction: str, answer: str) -> float:
@@ -49,8 +47,8 @@ def compute_f1(prediction: str, answer: str) -> float:
     Returns:
         boolean: the f1 score
     """
-    pred_tokens = normalize_text(prediction).split()
-    answer_tokens = normalize_text(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)

+from typing import Callable, List
+from base_model.string_utils import lower, remove_articles, remove_punc, white_space_fix
+def normalize_text(inp: str, preprocessing_functions: List[Callable[[str], str]]):
+    for fun in preprocessing_functions:
+        inp = fun(inp)
+    return inp
+def normalize_text_default(inp: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
         s (str): the sentence
     Returns:
+        string: normalized with default parames
     """
+    steps = [remove_articles, white_space_fix, remove_punc, lower]
+    return normalize_text(inp, steps)
 def compute_exact_match(prediction: str, answer: str) -> int:
     Returns:
         int: 1 for exact match, 0 for not
     """
+    return int(normalize_text_default(prediction) == normalize_text_default(answer))
 def compute_f1(prediction: str, answer: str) -> float:
     Returns:
         boolean: the f1 score
     """
+    pred_tokens = normalize_text_default(prediction).split()
+    answer_tokens = normalize_text_default(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)

base_model/retriever.py CHANGED Viewed

@@ -22,7 +22,7 @@ class Retriever:
     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
-    def __init__(self, dataset: str = "GroNLP/ik-nlp-22_slp") -> None:
         """Initialize the retriever
         Args:
@@ -49,12 +49,12 @@ class Retriever:
         )
         # Dataset building
-        self.dataset = self.__init_dataset(dataset)
-    def __init_dataset(self,
-                       dataset: str,
-                       fname: str = "./models/paragraphs_embedding.faiss"):
         """Loads the dataset and adds FAISS embeddings.
         Args:
@@ -67,12 +67,12 @@ class Retriever:
             embeddings.
         """
         # Load dataset
-        ds = load_dataset(dataset, name="paragraphs")["train"]
         print(ds)
-        if os.path.exists(fname):
             # If we already have FAISS embeddings, load them from disk
-            ds.load_faiss_index('embeddings', fname)
             return ds
         else:
             # If there are no FAISS embeddings, generate them
@@ -91,7 +91,7 @@ class Retriever:
             # save dataset w/ embeddings
             os.makedirs("./models/", exist_ok=True)
-            ds_with_embeddings.save_faiss_index("embeddings", fname)
             return ds_with_embeddings
@@ -127,7 +127,8 @@ class Retriever:
             float: overall exact match
             float: overall F1-score
         """
-        questions_ds = load_dataset("GroNLP/ik-nlp-22_slp", name="questions")['test']
         questions = questions_ds['question']
         answers = questions_ds['answer']
@@ -140,7 +141,9 @@ class Retriever:
             scores += score[0]
             predictions.append(result['text'][0])
-        exact_matches = [evaluate.compute_exact_match(predictions[i], answers[i]) for i in range(len(answers))]
-        f1_scores = [evaluate.compute_f1(predictions[i], answers[i]) for i in range(len(answers))]
         return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)

     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
+    def __init__(self, dataset_name: str = "GroNLP/ik-nlp-22_slp") -> None:
         """Initialize the retriever
         Args:
         )
         # Dataset building
+        self.dataset_name = dataset_name
+        self.dataset = self._init_dataset(dataset_name)
+    def _init_dataset(self,
+                      dataset_name: str,
+                      embedding_path: str = "./models/paragraphs_embedding.faiss"):
         """Loads the dataset and adds FAISS embeddings.
         Args:
             embeddings.
         """
         # Load dataset
+        ds = load_dataset(dataset_name, name="paragraphs")["train"]
         print(ds)
+        if os.path.exists(embedding_path):
             # If we already have FAISS embeddings, load them from disk
+            ds.load_faiss_index('embeddings', embedding_path)
             return ds
         else:
             # If there are no FAISS embeddings, generate them
             # save dataset w/ embeddings
             os.makedirs("./models/", exist_ok=True)
+            ds_with_embeddings.save_faiss_index("embeddings", embedding_path)
             return ds_with_embeddings
             float: overall exact match
             float: overall F1-score
         """
+        questions_ds = load_dataset(
+            self.dataset_name, name="questions")['test']
         questions = questions_ds['question']
         answers = questions_ds['answer']
             scores += score[0]
             predictions.append(result['text'][0])
+        exact_matches = [evaluate.compute_exact_match(
+            predictions[i], answers[i]) for i in range(len(answers))]
+        f1_scores = [evaluate.compute_f1(
+            predictions[i], answers[i]) for i in range(len(answers))]
         return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)

base_model/string_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import re
+import string
+def remove_articles(text):
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(regex, " ", text)
+def white_space_fix(text):
+    return " ".join(text.split())
+def remove_punc(text):
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude)
+def lower(text):
+    return text.lower()