Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

App Files Files Community

GGroenendaal commited on Mar 18, 2022

Commit

fa8dc75

•

2 Parent(s): 7570c1d 51a31d4

Merge branch 'esretriever' into main

Browse files

Files changed (13) hide show

.env.example +4 -0
README.md +3 -0
base_model/main.py +0 -20
main.py +38 -0
poetry.lock +121 -2
pyproject.toml +18 -0
base_model/evaluate.py → src/evaluation.py +38 -10
{base_model → src}/reader.py +0 -0
src/retrievers/base_retriever.py +3 -0
src/retrievers/es_retriever.py +11 -0
base_model/retriever.py → src/retrievers/fais_retriever.py +16 -40
src/utils/log.py +31 -0
{base_model → src/utils}/string_utils.py +0 -0

.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+ELASTIC_USERNAME=elastic
+ELASTIC_PASSWORD=<password>
+LOG_LEVEL=INFO

README.md CHANGED Viewed

@@ -73,3 +73,6 @@ poetry run python main.py
 > shows that MT systems perform worse when they are asked to translate sentences
 > that describe people with non-stereotypical gender roles, like "The doctor
 > asked the nurse to help her in the > operation".

 > shows that MT systems perform worse when they are asked to translate sentences
 > that describe people with non-stereotypical gender roles, like "The doctor
 > asked the nurse to help her in the > operation".
+## Setting up elastic search.

base_model/main.py DELETED Viewed

@@ -1,20 +0,0 @@
-from retriever import Retriever
-if __name__ == '__main__':
-    # Initialize retriever
-    r = Retriever()
-    # Retrieve example
-    scores, result = r.retrieve(
-        "What is the perplexity of a language model?")
-    for i, score in enumerate(scores):
-        print(f"Result {i+1} (score: {score:.02f}):")
-        print(result['text'][i])
-        print()  # Newline
-    # Compute overall performance
-    exact_match, f1_score = r.evaluate()
-    print(f"Exact match: {exact_match:.02f}\n"
-          f"F1-score: {f1_score:.02f}")

main.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from datasets import DatasetDict, load_dataset
+from src.retrievers.fais_retriever import FAISRetriever
+from src.utils.log import get_logger
+from src.evaluation import evaluate
+from typing import cast
+logger = get_logger()
+if __name__ == '__main__':
+    dataset_name = "GroNLP/ik-nlp-22_slp"
+    paragraphs = load_dataset(dataset_name, "paragraphs")
+    questions = cast(DatasetDict, load_dataset(dataset_name, "questions"))
+    questions_test = questions["test"]
+    logger.info(questions)
+    # Initialize retriever
+    r = FAISRetriever()
+    # # Retrieve example
+    example_q = "What is the perplexity of a language model?"
+    scores, result = r.retrieve(example_q)
+    logger.info(
+        f"Example q: {example_q} answer: {result['text'][0]}")
+    for i, score in enumerate(scores):
+        logger.info(f"Result {i+1} (score: {score:.02f}):")
+        logger.info(result['text'][i])
+    # Compute overall performance
+    exact_match, f1_score = evaluate(
+        r, questions_test["question"], questions_test["answer"])
+    logger.info(f"Exact match: {exact_match:.02f}\n"
+                f"F1-score: {f1_score:.02f}")

poetry.lock CHANGED Viewed

@@ -149,6 +149,36 @@ python-versions = ">=2.7, !=3.0.*"
 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
 [[package]]
 name = "faiss-cpu"
 version = "1.7.2"
@@ -291,6 +321,32 @@ python-versions = "*"
 [package.dependencies]
 dill = ">=0.3.4"
 [[package]]
 name = "numpy"
 version = "1.22.3"
@@ -380,6 +436,17 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 [package.dependencies]
 six = ">=1.5"
 [[package]]
 name = "pytz"
 version = "2021.3"
@@ -480,6 +547,14 @@ category = "dev"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 [[package]]
 name = "torch"
 version = "1.11.0"
@@ -610,7 +685,7 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "227b922ee14abf36ca75bb238d239d712bed9213d54c567996566d465e465733"
 [metadata.files]
 aiohttp = [
@@ -727,6 +802,14 @@ dill = [
     {file = "dill-0.3.4-py2.py3-none-any.whl", hash = "sha256:7e40e4a70304fd9ceab3535d36e58791d9c4a776b38ec7f7ec9afc8d3dca4d4f"},
     {file = "dill-0.3.4.zip", hash = "sha256:9f9734205146b2b353ab3fec9af0070237b6ddae78452af83d2fca84d739e675"},
 ]
 faiss-cpu = [
     {file = "faiss-cpu-1.7.2.tar.gz", hash = "sha256:f7ea89de997f55764e3710afaf0a457b2529252f99ee63510d4d9348d5b419dd"},
     {file = "faiss_cpu-1.7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b7461f989d757917a3e6dc81eb171d0b563eb98d23ebaf7fc6684d0093ba267e"},
@@ -918,12 +1001,40 @@ multiprocess = [
     {file = "multiprocess-0.70.12.2-py39-none-any.whl", hash = "sha256:6f812a1d3f198b7cacd63983f60e2dc1338bd4450893f90c435067b5a3127e6f"},
     {file = "multiprocess-0.70.12.2.zip", hash = "sha256:206bb9b97b73f87fec1ed15a19f8762950256aa84225450abc7150d02855a083"},
 ]
 numpy = [
     {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"},
     {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"},
     {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e"},
     {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4"},
-    {file = "numpy-1.22.3-cp310-cp310-win32.whl", hash = "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430"},
     {file = "numpy-1.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4"},
     {file = "numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce"},
     {file = "numpy-1.22.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe"},
@@ -1015,6 +1126,10 @@ python-dateutil = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
 pytz = [
     {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"},
     {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
@@ -1189,6 +1304,10 @@ toml = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
 torch = [
     {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
     {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},

 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
+[[package]]
+name = "elastic-transport"
+version = "8.1.0"
+description = "Transport classes and utilities shared among Python Elastic client libraries"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+[package.dependencies]
+certifi = "*"
+urllib3 = ">=1.26.2,<2"
+[package.extras]
+develop = ["pytest", "pytest-cov", "pytest-mock", "pytest-asyncio", "mock", "requests", "aiohttp"]
+[[package]]
+name = "elasticsearch"
+version = "8.1.0"
+description = "Python client for Elasticsearch"
+category = "main"
+optional = false
+python-versions = ">=3.6, <4"
+[package.dependencies]
+elastic-transport = ">=8,<9"
+[package.extras]
+async = ["aiohttp (>=3,<4)"]
+requests = ["requests (>=2.4.0,<3.0.0)"]
 [[package]]
 name = "faiss-cpu"
 version = "1.7.2"
 [package.dependencies]
 dill = ">=0.3.4"
+[[package]]
+name = "mypy"
+version = "0.941"
+description = "Optional static typing for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+[package.dependencies]
+mypy-extensions = ">=0.4.3"
+tomli = ">=1.1.0"
+typing-extensions = ">=3.10"
+[package.extras]
+dmypy = ["psutil (>=4.0)"]
+python2 = ["typed-ast (>=1.4.0,<2)"]
+reports = ["lxml"]
+[[package]]
+name = "mypy-extensions"
+version = "0.4.3"
+description = "Experimental type system extensions for programs checked with the mypy typechecker."
+category = "dev"
+optional = false
+python-versions = "*"
 [[package]]
 name = "numpy"
 version = "1.22.3"
 [package.dependencies]
 six = ">=1.5"
+[[package]]
+name = "python-dotenv"
+version = "0.19.2"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+[package.extras]
+cli = ["click (>=5.0)"]
 [[package]]
 name = "pytz"
 version = "2021.3"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
 [[package]]
 name = "torch"
 version = "1.11.0"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
+content-hash = "7fadbb5aabac268ecd27c257e2c8f651d26896e78c9cc0ea7e61a8b6ec61c84c"
 [metadata.files]
 aiohttp = [
     {file = "dill-0.3.4-py2.py3-none-any.whl", hash = "sha256:7e40e4a70304fd9ceab3535d36e58791d9c4a776b38ec7f7ec9afc8d3dca4d4f"},
     {file = "dill-0.3.4.zip", hash = "sha256:9f9734205146b2b353ab3fec9af0070237b6ddae78452af83d2fca84d739e675"},
 ]
+elastic-transport = [
+    {file = "elastic-transport-8.1.0.tar.gz", hash = "sha256:769ee4c7b28d270cdbce71359973b88129ac312b13be95b4f7479e35c49d9455"},
+    {file = "elastic_transport-8.1.0-py3-none-any.whl", hash = "sha256:0bb2ae3d13348e9e4587ca1f17cd813a528a7cc07f879505f56d69c81823b660"},
+]
+elasticsearch = [
+    {file = "elasticsearch-8.1.0-py3-none-any.whl", hash = "sha256:11e36565dfdf649b7911c2d3cb1f15b99267acfb7f82e94e7613c0323a9936e9"},
+    {file = "elasticsearch-8.1.0.tar.gz", hash = "sha256:648d1c707a632279535356d2762cbc63ae728c4633211fe160f43f87a3e1cdcd"},
+]
 faiss-cpu = [
     {file = "faiss-cpu-1.7.2.tar.gz", hash = "sha256:f7ea89de997f55764e3710afaf0a457b2529252f99ee63510d4d9348d5b419dd"},
     {file = "faiss_cpu-1.7.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b7461f989d757917a3e6dc81eb171d0b563eb98d23ebaf7fc6684d0093ba267e"},
     {file = "multiprocess-0.70.12.2-py39-none-any.whl", hash = "sha256:6f812a1d3f198b7cacd63983f60e2dc1338bd4450893f90c435067b5a3127e6f"},
     {file = "multiprocess-0.70.12.2.zip", hash = "sha256:206bb9b97b73f87fec1ed15a19f8762950256aa84225450abc7150d02855a083"},
 ]
+mypy = [
+    {file = "mypy-0.941-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:98f61aad0bb54f797b17da5b82f419e6ce214de0aa7e92211ebee9e40eb04276"},
+    {file = "mypy-0.941-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6a8e1f63357851444940351e98fb3252956a15f2cabe3d698316d7a2d1f1f896"},
+    {file = "mypy-0.941-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b30d29251dff4c59b2e5a1fa1bab91ff3e117b4658cb90f76d97702b7a2ae699"},
+    {file = "mypy-0.941-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8eaf55fdf99242a1c8c792247c455565447353914023878beadb79600aac4a2a"},
+    {file = "mypy-0.941-cp310-cp310-win_amd64.whl", hash = "sha256:080097eee5393fd740f32c63f9343580aaa0fb1cda0128fd859dfcf081321c3d"},
+    {file = "mypy-0.941-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f79137d012ff3227866222049af534f25354c07a0d6b9a171dba9f1d6a1fdef4"},
+    {file = "mypy-0.941-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8e5974583a77d630a5868eee18f85ac3093caf76e018c510aeb802b9973304ce"},
+    {file = "mypy-0.941-cp36-cp36m-win_amd64.whl", hash = "sha256:0dd441fbacf48e19dc0c5c42fafa72b8e1a0ba0a39309c1af9c84b9397d9b15a"},
+    {file = "mypy-0.941-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0d3bcbe146247997e03bf030122000998b076b3ac6925b0b6563f46d1ce39b50"},
+    {file = "mypy-0.941-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3bada0cf7b6965627954b3a128903a87cac79a79ccd83b6104912e723ef16c7b"},
+    {file = "mypy-0.941-cp37-cp37m-win_amd64.whl", hash = "sha256:eea10982b798ff0ccc3b9e7e42628f932f552c5845066970e67cd6858655d52c"},
+    {file = "mypy-0.941-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:108f3c7e14a038cf097d2444fa0155462362c6316e3ecb2d70f6dd99cd36084d"},
+    {file = "mypy-0.941-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d61b73c01fc1de799226963f2639af831307fe1556b04b7c25e2b6c267a3bc76"},
+    {file = "mypy-0.941-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:42c216a33d2bdba08098acaf5bae65b0c8196afeb535ef4b870919a788a27259"},
+    {file = "mypy-0.941-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fc5ecff5a3bbfbe20091b1cad82815507f5ae9c380a3a9bf40f740c70ce30a9b"},
+    {file = "mypy-0.941-cp38-cp38-win_amd64.whl", hash = "sha256:bf446223b2e0e4f0a4792938e8d885e8a896834aded5f51be5c3c69566495540"},
+    {file = "mypy-0.941-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:745071762f32f65e77de6df699366d707fad6c132a660d1342077cbf671ef589"},
+    {file = "mypy-0.941-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:465a6ce9ca6268cadfbc27a2a94ddf0412568a6b27640ced229270be4f5d394d"},
+    {file = "mypy-0.941-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d051ce0946521eba48e19b25f27f98e5ce4dbc91fff296de76240c46b4464df0"},
+    {file = "mypy-0.941-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:818cfc51c25a5dbfd0705f3ac1919fff6971eb0c02e6f1a1f6a017a42405a7c0"},
+    {file = "mypy-0.941-cp39-cp39-win_amd64.whl", hash = "sha256:b2ce2788df0c066c2ff4ba7190fa84f18937527c477247e926abeb9b1168b8cc"},
+    {file = "mypy-0.941-py3-none-any.whl", hash = "sha256:3cf77f138efb31727ee7197bc824c9d6d7039204ed96756cc0f9ca7d8e8fc2a4"},
+    {file = "mypy-0.941.tar.gz", hash = "sha256:cbcc691d8b507d54cb2b8521f0a2a3d4daa477f62fe77f0abba41e5febb377b7"},
+]
+mypy-extensions = [
+    {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
+    {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
+]
 numpy = [
     {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"},
     {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"},
     {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e"},
     {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4"},
     {file = "numpy-1.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4"},
     {file = "numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce"},
     {file = "numpy-1.22.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe"},
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
 ]
+python-dotenv = [
+    {file = "python-dotenv-0.19.2.tar.gz", hash = "sha256:a5de49a31e953b45ff2d2fd434bbc2670e8db5273606c1e737cc6b93eff3655f"},
+    {file = "python_dotenv-0.19.2-py2.py3-none-any.whl", hash = "sha256:32b2bdc1873fd3a3c346da1c6db83d0053c3c62f28f1f38516070c4c8971b1d3"},
+]
 pytz = [
     {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"},
     {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
 torch = [
     {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
     {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},

pyproject.toml CHANGED Viewed

@@ -11,10 +11,28 @@ transformers = "^4.17.0"
 torch = "^1.11.0"
 datasets = "^1.18.4"
 faiss-cpu = "^1.7.2"
 [tool.poetry.dev-dependencies]
 flake8 = "^4.0.1"
 autopep8 = "^1.6.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]

 torch = "^1.11.0"
 datasets = "^1.18.4"
 faiss-cpu = "^1.7.2"
+python-dotenv = "^0.19.2"
+elasticsearch = "^8.1.0"
 [tool.poetry.dev-dependencies]
 flake8 = "^4.0.1"
 autopep8 = "^1.6.0"
+mypy = "^0.941"
+[tool.mypy]
+no_implicit_optional=true
+[[tool.mypy.overrides]]
+module = [
+    "transformers",
+    "datasets",
+]
+ignore_missing_imports = true
+[tool.isort]
+profile = "black"
 [build-system]
 requires = ["poetry-core>=1.0.0"]

base_model/evaluate.py → src/evaluation.py RENAMED Viewed

@@ -1,15 +1,17 @@
-from typing import Callable, List
-from base_model.string_utils import lower, remove_articles, remove_punc, white_space_fix
-def normalize_text(inp: str, preprocessing_functions: List[Callable[[str], str]]):
     for fun in preprocessing_functions:
         inp = fun(inp)
     return inp
-def normalize_text_default(inp: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
@@ -21,10 +23,10 @@ def normalize_text_default(inp: str) -> str:
     steps = [remove_articles, white_space_fix, remove_punc, lower]
-    return normalize_text(inp, steps)
-def compute_exact_match(prediction: str, answer: str) -> int:
     """Computes exact match for sentences.
     Args:
@@ -34,10 +36,10 @@ def compute_exact_match(prediction: str, answer: str) -> int:
     Returns:
         int: 1 for exact match, 0 for not
     """
-    return int(normalize_text_default(prediction) == normalize_text_default(answer))
-def compute_f1(prediction: str, answer: str) -> float:
     """Computes F1-score on token overlap for sentences.
     Args:
@@ -47,8 +49,8 @@ def compute_f1(prediction: str, answer: str) -> float:
     Returns:
         boolean: the f1 score
     """
-    pred_tokens = normalize_text_default(prediction).split()
-    answer_tokens = normalize_text_default(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)
@@ -62,3 +64,29 @@ def compute_f1(prediction: str, answer: str) -> float:
     rec = len(common_tokens) / len(answer_tokens)
     return 2 * (prec * rec) / (prec + rec)

+from typing import Any, Callable, List
+from src.retrievers.base_retriever import Retriever
+from src.utils.string_utils import (lower, remove_articles, remove_punc,
+                                    white_space_fix)
+def _normalize_text(inp: str, preprocessing_functions: List[Callable[[str], str]]):
     for fun in preprocessing_functions:
         inp = fun(inp)
     return inp
+def _normalize_text_default(inp: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
     steps = [remove_articles, white_space_fix, remove_punc, lower]
+    return _normalize_text(inp, steps)
+def exact_match(prediction: str, answer: str) -> int:
     """Computes exact match for sentences.
     Args:
     Returns:
         int: 1 for exact match, 0 for not
     """
+    return int(_normalize_text_default(prediction) == _normalize_text_default(answer))
+def f1(prediction: str, answer: str) -> float:
     """Computes F1-score on token overlap for sentences.
     Args:
     Returns:
         boolean: the f1 score
     """
+    pred_tokens = _normalize_text_default(prediction).split()
+    answer_tokens = _normalize_text_default(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)
     rec = len(common_tokens) / len(answer_tokens)
     return 2 * (prec * rec) / (prec + rec)
+def evaluate(retriever: Retriever, questions: Any, answers: Any):
+    """Evaluates the entire model by computing F1-score and exact match on the
+    entire dataset.
+    Returns:
+        float: overall exact match
+        float: overall F1-score
+    """
+    predictions = []
+    scores = 0
+    # Currently just takes the first answer and does not look at scores yet
+    for question in questions:
+        score, result = retriever.retrieve(question, 1)
+        scores += score[0]
+        predictions.append(result['text'][0])
+    exact_matches = [exact_match(
+        predictions[i], answers[i]) for i in range(len(answers))]
+    f1_scores = [f1(
+        predictions[i], answers[i]) for i in range(len(answers))]
+    return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)

{base_model → src}/reader.py RENAMED Viewed

File without changes

src/retrievers/base_retriever.py ADDED Viewed

	@@ -0,0 +1,3 @@

+class Retriever():
+    def retrieve(self, query: str, k: int):
+        pass

src/retrievers/es_retriever.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from src.utils.log import get_logger
+logger = get_logger()
+class ESRetriever(Retriever):
+    def __init__(self, data_set):
+        pass
+    def retrieve(self, query: str, k: int):
+        pass

base_model/retriever.py → src/retrievers/fais_retriever.py RENAMED Viewed

@@ -1,23 +1,27 @@
 from transformers import (
     DPRContextEncoder,
     DPRContextEncoderTokenizer,
     DPRQuestionEncoder,
     DPRQuestionEncoderTokenizer,
 )
-from datasets import load_dataset
-import torch
-import os.path
-import evaluate
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
-import os
-os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
-class Retriever:
     """A class used to retrieve relevant documents based on some query.
     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
@@ -67,12 +71,13 @@ class Retriever:
             embeddings.
         """
         # Load dataset
-        ds = load_dataset(dataset_name, name="paragraphs")["train"]
-        print(ds)
         if os.path.exists(embedding_path):
             # If we already have FAISS embeddings, load them from disk
-            ds.load_faiss_index('embeddings', embedding_path)
             return ds
         else:
             # If there are no FAISS embeddings, generate them
@@ -85,7 +90,7 @@ class Retriever:
                 return {"embeddings": enc}
             # Add FAISS embeddings
-            ds_with_embeddings = ds.map(embed)
             ds_with_embeddings.add_faiss_index(column="embeddings")
@@ -118,32 +123,3 @@ class Retriever:
         )
         return scores, results
-    def evaluate(self):
-        """Evaluates the entire model by computing F1-score and exact match on the
-        entire dataset.
-        Returns:
-            float: overall exact match
-            float: overall F1-score
-        """
-        questions_ds = load_dataset(
-            self.dataset_name, name="questions")['test']
-        questions = questions_ds['question']
-        answers = questions_ds['answer']
-        predictions = []
-        scores = 0
-        # Currently just takes the first answer and does not look at scores yet
-        for question in questions:
-            score, result = self.retrieve(question, 1)
-            scores += score[0]
-            predictions.append(result['text'][0])
-        exact_matches = [evaluate.compute_exact_match(
-            predictions[i], answers[i]) for i in range(len(answers))]
-        f1_scores = [evaluate.compute_f1(
-            predictions[i], answers[i]) for i in range(len(answers))]
-        return sum(exact_matches) / len(exact_matches), sum(f1_scores) / len(f1_scores)

+import os
+import os.path
+import torch
+from datasets import load_dataset
 from transformers import (
     DPRContextEncoder,
     DPRContextEncoderTokenizer,
     DPRQuestionEncoder,
     DPRQuestionEncoderTokenizer,
 )
+from src.retrievers.base_retriever import Retriever
+from src.utils.log import get_logger
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
+logger = get_logger()
+class FAISRetriever(Retriever):
     """A class used to retrieve relevant documents based on some query.
     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
             embeddings.
         """
         # Load dataset
+        ds = load_dataset(dataset_name, name="paragraphs")[
+            "train"]  # type: ignore
+        logger.info(ds)
         if os.path.exists(embedding_path):
             # If we already have FAISS embeddings, load them from disk
+            ds.load_faiss_index('embeddings', embedding_path)  # type: ignore
             return ds
         else:
             # If there are no FAISS embeddings, generate them
                 return {"embeddings": enc}
             # Add FAISS embeddings
+            ds_with_embeddings = ds.map(embed)  # type: ignore
             ds_with_embeddings.add_faiss_index(column="embeddings")
         )
         return scores, results

src/utils/log.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import logging
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def get_logger():
+    # creates a default logger for the project
+    logger = logging.getLogger("Flashcards")
+    log_level = os.getenv("LOG_LEVEL", "INFO")
+    logger.setLevel(log_level)
+    # Log format
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    # file handler
+    fh = logging.FileHandler("logs.log")
+    fh.setFormatter(formatter)
+    # stout
+    ch = logging.StreamHandler()
+    ch.setFormatter(formatter)
+    logger.addHandler(fh)
+    logger.addHandler(ch)
+    return logger

{base_model → src/utils}/string_utils.py RENAMED Viewed

File without changes