import json | |
import logging | |
import unittest | |
from tqdm import tqdm | |
import beir.util | |
from bm25s.utils.corpus import JsonlCorpus | |
from bm25s.utils.beir import BASE_URL | |
class TestTopKSingleQuery(unittest.TestCase): | |
def test_utils_corpus(self): | |
save_dir = "datasets" | |
dataset = "scifact" | |
data_path = beir.util.download_and_unzip(BASE_URL.format(dataset), save_dir) | |
corpus_path = f"{data_path}/corpus.jsonl" | |
nq = JsonlCorpus(corpus_path) | |
# get all ids | |
corpus_ids = [doc["_id"] for doc in tqdm(nq)] | |
# alternatively, try opening the file and read the _ids as we go | |
corpus_ids_2 = [] | |
with open(corpus_path, "r") as f: | |
for line in f: | |
doc = json.loads(line) | |
corpus_ids_2.append(doc["_id"]) | |
self.assertListEqual(corpus_ids, corpus_ids_2) |