Ramon Meffert
Add evaluation
492106d
raw
history blame
No virus
1.66 kB
from typing import Dict, List
def context_to_reader_input(result: Dict[str, List[str]]) \
-> Dict[str, List[str]]:
"""Takes the output of the retriever and turns it into a format the reader
understands.
Args:
result (Dict[str, List[str]]): The result from the retriever
"""
# Take the number of valeus of an arbitrary item as the number of entries
# (This should always be valid)
num_entries = len(result['n_chapter'])
# Prepare result
reader_result = {
'titles': [],
'texts': [],
'scores': []
}
for n in range(num_entries):
# Get the most specific title
if result['subsection'][n] != 'nan':
title = result['subsection'][n]
elif result['section'][n] != 'nan':
title = result['section'][n]
else:
title = result['chapter'][n]
reader_result['titles'].append(title)
reader_result['texts'].append(result['text'][n])
reader_result['scores'].append(result['text'][n])
return reader_result
def remove_formulas(ds):
"""Replaces text in the 'text' column of the ds which has an average
word length of <= 3.5 with blanks. This essentially means that most
of the formulas are removed.
To-do:
- more-preprocessing
- a summarization model perhaps
Args:
ds: HuggingFace dataset that contains the information for the retriever
Returns:
ds: preprocessed HuggingFace dataset
"""
words = ds['text'].split()
average = sum(len(word) for word in words) / len(words)
if average <= 3.5:
ds['text'] = ''
return ds