File size: 1,655 Bytes
ab5dfc2
 
 
325e3c6
ab5dfc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492106d
 
ab5dfc2
 
 
 
 
 
 
 
 
 
 
 
 
492106d
ab5dfc2
 
b7158e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325e3c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from typing import Dict, List


def context_to_reader_input(result: Dict[str, List[str]]) \
        -> Dict[str, List[str]]:
    """Takes the output of the retriever and turns it into a format the reader
    understands.

    Args:
        result (Dict[str, List[str]]): The result from the retriever
    """

    # Take the number of valeus of an arbitrary item as the number of entries
    # (This should always be valid)
    num_entries = len(result['n_chapter'])

    # Prepare result
    reader_result = {
        'titles': [],
        'texts': [],
        'scores': []
    }

    for n in range(num_entries):
        # Get the most specific title
        if result['subsection'][n] != 'nan':
            title = result['subsection'][n]
        elif result['section'][n] != 'nan':
            title = result['section'][n]
        else:
            title = result['chapter'][n]

        reader_result['titles'].append(title)
        reader_result['texts'].append(result['text'][n])
        reader_result['scores'].append(result['text'][n])

    return reader_result


def remove_formulas(ds):
    """Replaces text in the 'text' column of the ds which has an average
    word length of <= 3.5 with blanks. This essentially means that most
    of the formulas are removed.
    To-do:
    - more-preprocessing
    - a summarization model perhaps
    Args:
        ds: HuggingFace dataset that contains the information for the retriever
    Returns:
        ds: preprocessed HuggingFace dataset
    """
    words = ds['text'].split()
    average = sum(len(word) for word in words) / len(words)
    if average <= 3.5:
        ds['text'] = ''
    return ds