Spaces:
Sleeping
Sleeping
import re | |
def drop_non_relevant_text(text_list): | |
'''Based on the distribution of the len in the Spanish words, | |
I decided to drop paragraphs composed mainly (over 50%) of too long words (len over 14 characters) | |
This is to remove errors in the process to read PDFs | |
''' | |
text_list = [x.split(' ') for x in text_list] | |
relevant_sentences = [] | |
counter = 0 | |
for i in text_list: | |
for j in i: | |
if len(j)>14: | |
counter+=1 | |
if counter/len(i)<0.5: | |
relevant_sentences+=[i] | |
counter=0 | |
return [' '.join(x).strip() for x in relevant_sentences] | |
def preprocess_text(text): | |
text=text.strip() | |
text = re.sub(' +', ' ',text) | |
text = re.sub('-', '',text) | |
text = re.sub('-', '',text) | |
text = re.sub('\n', '',text) | |
return [x for x in text.split('.') if len(x)>1] | |