JCRios commited on
Commit
05f6e62
1 Parent(s): 1bd94ad

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +26 -0
utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def drop_non_relevant_text(text_list):
4
+ '''Based on the distribution of the len in the Spanish words,
5
+ I decided to drop paragraphs composed mainly (over 50%) of too long words (len over 14 characters)
6
+ This is to remove errors in the process to read PDFs
7
+ '''
8
+ text_list = [x.split(' ') for x in text_list]
9
+ relevant_sentences = []
10
+ counter = 0
11
+ for i in text_list:
12
+ for j in i:
13
+ if len(j)>14:
14
+ counter+=1
15
+ if counter/len(i)<0.5:
16
+ relevant_sentences+=[i]
17
+ counter=0
18
+ return [' '.join(x).strip() for x in relevant_sentences]
19
+
20
+ def preprocess_text(text):
21
+ text=text.strip()
22
+ text = re.sub(' +', ' ',text)
23
+ text = re.sub('-', '',text)
24
+ text = re.sub('-', '',text)
25
+ text = re.sub('\n', '',text)
26
+ return [x for x in text.split('.') if len(x)>1]