zhenyundeng commited on
Commit
6bf7515
·
1 Parent(s): 8532c4b
Files changed (1) hide show
  1. html2lines.py +72 -0
html2lines.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from distutils.command.config import config
2
+ import requests
3
+ from time import sleep
4
+ import trafilatura
5
+ from trafilatura.meta import reset_caches
6
+ from trafilatura.settings import DEFAULT_CONFIG
7
+ import spacy
8
+ import os
9
+ # os.system("python -m spacy download en_core_web_sm")
10
+ nlp = spacy.load('en_core_web_sm')
11
+ import sys
12
+
13
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
14
+
15
+ def get_page(url):
16
+ page = None
17
+ for i in range(3):
18
+ try:
19
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
20
+ assert page is not None
21
+ print("Fetched "+url, file=sys.stderr)
22
+ break
23
+ except:
24
+ sleep(3)
25
+ return page
26
+
27
+ def url2lines(url):
28
+ page = get_page(url)
29
+
30
+ if page is None:
31
+ return []
32
+
33
+ lines = html2lines(page)
34
+ return lines
35
+
36
+ def line_correction(lines, max_size=100):
37
+ out_lines = []
38
+ for line in lines:
39
+ if len(line) < 4:
40
+ continue
41
+
42
+ if len(line) > max_size:
43
+ doc = nlp(line[:5000]) # We split lines into sentences, but for performance we take only the first 5k characters per line
44
+ stack = ""
45
+ for sent in doc.sents:
46
+ if len(stack) > 0:
47
+ stack += " "
48
+ stack += str(sent).strip()
49
+ if len(stack) > max_size:
50
+ out_lines.append(stack)
51
+ stack = ""
52
+
53
+ if len(stack) > 0:
54
+ out_lines.append(stack)
55
+ else:
56
+ out_lines.append(line)
57
+
58
+ return out_lines
59
+
60
+ def html2lines(page):
61
+ out_lines = []
62
+
63
+ if len(page.strip()) == 0 or page is None:
64
+ return out_lines
65
+
66
+ text = trafilatura.extract(page, config=DEFAULT_CONFIG)
67
+ reset_caches()
68
+
69
+ if text is None:
70
+ return out_lines
71
+
72
+ return text.split("\n") # We just spit out the entire page, so need to reformat later.