Spaces:
Build error
Build error
File size: 1,856 Bytes
6bf7515 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from distutils.command.config import config
import requests
from time import sleep
import trafilatura
from trafilatura.meta import reset_caches
from trafilatura.settings import DEFAULT_CONFIG
import spacy
import os
# os.system("python -m spacy download en_core_web_sm")
nlp = spacy.load('en_core_web_sm')
import sys
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
def get_page(url):
page = None
for i in range(3):
try:
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
assert page is not None
print("Fetched "+url, file=sys.stderr)
break
except:
sleep(3)
return page
def url2lines(url):
page = get_page(url)
if page is None:
return []
lines = html2lines(page)
return lines
def line_correction(lines, max_size=100):
out_lines = []
for line in lines:
if len(line) < 4:
continue
if len(line) > max_size:
doc = nlp(line[:5000]) # We split lines into sentences, but for performance we take only the first 5k characters per line
stack = ""
for sent in doc.sents:
if len(stack) > 0:
stack += " "
stack += str(sent).strip()
if len(stack) > max_size:
out_lines.append(stack)
stack = ""
if len(stack) > 0:
out_lines.append(stack)
else:
out_lines.append(line)
return out_lines
def html2lines(page):
out_lines = []
if len(page.strip()) == 0 or page is None:
return out_lines
text = trafilatura.extract(page, config=DEFAULT_CONFIG)
reset_caches()
if text is None:
return out_lines
return text.split("\n") # We just spit out the entire page, so need to reformat later. |