Spaces:
Build error
Build error
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urlparse | |
import gradio as gr | |
import json | |
def extract_wikipedia_text(raw_text, language): | |
contents = [] | |
paragraph = "" | |
for element in raw_text: | |
# detected next headline | |
if element.name == "span": | |
if paragraph == "": | |
continue | |
contents.append({f"text-{language}": paragraph}) | |
paragraph = "" | |
else: | |
clean_text = preprocessing(element.text) | |
if clean_text == "": | |
continue | |
if paragraph != "": | |
clean_text = " " + clean_text | |
paragraph += clean_text | |
return contents | |
def preprocessing(text): | |
# remove square brackets a.k.a citations | |
clean_text = re.sub("\[.*?]", "", text).strip() | |
# remove \n | |
clean_text = clean_text.replace("\n", "") | |
return clean_text | |
def scrape(url): | |
language = urlparse(url).netloc.split(".")[0] | |
try: | |
page = requests.get(url, headers={"user-agent": "Mozilla/5.0"}) | |
soup = BeautifulSoup(page.content, "html.parser") | |
except: | |
print("error") | |
title = soup.find("h1", {"id": "firstHeading"}).get_text().strip() | |
raw_text = soup.select( | |
"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p" | |
) | |
contents = extract_wikipedia_text(raw_text, language) | |
json_output = {"source": url, f"title-{language}": title, "pages": contents} | |
filename = f"{url.split('/')[-1]}.json" | |
with open(filename, "w") as f: | |
json.dump(json_output, f) | |
return json_output, filename | |
style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }" | |
with gr.Blocks(css=style_sheet) as demo: | |
gr.Markdown( | |
f""" | |
<center> | |
<h1>Wikipedia Scraper π</h1> | |
</center> | |
""" | |
) | |
with gr.Row(): | |
inp = gr.Textbox(placeholder="Wikipedia URL") | |
with gr.Column(): | |
out = gr.JSON(elem_id="json-output") | |
out_download = gr.File() | |
btn = gr.Button("Scrape") | |
btn.click(fn=scrape, inputs=inp, outputs=[out, out_download]) | |
demo.launch(debug=True) | |