|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import asyncio
|
|
from playwright.async_api import async_playwright
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import trafilatura
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_page_title(url: str) -> str:
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
title_tag = soup.find('title')
|
|
return title_tag.string.strip() if title_tag else "Untitled"
|
|
except requests.RequestException as e:
|
|
logging.error(f"Error fetching page title: {e}")
|
|
return "Untitled"
|
|
|
|
|
|
def get_artice_title(article_url_arg: str) -> str:
|
|
|
|
article_title = get_page_title(article_url_arg)
|
|
|
|
|
|
def scrape_article(url):
|
|
async def fetch_html(url: str) -> str:
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
|
|
page = await context.new_page()
|
|
await page.goto(url)
|
|
await page.wait_for_load_state("networkidle")
|
|
content = await page.content()
|
|
await browser.close()
|
|
return content
|
|
|
|
def extract_article_data(html: str) -> dict:
|
|
downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
|
|
if downloaded:
|
|
metadata = trafilatura.extract_metadata(html)
|
|
if metadata:
|
|
return {
|
|
'title': metadata.title if metadata.title else 'N/A',
|
|
'author': metadata.author if metadata.author else 'N/A',
|
|
'content': downloaded,
|
|
'date': metadata.date if metadata.date else 'N/A',
|
|
}
|
|
else:
|
|
print("Metadata extraction failed.")
|
|
return None
|
|
else:
|
|
print("Content extraction failed.")
|
|
return None
|
|
|
|
def convert_html_to_markdown(html: str) -> str:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
for para in soup.find_all('p'):
|
|
para.append('\n')
|
|
|
|
|
|
text = soup.get_text(separator='\n\n')
|
|
|
|
return text
|
|
|
|
async def fetch_and_extract_article(url: str):
|
|
html = await fetch_html(url)
|
|
print("HTML Content:", html[:500])
|
|
article_data = extract_article_data(html)
|
|
if article_data:
|
|
article_data['content'] = convert_html_to_markdown(article_data['content'])
|
|
return article_data
|
|
else:
|
|
return None
|
|
|
|
|
|
article_data = asyncio.run(fetch_and_extract_article(url))
|
|
return article_data
|
|
|
|
|
|
|
|
|