Open_NotebookLM_TLDW / App_Function_Libraries /Article_Extractor_Lib.py
oceansweep's picture
?
ed28876
raw
history blame
4.12 kB
# Article_Extractor_Lib.py
#########################################
# Article Extraction Library
# This library is used to handle scraping and extraction of articles from web pages.
# Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
# Firecrawl would be a better option for this, but it is not yet implemented.
####
#
####################
# Function List
#
# 1. get_page_title(url)
# 2. get_article_text(url)
# 3. get_article_title(article_url_arg)
#
####################
#
# Import necessary libraries
import logging
# 3rd-Party Imports
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import trafilatura
# Import Local
#
#######################################################################################################################
# Function Definitions
#
def get_page_title(url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title_tag = soup.find('title')
return title_tag.string.strip() if title_tag else "Untitled"
except requests.RequestException as e:
logging.error(f"Error fetching page title: {e}")
return "Untitled"
def get_artice_title(article_url_arg: str) -> str:
# Use beautifulsoup to get the page title - Really should be using ytdlp for this....
article_title = get_page_title(article_url_arg)
def scrape_article(url):
async def fetch_html(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
page = await context.new_page()
await page.goto(url)
await page.wait_for_load_state("networkidle") # Wait for the network to be idle
content = await page.content()
await browser.close()
return content
def extract_article_data(html: str) -> dict:
downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
if downloaded:
metadata = trafilatura.extract_metadata(html)
if metadata:
return {
'title': metadata.title if metadata.title else 'N/A',
'author': metadata.author if metadata.author else 'N/A',
'content': downloaded,
'date': metadata.date if metadata.date else 'N/A',
}
else:
print("Metadata extraction failed.")
return None
else:
print("Content extraction failed.")
return None
def convert_html_to_markdown(html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
# Convert each paragraph to markdown
for para in soup.find_all('p'):
para.append('\n') # Add a newline at the end of each paragraph for markdown separation
# Use .get_text() with separator to keep paragraph separation
text = soup.get_text(separator='\n\n')
return text
async def fetch_and_extract_article(url: str):
html = await fetch_html(url)
print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection
article_data = extract_article_data(html)
if article_data:
article_data['content'] = convert_html_to_markdown(article_data['content'])
return article_data
else:
return None
# Using asyncio.run to handle event loop creation and execution
article_data = asyncio.run(fetch_and_extract_article(url))
return article_data
#
#
#######################################################################################################################