import streamlit as st import requests from bs4 import BeautifulSoup import re from requests.sessions import Session from langdetect import detect from googletrans import Translator def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None): try: session = Session() # Handle authentication if credentials are provided if email and password and login_url: login_data = { 'email': email, 'password': password # Include other necessary fields as required by the website } response = session.post(login_url, data=login_data) response.raise_for_status() else: response = session.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove unwanted tags for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): tag.extract() # Use query selector if provided if query_selector: elements = soup.select(query_selector) text_content = " ".join([element.get_text() for element in elements]) else: # Extract header content header_content = soup.find("header") header_text = header_content.get_text() if header_content else "" # Extract paragraph content paragraph_content = soup.body paragraph_text = " ".join([p.get_text() for p in paragraph_content]) text_content = f"{header_text}\n\n{paragraph_text}" # Clean up whitespace visible_text = re.sub(r'\s+', ' ', text_content).strip() # Translate non-English text translator = Translator() sentences = re.split(r'(?<=[.!?]) +', visible_text) translated_sentences = [] for sentence in sentences: try: lang = detect(sentence) if lang != 'en': translation = translator.translate(sentence, dest='en').text translated_sentences.append(translation) else: translated_sentences.append(sentence) except Exception: translated_sentences.append(sentence) translated_text = ' '.join(translated_sentences) return translated_text except Exception as e: st.error(f"Error occurred while scraping the data: {e}") return None def main(): st.title("🌐 Web Data Scraper") url_input = st.text_input("Enter the URL :", "") query_selector = st.text_input("Enter a query selector (optional):", "") email = st.text_input("Email (if authentication required):", "") password = st.text_input("Password (if authentication required):", "", type="password") login_url = st.text_input("Enter the login URL (if authentication required):", "") if st.button("💿 Load Data"): if url_input: data = scrape_visible_text_from_url( url=url_input, query_selector=query_selector if query_selector else None, email=email if email else None, password=password if password else None, login_url=login_url if login_url else None ) if data: st.success("Data text successfully scraped!") st.subheader("Scraped Text:") st.write(data) else: st.warning("Failed to load data from the URL.") else: st.warning("Please enter a valid URL.") if __name__ == "__main__": main()