import requests import pandas as pd import time import pymorphy2 import re import cloudscraper from datetime import datetime, timedelta from transformers import pipeline from bs4 import BeautifulSoup ###### class NewsData: def __init__(self) -> None: """ Парсер статей с ru.investing.com. """ self.urls = [ ("https://ru.investing.com/news/forex-news/", "forex-news"), ("https://ru.investing.com/news/commodities-news/", "commodities-news"), ] self.headers = self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Referer': 'https://www.google.com/' } self.proxies = { "http": "http://82.146.37.145:80", "https": "https://82.146.37.145:80" } self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму self.scraper = cloudscraper.create_scraper() def get_data(self): # Парсим сайты за последнии 7 дней date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1) articles_data = [] for base_url, tag in self.urls: page = 1 while True: url = f"{base_url}{page}/" #response = requests.get(url, headers=self.headers) response = self.scraper.get(url, headers=self.headers, proxies=self.proxies) print(response) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") articles = soup.find_all("a", attrs={"data-test": "article-title-link"}) daily_count = 0 for article in articles: title = article.text.strip() date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"}) url = article["href"] if date_tag: publish_date_str = date_tag["datetime"] publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S") articles_data.append([title, publish_date_str, tag, url]) daily_count += 1 page += 1 if publish_date < date_limit: print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.") print(f"Общее количество скачанных статей: {len(articles_data)}") break else: if response.status_code == 404: break print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}") time.sleep(5) return self.get_proccesing_news(articles_data) def clean_text(self, text): text = re.sub(r'[^а-яА-Я\s]', '', text) text = text.lower() text = ' '.join([word for word in text.split() if len(word) >= 3]) text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()]) return text def get_proccesing_news(self, articles_data): # Обрабатываем полученные данные, добавляем сентиментальный анализ. sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt") news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url']) news['title'] = news['title'].ffill().map(self.clean_text) news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label']) news = pd.get_dummies(news, columns=['sentiment']) news['DATE'] = pd.to_datetime(news['DATE']) news['day'] = news['DATE'].dt.day news['year'] = news['DATE'].dt.year news['month'] = news['DATE'].dt.month news['hour'] = news['DATE'].dt.hour return news