import torch import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification from scipy.stats import zscore class SentimentAnalyzer: def __init__(self): self.models = { 'finbert': AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone"), 'financial_sentiment': AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") } self.tokenizers = { 'finbert': AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone"), 'financial_sentiment': AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") } self.max_length = 512 # Limite do modelo def chunk_text(self, text, tokenizer): tokens = tokenizer.encode(text, truncation=False) return [tokens[i:i+self.max_length] for i in range(0, len(tokens), self.max_length)] def preprocess_text(self, item): title = str(item.get('title', '')).strip() content = str(item.get('content', '')).strip() text = f"{title} {content}".strip() return text if text else None def analyze(self, news): if not news: return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} sentiment_scores = [] for item in news: if not isinstance(item, dict): continue text = self.preprocess_text(item) if not text: continue tokenizer = self.tokenizers['financial_sentiment'] model = self.models['financial_sentiment'] tokenized_chunks = self.chunk_text(text, tokenizer) chunk_scores = [] for chunk in tokenized_chunks: inputs = tokenizer.decode(chunk, skip_special_tokens=True) inputs = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=self.max_length) outputs = model(**inputs) probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) chunk_scores.append(probabilities.detach().numpy()[0]) if chunk_scores: sentiment_scores.append(np.mean(chunk_scores, axis=0)) if not sentiment_scores: return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} # Filtro de outliers filtered_scores = [s for s in sentiment_scores if np.abs(zscore(s)).max() < 2] avg_sentiment = np.mean(filtered_scores, axis=0) if filtered_scores else np.mean(sentiment_scores, axis=0) return {'negative': float(avg_sentiment[0]), 'neutral': float(avg_sentiment[1]), 'positive': float(avg_sentiment[2])}