import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pad_sequence import nltk from nltk.tokenize import word_tokenize from collections import Counter # دانلود بسته‌های nltk nltk.download('punkt') # بارگذاری داده‌ها از فایل def load_text_data(file_path): with open(file_path, 'r') as file: data = file.readlines() return [line.strip() for line in data] # بارگذاری داده‌ها file_path = 'data.txt' sentences = load_text_data(file_path) # توکن‌سازی def tokenize(text): return word_tokenize(text.lower()) # ساخت vocab def build_vocab(sentences): tokens = [token for sentence in sentences for token in tokenize(sentence)] vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())} vocab[''] = len(vocab) vocab[''] = len(vocab) return vocab vocab = build_vocab(sentences) vocab_size = len(vocab) print(f"Vocabulary size: {vocab_size}") # آماده‌سازی داده‌ها class TextDataset(Dataset): def __init__(self, sentences, vocab, seq_length=10): self.data = [] self.vocab = vocab self.seq_length = seq_length for sentence in sentences: tokens = tokenize(sentence) indices = [vocab.get(token, vocab['']) for token in tokens] # ایجاد توالی‌های ورودی و هدف for i in range(len(indices) - seq_length): self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1])) def __len__(self): return len(self.data) def __getitem__(self, idx): inputs, targets = self.data[idx] return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long) dataset = TextDataset(sentences, vocab, seq_length=10) dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: ( pad_sequence([i[0] for i in x], batch_first=True), pad_sequence([i[1] for i in x], batch_first=True) )) # مدل LSTM با Dropout class LSTMModel(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2): super(LSTMModel, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0.5, batch_first=True) self.fc = nn.Linear(hidden_size, vocab_size) def forward(self, x): x = self.embedding(x) lstm_out, _ = self.lstm(x) out = self.fc(lstm_out) return out # پارامترها و مدل embed_size = 64 hidden_size = 256 model = LSTMModel(vocab_size, embed_size, hidden_size) criterion = nn.CrossEntropyLoss() optimizer = optim.AdamW(model.parameters(), lr=0.001) # استفاده از AdamW به جای Adam # آموزش مدل num_epochs = 20 for epoch in range(num_epochs): for inputs, targets in dataloader: optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss loss.backward() optimizer.step() print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') # تولید جمله def generate_sentence(model, start_text, vocab, tokenizer, max_length=50): model.eval() indices = [vocab.get(token, vocab['']) for token in tokenize(start_text)] input_seq = torch.tensor(indices).unsqueeze(0) # تبدیل به tensor و اضافه کردن بعد batch generated_sentence = start_text with torch.no_grad(): for _ in range(max_length): outputs = model(input_seq) next_word_probs = outputs[:, -1, :] predicted_index = torch.argmax(next_word_probs, dim=1).item() predicted_word = [word for word, index in vocab.items() if index == predicted_index][0] generated_sentence += ' ' + predicted_word # اضافه کردن کلمه پیش‌بینی شده به توالی ورودی برای پیش‌بینی کلمه بعدی indices.append(predicted_index) input_seq = torch.tensor(indices[-10:]).unsqueeze(0) # آخرین 10 کلمه را به عنوان ورودی جدید return generated_sentence # تولید جمله start_text = "Deep learning is" generated_sentence = generate_sentence(model, start_text, vocab, tokenize) print(f"Generated sentence: {generated_sentence}")