Upload 6 files
Browse files- lstm-chatbot-training/chatbot.py +78 -0
- lstm-chatbot-training/chatbot2.py +74 -0
- lstm-chatbot-training/train.py +105 -0
- lstm-chatbot-training/train2.py +137 -0
- lstm-chatbot-training/train3.py +105 -0
- lstm-chatbot-training/train4.py +109 -0
lstm-chatbot-training/chatbot.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
# توکنسازی
|
8 |
+
def tokenize(text):
|
9 |
+
return word_tokenize(text.lower())
|
10 |
+
|
11 |
+
# بارگذاری vocab
|
12 |
+
with open('vocab.json', 'r') as f:
|
13 |
+
vocab = json.load(f)
|
14 |
+
vocab_size = len(vocab)
|
15 |
+
|
16 |
+
# مدل LSTM با Dropout و لایههای متعدد
|
17 |
+
class LSTMModel(nn.Module):
|
18 |
+
def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
|
19 |
+
super(LSTMModel, self).__init__()
|
20 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
21 |
+
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first = True)
|
22 |
+
self.fc = nn.Linear(hidden_size, vocab_size)
|
23 |
+
|
24 |
+
def forward(self, x):
|
25 |
+
x = self.embedding(x)
|
26 |
+
lstm_out, _ = self.lstm(x)
|
27 |
+
out = self.fc(lstm_out)
|
28 |
+
return out
|
29 |
+
|
30 |
+
# بارگذاری مدل
|
31 |
+
embed_size = 64
|
32 |
+
hidden_size = 512
|
33 |
+
model = LSTMModel(vocab_size, embed_size, hidden_size)
|
34 |
+
model.load_state_dict(torch.load('lstm_model.pth', weights_only= True))
|
35 |
+
model.eval()
|
36 |
+
|
37 |
+
# Beam Search با Top-k Sampling
|
38 |
+
def beam_search_with_top_k_sampling(model, start_text, vocab, tokenizer, beam_width=1, top_k=2, max_length= 64):
|
39 |
+
model.eval()
|
40 |
+
indices = [vocab.get(token, vocab['<unk>']) for token in tokenize(start_text)]
|
41 |
+
input_seq = torch.tensor(indices).unsqueeze(0)
|
42 |
+
beams = [(input_seq, start_text, 0)] # (sequence, text, score)
|
43 |
+
|
44 |
+
for _ in range(max_length):
|
45 |
+
new_beams = []
|
46 |
+
for beam in beams:
|
47 |
+
seq, text, score = beam
|
48 |
+
with torch.no_grad():
|
49 |
+
outputs = model(seq)
|
50 |
+
next_word_probs = outputs[:, -1, :]
|
51 |
+
topk_probs, topk_indices = torch.topk(next_word_probs, top_k, dim=1)
|
52 |
+
|
53 |
+
# Sample from top-k probabilities
|
54 |
+
probabilities = torch.softmax(topk_probs, dim=1).squeeze().cpu().numpy()
|
55 |
+
for i in range(top_k):
|
56 |
+
next_index = np.random.choice(topk_indices[0].cpu().numpy(), p=probabilities)
|
57 |
+
next_word = [word for word, index in vocab.items() if index == next_index][0]
|
58 |
+
new_seq = torch.cat([seq, torch.tensor([[next_index]])], dim=1)
|
59 |
+
new_score = score + np.log(probabilities[i]) # Update score with log probability
|
60 |
+
new_beams.append((new_seq, text + ' ' + next_word, new_score))
|
61 |
+
|
62 |
+
# Select the top beams
|
63 |
+
beams = sorted(new_beams, key=lambda x: x[2], reverse = False)[:beam_width]
|
64 |
+
|
65 |
+
return beams[-1][1] # Return the highest scoring beam
|
66 |
+
|
67 |
+
# چت بات
|
68 |
+
def chat():
|
69 |
+
print("Chatbot is ready. Type 'exit' to end the chat.")
|
70 |
+
while True:
|
71 |
+
user_input = input("You: ")
|
72 |
+
if user_input.lower() == 'exit':
|
73 |
+
break
|
74 |
+
response = beam_search_with_top_k_sampling(model, user_input, vocab, tokenize)
|
75 |
+
print(f"Bot: {response}")
|
76 |
+
|
77 |
+
# شروع چت
|
78 |
+
chat()
|
lstm-chatbot-training/chatbot2.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import json
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
|
6 |
+
# بارگذاری vocab
|
7 |
+
with open('vocab.json', 'r') as f:
|
8 |
+
vocab = json.load(f)
|
9 |
+
|
10 |
+
# بارگذاری مدل
|
11 |
+
class LSTMModel(nn.Module):
|
12 |
+
def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2):
|
13 |
+
super(LSTMModel, self).__init__()
|
14 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
15 |
+
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first=True, bidirectional=True)
|
16 |
+
self.fc = nn.Linear(hidden_size * 2, vocab_size)
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
x = self.embedding(x)
|
20 |
+
lstm_out, _ = self.lstm(x)
|
21 |
+
out = self.fc(lstm_out)
|
22 |
+
return out
|
23 |
+
|
24 |
+
vocab_size = len(vocab)
|
25 |
+
embed_size = 64
|
26 |
+
hidden_size = 512
|
27 |
+
model = LSTMModel(vocab_size, embed_size, hidden_size)
|
28 |
+
model.load_state_dict(torch.load('lstm_model.pth', weights_only=True))
|
29 |
+
model.eval()
|
30 |
+
|
31 |
+
# توکنسازی و پردازش ورودی
|
32 |
+
def tokenize(text):
|
33 |
+
return word_tokenize(text.lower())
|
34 |
+
|
35 |
+
def tokens_to_indices(tokens, vocab):
|
36 |
+
return [vocab.get(token, vocab['<unk>']) for token in tokens]
|
37 |
+
|
38 |
+
def indices_to_tokens(indices, vocab):
|
39 |
+
inv_vocab = {v: k for k, v in vocab.items()}
|
40 |
+
return [inv_vocab.get(index, '<unk>') for index in indices]
|
41 |
+
|
42 |
+
# تولید پاسخ
|
43 |
+
def generate_response(model, input_text, vocab, max_length=20):
|
44 |
+
tokens = tokenize(input_text)
|
45 |
+
input_indices = tokens_to_indices(tokens, vocab)
|
46 |
+
input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0) # اضافه کردن بعد برای batch
|
47 |
+
|
48 |
+
response_indices = []
|
49 |
+
with torch.no_grad():
|
50 |
+
for _ in range(max_length):
|
51 |
+
output = model(input_tensor)
|
52 |
+
next_token_id = output.argmax(dim=-1)[:, -1].item() # پیدا کردن کلمه بعدی
|
53 |
+
response_indices.append(next_token_id)
|
54 |
+
input_tensor = torch.cat([input_tensor, torch.tensor([[next_token_id]], dtype=torch.long)], dim=1)
|
55 |
+
|
56 |
+
if next_token_id == vocab['<pad>']:
|
57 |
+
break
|
58 |
+
|
59 |
+
response_tokens = indices_to_tokens(response_indices, vocab)
|
60 |
+
return ' '.join(response_tokens).replace('<pad>', '')
|
61 |
+
|
62 |
+
# حلقه چت
|
63 |
+
def chat():
|
64 |
+
print("Chatbot is ready! Type 'quit' to exit.")
|
65 |
+
while True:
|
66 |
+
user_input = input("You: ")
|
67 |
+
if user_input.lower() == 'quit':
|
68 |
+
print("Goodbye!")
|
69 |
+
break
|
70 |
+
response = generate_response(model, user_input, vocab)
|
71 |
+
print(f"Bot: {response}")
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
chat()
|
lstm-chatbot-training/train.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
from torch.nn.utils.rnn import pad_sequence
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from collections import Counter
|
9 |
+
import json
|
10 |
+
|
11 |
+
# دانلود بستههای nltk
|
12 |
+
#nltk.download('punkt')
|
13 |
+
|
14 |
+
# بارگذاری دادهها از فایل
|
15 |
+
def load_text_data(file_path):
|
16 |
+
with open(file_path, 'r') as file:
|
17 |
+
data = file.readlines()
|
18 |
+
return [line.strip() for line in data]
|
19 |
+
|
20 |
+
# بارگذاری دادهها
|
21 |
+
file_path = 'data.txt'
|
22 |
+
sentences = load_text_data(file_path)
|
23 |
+
|
24 |
+
# توکنسازی
|
25 |
+
def tokenize(text):
|
26 |
+
return word_tokenize(text.lower())
|
27 |
+
|
28 |
+
# ساخت vocab
|
29 |
+
def build_vocab(sentences):
|
30 |
+
tokens = [token for sentence in sentences for token in tokenize(sentence)]
|
31 |
+
vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
|
32 |
+
vocab['<unk>'] = len(vocab)
|
33 |
+
vocab['<pad>'] = len(vocab)
|
34 |
+
return vocab
|
35 |
+
|
36 |
+
vocab = build_vocab(sentences)
|
37 |
+
vocab_size = len(vocab)
|
38 |
+
print(f"Vocabulary size: {vocab_size}")
|
39 |
+
|
40 |
+
# ذخیره vocab
|
41 |
+
with open('vocab.json', 'w') as f:
|
42 |
+
json.dump(vocab, f)
|
43 |
+
print('Vocabulary saved to vocab.json')
|
44 |
+
|
45 |
+
# آمادهسازی دادهها
|
46 |
+
class TextDataset(Dataset):
|
47 |
+
def __init__(self, sentences, vocab, seq_length= 8):
|
48 |
+
self.data = []
|
49 |
+
self.vocab = vocab
|
50 |
+
self.seq_length = seq_length
|
51 |
+
for sentence in sentences:
|
52 |
+
tokens = tokenize(sentence)
|
53 |
+
indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
|
54 |
+
for i in range(len(indices) - seq_length):
|
55 |
+
self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
|
56 |
+
|
57 |
+
def __len__(self):
|
58 |
+
return len(self.data)
|
59 |
+
|
60 |
+
def __getitem__(self, idx):
|
61 |
+
inputs, targets = self.data[idx]
|
62 |
+
return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
|
63 |
+
|
64 |
+
dataset = TextDataset(sentences, vocab, seq_length= 8)
|
65 |
+
dataloader = DataLoader(dataset, batch_size=1, shuffle = True, collate_fn=lambda x: (
|
66 |
+
pad_sequence([i[0] for i in x], batch_first = True),
|
67 |
+
pad_sequence([i[1] for i in x], batch_first = True)
|
68 |
+
))
|
69 |
+
|
70 |
+
# مدل LSTM با Dropout و لایههای متعدد
|
71 |
+
class LSTMModel(nn.Module):
|
72 |
+
def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
|
73 |
+
super(LSTMModel, self).__init__()
|
74 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
75 |
+
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first = True)
|
76 |
+
self.fc = nn.Linear(hidden_size, vocab_size)
|
77 |
+
|
78 |
+
def forward(self, x):
|
79 |
+
x = self.embedding(x)
|
80 |
+
lstm_out, _ = self.lstm(x)
|
81 |
+
out = self.fc(lstm_out)
|
82 |
+
return out
|
83 |
+
|
84 |
+
# پارامترها و مدل
|
85 |
+
embed_size = 64
|
86 |
+
hidden_size = 512
|
87 |
+
model = LSTMModel(vocab_size, embed_size, hidden_size)
|
88 |
+
criterion = nn.CrossEntropyLoss()
|
89 |
+
optimizer = optim.AdamW(model.parameters(), lr=0.01) # استفاده از AdamW به جای Adam
|
90 |
+
|
91 |
+
# آموزش مدل
|
92 |
+
num_epochs = 8
|
93 |
+
for epoch in range(num_epochs):
|
94 |
+
for inputs, targets in dataloader:
|
95 |
+
optimizer.zero_grad()
|
96 |
+
outputs = model(inputs)
|
97 |
+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
|
98 |
+
loss.backward()
|
99 |
+
optimizer.step()
|
100 |
+
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
|
101 |
+
|
102 |
+
# ذخیره مدل
|
103 |
+
model_path = 'lstm_model.pth'
|
104 |
+
torch.save(model.state_dict(), model_path)
|
105 |
+
print(f'Model saved to {model_path}')
|
lstm-chatbot-training/train2.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
from torch.nn.utils.rnn import pad_sequence
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from collections import Counter
|
9 |
+
import json
|
10 |
+
|
11 |
+
# دانلود بستههای nltk
|
12 |
+
# nltk.download('punkt')
|
13 |
+
|
14 |
+
# بارگذاری دادهها از فایل
|
15 |
+
def load_text_data(file_path):
|
16 |
+
with open(file_path, 'r') as file:
|
17 |
+
data = file.readlines()
|
18 |
+
return [line.strip() for line in data]
|
19 |
+
|
20 |
+
# بارگذاری دادهها
|
21 |
+
file_path = 'data.txt'
|
22 |
+
sentences = load_text_data(file_path)
|
23 |
+
print(f"Loaded sentences: {sentences[:0]}") # نمایش چند جمله اول برای بررسی
|
24 |
+
|
25 |
+
# توکنسازی
|
26 |
+
def tokenize(text):
|
27 |
+
return word_tokenize(text.lower())
|
28 |
+
|
29 |
+
# ساخت vocab
|
30 |
+
def build_vocab(sentences):
|
31 |
+
tokens = [token for sentence in sentences for token in tokenize(sentence)]
|
32 |
+
vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
|
33 |
+
vocab['<unk>'] = len(vocab)
|
34 |
+
vocab['<pad>'] = len(vocab)
|
35 |
+
return vocab
|
36 |
+
|
37 |
+
vocab = build_vocab(sentences)
|
38 |
+
vocab_size = len(vocab)
|
39 |
+
print(f"Vocabulary size: {vocab_size}")
|
40 |
+
|
41 |
+
# ذخیره vocab
|
42 |
+
with open('vocab.json', 'w') as f:
|
43 |
+
json.dump(vocab, f)
|
44 |
+
print('Vocabulary saved to vocab.json')
|
45 |
+
|
46 |
+
# آمادهسازی دادهها
|
47 |
+
class TextDataset(Dataset):
|
48 |
+
def __init__(self, sentences, vocab, seq_length=10):
|
49 |
+
self.data = []
|
50 |
+
self.vocab = vocab
|
51 |
+
self.seq_length = seq_length
|
52 |
+
for sentence in sentences:
|
53 |
+
tokens = tokenize(sentence)
|
54 |
+
indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
|
55 |
+
if len(indices) >= seq_length + 1:
|
56 |
+
for i in range(len(indices) - seq_length):
|
57 |
+
self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
|
58 |
+
print(f"Dataset size: {len(self.data)}") # نمایش تعداد نمونهها
|
59 |
+
|
60 |
+
def __len__(self):
|
61 |
+
return len(self.data)
|
62 |
+
|
63 |
+
def __getitem__(self, idx):
|
64 |
+
inputs, targets = self.data[idx]
|
65 |
+
return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
|
66 |
+
|
67 |
+
dataset = TextDataset(sentences, vocab, seq_length=10)
|
68 |
+
print(f"Number of samples in dataset: {len(dataset)}")
|
69 |
+
|
70 |
+
def collate_fn(batch):
|
71 |
+
inputs, targets = zip(*batch)
|
72 |
+
inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=vocab['<pad>'])
|
73 |
+
targets_pad = pad_sequence(targets, batch_first=True, padding_value=vocab['<pad>'])
|
74 |
+
return inputs_pad, targets_pad
|
75 |
+
|
76 |
+
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
|
77 |
+
|
78 |
+
# مدل Transformer
|
79 |
+
class Transformer(nn.Module):
|
80 |
+
def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers):
|
81 |
+
super(Transformer, self).__init__()
|
82 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
83 |
+
self.transformer = nn.Transformer(
|
84 |
+
d_model=embed_size,
|
85 |
+
nhead=num_heads,
|
86 |
+
num_encoder_layers=num_layers,
|
87 |
+
num_decoder_layers=num_layers,
|
88 |
+
dim_feedforward=hidden_size,
|
89 |
+
batch_first=True # اطمینان از تنظیم batch_first=True
|
90 |
+
)
|
91 |
+
self.fc = nn.Linear(embed_size, vocab_size)
|
92 |
+
|
93 |
+
def forward(self, src, tgt, src_mask=None, tgt_mask=None):
|
94 |
+
src = self.embedding(src)
|
95 |
+
tgt = self.embedding(tgt)
|
96 |
+
output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
|
97 |
+
output = self.fc(output)
|
98 |
+
return output
|
99 |
+
|
100 |
+
# پارامترها و مدل
|
101 |
+
embed_size = 10
|
102 |
+
num_heads = 5
|
103 |
+
hidden_size = 100
|
104 |
+
num_layers = 2
|
105 |
+
model = Transformer(vocab_size, embed_size, num_heads, hidden_size, num_layers)
|
106 |
+
|
107 |
+
# استفاده از GPU در صورت در دسترس بودن
|
108 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
109 |
+
model.to(device)
|
110 |
+
|
111 |
+
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
|
112 |
+
optimizer = optim.AdamW(model.parameters(), lr=0.01)
|
113 |
+
|
114 |
+
# آموزش مدل
|
115 |
+
num_epochs = 5
|
116 |
+
for epoch in range(num_epochs):
|
117 |
+
total_loss = 0
|
118 |
+
for src_batch, tgt_batch in dataloader:
|
119 |
+
src_batch = src_batch.squeeze(0).to(device)
|
120 |
+
tgt_batch = tgt_batch.squeeze(0).to(device)
|
121 |
+
|
122 |
+
src_mask = tgt_mask = None # در اینجا میتوانید ماسکهای مناسب اضافه کنید
|
123 |
+
|
124 |
+
optimizer.zero_grad()
|
125 |
+
outputs = model(src_batch, tgt_batch, src_mask, tgt_mask)
|
126 |
+
loss = criterion(outputs.view(-1, vocab_size), tgt_batch.view(-1)) # تغییر شکل برای محاسبه loss
|
127 |
+
loss.backward()
|
128 |
+
optimizer.step()
|
129 |
+
total_loss += loss.item()
|
130 |
+
|
131 |
+
avg_loss = total_loss / len(dataloader)
|
132 |
+
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
|
133 |
+
|
134 |
+
# ذخیره مدل
|
135 |
+
model_path = 'transformer_model.pth'
|
136 |
+
torch.save(model.state_dict(), model_path)
|
137 |
+
print(f'Model saved to {model_path}')
|
lstm-chatbot-training/train3.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
from torch.nn.utils.rnn import pad_sequence
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from collections import Counter
|
9 |
+
import json
|
10 |
+
|
11 |
+
# دانلود بستههای nltk
|
12 |
+
#nltk.download('punkt')
|
13 |
+
|
14 |
+
# بارگذاری دادهها از فایل
|
15 |
+
def load_text_data(file_path):
|
16 |
+
with open(file_path, 'r') as file:
|
17 |
+
data = file.readlines()
|
18 |
+
return [line.strip() for line in data]
|
19 |
+
|
20 |
+
# بارگذاری دادهها
|
21 |
+
file_path = 'data.txt'
|
22 |
+
sentences = load_text_data(file_path)
|
23 |
+
|
24 |
+
# توکنسازی
|
25 |
+
def tokenize(text):
|
26 |
+
return word_tokenize(text.lower())
|
27 |
+
|
28 |
+
# ساخت vocab
|
29 |
+
def build_vocab(sentences):
|
30 |
+
tokens = [token for sentence in sentences for token in tokenize(sentence)]
|
31 |
+
vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
|
32 |
+
vocab['<unk>'] = len(vocab)
|
33 |
+
vocab['<pad>'] = len(vocab)
|
34 |
+
return vocab
|
35 |
+
|
36 |
+
vocab = build_vocab(sentences)
|
37 |
+
vocab_size = len(vocab)
|
38 |
+
print(f"Vocabulary size: {vocab_size}")
|
39 |
+
|
40 |
+
# ذخیره vocab
|
41 |
+
with open('vocab.json', 'w') as f:
|
42 |
+
json.dump(vocab, f)
|
43 |
+
print('Vocabulary saved to vocab.json')
|
44 |
+
|
45 |
+
# آمادهسازی دادهها
|
46 |
+
class TextDataset(Dataset):
|
47 |
+
def __init__(self, sentences, vocab, seq_length=64): # افزایش seq_length
|
48 |
+
self.data = []
|
49 |
+
self.vocab = vocab
|
50 |
+
self.seq_length = seq_length
|
51 |
+
for sentence in sentences:
|
52 |
+
tokens = tokenize(sentence)
|
53 |
+
indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
|
54 |
+
for i in range(len(indices) - seq_length):
|
55 |
+
self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
|
56 |
+
|
57 |
+
def __len__(self):
|
58 |
+
return len(self.data)
|
59 |
+
|
60 |
+
def __getitem__(self, idx):
|
61 |
+
inputs, targets = self.data[idx]
|
62 |
+
return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
|
63 |
+
|
64 |
+
dataset = TextDataset(sentences, vocab, seq_length=64)
|
65 |
+
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda x: ( # افزایش batch_size
|
66 |
+
pad_sequence([i[0] for i in x], batch_first=True),
|
67 |
+
pad_sequence([i[1] for i in x], batch_first=True)
|
68 |
+
))
|
69 |
+
|
70 |
+
# مدل LSTM با Dropout و لایههای متعدد
|
71 |
+
class LSTMModel(nn.Module):
|
72 |
+
def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2): # افزایش num_layers
|
73 |
+
super(LSTMModel, self).__init__()
|
74 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
75 |
+
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first=True, bidirectional=True) # افزودن bidirectional=True
|
76 |
+
self.fc = nn.Linear(hidden_size * 2, vocab_size) # تغییر اندازه ورودی به fc برای LSTM دوطرفه
|
77 |
+
|
78 |
+
def forward(self, x):
|
79 |
+
x = self.embedding(x)
|
80 |
+
lstm_out, _ = self.lstm(x)
|
81 |
+
out = self.fc(lstm_out)
|
82 |
+
return out
|
83 |
+
|
84 |
+
# پارامترها و مدل
|
85 |
+
embed_size = 64 # افزایش embed_size
|
86 |
+
hidden_size = 512
|
87 |
+
model = LSTMModel(vocab_size, embed_size, hidden_size)
|
88 |
+
criterion = nn.CrossEntropyLoss()
|
89 |
+
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-5) # اضافه کردن weight decay
|
90 |
+
|
91 |
+
# آموزش مدل
|
92 |
+
num_epochs = 8 # افزایش num_epochs
|
93 |
+
for epoch in range(num_epochs):
|
94 |
+
for inputs, targets in dataloader:
|
95 |
+
optimizer.zero_grad()
|
96 |
+
outputs = model(inputs)
|
97 |
+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
|
98 |
+
loss.backward()
|
99 |
+
optimizer.step()
|
100 |
+
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
|
101 |
+
|
102 |
+
# ذخیره مدل
|
103 |
+
model_path = 'lstm_model.pth'
|
104 |
+
torch.save(model.state_dict(), model_path)
|
105 |
+
print(f'Model saved to {model_path}')
|
lstm-chatbot-training/train4.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
from torch.nn.utils.rnn import pad_sequence
|
6 |
+
import spacy
|
7 |
+
from collections import Counter
|
8 |
+
import json
|
9 |
+
|
10 |
+
# بارگذاری مدل توکنسازی spaCy
|
11 |
+
nlp = spacy.load('en_core_web_sm')
|
12 |
+
|
13 |
+
# بارگذاری دادهها از فایل
|
14 |
+
def load_text_data(file_path):
|
15 |
+
with open(file_path, 'r') as file:
|
16 |
+
data = file.readlines()
|
17 |
+
return [line.strip() for line in data]
|
18 |
+
|
19 |
+
# بارگذاری دادهها
|
20 |
+
file_path = 'data.txt'
|
21 |
+
sentences = load_text_data(file_path)
|
22 |
+
|
23 |
+
# توکنسازی
|
24 |
+
def tokenize(text):
|
25 |
+
return [token.text.lower() for token in nlp(text).tokenize]
|
26 |
+
|
27 |
+
# ساخت vocab
|
28 |
+
def build_vocab(sentences):
|
29 |
+
tokens = [token for sentence in sentences for token in tokenize(sentence)]
|
30 |
+
vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
|
31 |
+
vocab['<unk>'] = len(vocab)
|
32 |
+
vocab['<pad>'] = len(vocab)
|
33 |
+
return vocab
|
34 |
+
|
35 |
+
vocab = build_vocab(sentences)
|
36 |
+
vocab_size = len(vocab)
|
37 |
+
print(f"Vocabulary size: {vocab_size}")
|
38 |
+
|
39 |
+
# ذخیره vocab
|
40 |
+
with open('vocab.json', 'w') as f:
|
41 |
+
json.dump(vocab, f)
|
42 |
+
print('Vocabulary saved to vocab.json')
|
43 |
+
|
44 |
+
# آمادهسازی دادهها
|
45 |
+
class TextDataset(Dataset):
|
46 |
+
def __init__(self, sentences, vocab, seq_length=8):
|
47 |
+
self.data = []
|
48 |
+
self.vocab = vocab
|
49 |
+
self.seq_length = seq_length
|
50 |
+
for sentence in sentences:
|
51 |
+
tokens = tokenize(sentence)
|
52 |
+
indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
|
53 |
+
for i in range(len(indices) - seq_length):
|
54 |
+
self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
|
55 |
+
|
56 |
+
def __len__(self):
|
57 |
+
return len(self.data)
|
58 |
+
|
59 |
+
def __getitem__(self, idx):
|
60 |
+
inputs, targets = self.data[idx]
|
61 |
+
return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
|
62 |
+
|
63 |
+
dataset = TextDataset(sentences, vocab, seq_length=8)
|
64 |
+
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=lambda x: (
|
65 |
+
pad_sequence([i[0] for i in x], batch_first=True),
|
66 |
+
pad_sequence([i[1] for i in x], batch_first=True)
|
67 |
+
))
|
68 |
+
|
69 |
+
# مدل LSTM با Dropout و لایههای متعدد
|
70 |
+
class LSTMModel(nn.Module):
|
71 |
+
def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2, dropout=0.5):
|
72 |
+
super(LSTMModel, self).__init__()
|
73 |
+
self.embedding = nn.Embedding(vocab_size, embed_size)
|
74 |
+
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
|
75 |
+
self.fc = nn.Linear(hidden_size, vocab_size)
|
76 |
+
|
77 |
+
def forward(self, x):
|
78 |
+
x = self.embedding(x)
|
79 |
+
lstm_out, _ = self.lstm(x)
|
80 |
+
out = self.fc(lstm_out)
|
81 |
+
return out
|
82 |
+
|
83 |
+
# پارامترها و مدل
|
84 |
+
embed_size = 64 # افزایش اندازه embedding برای قابلیت یادگیری بهتر
|
85 |
+
hidden_size = 512
|
86 |
+
model = LSTMModel(vocab_size, embed_size, hidden_size)
|
87 |
+
criterion = nn.CrossEntropyLoss()
|
88 |
+
optimizer = optim.AdamW(model.parameters(), lr=0.001) # کاهش نرخ یادگیری برای دقت بهتر
|
89 |
+
|
90 |
+
# آموزش مدل
|
91 |
+
num_epochs = 10
|
92 |
+
for epoch in range(num_epochs):
|
93 |
+
model.train() # فعال کردن حالت آموزش
|
94 |
+
total_loss = 0
|
95 |
+
for inputs, targets in dataloader:
|
96 |
+
optimizer.zero_grad()
|
97 |
+
outputs = model(inputs)
|
98 |
+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
|
99 |
+
loss.backward()
|
100 |
+
nn.utils.clip_grad_norm_(model.parameters(), 1.0) # جلوگیری از انفجار گرادیان
|
101 |
+
optimizer.step()
|
102 |
+
total_loss += loss.item()
|
103 |
+
avg_loss = total_loss / len(dataloader)
|
104 |
+
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
|
105 |
+
|
106 |
+
# ذخیره مدل
|
107 |
+
model_path = 'lstm_model.pth'
|
108 |
+
torch.save(model.state_dict(), model_path)
|
109 |
+
print(f'Model saved to {model_path}')
|