ysn-rfd commited on
Commit
83e7c05
·
verified ·
1 Parent(s): d994c11

Upload 6 files

Browse files
lstm-chatbot-training/chatbot.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from nltk.tokenize import word_tokenize
4
+ import json
5
+ import numpy as np
6
+
7
+ # توکن‌سازی
8
+ def tokenize(text):
9
+ return word_tokenize(text.lower())
10
+
11
+ # بارگذاری vocab
12
+ with open('vocab.json', 'r') as f:
13
+ vocab = json.load(f)
14
+ vocab_size = len(vocab)
15
+
16
+ # مدل LSTM با Dropout و لایه‌های متعدد
17
+ class LSTMModel(nn.Module):
18
+ def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
19
+ super(LSTMModel, self).__init__()
20
+ self.embedding = nn.Embedding(vocab_size, embed_size)
21
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first = True)
22
+ self.fc = nn.Linear(hidden_size, vocab_size)
23
+
24
+ def forward(self, x):
25
+ x = self.embedding(x)
26
+ lstm_out, _ = self.lstm(x)
27
+ out = self.fc(lstm_out)
28
+ return out
29
+
30
+ # بارگذاری مدل
31
+ embed_size = 64
32
+ hidden_size = 512
33
+ model = LSTMModel(vocab_size, embed_size, hidden_size)
34
+ model.load_state_dict(torch.load('lstm_model.pth', weights_only= True))
35
+ model.eval()
36
+
37
+ # Beam Search با Top-k Sampling
38
+ def beam_search_with_top_k_sampling(model, start_text, vocab, tokenizer, beam_width=1, top_k=2, max_length= 64):
39
+ model.eval()
40
+ indices = [vocab.get(token, vocab['<unk>']) for token in tokenize(start_text)]
41
+ input_seq = torch.tensor(indices).unsqueeze(0)
42
+ beams = [(input_seq, start_text, 0)] # (sequence, text, score)
43
+
44
+ for _ in range(max_length):
45
+ new_beams = []
46
+ for beam in beams:
47
+ seq, text, score = beam
48
+ with torch.no_grad():
49
+ outputs = model(seq)
50
+ next_word_probs = outputs[:, -1, :]
51
+ topk_probs, topk_indices = torch.topk(next_word_probs, top_k, dim=1)
52
+
53
+ # Sample from top-k probabilities
54
+ probabilities = torch.softmax(topk_probs, dim=1).squeeze().cpu().numpy()
55
+ for i in range(top_k):
56
+ next_index = np.random.choice(topk_indices[0].cpu().numpy(), p=probabilities)
57
+ next_word = [word for word, index in vocab.items() if index == next_index][0]
58
+ new_seq = torch.cat([seq, torch.tensor([[next_index]])], dim=1)
59
+ new_score = score + np.log(probabilities[i]) # Update score with log probability
60
+ new_beams.append((new_seq, text + ' ' + next_word, new_score))
61
+
62
+ # Select the top beams
63
+ beams = sorted(new_beams, key=lambda x: x[2], reverse = False)[:beam_width]
64
+
65
+ return beams[-1][1] # Return the highest scoring beam
66
+
67
+ # چت بات
68
+ def chat():
69
+ print("Chatbot is ready. Type 'exit' to end the chat.")
70
+ while True:
71
+ user_input = input("You: ")
72
+ if user_input.lower() == 'exit':
73
+ break
74
+ response = beam_search_with_top_k_sampling(model, user_input, vocab, tokenize)
75
+ print(f"Bot: {response}")
76
+
77
+ # شروع چت
78
+ chat()
lstm-chatbot-training/chatbot2.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import json
4
+ from nltk.tokenize import word_tokenize
5
+
6
+ # بارگذاری vocab
7
+ with open('vocab.json', 'r') as f:
8
+ vocab = json.load(f)
9
+
10
+ # بارگذاری مدل
11
+ class LSTMModel(nn.Module):
12
+ def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2):
13
+ super(LSTMModel, self).__init__()
14
+ self.embedding = nn.Embedding(vocab_size, embed_size)
15
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first=True, bidirectional=True)
16
+ self.fc = nn.Linear(hidden_size * 2, vocab_size)
17
+
18
+ def forward(self, x):
19
+ x = self.embedding(x)
20
+ lstm_out, _ = self.lstm(x)
21
+ out = self.fc(lstm_out)
22
+ return out
23
+
24
+ vocab_size = len(vocab)
25
+ embed_size = 64
26
+ hidden_size = 512
27
+ model = LSTMModel(vocab_size, embed_size, hidden_size)
28
+ model.load_state_dict(torch.load('lstm_model.pth', weights_only=True))
29
+ model.eval()
30
+
31
+ # توکن‌سازی و پردازش ورودی
32
+ def tokenize(text):
33
+ return word_tokenize(text.lower())
34
+
35
+ def tokens_to_indices(tokens, vocab):
36
+ return [vocab.get(token, vocab['<unk>']) for token in tokens]
37
+
38
+ def indices_to_tokens(indices, vocab):
39
+ inv_vocab = {v: k for k, v in vocab.items()}
40
+ return [inv_vocab.get(index, '<unk>') for index in indices]
41
+
42
+ # تولید پاسخ
43
+ def generate_response(model, input_text, vocab, max_length=20):
44
+ tokens = tokenize(input_text)
45
+ input_indices = tokens_to_indices(tokens, vocab)
46
+ input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0) # اضافه کردن بعد برای batch
47
+
48
+ response_indices = []
49
+ with torch.no_grad():
50
+ for _ in range(max_length):
51
+ output = model(input_tensor)
52
+ next_token_id = output.argmax(dim=-1)[:, -1].item() # پیدا کردن کلمه بعدی
53
+ response_indices.append(next_token_id)
54
+ input_tensor = torch.cat([input_tensor, torch.tensor([[next_token_id]], dtype=torch.long)], dim=1)
55
+
56
+ if next_token_id == vocab['<pad>']:
57
+ break
58
+
59
+ response_tokens = indices_to_tokens(response_indices, vocab)
60
+ return ' '.join(response_tokens).replace('<pad>', '')
61
+
62
+ # حلقه چت
63
+ def chat():
64
+ print("Chatbot is ready! Type 'quit' to exit.")
65
+ while True:
66
+ user_input = input("You: ")
67
+ if user_input.lower() == 'quit':
68
+ print("Goodbye!")
69
+ break
70
+ response = generate_response(model, user_input, vocab)
71
+ print(f"Bot: {response}")
72
+
73
+ if __name__ == "__main__":
74
+ chat()
lstm-chatbot-training/train.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from collections import Counter
9
+ import json
10
+
11
+ # دانلود بسته‌های nltk
12
+ #nltk.download('punkt')
13
+
14
+ # بارگذاری داده‌ها از فایل
15
+ def load_text_data(file_path):
16
+ with open(file_path, 'r') as file:
17
+ data = file.readlines()
18
+ return [line.strip() for line in data]
19
+
20
+ # بارگذاری داده‌ها
21
+ file_path = 'data.txt'
22
+ sentences = load_text_data(file_path)
23
+
24
+ # توکن‌سازی
25
+ def tokenize(text):
26
+ return word_tokenize(text.lower())
27
+
28
+ # ساخت vocab
29
+ def build_vocab(sentences):
30
+ tokens = [token for sentence in sentences for token in tokenize(sentence)]
31
+ vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
32
+ vocab['<unk>'] = len(vocab)
33
+ vocab['<pad>'] = len(vocab)
34
+ return vocab
35
+
36
+ vocab = build_vocab(sentences)
37
+ vocab_size = len(vocab)
38
+ print(f"Vocabulary size: {vocab_size}")
39
+
40
+ # ذخیره vocab
41
+ with open('vocab.json', 'w') as f:
42
+ json.dump(vocab, f)
43
+ print('Vocabulary saved to vocab.json')
44
+
45
+ # آماده‌سازی داده‌ها
46
+ class TextDataset(Dataset):
47
+ def __init__(self, sentences, vocab, seq_length= 8):
48
+ self.data = []
49
+ self.vocab = vocab
50
+ self.seq_length = seq_length
51
+ for sentence in sentences:
52
+ tokens = tokenize(sentence)
53
+ indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
54
+ for i in range(len(indices) - seq_length):
55
+ self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
56
+
57
+ def __len__(self):
58
+ return len(self.data)
59
+
60
+ def __getitem__(self, idx):
61
+ inputs, targets = self.data[idx]
62
+ return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
63
+
64
+ dataset = TextDataset(sentences, vocab, seq_length= 8)
65
+ dataloader = DataLoader(dataset, batch_size=1, shuffle = True, collate_fn=lambda x: (
66
+ pad_sequence([i[0] for i in x], batch_first = True),
67
+ pad_sequence([i[1] for i in x], batch_first = True)
68
+ ))
69
+
70
+ # مدل LSTM با Dropout و لایه‌های متعدد
71
+ class LSTMModel(nn.Module):
72
+ def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
73
+ super(LSTMModel, self).__init__()
74
+ self.embedding = nn.Embedding(vocab_size, embed_size)
75
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first = True)
76
+ self.fc = nn.Linear(hidden_size, vocab_size)
77
+
78
+ def forward(self, x):
79
+ x = self.embedding(x)
80
+ lstm_out, _ = self.lstm(x)
81
+ out = self.fc(lstm_out)
82
+ return out
83
+
84
+ # پارامترها و مدل
85
+ embed_size = 64
86
+ hidden_size = 512
87
+ model = LSTMModel(vocab_size, embed_size, hidden_size)
88
+ criterion = nn.CrossEntropyLoss()
89
+ optimizer = optim.AdamW(model.parameters(), lr=0.01) # استفاده از AdamW به جای Adam
90
+
91
+ # آموزش مدل
92
+ num_epochs = 8
93
+ for epoch in range(num_epochs):
94
+ for inputs, targets in dataloader:
95
+ optimizer.zero_grad()
96
+ outputs = model(inputs)
97
+ loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
98
+ loss.backward()
99
+ optimizer.step()
100
+ print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
101
+
102
+ # ذخیره مدل
103
+ model_path = 'lstm_model.pth'
104
+ torch.save(model.state_dict(), model_path)
105
+ print(f'Model saved to {model_path}')
lstm-chatbot-training/train2.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from collections import Counter
9
+ import json
10
+
11
+ # دانلود بسته‌های nltk
12
+ # nltk.download('punkt')
13
+
14
+ # بارگذاری داده‌ها از فایل
15
+ def load_text_data(file_path):
16
+ with open(file_path, 'r') as file:
17
+ data = file.readlines()
18
+ return [line.strip() for line in data]
19
+
20
+ # بارگذاری داده‌ها
21
+ file_path = 'data.txt'
22
+ sentences = load_text_data(file_path)
23
+ print(f"Loaded sentences: {sentences[:0]}") # نمایش چند جمله اول برای بررسی
24
+
25
+ # توکن‌سازی
26
+ def tokenize(text):
27
+ return word_tokenize(text.lower())
28
+
29
+ # ساخت vocab
30
+ def build_vocab(sentences):
31
+ tokens = [token for sentence in sentences for token in tokenize(sentence)]
32
+ vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
33
+ vocab['<unk>'] = len(vocab)
34
+ vocab['<pad>'] = len(vocab)
35
+ return vocab
36
+
37
+ vocab = build_vocab(sentences)
38
+ vocab_size = len(vocab)
39
+ print(f"Vocabulary size: {vocab_size}")
40
+
41
+ # ذخیره vocab
42
+ with open('vocab.json', 'w') as f:
43
+ json.dump(vocab, f)
44
+ print('Vocabulary saved to vocab.json')
45
+
46
+ # آماده‌سازی داده‌ها
47
+ class TextDataset(Dataset):
48
+ def __init__(self, sentences, vocab, seq_length=10):
49
+ self.data = []
50
+ self.vocab = vocab
51
+ self.seq_length = seq_length
52
+ for sentence in sentences:
53
+ tokens = tokenize(sentence)
54
+ indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
55
+ if len(indices) >= seq_length + 1:
56
+ for i in range(len(indices) - seq_length):
57
+ self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
58
+ print(f"Dataset size: {len(self.data)}") # نمایش تعداد نمونه‌ها
59
+
60
+ def __len__(self):
61
+ return len(self.data)
62
+
63
+ def __getitem__(self, idx):
64
+ inputs, targets = self.data[idx]
65
+ return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
66
+
67
+ dataset = TextDataset(sentences, vocab, seq_length=10)
68
+ print(f"Number of samples in dataset: {len(dataset)}")
69
+
70
+ def collate_fn(batch):
71
+ inputs, targets = zip(*batch)
72
+ inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=vocab['<pad>'])
73
+ targets_pad = pad_sequence(targets, batch_first=True, padding_value=vocab['<pad>'])
74
+ return inputs_pad, targets_pad
75
+
76
+ dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
77
+
78
+ # مدل Transformer
79
+ class Transformer(nn.Module):
80
+ def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers):
81
+ super(Transformer, self).__init__()
82
+ self.embedding = nn.Embedding(vocab_size, embed_size)
83
+ self.transformer = nn.Transformer(
84
+ d_model=embed_size,
85
+ nhead=num_heads,
86
+ num_encoder_layers=num_layers,
87
+ num_decoder_layers=num_layers,
88
+ dim_feedforward=hidden_size,
89
+ batch_first=True # اطمینان از تنظیم batch_first=True
90
+ )
91
+ self.fc = nn.Linear(embed_size, vocab_size)
92
+
93
+ def forward(self, src, tgt, src_mask=None, tgt_mask=None):
94
+ src = self.embedding(src)
95
+ tgt = self.embedding(tgt)
96
+ output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
97
+ output = self.fc(output)
98
+ return output
99
+
100
+ # پارامترها و مدل
101
+ embed_size = 10
102
+ num_heads = 5
103
+ hidden_size = 100
104
+ num_layers = 2
105
+ model = Transformer(vocab_size, embed_size, num_heads, hidden_size, num_layers)
106
+
107
+ # استفاده از GPU در صورت در دسترس بودن
108
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
109
+ model.to(device)
110
+
111
+ criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
112
+ optimizer = optim.AdamW(model.parameters(), lr=0.01)
113
+
114
+ # آموزش مدل
115
+ num_epochs = 5
116
+ for epoch in range(num_epochs):
117
+ total_loss = 0
118
+ for src_batch, tgt_batch in dataloader:
119
+ src_batch = src_batch.squeeze(0).to(device)
120
+ tgt_batch = tgt_batch.squeeze(0).to(device)
121
+
122
+ src_mask = tgt_mask = None # در اینجا می‌توانید ماسک‌های مناسب اضافه کنید
123
+
124
+ optimizer.zero_grad()
125
+ outputs = model(src_batch, tgt_batch, src_mask, tgt_mask)
126
+ loss = criterion(outputs.view(-1, vocab_size), tgt_batch.view(-1)) # تغییر شکل برای محاسبه loss
127
+ loss.backward()
128
+ optimizer.step()
129
+ total_loss += loss.item()
130
+
131
+ avg_loss = total_loss / len(dataloader)
132
+ print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
133
+
134
+ # ذخیره مدل
135
+ model_path = 'transformer_model.pth'
136
+ torch.save(model.state_dict(), model_path)
137
+ print(f'Model saved to {model_path}')
lstm-chatbot-training/train3.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from collections import Counter
9
+ import json
10
+
11
+ # دانلود بسته‌های nltk
12
+ #nltk.download('punkt')
13
+
14
+ # بارگذاری داده‌ها از فایل
15
+ def load_text_data(file_path):
16
+ with open(file_path, 'r') as file:
17
+ data = file.readlines()
18
+ return [line.strip() for line in data]
19
+
20
+ # بارگذاری داده‌ها
21
+ file_path = 'data.txt'
22
+ sentences = load_text_data(file_path)
23
+
24
+ # توکن‌سازی
25
+ def tokenize(text):
26
+ return word_tokenize(text.lower())
27
+
28
+ # ساخت vocab
29
+ def build_vocab(sentences):
30
+ tokens = [token for sentence in sentences for token in tokenize(sentence)]
31
+ vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
32
+ vocab['<unk>'] = len(vocab)
33
+ vocab['<pad>'] = len(vocab)
34
+ return vocab
35
+
36
+ vocab = build_vocab(sentences)
37
+ vocab_size = len(vocab)
38
+ print(f"Vocabulary size: {vocab_size}")
39
+
40
+ # ذخیره vocab
41
+ with open('vocab.json', 'w') as f:
42
+ json.dump(vocab, f)
43
+ print('Vocabulary saved to vocab.json')
44
+
45
+ # آماده‌سازی داده‌ها
46
+ class TextDataset(Dataset):
47
+ def __init__(self, sentences, vocab, seq_length=64): # افزایش seq_length
48
+ self.data = []
49
+ self.vocab = vocab
50
+ self.seq_length = seq_length
51
+ for sentence in sentences:
52
+ tokens = tokenize(sentence)
53
+ indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
54
+ for i in range(len(indices) - seq_length):
55
+ self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
56
+
57
+ def __len__(self):
58
+ return len(self.data)
59
+
60
+ def __getitem__(self, idx):
61
+ inputs, targets = self.data[idx]
62
+ return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
63
+
64
+ dataset = TextDataset(sentences, vocab, seq_length=64)
65
+ dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda x: ( # افزایش batch_size
66
+ pad_sequence([i[0] for i in x], batch_first=True),
67
+ pad_sequence([i[1] for i in x], batch_first=True)
68
+ ))
69
+
70
+ # مدل LSTM با Dropout و لایه‌های متعدد
71
+ class LSTMModel(nn.Module):
72
+ def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2): # افزایش num_layers
73
+ super(LSTMModel, self).__init__()
74
+ self.embedding = nn.Embedding(vocab_size, embed_size)
75
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=0, batch_first=True, bidirectional=True) # افزودن bidirectional=True
76
+ self.fc = nn.Linear(hidden_size * 2, vocab_size) # تغییر اندازه ورودی به fc برای LSTM دوطرفه
77
+
78
+ def forward(self, x):
79
+ x = self.embedding(x)
80
+ lstm_out, _ = self.lstm(x)
81
+ out = self.fc(lstm_out)
82
+ return out
83
+
84
+ # پارامترها و مدل
85
+ embed_size = 64 # افزایش embed_size
86
+ hidden_size = 512
87
+ model = LSTMModel(vocab_size, embed_size, hidden_size)
88
+ criterion = nn.CrossEntropyLoss()
89
+ optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-5) # اضافه کردن weight decay
90
+
91
+ # آموزش مدل
92
+ num_epochs = 8 # افزایش num_epochs
93
+ for epoch in range(num_epochs):
94
+ for inputs, targets in dataloader:
95
+ optimizer.zero_grad()
96
+ outputs = model(inputs)
97
+ loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
98
+ loss.backward()
99
+ optimizer.step()
100
+ print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
101
+
102
+ # ذخیره مدل
103
+ model_path = 'lstm_model.pth'
104
+ torch.save(model.state_dict(), model_path)
105
+ print(f'Model saved to {model_path}')
lstm-chatbot-training/train4.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ import spacy
7
+ from collections import Counter
8
+ import json
9
+
10
+ # بارگذاری مدل توکن‌سازی spaCy
11
+ nlp = spacy.load('en_core_web_sm')
12
+
13
+ # بارگذاری داده‌ها از فایل
14
+ def load_text_data(file_path):
15
+ with open(file_path, 'r') as file:
16
+ data = file.readlines()
17
+ return [line.strip() for line in data]
18
+
19
+ # بارگذاری داده‌ها
20
+ file_path = 'data.txt'
21
+ sentences = load_text_data(file_path)
22
+
23
+ # توکن‌سازی
24
+ def tokenize(text):
25
+ return [token.text.lower() for token in nlp(text).tokenize]
26
+
27
+ # ساخت vocab
28
+ def build_vocab(sentences):
29
+ tokens = [token for sentence in sentences for token in tokenize(sentence)]
30
+ vocab = {word: i for i, (word, _) in enumerate(Counter(tokens).items())}
31
+ vocab['<unk>'] = len(vocab)
32
+ vocab['<pad>'] = len(vocab)
33
+ return vocab
34
+
35
+ vocab = build_vocab(sentences)
36
+ vocab_size = len(vocab)
37
+ print(f"Vocabulary size: {vocab_size}")
38
+
39
+ # ذخیره vocab
40
+ with open('vocab.json', 'w') as f:
41
+ json.dump(vocab, f)
42
+ print('Vocabulary saved to vocab.json')
43
+
44
+ # آماده‌سازی داده‌ها
45
+ class TextDataset(Dataset):
46
+ def __init__(self, sentences, vocab, seq_length=8):
47
+ self.data = []
48
+ self.vocab = vocab
49
+ self.seq_length = seq_length
50
+ for sentence in sentences:
51
+ tokens = tokenize(sentence)
52
+ indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
53
+ for i in range(len(indices) - seq_length):
54
+ self.data.append((indices[i:i+seq_length], indices[i+1:i+seq_length+1]))
55
+
56
+ def __len__(self):
57
+ return len(self.data)
58
+
59
+ def __getitem__(self, idx):
60
+ inputs, targets = self.data[idx]
61
+ return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)
62
+
63
+ dataset = TextDataset(sentences, vocab, seq_length=8)
64
+ dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=lambda x: (
65
+ pad_sequence([i[0] for i in x], batch_first=True),
66
+ pad_sequence([i[1] for i in x], batch_first=True)
67
+ ))
68
+
69
+ # مدل LSTM با Dropout و لایه‌های متعدد
70
+ class LSTMModel(nn.Module):
71
+ def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2, dropout=0.5):
72
+ super(LSTMModel, self).__init__()
73
+ self.embedding = nn.Embedding(vocab_size, embed_size)
74
+ self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
75
+ self.fc = nn.Linear(hidden_size, vocab_size)
76
+
77
+ def forward(self, x):
78
+ x = self.embedding(x)
79
+ lstm_out, _ = self.lstm(x)
80
+ out = self.fc(lstm_out)
81
+ return out
82
+
83
+ # پارامترها و مدل
84
+ embed_size = 64 # افزایش اندازه embedding برای قابلیت یادگیری بهتر
85
+ hidden_size = 512
86
+ model = LSTMModel(vocab_size, embed_size, hidden_size)
87
+ criterion = nn.CrossEntropyLoss()
88
+ optimizer = optim.AdamW(model.parameters(), lr=0.001) # کاهش نرخ یادگیری برای دقت بهتر
89
+
90
+ # آموزش مدل
91
+ num_epochs = 10
92
+ for epoch in range(num_epochs):
93
+ model.train() # فعال کردن حالت آموزش
94
+ total_loss = 0
95
+ for inputs, targets in dataloader:
96
+ optimizer.zero_grad()
97
+ outputs = model(inputs)
98
+ loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # تغییر شکل برای محاسبه loss
99
+ loss.backward()
100
+ nn.utils.clip_grad_norm_(model.parameters(), 1.0) # جلوگیری از انفجار گرادیان
101
+ optimizer.step()
102
+ total_loss += loss.item()
103
+ avg_loss = total_loss / len(dataloader)
104
+ print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
105
+
106
+ # ذخیره مدل
107
+ model_path = 'lstm_model.pth'
108
+ torch.save(model.state_dict(), model_path)
109
+ print(f'Model saved to {model_path}')