File size: 5,605 Bytes
0a83766
997991d
de7d627
0a83766
a8d1617
b0226fd
8ad732e
3caf963
8ad732e
 
7a2d473
8ad732e
4270cfb
c899f78
3caf963
 
 
 
 
 
 
 
c899f78
b2c4316
b0226fd
997991d
 
 
b0226fd
997991d
 
 
b0226fd
 
 
de7d627
a8d1617
 
 
 
 
155ecd2
 
 
 
 
 
a8d1617
 
 
 
 
997991d
 
 
 
 
 
155ecd2
 
 
 
 
 
 
 
 
 
 
997991d
4270cfb
3caf963
 
 
 
 
 
 
 
 
 
b2c4316
 
b0226fd
997991d
b0226fd
997991d
 
b0226fd
997991d
 
8ad732e
 
 
b0226fd
b2c4316
b0226fd
 
 
 
b2c4316
 
997991d
b2c4316
997991d
b0226fd
 
 
b2c4316
997991d
 
a8d1617
b2c4316
 
 
b0226fd
3c13618
b0226fd
 
 
 
 
 
 
b2c4316
b0226fd
b2c4316
0a83766
b0226fd
 
997991d
b0226fd
0a83766
b0226fd
b2c4316
 
b0226fd
8ad732e
 
 
b0226fd
 
 
 
 
 
8ad732e
b0226fd
 
8ad732e
 
 
 
 
b0226fd
8ad732e
b2c4316
 
8ad732e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import json
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch
from huggingface_hub import Repository, HfFolder
import subprocess

# Authenticate Hugging Face Hub
hf_token = st.secrets["HF_TOKEN"]  # Store your token in the Hugging Face Space Secrets
HfFolder.save_token(hf_token)

# Set Git user identity
def set_git_config():
    try:
        subprocess.run(['git', 'config', 'user.email', '[email protected]'], check=True)
        subprocess.run(['git', 'config', 'user.name', 'Nilesh'], check=True)
    except subprocess.CalledProcessError as e:
        st.error(f"Git configuration error: {str(e)}")

set_git_config()

@st.cache_data
def load_data(file_path):
    if not os.path.exists(file_path):
        st.error(f"File not found: {file_path}")
        return None
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    except Exception as e:
        st.error(f"Error loading dataset: {str(e)}")
        return None

@st.cache_resource
def initialize_model_and_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        
        # Set the pad token to the eos token if it doesn't exist
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id
        
        return tokenizer, model
    except Exception as e:
        st.error(f"Error initializing model and tokenizer: {str(e)}")
        return None, None

def create_dataset(data, tokenizer, max_length):
    inputs = []
    for item in data:
        prompt = item['prompt']
        response = item['response']
        full_text = f"Human: {prompt}\nAssistant: {response}"
        encoded = tokenizer.encode_plus(
            full_text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs.append({
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze()
        })
    return inputs

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings[idx].items()}

    def __len__(self):
        return len(self.encodings)

def main():
    st.title("Model Training with Streamlit")

    # User inputs with recommended values
    model_name = st.text_input("Enter model name", "distilgpt2")
    file_path = st.text_input("Enter path to training data JSON file", "appointment_training_data.json")
    max_length = st.number_input("Enter max token length", min_value=32, max_value=512, value=256)
    num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
    batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
    learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
    
    # Specify your Hugging Face model repository ID
    repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")

    tokenizer, model = initialize_model_and_tokenizer(model_name)
    
    if tokenizer is None or model is None:
        st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
        return

    st.write("Loading and processing dataset...")
    data = load_data(file_path)
    
    if data is None:
        st.warning("Failed to load dataset. Please check the file path and try again.")
        return

    st.write("Tokenizing dataset...")
    tokenized_dataset = create_dataset(data, tokenizer, max_length)

    dataset = SimpleDataset(tokenized_dataset)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='no',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )

    if st.button('Start Training'):
        st.write("Starting training...")
        progress_bar = st.progress(0)

        # Initialize the Hugging Face repository
        repo = Repository(local_dir="./results", clone_from=repo_id)
        
        for epoch in range(int(num_epochs)):
            trainer.train()
            progress = (epoch + 1) / num_epochs
            progress_bar.progress(progress)
            
            # Save the model after each epoch locally
            model_path = f"./results/model_epoch_{epoch+1}"
            trainer.save_model(model_path)
            st.write(f"Model saved locally: {model_path}")

            # Push to Hugging Face Hub
            repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
            st.write(f"Model pushed to Hugging Face Hub: {repo_id}")

        st.write("Training complete. Model is available on the Hugging Face Hub.")

if __name__ == "__main__":
    main()