PeVe_mistral / app.py
nileshhanotia's picture
Update app.py
3caf963 verified
raw
history blame
5.61 kB
import os
import json
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch
from huggingface_hub import Repository, HfFolder
import subprocess
# Authenticate Hugging Face Hub
hf_token = st.secrets["HF_TOKEN"] # Store your token in the Hugging Face Space Secrets
HfFolder.save_token(hf_token)
# Set Git user identity
def set_git_config():
try:
subprocess.run(['git', 'config', 'user.email', '[email protected]'], check=True)
subprocess.run(['git', 'config', 'user.name', 'Nilesh'], check=True)
except subprocess.CalledProcessError as e:
st.error(f"Git configuration error: {str(e)}")
set_git_config()
@st.cache_data
def load_data(file_path):
if not os.path.exists(file_path):
st.error(f"File not found: {file_path}")
return None
try:
with open(file_path, 'r') as f:
data = json.load(f)
return data
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
return None
@st.cache_resource
def initialize_model_and_tokenizer(model_name):
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set the pad token to the eos token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
return tokenizer, model
except Exception as e:
st.error(f"Error initializing model and tokenizer: {str(e)}")
return None, None
def create_dataset(data, tokenizer, max_length):
inputs = []
for item in data:
prompt = item['prompt']
response = item['response']
full_text = f"Human: {prompt}\nAssistant: {response}"
encoded = tokenizer.encode_plus(
full_text,
max_length=max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
inputs.append({
'input_ids': encoded['input_ids'].squeeze(),
'attention_mask': encoded['attention_mask'].squeeze()
})
return inputs
class SimpleDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: val[idx] for key, val in self.encodings[idx].items()}
def __len__(self):
return len(self.encodings)
def main():
st.title("Model Training with Streamlit")
# User inputs with recommended values
model_name = st.text_input("Enter model name", "distilgpt2")
file_path = st.text_input("Enter path to training data JSON file", "appointment_training_data.json")
max_length = st.number_input("Enter max token length", min_value=32, max_value=512, value=256)
num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
# Specify your Hugging Face model repository ID
repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
tokenizer, model = initialize_model_and_tokenizer(model_name)
if tokenizer is None or model is None:
st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
return
st.write("Loading and processing dataset...")
data = load_data(file_path)
if data is None:
st.warning("Failed to load dataset. Please check the file path and try again.")
return
st.write("Tokenizing dataset...")
tokenized_dataset = create_dataset(data, tokenizer, max_length)
dataset = SimpleDataset(tokenized_dataset)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy='no',
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)
if st.button('Start Training'):
st.write("Starting training...")
progress_bar = st.progress(0)
# Initialize the Hugging Face repository
repo = Repository(local_dir="./results", clone_from=repo_id)
for epoch in range(int(num_epochs)):
trainer.train()
progress = (epoch + 1) / num_epochs
progress_bar.progress(progress)
# Save the model after each epoch locally
model_path = f"./results/model_epoch_{epoch+1}"
trainer.save_model(model_path)
st.write(f"Model saved locally: {model_path}")
# Push to Hugging Face Hub
repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
st.write(f"Model pushed to Hugging Face Hub: {repo_id}")
st.write("Training complete. Model is available on the Hugging Face Hub.")
if __name__ == "__main__":
main()