Spaces:

nileshhanotia
/

PeVe_mistral

Sleeping

App Files Files Community

PeVe_mistral / app.py

nileshhanotia

Update app.py

3caf963 verified 2 months ago

raw

history blame

5.61 kB

	import os
	import json
	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
	from transformers import DataCollatorForLanguageModeling
	import torch
	from huggingface_hub import Repository, HfFolder
	import subprocess

	# Authenticate Hugging Face Hub
	hf_token = st.secrets["HF_TOKEN"] # Store your token in the Hugging Face Space Secrets
	HfFolder.save_token(hf_token)

	# Set Git user identity
	def set_git_config():
	try:
	subprocess.run(['git', 'config', 'user.email', '[email protected]'], check=True)
	subprocess.run(['git', 'config', 'user.name', 'Nilesh'], check=True)
	except subprocess.CalledProcessError as e:
	st.error(f"Git configuration error: {str(e)}")

	set_git_config()

	@st.cache_data
	def load_data(file_path):
	if not os.path.exists(file_path):
	st.error(f"File not found: {file_path}")
	return None
	try:
	with open(file_path, 'r') as f:
	data = json.load(f)
	return data
	except Exception as e:
	st.error(f"Error loading dataset: {str(e)}")
	return None

	@st.cache_resource
	def initialize_model_and_tokenizer(model_name):
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# Set the pad token to the eos token if it doesn't exist
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model.config.pad_token_id = model.config.eos_token_id

	return tokenizer, model
	except Exception as e:
	st.error(f"Error initializing model and tokenizer: {str(e)}")
	return None, None

	def create_dataset(data, tokenizer, max_length):
	inputs = []
	for item in data:
	prompt = item['prompt']
	response = item['response']
	full_text = f"Human: {prompt}\nAssistant: {response}"
	encoded = tokenizer.encode_plus(
	full_text,
	max_length=max_length,
	padding='max_length',
	truncation=True,
	return_tensors='pt'
	)
	inputs.append({
	'input_ids': encoded['input_ids'].squeeze(),
	'attention_mask': encoded['attention_mask'].squeeze()
	})
	return inputs

	class SimpleDataset(torch.utils.data.Dataset):
	def __init__(self, encodings):
	self.encodings = encodings

	def __getitem__(self, idx):
	return {key: val[idx] for key, val in self.encodings[idx].items()}

	def __len__(self):
	return len(self.encodings)

	def main():
	st.title("Model Training with Streamlit")

	# User inputs with recommended values
	model_name = st.text_input("Enter model name", "distilgpt2")
	file_path = st.text_input("Enter path to training data JSON file", "appointment_training_data.json")
	max_length = st.number_input("Enter max token length", min_value=32, max_value=512, value=256)
	num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
	batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
	learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")

	# Specify your Hugging Face model repository ID
	repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")

	tokenizer, model = initialize_model_and_tokenizer(model_name)

	if tokenizer is None or model is None:
	st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
	return

	st.write("Loading and processing dataset...")
	data = load_data(file_path)

	if data is None:
	st.warning("Failed to load dataset. Please check the file path and try again.")
	return

	st.write("Tokenizing dataset...")
	tokenized_dataset = create_dataset(data, tokenizer, max_length)

	dataset = SimpleDataset(tokenized_dataset)

	# Define training arguments
	training_args = TrainingArguments(
	output_dir='./results',
	evaluation_strategy='no',
	learning_rate=learning_rate,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_epochs,
	weight_decay=0.01,
	logging_dir='./logs',
	logging_steps=10,
	)

	# Initialize the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
	)

	if st.button('Start Training'):
	st.write("Starting training...")
	progress_bar = st.progress(0)

	# Initialize the Hugging Face repository
	repo = Repository(local_dir="./results", clone_from=repo_id)

	for epoch in range(int(num_epochs)):
	trainer.train()
	progress = (epoch + 1) / num_epochs
	progress_bar.progress(progress)

	# Save the model after each epoch locally
	model_path = f"./results/model_epoch_{epoch+1}"
	trainer.save_model(model_path)
	st.write(f"Model saved locally: {model_path}")

	# Push to Hugging Face Hub
	repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
	st.write(f"Model pushed to Hugging Face Hub: {repo_id}")

	st.write("Training complete. Model is available on the Hugging Face Hub.")

	if __name__ == "__main__":
	main()