import os import json import random import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import Dataset, concatenate_datasets import torch from huggingface_hub import Repository, HfFolder import subprocess # Authenticate Hugging Face Hub hf_token = st.secrets["HF_TOKEN"] HfFolder.save_token(hf_token) # Set Git user identity def set_git_config(): try: subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True) subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True) st.success("Git configuration set successfully.") except subprocess.CalledProcessError as e: st.error(f"Git configuration error: {str(e)}") # Call set_git_config at the start of the script set_git_config() @st.cache_data def load_data(file_paths): combined_data = [] for file_path in file_paths: file_path = file_path.strip() if not os.path.exists(file_path): st.error(f"File not found: {file_path}") return None try: with open(file_path, 'r') as f: data = json.load(f) if 'intents' in data: for intent in data['intents']: combined_data.extend(intent['examples']) else: st.error(f"Invalid format in file: {file_path}") return None except Exception as e: st.error(f"Error loading dataset from {file_path}: {str(e)}") return None return combined_data @st.cache_resource def initialize_model_and_tokenizer(model_name, num_labels): try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id return tokenizer, model except Exception as e: st.error(f"Error initializing model and tokenizer: {str(e)}") return None, None def create_dataset(data, tokenizer, max_length): texts = [item.get('prompt', '') for item in data] labels = [item.get('label', -1) for item in data] # Debugging: Print out labels to check for invalid values print(f"Labels before adjustment: {labels}") # Ensure all labels are within the valid range labels = [label if 0 <= label < num_labels else 0 for label in labels] # Debugging: Print out adjusted labels print(f"Labels after adjustment: {labels}") encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length) dataset = Dataset.from_dict({ 'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': labels }) return dataset def split_data(data, test_size=0.2): if not data: raise ValueError("Data is empty, cannot split.") random.shuffle(data) split_index = int(len(data) * (1 - test_size)) return data[:split_index], data[split_index:] def main(): st.title("Appointment Classification Model Training") model_name = st.text_input("Enter model name", "distilgpt2") file_paths = st.text_area("Enter paths to training data JSON files (comma-separated)", "training_data1.json,training_data2.json").split(',') max_length = st.number_input("Enter max token length", min_value=32, max_value=512, value=128) num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3) batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8) learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e") num_labels = 3 # We have 3 classes: schedule, reschedule, cancel repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe") tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels) if tokenizer is None or model is None: st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.") return st.write("Loading and processing dataset...") data = load_data(file_paths) if data is None: st.warning("Failed to load dataset. Please check the file paths and try again.") return st.write("Preparing dataset...") # Split the data into train and evaluation sets try: train_data, eval_data = split_data(data) except ValueError as e: st.error(f"Data splitting error: {str(e)}") return train_dataset = create_dataset(train_data, tokenizer, max_length) eval_dataset = create_dataset(eval_data, tokenizer, max_length) training_args = TrainingArguments( output_dir='./results', evaluation_strategy='epoch', learning_rate=learning_rate, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, logging_dir='./logs', logging_steps=10, push_to_hub=True, hub_model_id=repo_id, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) if st.button('Start Training'): st.write("Starting training...") trainer.train() trainer.push_to_hub() st.write(f"Training complete. Model is available on the Hugging Face Hub: {repo_id}") if __name__ == "__main__": main()