Spaces:

nileshhanotia
/

PeVe_mistral

Sleeping

App Files Files Community

nileshhanotia commited on Sep 9, 2024

Commit

de7d627

verified ·

1 Parent(s): 0a83766

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -57

app.py CHANGED Viewed

@@ -1,73 +1,56 @@
-import streamlit as st
 import os
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
-import pandas as pd
-from io import StringIO
 def preprocess_function(examples):
-    if 'prompt' not in examples:
-        raise ValueError("Key 'prompt' not found in examples. Please check your dataset fields.")
     return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=128)
-def train_model(training_data):
-    # Load the dataset
-    dataset = load_dataset('json', data_files={'train': training_data})
-    # Initialize the tokenizer and model
-    model_name = 'mistral/Mixtral-8x7B'  # Replace with the correct model name
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    # Tokenize the dataset
-    tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)
-    # Define training arguments
-    training_args = TrainingArguments(
-        output_dir='./results',              # Output directory
-        evaluation_strategy='epoch',         # Evaluation strategy
-        learning_rate=2e-5,                  # Learning rate
-        per_device_train_batch_size=4,       # Batch size for training
-        per_device_eval_batch_size=4,        # Batch size for evaluation
-        num_train_epochs=3,                  # Number of training epochs
-        weight_decay=0.01,                   # Strength of weight decay
-        logging_dir='./logs',                # Directory for storing logs
-        logging_steps=10,                    # Log every 10 steps
-    )
-    # Initialize the Trainer
     trainer = Trainer(
         model=model,                         # The model to train
         args=training_args,                  # Training arguments
         train_dataset=tokenized_dataset,     # Training dataset
     )
-    # Start training
     trainer.train()
-def main():
-    st.title("Model Training with Streamlit")
-    st.write("Upload your training data in JSON format:")
-    uploaded_file = st.file_uploader("Choose a file", type="json")
-    if uploaded_file is not None:
-        st.write("File uploaded successfully!")
-        # Read the file into a pandas DataFrame
-        file_contents = uploaded_file.read().decode("utf-8")
-        st.write("Preview of uploaded data:")
-        st.text(file_contents[:1000])  # Display first 1000 characters for preview
-        # Save the file to a temporary location
-        temp_file_path = 'training_data.json'
-        with open(temp_file_path, 'w') as f:
-            f.write(file_contents)
-        # Call the train_model function
-        st.write("Training the model...")
-        train_model(temp_file_path)
-        st.write("Training completed!")
-if __name__ == "__main__":
-    main()

 import os
+import streamlit as st
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
+# Load training data
+@st.cache
+def load_data():
+    return load_dataset('json', data_files='training_data.json')
+dataset = load_data()
+# Initialize the tokenizer and model
+model_name = 'mistral/Mixtral-8x7B'  # Replace with the correct model name
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Define a preprocessing function
 def preprocess_function(examples):
     return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=128)
+# Tokenize the dataset
+tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)
+# Define training arguments
+training_args = TrainingArguments(
+    output_dir='./results',              # Output directory
+    evaluation_strategy='epoch',         # Evaluation strategy
+    learning_rate=2e-5,                  # Learning rate
+    per_device_train_batch_size=4,       # Batch size for training
+    per_device_eval_batch_size=4,        # Batch size for evaluation
+    num_train_epochs=3,                  # Number of training epochs
+    weight_decay=0.01,                   # Strength of weight decay
+    logging_dir='./logs',                # Directory for storing logs
+    logging_steps=10,                    # Log every 10 steps
+)
+# Define the training function
+def train_model():
     trainer = Trainer(
         model=model,                         # The model to train
         args=training_args,                  # Training arguments
         train_dataset=tokenized_dataset,     # Training dataset
     )
     trainer.train()
+# Streamlit UI
+st.title("Fine-Tuning a Language Model")
+if st.button('Start Training'):
+    with st.spinner('Training in progress...'):
+        train_model()
+    st.success('Training completed!')
+# Display some example outputs (optional)
+st.write("Example training data:", dataset['train'].select(range(5)))