nileshhanotia commited on
Commit
de7d627
1 Parent(s): 0a83766

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -57
app.py CHANGED
@@ -1,73 +1,56 @@
1
- import streamlit as st
2
  import os
 
3
  from datasets import load_dataset
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
5
- import pandas as pd
6
- from io import StringIO
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def preprocess_function(examples):
9
- if 'prompt' not in examples:
10
- raise ValueError("Key 'prompt' not found in examples. Please check your dataset fields.")
11
  return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=128)
12
 
13
- def train_model(training_data):
14
- # Load the dataset
15
- dataset = load_dataset('json', data_files={'train': training_data})
16
-
17
- # Initialize the tokenizer and model
18
- model_name = 'mistral/Mixtral-8x7B' # Replace with the correct model name
19
- tokenizer = AutoTokenizer.from_pretrained(model_name)
20
- model = AutoModelForCausalLM.from_pretrained(model_name)
21
 
22
- # Tokenize the dataset
23
- tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)
24
-
25
- # Define training arguments
26
- training_args = TrainingArguments(
27
- output_dir='./results', # Output directory
28
- evaluation_strategy='epoch', # Evaluation strategy
29
- learning_rate=2e-5, # Learning rate
30
- per_device_train_batch_size=4, # Batch size for training
31
- per_device_eval_batch_size=4, # Batch size for evaluation
32
- num_train_epochs=3, # Number of training epochs
33
- weight_decay=0.01, # Strength of weight decay
34
- logging_dir='./logs', # Directory for storing logs
35
- logging_steps=10, # Log every 10 steps
36
- )
37
-
38
- # Initialize the Trainer
39
  trainer = Trainer(
40
  model=model, # The model to train
41
  args=training_args, # Training arguments
42
  train_dataset=tokenized_dataset, # Training dataset
43
  )
44
-
45
- # Start training
46
  trainer.train()
47
 
48
- def main():
49
- st.title("Model Training with Streamlit")
50
-
51
- st.write("Upload your training data in JSON format:")
52
- uploaded_file = st.file_uploader("Choose a file", type="json")
53
-
54
- if uploaded_file is not None:
55
- st.write("File uploaded successfully!")
56
-
57
- # Read the file into a pandas DataFrame
58
- file_contents = uploaded_file.read().decode("utf-8")
59
- st.write("Preview of uploaded data:")
60
- st.text(file_contents[:1000]) # Display first 1000 characters for preview
61
-
62
- # Save the file to a temporary location
63
- temp_file_path = 'training_data.json'
64
- with open(temp_file_path, 'w') as f:
65
- f.write(file_contents)
66
-
67
- # Call the train_model function
68
- st.write("Training the model...")
69
- train_model(temp_file_path)
70
- st.write("Training completed!")
71
 
72
- if __name__ == "__main__":
73
- main()
 
 
1
  import os
2
+ import streamlit as st
3
  from datasets import load_dataset
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
 
 
5
 
6
+ # Load training data
7
+ @st.cache
8
+ def load_data():
9
+ return load_dataset('json', data_files='training_data.json')
10
+
11
+ dataset = load_data()
12
+
13
+ # Initialize the tokenizer and model
14
+ model_name = 'mistral/Mixtral-8x7B' # Replace with the correct model name
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForCausalLM.from_pretrained(model_name)
17
+
18
+ # Define a preprocessing function
19
  def preprocess_function(examples):
 
 
20
  return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=128)
21
 
22
+ # Tokenize the dataset
23
+ tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)
 
 
 
 
 
 
24
 
25
+ # Define training arguments
26
+ training_args = TrainingArguments(
27
+ output_dir='./results', # Output directory
28
+ evaluation_strategy='epoch', # Evaluation strategy
29
+ learning_rate=2e-5, # Learning rate
30
+ per_device_train_batch_size=4, # Batch size for training
31
+ per_device_eval_batch_size=4, # Batch size for evaluation
32
+ num_train_epochs=3, # Number of training epochs
33
+ weight_decay=0.01, # Strength of weight decay
34
+ logging_dir='./logs', # Directory for storing logs
35
+ logging_steps=10, # Log every 10 steps
36
+ )
37
+
38
+ # Define the training function
39
+ def train_model():
 
 
40
  trainer = Trainer(
41
  model=model, # The model to train
42
  args=training_args, # Training arguments
43
  train_dataset=tokenized_dataset, # Training dataset
44
  )
 
 
45
  trainer.train()
46
 
47
+ # Streamlit UI
48
+ st.title("Fine-Tuning a Language Model")
49
+
50
+ if st.button('Start Training'):
51
+ with st.spinner('Training in progress...'):
52
+ train_model()
53
+ st.success('Training completed!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Display some example outputs (optional)
56
+ st.write("Example training data:", dataset['train'].select(range(5)))