Spaces:
Sleeping
Sleeping
nileshhanotia
commited on
Commit
•
3caf963
1
Parent(s):
c899f78
Update app.py
Browse files
app.py
CHANGED
@@ -5,16 +5,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingA
|
|
5 |
from transformers import DataCollatorForLanguageModeling
|
6 |
import torch
|
7 |
from huggingface_hub import Repository, HfFolder
|
|
|
8 |
|
9 |
# Authenticate Hugging Face Hub
|
10 |
hf_token = st.secrets["HF_TOKEN"] # Store your token in the Hugging Face Space Secrets
|
11 |
HfFolder.save_token(hf_token)
|
12 |
|
13 |
-
import os
|
14 |
-
|
15 |
# Set Git user identity
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
@st.cache_data
|
20 |
def load_data(file_path):
|
@@ -64,6 +69,16 @@ def create_dataset(data, tokenizer, max_length):
|
|
64 |
})
|
65 |
return inputs
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def main():
|
68 |
st.title("Model Training with Streamlit")
|
69 |
|
@@ -94,17 +109,6 @@ def main():
|
|
94 |
st.write("Tokenizing dataset...")
|
95 |
tokenized_dataset = create_dataset(data, tokenizer, max_length)
|
96 |
|
97 |
-
# Convert tokenized_dataset to a torch Dataset
|
98 |
-
class SimpleDataset(torch.utils.data.Dataset):
|
99 |
-
def __init__(self, encodings):
|
100 |
-
self.encodings = encodings
|
101 |
-
|
102 |
-
def __getitem__(self, idx):
|
103 |
-
return {key: val[idx] for key, val in self.encodings[idx].items()}
|
104 |
-
|
105 |
-
def __len__(self):
|
106 |
-
return len(self.encodings)
|
107 |
-
|
108 |
dataset = SimpleDataset(tokenized_dataset)
|
109 |
|
110 |
# Define training arguments
|
|
|
5 |
from transformers import DataCollatorForLanguageModeling
|
6 |
import torch
|
7 |
from huggingface_hub import Repository, HfFolder
|
8 |
+
import subprocess
|
9 |
|
10 |
# Authenticate Hugging Face Hub
|
11 |
hf_token = st.secrets["HF_TOKEN"] # Store your token in the Hugging Face Space Secrets
|
12 |
HfFolder.save_token(hf_token)
|
13 |
|
|
|
|
|
14 |
# Set Git user identity
|
15 |
+
def set_git_config():
|
16 |
+
try:
|
17 |
+
subprocess.run(['git', 'config', 'user.email', '[email protected]'], check=True)
|
18 |
+
subprocess.run(['git', 'config', 'user.name', 'Nilesh'], check=True)
|
19 |
+
except subprocess.CalledProcessError as e:
|
20 |
+
st.error(f"Git configuration error: {str(e)}")
|
21 |
+
|
22 |
+
set_git_config()
|
23 |
|
24 |
@st.cache_data
|
25 |
def load_data(file_path):
|
|
|
69 |
})
|
70 |
return inputs
|
71 |
|
72 |
+
class SimpleDataset(torch.utils.data.Dataset):
|
73 |
+
def __init__(self, encodings):
|
74 |
+
self.encodings = encodings
|
75 |
+
|
76 |
+
def __getitem__(self, idx):
|
77 |
+
return {key: val[idx] for key, val in self.encodings[idx].items()}
|
78 |
+
|
79 |
+
def __len__(self):
|
80 |
+
return len(self.encodings)
|
81 |
+
|
82 |
def main():
|
83 |
st.title("Model Training with Streamlit")
|
84 |
|
|
|
109 |
st.write("Tokenizing dataset...")
|
110 |
tokenized_dataset = create_dataset(data, tokenizer, max_length)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
dataset = SimpleDataset(tokenized_dataset)
|
113 |
|
114 |
# Define training arguments
|