Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Copy of assessment3_Elina_Hemink.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW | |
## Create embeddings of the email dataset and store in a chromadb database | |
""" | |
!pip install chromadb | |
import chromadb | |
from chromadb.utils import embedding_functions | |
import pandas as pd | |
import email | |
from sklearn.model_selection import train_test_split | |
# Loading email.csv dataset | |
emails = pd.read_csv('emails.csv') | |
print(emails.head()) | |
# What a message looks like | |
print(emails['message'][0]) | |
# Getting the content of the emails and saving to a list | |
content_text = [] | |
for item in emails.message: | |
text = email.message_from_string(item) | |
message = (text.get_payload()) | |
cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") | |
content_text.append(cleaned_message) | |
# Checking content of emails (first 5 items) | |
print(content_text[:5]) | |
# Taking a sample of the dataset | |
train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step | |
print(train[:5]) | |
print(len(train)) | |
# Setting up ids for ChromaDB collections | |
ids = [] | |
for i in range(len(train)): | |
id = 'id'+str(i+1) | |
ids.append(id) | |
# Creating collection | |
client = chromadb.Client() | |
collection = client.create_collection(name="Enron_emails") | |
collection.add( | |
documents = train, | |
ids = ids | |
) | |
"""## Fine-tune a Language Model on the Dataset""" | |
!pip install transformers[torch] accelerate -U | |
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments | |
# Load pre-trained GPT2 tokenizer and model | |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
model = GPT2LMHeadModel.from_pretrained('gpt2') | |
# Tokenize the dataset | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
tokenized_emails = tokenizer(train, truncation=True, padding=True) | |
# Extract token IDs from BatchEncoding object | |
token_ids_list = tokenized_emails['input_ids'] | |
# Save token IDs to a text file | |
with open('tokenized_emails.txt', 'w') as f: | |
for token_ids in token_ids_list: | |
f.write(' '.join(map(str, token_ids)) + '\n') | |
# Initialize TextDataset with the file path | |
dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128) | |
# Define data collator | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir='./output', | |
num_train_epochs=3, | |
per_device_train_batch_size=8, | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=dataset, | |
) | |
# Fine-tune the model | |
trainer.train() | |
# Save the fine-tuned model | |
model.save_pretrained("/fine_tuned_model") | |
tokenizer.save_pretrained("/fine_tuned_model") | |
"""## Create a Gradio Interface""" | |
!pip install gradio | |
import gradio as gr | |
model_dir= "/fine_tuned_model" | |
tokenizer = GPT2Tokenizer.from_pretrained(model_dir) | |
model = GPT2LMHeadModel.from_pretrained(model_dir) | |
# Load chromadb collection to pass as context | |
documents = collection.get(["documents"]) | |
# Define function to answer questions using the fine-tuned model and ChromaDB collection | |
def answer_question(question): | |
# Concatenate document contents to create context | |
context = " ".join(doc["content"] for doc in documents) | |
# Append question to the context | |
input_text = f"Question: {question} Context: {context} Answer:" | |
# Generate answer using the model | |
input_ids = tokenizer.encode(input_text, return_tensors="pt") | |
generated = model.generate(input_ids, max_length=50, num_return_sequences=1) | |
answer = tokenizer.decode(generated[0], skip_special_tokens=True) | |
gr.Interface(fn=answer_question, inputs="text", outputs="text").launch() | |
"""## Deploy the Gradio Interface in a Huggingface Space""" | |