Spaces:
Runtime error
Runtime error
File size: 3,964 Bytes
6cfbf56 f47c9d2 6cfbf56 f47c9d2 6cfbf56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# -*- coding: utf-8 -*-
"""Copy of assessment3_Elina_Hemink.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW
## Create embeddings of the email dataset and store in a chromadb database
"""
!pip install chromadb
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
import email
from sklearn.model_selection import train_test_split
# Loading email.csv dataset
emails = pd.read_csv('emails.csv')
print(emails.head())
# What a message looks like
print(emails['message'][0])
# Getting the content of the emails and saving to a list
content_text = []
for item in emails.message:
text = email.message_from_string(item)
message = (text.get_payload())
cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
content_text.append(cleaned_message)
# Checking content of emails (first 5 items)
print(content_text[:5])
# Taking a sample of the dataset
train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step
print(train[:5])
print(len(train))
# Setting up ids for ChromaDB collections
ids = []
for i in range(len(train)):
id = 'id'+str(i+1)
ids.append(id)
# Creating collection
client = chromadb.Client()
collection = client.create_collection(name="Enron_emails")
collection.add(
documents = train,
ids = ids
)
"""## Fine-tune a Language Model on the Dataset"""
!pip install transformers[torch] accelerate -U
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# Load pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Tokenize the dataset
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_emails = tokenizer(train, truncation=True, padding=True)
# Extract token IDs from BatchEncoding object
token_ids_list = tokenized_emails['input_ids']
# Save token IDs to a text file
with open('tokenized_emails.txt', 'w') as f:
for token_ids in token_ids_list:
f.write(' '.join(map(str, token_ids)) + '\n')
# Initialize TextDataset with the file path
dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)
# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Define training arguments
training_args = TrainingArguments(
output_dir='./output',
num_train_epochs=3,
per_device_train_batch_size=8,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
# Fine-tune the model
trainer.train()
# Save the fine-tuned model
model.save_pretrained("/fine_tuned_model")
tokenizer.save_pretrained("/fine_tuned_model")
"""## Create a Gradio Interface"""
!pip install gradio
import gradio as gr
model_dir= "/fine_tuned_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained(model_dir)
# Load chromadb collection to pass as context
documents = collection.get(["documents"])
# Define function to answer questions using the fine-tuned model and ChromaDB collection
def answer_question(question):
# Concatenate document contents to create context
context = " ".join(doc["content"] for doc in documents)
# Append question to the context
input_text = f"Question: {question} Context: {context} Answer:"
# Generate answer using the model
input_ids = tokenizer.encode(input_text, return_tensors="pt")
generated = model.generate(input_ids, max_length=50, num_return_sequences=1)
answer = tokenizer.decode(generated[0], skip_special_tokens=True)
gr.Interface(fn=answer_question, inputs="text", outputs="text").launch()
"""## Deploy the Gradio Interface in a Huggingface Space"""
|