# -*- coding: utf-8 -*-
"""Copy of assessment3_Elina_Hemink.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW

## Create embeddings of the email dataset and store in a chromadb database
"""

!pip install chromadb
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
import email
from sklearn.model_selection import train_test_split


# Loading email.csv dataset
emails = pd.read_csv('emails.csv')
print(emails.head())

# What a message looks like
print(emails['message'][0])

# Getting the content of the emails and saving to a list
content_text = []
for item in emails.message:
  text = email.message_from_string(item)
  message = (text.get_payload())
  cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
  content_text.append(cleaned_message)

# Checking content of emails (first 5 items)
print(content_text[:5])

# Taking a sample of the dataset
train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step

print(train[:5])
print(len(train))

# Setting up ids for ChromaDB collections
ids = []
for i in range(len(train)):
  id = 'id'+str(i+1)
  ids.append(id)

# Creating collection
client = chromadb.Client()
collection = client.create_collection(name="Enron_emails")
collection.add(
    documents = train,
    ids = ids
)

"""## Fine-tune a Language Model on the Dataset"""

!pip install transformers[torch] accelerate -U
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the dataset
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_emails = tokenizer(train, truncation=True, padding=True)

# Extract token IDs from BatchEncoding object
token_ids_list = tokenized_emails['input_ids']

# Save token IDs to a text file
with open('tokenized_emails.txt', 'w') as f:
    for token_ids in token_ids_list:
        f.write(' '.join(map(str, token_ids)) + '\n')

# Initialize TextDataset with the file path
dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=3,
    per_device_train_batch_size=8,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/fine_tuned_model")
tokenizer.save_pretrained("/fine_tuned_model")

"""## Create a Gradio Interface"""

!pip install gradio
import gradio as gr

model_dir= "/fine_tuned_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained(model_dir)

# Load chromadb collection to pass as context
documents = collection.get(["documents"])

# Define function to answer questions using the fine-tuned model and ChromaDB collection
def answer_question(question):
    # Concatenate document contents to create context
    context = " ".join(doc["content"] for doc in documents)

    # Append question to the context
    input_text = f"Question: {question} Context: {context} Answer:"

    # Generate answer using the model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    generated = model.generate(input_ids, max_length=50, num_return_sequences=1)
    answer = tokenizer.decode(generated[0], skip_special_tokens=True)


gr.Interface(fn=answer_question, inputs="text", outputs="text").launch()

"""## Deploy the Gradio Interface in a Huggingface Space"""