File size: 3,964 Bytes
6cfbf56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47c9d2
6cfbf56
 
f47c9d2
6cfbf56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
"""Copy of assessment3_Elina_Hemink.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW

## Create embeddings of the email dataset and store in a chromadb database
"""

!pip install chromadb
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
import email
from sklearn.model_selection import train_test_split



# Loading email.csv dataset
emails = pd.read_csv('emails.csv')
print(emails.head())

# What a message looks like
print(emails['message'][0])

# Getting the content of the emails and saving to a list
content_text = []
for item in emails.message:
  text = email.message_from_string(item)
  message = (text.get_payload())
  cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
  content_text.append(cleaned_message)

# Checking content of emails (first 5 items)
print(content_text[:5])

# Taking a sample of the dataset
train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step

print(train[:5])
print(len(train))

# Setting up ids for ChromaDB collections
ids = []
for i in range(len(train)):
  id = 'id'+str(i+1)
  ids.append(id)

# Creating collection
client = chromadb.Client()
collection = client.create_collection(name="Enron_emails")
collection.add(
    documents = train,
    ids = ids
)

"""## Fine-tune a Language Model on the Dataset"""

!pip install transformers[torch] accelerate -U
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the dataset
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_emails = tokenizer(train, truncation=True, padding=True)

# Extract token IDs from BatchEncoding object
token_ids_list = tokenized_emails['input_ids']

# Save token IDs to a text file
with open('tokenized_emails.txt', 'w') as f:
    for token_ids in token_ids_list:
        f.write(' '.join(map(str, token_ids)) + '\n')

# Initialize TextDataset with the file path
dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=3,
    per_device_train_batch_size=8,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/fine_tuned_model")
tokenizer.save_pretrained("/fine_tuned_model")

"""## Create a Gradio Interface"""

!pip install gradio
import gradio as gr

model_dir= "/fine_tuned_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained(model_dir)

# Load chromadb collection to pass as context
documents = collection.get(["documents"])

# Define function to answer questions using the fine-tuned model and ChromaDB collection
def answer_question(question):
    # Concatenate document contents to create context
    context = " ".join(doc["content"] for doc in documents)

    # Append question to the context
    input_text = f"Question: {question} Context: {context} Answer:"

    # Generate answer using the model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    generated = model.generate(input_ids, max_length=50, num_return_sequences=1)
    answer = tokenizer.decode(generated[0], skip_special_tokens=True)


gr.Interface(fn=answer_question, inputs="text", outputs="text").launch()

"""## Deploy the Gradio Interface in a Huggingface Space"""