eHemink commited on
Commit
6cfbf56
·
verified ·
1 Parent(s): a1f0f16

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of assessment3_Elina_Hemink.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW
8
+
9
+ ## Create embeddings of the email dataset and store in a chromadb database
10
+ """
11
+
12
+ !pip install chromadb
13
+ import chromadb
14
+ from chromadb.utils import embedding_functions
15
+ import pandas as pd
16
+ import email
17
+ from sklearn.model_selection import train_test_split
18
+
19
+ from google.colab import drive
20
+ drive.mount('/content/drive')
21
+
22
+ # Loading email.csv dataset
23
+ emails = pd.read_csv('/content/drive/MyDrive/emails.csv')
24
+ print(emails.head())
25
+
26
+ # What a message looks like
27
+ print(emails['message'][0])
28
+
29
+ # Getting the content of the emails and saving to a list
30
+ content_text = []
31
+ for item in emails.message:
32
+ text = email.message_from_string(item)
33
+ message = (text.get_payload())
34
+ cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
35
+ content_text.append(cleaned_message)
36
+
37
+ # Checking content of emails (first 5 items)
38
+ print(content_text[:5])
39
+
40
+ # Taking a sample of the dataset
41
+ train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step
42
+
43
+ print(train[:5])
44
+ print(len(train))
45
+
46
+ # Setting up ids for ChromaDB collections
47
+ ids = []
48
+ for i in range(len(train)):
49
+ id = 'id'+str(i+1)
50
+ ids.append(id)
51
+
52
+ # Creating collection
53
+ client = chromadb.Client()
54
+ collection = client.create_collection(name="Enron_emails")
55
+ collection.add(
56
+ documents = train,
57
+ ids = ids
58
+ )
59
+
60
+ """## Fine-tune a Language Model on the Dataset"""
61
+
62
+ !pip install transformers[torch] accelerate -U
63
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
64
+
65
+ # Load pre-trained GPT2 tokenizer and model
66
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
67
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
68
+
69
+ # Tokenize the dataset
70
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
71
+ tokenized_emails = tokenizer(train, truncation=True, padding=True)
72
+
73
+ # Extract token IDs from BatchEncoding object
74
+ token_ids_list = tokenized_emails['input_ids']
75
+
76
+ # Save token IDs to a text file
77
+ with open('tokenized_emails.txt', 'w') as f:
78
+ for token_ids in token_ids_list:
79
+ f.write(' '.join(map(str, token_ids)) + '\n')
80
+
81
+ # Initialize TextDataset with the file path
82
+ dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)
83
+
84
+ # Define data collator
85
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
86
+
87
+ # Define training arguments
88
+ training_args = TrainingArguments(
89
+ output_dir='./output',
90
+ num_train_epochs=3,
91
+ per_device_train_batch_size=8,
92
+ )
93
+
94
+ # Initialize Trainer
95
+ trainer = Trainer(
96
+ model=model,
97
+ args=training_args,
98
+ data_collator=data_collator,
99
+ train_dataset=dataset,
100
+ )
101
+
102
+ # Fine-tune the model
103
+ trainer.train()
104
+
105
+ # Save the fine-tuned model
106
+ model.save_pretrained("/fine_tuned_model")
107
+ tokenizer.save_pretrained("/fine_tuned_model")
108
+
109
+ """## Create a Gradio Interface"""
110
+
111
+ !pip install gradio
112
+ import gradio as gr
113
+
114
+ model_dir= "/fine_tuned_model"
115
+ tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
116
+ model = GPT2LMHeadModel.from_pretrained(model_dir)
117
+
118
+ # Load chromadb collection to pass as context
119
+ documents = collection.get(["documents"])
120
+
121
+ # Define function to answer questions using the fine-tuned model and ChromaDB collection
122
+ def answer_question(question):
123
+ # Concatenate document contents to create context
124
+ context = " ".join(doc["content"] for doc in documents)
125
+
126
+ # Append question to the context
127
+ input_text = f"Question: {question} Context: {context} Answer:"
128
+
129
+ # Generate answer using the model
130
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
131
+ generated = model.generate(input_ids, max_length=50, num_return_sequences=1)
132
+ answer = tokenizer.decode(generated[0], skip_special_tokens=True)
133
+
134
+
135
+ gr.Interface(fn=answer_question, inputs="text", outputs="text").launch()
136
+
137
+ """## Deploy the Gradio Interface in a Huggingface Space"""
138
+