Spaces:

eHemink
/

EnronEmails

Runtime error

App Files Files Community

EnronEmails / app.py

eHemink

Update app.py

f47c9d2 verified 11 months ago

raw

history blame

3.96 kB

	# -- coding: utf-8 --
	"""Copy of assessment3_Elina_Hemink.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW

	## Create embeddings of the email dataset and store in a chromadb database
	"""

	!pip install chromadb
	import chromadb
	from chromadb.utils import embedding_functions
	import pandas as pd
	import email
	from sklearn.model_selection import train_test_split



	# Loading email.csv dataset
	emails = pd.read_csv('emails.csv')
	print(emails.head())

	# What a message looks like
	print(emails['message'][0])

	# Getting the content of the emails and saving to a list
	content_text = []
	for item in emails.message:
	text = email.message_from_string(item)
	message = (text.get_payload())
	cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
	content_text.append(cleaned_message)

	# Checking content of emails (first 5 items)
	print(content_text[:5])

	# Taking a sample of the dataset
	train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step

	print(train[:5])
	print(len(train))

	# Setting up ids for ChromaDB collections
	ids = []
	for i in range(len(train)):
	id = 'id'+str(i+1)
	ids.append(id)

	# Creating collection
	client = chromadb.Client()
	collection = client.create_collection(name="Enron_emails")
	collection.add(
	documents = train,
	ids = ids
	)

	"""## Fine-tune a Language Model on the Dataset"""

	!pip install transformers[torch] accelerate -U
	from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

	# Load pre-trained GPT2 tokenizer and model
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	model = GPT2LMHeadModel.from_pretrained('gpt2')

	# Tokenize the dataset
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	tokenized_emails = tokenizer(train, truncation=True, padding=True)

	# Extract token IDs from BatchEncoding object
	token_ids_list = tokenized_emails['input_ids']

	# Save token IDs to a text file
	with open('tokenized_emails.txt', 'w') as f:
	for token_ids in token_ids_list:
	f.write(' '.join(map(str, token_ids)) + '\n')

	# Initialize TextDataset with the file path
	dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)

	# Define data collator
	data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

	# Define training arguments
	training_args = TrainingArguments(
	output_dir='./output',
	num_train_epochs=3,
	per_device_train_batch_size=8,
	)

	# Initialize Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=dataset,
	)

	# Fine-tune the model
	trainer.train()

	# Save the fine-tuned model
	model.save_pretrained("/fine_tuned_model")
	tokenizer.save_pretrained("/fine_tuned_model")

	"""## Create a Gradio Interface"""

	!pip install gradio
	import gradio as gr

	model_dir= "/fine_tuned_model"
	tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
	model = GPT2LMHeadModel.from_pretrained(model_dir)

	# Load chromadb collection to pass as context
	documents = collection.get(["documents"])

	# Define function to answer questions using the fine-tuned model and ChromaDB collection
	def answer_question(question):
	# Concatenate document contents to create context
	context = " ".join(doc["content"] for doc in documents)

	# Append question to the context
	input_text = f"Question: {question} Context: {context} Answer:"

	# Generate answer using the model
	input_ids = tokenizer.encode(input_text, return_tensors="pt")
	generated = model.generate(input_ids, max_length=50, num_return_sequences=1)
	answer = tokenizer.decode(generated[0], skip_special_tokens=True)


	gr.Interface(fn=answer_question, inputs="text", outputs="text").launch()

	"""## Deploy the Gradio Interface in a Huggingface Space"""