# -*- coding: utf-8 -*- """Copy of assessment3_Elina_Hemink.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW ## Create embeddings of the email dataset and store in a chromadb database """ !pip install chromadb import chromadb from chromadb.utils import embedding_functions import pandas as pd import email from sklearn.model_selection import train_test_split # Loading email.csv dataset emails = pd.read_csv('emails.csv') print(emails.head()) # What a message looks like print(emails['message'][0]) # Getting the content of the emails and saving to a list content_text = [] for item in emails.message: text = email.message_from_string(item) message = (text.get_payload()) cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") content_text.append(cleaned_message) # Checking content of emails (first 5 items) print(content_text[:5]) # Taking a sample of the dataset train, test = train_test_split(content_text, train_size = 0.01) # Dataset is too large to complete embedding step print(train[:5]) print(len(train)) # Setting up ids for ChromaDB collections ids = [] for i in range(len(train)): id = 'id'+str(i+1) ids.append(id) # Creating collection client = chromadb.Client() collection = client.create_collection(name="Enron_emails") collection.add( documents = train, ids = ids ) """## Fine-tune a Language Model on the Dataset""" !pip install transformers[torch] accelerate -U from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments # Load pre-trained GPT2 tokenizer and model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') # Tokenize the dataset tokenizer.add_special_tokens({'pad_token': '[PAD]'}) tokenized_emails = tokenizer(train, truncation=True, padding=True) # Extract token IDs from BatchEncoding object token_ids_list = tokenized_emails['input_ids'] # Save token IDs to a text file with open('tokenized_emails.txt', 'w') as f: for token_ids in token_ids_list: f.write(' '.join(map(str, token_ids)) + '\n') # Initialize TextDataset with the file path dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128) # Define data collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Define training arguments training_args = TrainingArguments( output_dir='./output', num_train_epochs=3, per_device_train_batch_size=8, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) # Fine-tune the model trainer.train() # Save the fine-tuned model model.save_pretrained("/fine_tuned_model") tokenizer.save_pretrained("/fine_tuned_model") """## Create a Gradio Interface""" !pip install gradio import gradio as gr model_dir= "/fine_tuned_model" tokenizer = GPT2Tokenizer.from_pretrained(model_dir) model = GPT2LMHeadModel.from_pretrained(model_dir) # Load chromadb collection to pass as context documents = collection.get(["documents"]) # Define function to answer questions using the fine-tuned model and ChromaDB collection def answer_question(question): # Concatenate document contents to create context context = " ".join(doc["content"] for doc in documents) # Append question to the context input_text = f"Question: {question} Context: {context} Answer:" # Generate answer using the model input_ids = tokenizer.encode(input_text, return_tensors="pt") generated = model.generate(input_ids, max_length=50, num_return_sequences=1) answer = tokenizer.decode(generated[0], skip_special_tokens=True) gr.Interface(fn=answer_question, inputs="text", outputs="text").launch() """## Deploy the Gradio Interface in a Huggingface Space"""