Spaces:
Build error
Build error
# https://www.kaggle.com/datasets/wcukierski/enron-email-dataset | |
from google.colab import drive | |
drive.mount('/content/drive') | |
# libraries | |
#!pip install transformers --upgrade | |
#!pip install gradio | |
#!pip install datasets | |
#!pip install huggingface-hub | |
#!pip install chromadb | |
#!pip install accelerate==0.21.0 | |
#!pip install transformers[torch] | |
#!pip install git+https://github.com/huggingface/accelerate.git | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoModel | |
from sklearn.model_selection import train_test_split | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline | |
import gradio as gr | |
import chromadb | |
from datasets import Dataset | |
from transformers import Trainer, TrainingArguments | |
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling | |
from transformers import TextDataset, DataCollatorForLanguageModeling | |
#from transformers import TrainingArguments, Trainer | |
#from transformers import pipeline | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
file_path = '/content/drive/MyDrive/emails.csv' | |
df = pd.read_csv(file_path) | |
df_columns = df.columns | |
print(df.head(10)) | |
messages_df = df['message'] #extract message column | |
print(messages_df.head()) | |
print(type(messages_df)) | |
# Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample. (Kept changing test size to stop colab crashing.) | |
emails_train, emails_test = train_test_split(messages_df, test_size=0.000008, random_state=42) | |
print(emails_test) | |
print(type(emails_test)) | |
pd.set_option('display.max_colwidth', None) #check content | |
print(emails_test.head()) #first 5 rows | |
print(type(emails_test)) | |
# Embeddings | |
import os | |
# Define maximum sequence length | |
max_seq_length = 512 | |
# Truncate or pad sequences to the maximum length | |
truncated_emails_test = [email[:max_seq_length] for email in emails_test] | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
model = AutoModel.from_pretrained("bert-base-uncased") | |
embeddings_pipeline = pipeline('feature-extraction', model=model, tokenizer=tokenizer) | |
embeddings = embeddings_pipeline(truncated_emails_test) | |
print(type(embeddings)) | |
#print(embeddings[:5]) #cannot see embeddings like this | |
# to see the embeddings | |
# Save each embedding to a separate file | |
for i, emb in enumerate(embeddings): | |
np.save(f"embedding_{i}.npy", emb) | |
# Load each embedding from its corresponding file | |
loaded_embeddings = [] | |
for i in range(len(embeddings)): | |
emb = np.load(f"embedding_{i}.npy") | |
loaded_embeddings.append(emb) | |
for i, emb in enumerate(loaded_embeddings): | |
print(f"Embedding {i}:") | |
print(emb) | |
import chromadb | |
chroma_client = chromadb.Client() | |
collection = chroma_client.create_collection(name="michelletest") | |
# Extract the embeddings from the nested list | |
extracted_embeddings = [embedding[0][0] for embedding in embeddings] | |
# Add embeddings to the ChromaDB collection | |
collection.add( | |
embeddings=extracted_embeddings[:5], # Add the first 5 embeddings | |
documents=emails_test.tolist()[:5], # Add the first 5 documents | |
metadatas=[{"source": "emails_test"} for _ in range(5)], # Metadata for the first 5 documents | |
ids=[f"id{i}" for i in range(5)] # ID for the first 5 documents | |
) | |
collection.count() #check how many in the database | |
# Retrieve the first 2 entries from the ChromaDB database to check that it worked properly | |
collection.get() | |
# Convert the Series to a DataFrame | |
emails_test_df = emails_test.to_frame() | |
# Print the column names of the DataFrame | |
print(emails_test_df.columns) | |
print(emails_test_df['message']) #checking content of messsages for fine tuning the model | |
print(emails_test_df['message'].head()) | |
# Print the column names of the DataFrame | |
print(emails_test_df.columns) | |
num_entries = emails_test_df.shape[0] | |
print("Number of entries in emails_test_df:", num_entries) | |
# Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample; 60 used in the end | |
emails_train, emails_test2 = train_test_split(messages_df, test_size=0.00001, random_state=42) | |
print(emails_test2) | |
print(type(emails_test2)) | |
num_entries2=emails_test2.shape[0] | |
print("number of",num_entries2) | |
# Convert pandas Series to a list of strings | |
text_list = emails_test_df['message'].tolist() | |
# Verify the type and content | |
print(type(text_list)) | |
print(text_list[:5]) # Print the first 5 entries as an example | |
print(text_list[:5]) | |
print(text_list) | |
print(text_list[2]) #to see the content of an average mail to know what to clean up | |
def remove_sections(email): #clean email of content that is not useful | |
"""Remove sections including original message, from, sent, to, subject line, and additional headers.""" | |
sections_to_remove = [ | |
"----- Original Message -----", | |
"From:", | |
"Sent:", | |
"To:", | |
"CC:", | |
"Subject:", | |
"Message-ID:", | |
"Date:", | |
"Mime-Version:", | |
"Content-Type:", | |
"Content-Transfer-Encoding:", | |
"X-cc:", | |
"X-bcc:", | |
"X-Folder:", | |
"X-Origin:", | |
"X-FileName:", | |
"-----Original Message-----" | |
] | |
for section in sections_to_remove: | |
email = [line for line in email if section not in line] | |
return email | |
# Remove sections from each email in the list | |
cleaned_text_list = [remove_sections(email.split("\n")) for email in text_list] | |
# Print out the cleaned emails to see if content looks ok | |
for cleaned_email in cleaned_text_list: | |
print("\n".join(cleaned_email)) | |
print("=" * 50) # Separate each cleaned email for better readability | |
#fine tune language model | |
# Define the pre-trained model name (bart-base) | |
model_name = "facebook/bart-base" | |
# Load the tokenizer for bart-base | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Function to preprocess text_list for training | |
def prepare_data(text_list): | |
# Tokenize the text with padding and truncation (BART handles these well) | |
inputs = tokenizer(text_list, padding="max_length", truncation=True) | |
# Copy the input IDs for labels (desired output during training) | |
labels = inputs.input_ids.copy() | |
# Create a Dataset object from the preprocessed data | |
return Dataset.from_dict({"input_ids": inputs["input_ids"], "labels": labels}) | |
"""Preprocesses text data for training the BART model. | |
Args: | |
text_list: A list of strings containing the text data. | |
Returns: | |
A Dataset object containing the preprocessed data. | |
""" | |
# Prepare your training data from the text list | |
train_data = prepare_data(text_list) | |
# Define the fine-tuning model (BART for sequence-to-sequence tasks) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
# Training hyperparameters (adjust as needed) | |
batch_size = 8 | |
learning_rate = 2e-5 | |
num_epochs = 3 | |
from transformers import Trainer | |
# Define the Trainer object for training management | |
trainer = Trainer( | |
model=model, | |
args=TrainingArguments( | |
output_dir="./results", # Output directory for checkpoints etc. | |
overwrite_output_dir=True, | |
per_device_train_batch_size=batch_size, | |
learning_rate=learning_rate, | |
num_train_epochs=num_epochs, | |
), | |
train_dataset=train_data, | |
) | |
# Start the fine-tuning process | |
trainer.train() | |
# Save the fine-tuned model and tokenizer | |
model.save_pretrained("./fine-tuned_bart") | |
tokenizer.save_pretrained("./fine-tuned_bart") | |
print("Fine-tuning completed! Model saved in ./fine-tuned_bart") | |
# Fine-tuning completed! Model saved in ./fine-tuned_bart | |
# i used a very small amount of input so that colab stopped crashing | |
import gradio as gr | |
from transformers import BartForQuestionAnswering, BartTokenizer | |
# Load the fine-tuned BART model | |
model = BartForQuestionAnswering.from_pretrained("./fine-tuned_bart") | |
tokenizer = BartTokenizer.from_pretrained("./fine-tuned_bart") | |
# Function to answer questions | |
def answer_question(question): | |
inputs = tokenizer.encode_plus(question, return_tensors="pt", max_length=512, truncation=True) | |
input_ids = inputs["input_ids"].tolist()[0] | |
answer_start_scores, answer_end_scores = model(**inputs) | |
answer_start = torch.argmax(answer_start_scores) | |
answer_end = torch.argmax(answer_end_scores) + 1 | |
answer = tokenizer.decode(input_ids[answer_start:answer_end]) | |
return answer | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=answer_question, | |
inputs="text", | |
outputs="text", | |
title="Question Answering Model", | |
description="Enter a question to get the answer." | |
) | |
# Launch the interface | |
iface.launch() | |