# -*- coding: utf-8 -*- """09 - TFIDF.py Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1aDPOMUJa_ml-2rRqyxKp24Gd9hlVFDwv """ # Commented out IPython magic to ensure Python compatibility. # %%writefile 09-TFIDF.py # import numpy as np # import math # import pandas as pd # from collections import defaultdict # # class TFIDF: # def __init__(self): # self.raw_data_set = [] # Raw input data (list of documents as lists of words) # self.vocab_list = [] # List of unique words (vocabulary) # self.data_matrix = [] # Matrix of word counts (Bag of Words representation) # self.tfidf_matrix = [] # Matrix for TF-IDF values # self.num_terms = [] # Number of terms in each document (for TF calculation) # # def load_data(self, file_path): # """Load dataset (CSV file with one row per document).""" # print("Loading data...") # df = pd.read_csv(file_path, header=None) # for index, row in df.iterrows(): # self.raw_data_set.append(row[0].split()) # Split each document into words # print(f"Loaded {len(self.raw_data_set)} documents.") # # def create_vocab_list(self): # """Create a list of unique words (vocabulary) from the raw data.""" # print("Creating vocabulary list...") # vocab_set = set() # for document in self.raw_data_set: # vocab_set.update(document) # self.vocab_list = sorted(list(vocab_set)) # print(f"Vocabulary size: {len(self.vocab_list)}") # # def bag_of_words_to_vector(self, document): # """Convert a document (list of words) to a vector representation.""" # word_vector = [0] * len(self.vocab_list) # for word in document: # if word in self.vocab_list: # idx = self.vocab_list.index(word) # word_vector[idx] += 1 # return word_vector # # def convert_to_matrix(self): # """Convert the entire dataset into a Bag of Words matrix.""" # print("Converting data to Bag of Words matrix...") # for document in self.raw_data_set: # self.data_matrix.append(self.bag_of_words_to_vector(document)) # self.num_terms.append(len(document)) # print(f"Data matrix shape: {len(self.data_matrix)} x {len(self.vocab_list)}") # # def compute_tfidf(self): # """Calculate the TF-IDF matrix.""" # print("Calculating TF-IDF matrix...") # num_docs = len(self.raw_data_set) # doc_term_count = [0] * len(self.vocab_list) # # # Count the number of documents each term appears in (for IDF calculation) # for doc_vector in self.data_matrix: # for idx, count in enumerate(doc_vector): # if count > 0: # doc_term_count[idx] += 1 # # # Calculate TF-IDF for each document-term pair # for i, doc_vector in enumerate(self.data_matrix): # tfidf_vector = [] # for j, count in enumerate(doc_vector): # tf = count / self.num_terms[i] # Term Frequency # idf = math.log(num_docs / (1 + doc_term_count[j])) # Inverse Document Frequency # tfidf_vector.append(tf * idf) # self.tfidf_matrix.append(tfidf_vector) # print(f"TF-IDF matrix calculated with shape: {len(self.tfidf_matrix)} x {len(self.vocab_list)}") # # def save_tfidf_matrix(self, output_file): # """Save the TF-IDF matrix to a CSV file.""" # print(f"Saving TF-IDF matrix to {output_file}...") # df = pd.DataFrame(self.tfidf_matrix, columns=self.vocab_list) # df.to_csv(output_file, index=False) # print(f"TF-IDF matrix saved to {output_file}.") # # # Explanation of the Python code: # # 1.Imports: We use numpy, math, and pandas to handle matrices and math calculations efficiently. We also use defaultdict for easy counting of word occurrences. # # # 2.Initialization (__init__): Initializes necessary attributes such as raw_data_set, vocab_list, data_matrix, and tfidf_matrix. # # # 3.Loading Data (load_data): This function reads a CSV file where each row contains a document. The documents are split into words and stored in raw_data_set. # # # 4.Vocabulary Creation (create_vocab_list): This function extracts the unique words from all documents to create a vocabulary list. It uses a set to avoid duplicates and then sorts it to maintain consistency. # # # 5.Bag of Words Conversion (bag_of_words_to_vector): Converts a document (list of words) into a vector where each index corresponds to a word in the vocabulary list, and the value is the frequency of that word in the document. # # # 6.Matrix Conversion (convert_to_matrix): Converts the entire dataset (list of documents) into a Bag of Words matrix. It also calculates the number of terms in each document for TF calculation. # # # 7.TF-IDF Calculation (compute_tfidf): For each document and each word, the function calculates the TF-IDF value based on the Term Frequency (TF) and Inverse Document Frequency (IDF). It calculates the TF by dividing the word count by the total number of terms in the document. IDF is calculated using the formula log(total_documents / (1 + document_count_for_term)). # # # 8.Saving the TF-IDF Matrix (save_tfidf_matrix): This function saves the TF-IDF matrix to a CSV file for further use or analysis. # !python /content/09-TFIDF.py