|
|
|
import re |
|
from nltk.stem import PorterStemmer |
|
from nltk.corpus import stopwords |
|
import string |
|
import csv |
|
|
|
class Preprocessing: |
|
def __init__(self): |
|
self.stop_words = set(stopwords.words('english')) |
|
self.stemmer = PorterStemmer() |
|
|
|
def remove_special_char(self, text: tuple[str, str]) -> tuple[str, str]: |
|
""" |
|
Removes special characters and digits from the text. |
|
""" |
|
sub, mes = text |
|
sub = re.sub(r'[^\w\s]', ' ', sub) |
|
mes = re.sub(r'[^\w\s]', ' ', mes) |
|
return sub, mes |
|
|
|
def lowercase_conversion(self, text: tuple[str, str]) -> tuple[str, str]: |
|
""" |
|
Converts all characters in the text to lowercase. |
|
""" |
|
sub, mes = text |
|
return sub.lower(), mes.lower() |
|
|
|
def tokenize(self, text: tuple[str, str]) -> tuple[list[str], list[str]]: |
|
""" |
|
Splits the text into individual words (tokens). |
|
""" |
|
sub, mes = text |
|
return sub.split(), mes.split() |
|
|
|
def removal_of_stop_words(self, tokens: tuple[list[str], list[str]]) -> tuple[list[str], list[str]]: |
|
""" |
|
Removes stopwords from the tokenized text. |
|
""" |
|
sub_tokens, mes_tokens = tokens |
|
sub_tokens = [word for word in sub_tokens if word not in self.stop_words] |
|
mes_tokens = [word for word in mes_tokens if word not in self.stop_words] |
|
return sub_tokens, mes_tokens |
|
|
|
def stem_words(self, tokens: tuple[list[str], list[str]]) -> list[str]: |
|
""" |
|
Stems each word in the tokenized text. |
|
Removes duplicates by returning a unique list of stems. |
|
""" |
|
sub_tokens, mes_tokens = tokens |
|
return list({self.stemmer.stem(word) for word in sub_tokens + mes_tokens}) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
preprocessor = Preprocessing() |
|
|
|
|
|
unique_words = set() |
|
|
|
|
|
with open("Final_Dataset.csv", "r", encoding="utf-8") as infile: |
|
csv_reader = csv.reader(infile) |
|
next(csv_reader) |
|
|
|
|
|
for i, row in enumerate(csv_reader): |
|
subject = row[0] |
|
message = row[1] |
|
|
|
|
|
text = (subject, message) |
|
text = preprocessor.remove_special_char(text) |
|
text = preprocessor.lowercase_conversion(text) |
|
tokens = preprocessor.tokenize(text) |
|
filtered_tokens = preprocessor.removal_of_stop_words(tokens) |
|
stemmed_tokens = preprocessor.stem_words(filtered_tokens) |
|
|
|
|
|
unique_words.update(stemmed_tokens) |
|
|
|
print(f"Processed row {i + 1}") |
|
|
|
|
|
with open("processed_data.txt", "w", encoding="utf-8") as outfile: |
|
outfile.write(" ".join(unique_words)) |
|
|
|
print("Unique words have been saved to uniquewords.txt.") |
|
|