Spam_Email_Detection / 02-Preprocessing.py
KarthikaRajagopal's picture
Upload 02-Preprocessing.py
e4eb02a verified
import re # For regular expressions
from nltk.stem import PorterStemmer # For stemming
from nltk.corpus import stopwords # For stopword removal
from typing import List, Tuple # For type hinting
import string # For string operations
from collections import Counter # For creating unique sets of tokens
import nltk # For downloading stopwords
# Ensure stopwords resource is downloaded
nltk.download('stopwords')
class Preprocessing:
def __init__(self):
self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming
def check_special_char(self, ch: str) -> bool:
"""
Checks if a character is a special character or a digit.
Returns True if it is, otherwise False.
"""
return ch in string.punctuation or ch.isdigit()
def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
"""
Removes special characters and digits from the text.
Replaces them with a space to preserve word boundaries.
"""
sub, mes = text
sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
return sub, mes
def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
"""
Converts all characters in the text to lowercase.
"""
sub, mes = text
return sub.lower(), mes.lower()
def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
"""
Splits the text into individual words (tokens) based on spaces.
"""
sub, mes = text
return sub.split(), mes.split()
def check_stop_words(self, word: str) -> bool:
"""
Checks if a word is a stopword.
"""
return word in self.stop_words
def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
"""
Removes stopwords from the tokenized text.
"""
sub_tokens, mes_tokens = tokens
sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
return sub_tokens, mes_tokens
def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
"""
Stems each word in the tokenized text using PorterStemmer.
Removes duplicates by returning a unique list of stems.
"""
sub_tokens, mes_tokens = tokens
unique_stems = set()
# Stem tokens from both subject and message
for word in sub_tokens + mes_tokens:
unique_stems.add(self.stemmer.stem(word))
return list(unique_stems)
# Example Usage
if __name__ == "__main__":
# Example input: (subject, message)
text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")
preprocessor = Preprocessing()
# Remove special characters
text = preprocessor.remove_special_char(text)
print("After removing special characters:", text)
# Convert to lowercase
text = preprocessor.lowercase_conversion(text)
print("After converting to lowercase:", text)
# Tokenize
tokens = preprocessor.tokenize(text)
print("After tokenizing:", tokens)
# Remove stopwords
tokens = preprocessor.removal_of_stop_words(tokens)
print("After removing stopwords:", tokens)
# Stem words
stems = preprocessor.stem_words(tokens)
print("After stemming:", stems)