KarthikaRajagopal
/

Spam_Email_Detection

Model card Files Files and versions Community

Spam_Email_Detection / 02-Preprocessing.py

KarthikaRajagopal's picture

KarthikaRajagopal

Upload 02-Preprocessing.py

e4eb02a verified 3 months ago

history blame contribute delete

3.67 kB

	import re # For regular expressions
	from nltk.stem import PorterStemmer # For stemming
	from nltk.corpus import stopwords # For stopword removal
	from typing import List, Tuple # For type hinting
	import string # For string operations
	from collections import Counter # For creating unique sets of tokens
	import nltk # For downloading stopwords

	# Ensure stopwords resource is downloaded
	nltk.download('stopwords')

	class Preprocessing:
	def __init__(self):
	self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
	self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming

	def check_special_char(self, ch: str) -> bool:
	"""
	Checks if a character is a special character or a digit.
	Returns True if it is, otherwise False.
	"""
	return ch in string.punctuation or ch.isdigit()

	def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
	"""
	Removes special characters and digits from the text.
	Replaces them with a space to preserve word boundaries.
	"""
	sub, mes = text
	sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
	mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
	return sub, mes

	def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
	"""
	Converts all characters in the text to lowercase.
	"""
	sub, mes = text
	return sub.lower(), mes.lower()

	def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
	"""
	Splits the text into individual words (tokens) based on spaces.
	"""
	sub, mes = text
	return sub.split(), mes.split()

	def check_stop_words(self, word: str) -> bool:
	"""
	Checks if a word is a stopword.
	"""
	return word in self.stop_words

	def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
	"""
	Removes stopwords from the tokenized text.
	"""
	sub_tokens, mes_tokens = tokens
	sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
	mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
	return sub_tokens, mes_tokens

	def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
	"""
	Stems each word in the tokenized text using PorterStemmer.
	Removes duplicates by returning a unique list of stems.
	"""
	sub_tokens, mes_tokens = tokens
	unique_stems = set()

	# Stem tokens from both subject and message
	for word in sub_tokens + mes_tokens:
	unique_stems.add(self.stemmer.stem(word))

	return list(unique_stems)

	# Example Usage
	if __name__ == "__main__":
	# Example input: (subject, message)
	text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")

	preprocessor = Preprocessing()

	# Remove special characters
	text = preprocessor.remove_special_char(text)
	print("After removing special characters:", text)

	# Convert to lowercase
	text = preprocessor.lowercase_conversion(text)
	print("After converting to lowercase:", text)

	# Tokenize
	tokens = preprocessor.tokenize(text)
	print("After tokenizing:", tokens)

	# Remove stopwords
	tokens = preprocessor.removal_of_stop_words(tokens)
	print("After removing stopwords:", tokens)

	# Stem words
	stems = preprocessor.stem_words(tokens)
	print("After stemming:", stems)