KarthikaRajagopal
/

Spam_Email_Detection

Model card Files Files and versions Community

KarthikaRajagopal commited on Dec 2, 2024

Commit

e4eb02a

verified ·

1 Parent(s): 35e2d85

Upload 02-Preprocessing.py

Browse files

Files changed (1) hide show

02-Preprocessing.py +102 -0

02-Preprocessing.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import re  # For regular expressions
+from nltk.stem import PorterStemmer  # For stemming
+from nltk.corpus import stopwords  # For stopword removal
+from typing import List, Tuple  # For type hinting
+import string  # For string operations
+from collections import Counter  # For creating unique sets of tokens
+import nltk  # For downloading stopwords
+# Ensure stopwords resource is downloaded
+nltk.download('stopwords')
+class Preprocessing:
+    def __init__(self):
+        self.stop_words = set(stopwords.words('english'))  # Initialize a set of stopwords
+        self.stemmer = PorterStemmer()  # Initialize the PorterStemmer for stemming
+    def check_special_char(self, ch: str) -> bool:
+        """
+        Checks if a character is a special character or a digit.
+        Returns True if it is, otherwise False.
+        """
+        return ch in string.punctuation or ch.isdigit()
+    def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
+        """
+        Removes special characters and digits from the text.
+        Replaces them with a space to preserve word boundaries.
+        """
+        sub, mes = text
+        sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
+        mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
+        return sub, mes
+    def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
+        """
+        Converts all characters in the text to lowercase.
+        """
+        sub, mes = text
+        return sub.lower(), mes.lower()
+    def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
+        """
+        Splits the text into individual words (tokens) based on spaces.
+        """
+        sub, mes = text
+        return sub.split(), mes.split()
+    def check_stop_words(self, word: str) -> bool:
+        """
+        Checks if a word is a stopword.
+        """
+        return word in self.stop_words
+    def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
+        """
+        Removes stopwords from the tokenized text.
+        """
+        sub_tokens, mes_tokens = tokens
+        sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
+        mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
+        return sub_tokens, mes_tokens
+    def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
+        """
+        Stems each word in the tokenized text using PorterStemmer.
+        Removes duplicates by returning a unique list of stems.
+        """
+        sub_tokens, mes_tokens = tokens
+        unique_stems = set()
+        # Stem tokens from both subject and message
+        for word in sub_tokens + mes_tokens:
+            unique_stems.add(self.stemmer.stem(word))
+        return list(unique_stems)
+# Example Usage
+if __name__ == "__main__":
+    # Example input: (subject, message)
+    text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")
+    preprocessor = Preprocessing()
+    # Remove special characters
+    text = preprocessor.remove_special_char(text)
+    print("After removing special characters:", text)
+    # Convert to lowercase
+    text = preprocessor.lowercase_conversion(text)
+    print("After converting to lowercase:", text)
+    # Tokenize
+    tokens = preprocessor.tokenize(text)
+    print("After tokenizing:", tokens)
+    # Remove stopwords
+    tokens = preprocessor.removal_of_stop_words(tokens)
+    print("After removing stopwords:", tokens)
+    # Stem words
+    stems = preprocessor.stem_words(tokens)
+    print("After stemming:", stems)