KarthikaRajagopal commited on
Commit
e4eb02a
·
verified ·
1 Parent(s): 35e2d85

Upload 02-Preprocessing.py

Browse files
Files changed (1) hide show
  1. 02-Preprocessing.py +102 -0
02-Preprocessing.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re # For regular expressions
2
+ from nltk.stem import PorterStemmer # For stemming
3
+ from nltk.corpus import stopwords # For stopword removal
4
+ from typing import List, Tuple # For type hinting
5
+ import string # For string operations
6
+ from collections import Counter # For creating unique sets of tokens
7
+ import nltk # For downloading stopwords
8
+
9
+ # Ensure stopwords resource is downloaded
10
+ nltk.download('stopwords')
11
+
12
+ class Preprocessing:
13
+ def __init__(self):
14
+ self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
15
+ self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming
16
+
17
+ def check_special_char(self, ch: str) -> bool:
18
+ """
19
+ Checks if a character is a special character or a digit.
20
+ Returns True if it is, otherwise False.
21
+ """
22
+ return ch in string.punctuation or ch.isdigit()
23
+
24
+ def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
25
+ """
26
+ Removes special characters and digits from the text.
27
+ Replaces them with a space to preserve word boundaries.
28
+ """
29
+ sub, mes = text
30
+ sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
31
+ mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
32
+ return sub, mes
33
+
34
+ def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
35
+ """
36
+ Converts all characters in the text to lowercase.
37
+ """
38
+ sub, mes = text
39
+ return sub.lower(), mes.lower()
40
+
41
+ def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
42
+ """
43
+ Splits the text into individual words (tokens) based on spaces.
44
+ """
45
+ sub, mes = text
46
+ return sub.split(), mes.split()
47
+
48
+ def check_stop_words(self, word: str) -> bool:
49
+ """
50
+ Checks if a word is a stopword.
51
+ """
52
+ return word in self.stop_words
53
+
54
+ def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
55
+ """
56
+ Removes stopwords from the tokenized text.
57
+ """
58
+ sub_tokens, mes_tokens = tokens
59
+ sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
60
+ mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
61
+ return sub_tokens, mes_tokens
62
+
63
+ def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
64
+ """
65
+ Stems each word in the tokenized text using PorterStemmer.
66
+ Removes duplicates by returning a unique list of stems.
67
+ """
68
+ sub_tokens, mes_tokens = tokens
69
+ unique_stems = set()
70
+
71
+ # Stem tokens from both subject and message
72
+ for word in sub_tokens + mes_tokens:
73
+ unique_stems.add(self.stemmer.stem(word))
74
+
75
+ return list(unique_stems)
76
+
77
+ # Example Usage
78
+ if __name__ == "__main__":
79
+ # Example input: (subject, message)
80
+ text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")
81
+
82
+ preprocessor = Preprocessing()
83
+
84
+ # Remove special characters
85
+ text = preprocessor.remove_special_char(text)
86
+ print("After removing special characters:", text)
87
+
88
+ # Convert to lowercase
89
+ text = preprocessor.lowercase_conversion(text)
90
+ print("After converting to lowercase:", text)
91
+
92
+ # Tokenize
93
+ tokens = preprocessor.tokenize(text)
94
+ print("After tokenizing:", tokens)
95
+
96
+ # Remove stopwords
97
+ tokens = preprocessor.removal_of_stop_words(tokens)
98
+ print("After removing stopwords:", tokens)
99
+
100
+ # Stem words
101
+ stems = preprocessor.stem_words(tokens)
102
+ print("After stemming:", stems)