Upload 02-Preprocessing.py
Browse files- 02-Preprocessing.py +102 -0
02-Preprocessing.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re # For regular expressions
|
2 |
+
from nltk.stem import PorterStemmer # For stemming
|
3 |
+
from nltk.corpus import stopwords # For stopword removal
|
4 |
+
from typing import List, Tuple # For type hinting
|
5 |
+
import string # For string operations
|
6 |
+
from collections import Counter # For creating unique sets of tokens
|
7 |
+
import nltk # For downloading stopwords
|
8 |
+
|
9 |
+
# Ensure stopwords resource is downloaded
|
10 |
+
nltk.download('stopwords')
|
11 |
+
|
12 |
+
class Preprocessing:
|
13 |
+
def __init__(self):
|
14 |
+
self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
|
15 |
+
self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming
|
16 |
+
|
17 |
+
def check_special_char(self, ch: str) -> bool:
|
18 |
+
"""
|
19 |
+
Checks if a character is a special character or a digit.
|
20 |
+
Returns True if it is, otherwise False.
|
21 |
+
"""
|
22 |
+
return ch in string.punctuation or ch.isdigit()
|
23 |
+
|
24 |
+
def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
|
25 |
+
"""
|
26 |
+
Removes special characters and digits from the text.
|
27 |
+
Replaces them with a space to preserve word boundaries.
|
28 |
+
"""
|
29 |
+
sub, mes = text
|
30 |
+
sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
|
31 |
+
mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
|
32 |
+
return sub, mes
|
33 |
+
|
34 |
+
def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
|
35 |
+
"""
|
36 |
+
Converts all characters in the text to lowercase.
|
37 |
+
"""
|
38 |
+
sub, mes = text
|
39 |
+
return sub.lower(), mes.lower()
|
40 |
+
|
41 |
+
def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
|
42 |
+
"""
|
43 |
+
Splits the text into individual words (tokens) based on spaces.
|
44 |
+
"""
|
45 |
+
sub, mes = text
|
46 |
+
return sub.split(), mes.split()
|
47 |
+
|
48 |
+
def check_stop_words(self, word: str) -> bool:
|
49 |
+
"""
|
50 |
+
Checks if a word is a stopword.
|
51 |
+
"""
|
52 |
+
return word in self.stop_words
|
53 |
+
|
54 |
+
def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
|
55 |
+
"""
|
56 |
+
Removes stopwords from the tokenized text.
|
57 |
+
"""
|
58 |
+
sub_tokens, mes_tokens = tokens
|
59 |
+
sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
|
60 |
+
mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
|
61 |
+
return sub_tokens, mes_tokens
|
62 |
+
|
63 |
+
def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
|
64 |
+
"""
|
65 |
+
Stems each word in the tokenized text using PorterStemmer.
|
66 |
+
Removes duplicates by returning a unique list of stems.
|
67 |
+
"""
|
68 |
+
sub_tokens, mes_tokens = tokens
|
69 |
+
unique_stems = set()
|
70 |
+
|
71 |
+
# Stem tokens from both subject and message
|
72 |
+
for word in sub_tokens + mes_tokens:
|
73 |
+
unique_stems.add(self.stemmer.stem(word))
|
74 |
+
|
75 |
+
return list(unique_stems)
|
76 |
+
|
77 |
+
# Example Usage
|
78 |
+
if __name__ == "__main__":
|
79 |
+
# Example input: (subject, message)
|
80 |
+
text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")
|
81 |
+
|
82 |
+
preprocessor = Preprocessing()
|
83 |
+
|
84 |
+
# Remove special characters
|
85 |
+
text = preprocessor.remove_special_char(text)
|
86 |
+
print("After removing special characters:", text)
|
87 |
+
|
88 |
+
# Convert to lowercase
|
89 |
+
text = preprocessor.lowercase_conversion(text)
|
90 |
+
print("After converting to lowercase:", text)
|
91 |
+
|
92 |
+
# Tokenize
|
93 |
+
tokens = preprocessor.tokenize(text)
|
94 |
+
print("After tokenizing:", tokens)
|
95 |
+
|
96 |
+
# Remove stopwords
|
97 |
+
tokens = preprocessor.removal_of_stop_words(tokens)
|
98 |
+
print("After removing stopwords:", tokens)
|
99 |
+
|
100 |
+
# Stem words
|
101 |
+
stems = preprocessor.stem_words(tokens)
|
102 |
+
print("After stemming:", stems)
|