|
import re |
|
from nltk.stem import PorterStemmer |
|
from nltk.corpus import stopwords |
|
from typing import List, Tuple |
|
import string |
|
from collections import Counter |
|
import nltk |
|
|
|
|
|
nltk.download('stopwords') |
|
|
|
class Preprocessing: |
|
def __init__(self): |
|
self.stop_words = set(stopwords.words('english')) |
|
self.stemmer = PorterStemmer() |
|
|
|
def check_special_char(self, ch: str) -> bool: |
|
""" |
|
Checks if a character is a special character or a digit. |
|
Returns True if it is, otherwise False. |
|
""" |
|
return ch in string.punctuation or ch.isdigit() |
|
|
|
def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]: |
|
""" |
|
Removes special characters and digits from the text. |
|
Replaces them with a space to preserve word boundaries. |
|
""" |
|
sub, mes = text |
|
sub = ''.join([' ' if self.check_special_char(c) else c for c in sub]) |
|
mes = ''.join([' ' if self.check_special_char(c) else c for c in mes]) |
|
return sub, mes |
|
|
|
def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]: |
|
""" |
|
Converts all characters in the text to lowercase. |
|
""" |
|
sub, mes = text |
|
return sub.lower(), mes.lower() |
|
|
|
def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]: |
|
""" |
|
Splits the text into individual words (tokens) based on spaces. |
|
""" |
|
sub, mes = text |
|
return sub.split(), mes.split() |
|
|
|
def check_stop_words(self, word: str) -> bool: |
|
""" |
|
Checks if a word is a stopword. |
|
""" |
|
return word in self.stop_words |
|
|
|
def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]: |
|
""" |
|
Removes stopwords from the tokenized text. |
|
""" |
|
sub_tokens, mes_tokens = tokens |
|
sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)] |
|
mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)] |
|
return sub_tokens, mes_tokens |
|
|
|
def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]: |
|
""" |
|
Stems each word in the tokenized text using PorterStemmer. |
|
Removes duplicates by returning a unique list of stems. |
|
""" |
|
sub_tokens, mes_tokens = tokens |
|
unique_stems = set() |
|
|
|
|
|
for word in sub_tokens + mes_tokens: |
|
unique_stems.add(self.stemmer.stem(word)) |
|
|
|
return list(unique_stems) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$") |
|
|
|
preprocessor = Preprocessing() |
|
|
|
|
|
text = preprocessor.remove_special_char(text) |
|
print("After removing special characters:", text) |
|
|
|
|
|
text = preprocessor.lowercase_conversion(text) |
|
print("After converting to lowercase:", text) |
|
|
|
|
|
tokens = preprocessor.tokenize(text) |
|
print("After tokenizing:", tokens) |
|
|
|
|
|
tokens = preprocessor.removal_of_stop_words(tokens) |
|
print("After removing stopwords:", tokens) |
|
|
|
|
|
stems = preprocessor.stem_words(tokens) |
|
print("After stemming:", stems) |
|
|