Spaces:
Running
Running
File size: 2,871 Bytes
78e8beb 6f27821 78e8beb f138a14 2bc2fff f138a14 78e8beb f138a14 78e8beb f138a14 78e8beb f138a14 78e8beb f138a14 78e8beb 2bc2fff 78e8beb f138a14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import json
import re
import unicodedata
from utils.norm_config import norm_config
def text_normalize(
text,
iso_code="xxx",
lower_case=True,
remove_numbers=False,
remove_brackets=False,
rm_extra_spaces=False,
):
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
Args:
text : The string to be normalized
iso_code :
remove_numbers : Boolean flag to specify if words containing only digits should be removed
Returns:
normalized_text : the string after all normalization
"""
config = norm_config.get(iso_code, norm_config["*"])
for field in [
"lower_case",
"punc_set",
"del_set",
"mapping",
"digit_set",
"unicode_norm",
]:
if field not in config:
config[field] = norm_config["*"][field]
text = unicodedata.normalize(config["unicode_norm"], text)
# Convert to lower case
if config["lower_case"] and lower_case:
text = text.lower()
# brackets
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
if remove_brackets:
text = re.sub(r"\([^\)]*\)", " ", text)
# Apply mappings
for old, new in config["mapping"].items():
text = re.sub(old, new, text)
# Replace punctutations with space
punct_pattern = r"[" + config["punc_set"]
punct_pattern += "]"
normalized_text = re.sub(punct_pattern, " ", text)
# remove characters in delete list
delete_patten = r"[" + config["del_set"] + "]"
normalized_text = re.sub(delete_patten, "", normalized_text)
# Remove words containing only digits
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
# The lookaround enables overlapping pattern matches to be replaced
if remove_numbers:
digits_pattern = "[" + config["digit_set"]
digits_pattern += "]+"
complete_digit_pattern = (
r"^"
+ digits_pattern
+ "(?=\s)|(?<=\s)"
+ digits_pattern
+ "(?=\s)|(?<=\s)"
+ digits_pattern
+ "$"
)
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
if config["rm_diacritics"]:
from unidecode import unidecode
normalized_text = unidecode(normalized_text)
if rm_extra_spaces:
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
return normalized_text
|