Spaces:
Running
Running
import os | |
import re | |
colon = ":" | |
comma = "," | |
exclamation_mark = "!" | |
period = re.escape(".") | |
question_mark = re.escape("?") | |
semicolon = ";" | |
left_curly_bracket = "{" | |
right_curly_bracket = "}" | |
quotation_mark = '"' | |
basic_punc = ( | |
period | |
+ question_mark | |
+ comma | |
+ colon | |
+ exclamation_mark | |
+ left_curly_bracket | |
+ right_curly_bracket | |
) | |
# General punc unicode block (0x2000-0x206F) | |
zero_width_space = r"\u200B" | |
zero_width_nonjoiner = r"\u200C" | |
left_to_right_mark = r"\u200E" | |
right_to_left_mark = r"\u200F" | |
left_to_right_embedding = r"\u202A" | |
pop_directional_formatting = r"\u202C" | |
# Here are some commonly ill-typed versions of apostrophe | |
right_single_quotation_mark = r"\u2019" | |
left_single_quotation_mark = r"\u2018" | |
# Language specific definitions | |
# Spanish | |
inverted_exclamation_mark = r"\u00A1" | |
inverted_question_mark = r"\u00BF" | |
# Hindi | |
hindi_danda = "\u0964" | |
# Egyptian Arabic | |
# arabic_percent = r"\u066A" | |
arabic_comma = r"\u060C" | |
arabic_question_mark = r"\u061F" | |
arabic_semicolon = r"\u061B" | |
arabic_diacritics = r"\u064B-\u0652" | |
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657" | |
# Chinese | |
full_stop = r"\u3002" | |
full_comma = r"\uFF0C" | |
full_exclamation_mark = r"\uFF01" | |
full_question_mark = r"\uFF1F" | |
full_semicolon = r"\uFF1B" | |
full_colon = r"\uFF1A" | |
full_parentheses = r"\uFF08\uFF09" | |
quotation_mark_horizontal = r"\u300C-\u300F" | |
quotation_mark_vertical = r"\uFF41-\uFF44" | |
title_marks = r"\u3008-\u300B" | |
wavy_low_line = r"\uFE4F" | |
ellipsis = r"\u22EF" | |
enumeration_comma = r"\u3001" | |
hyphenation_point = r"\u2027" | |
forward_slash = r"\uFF0F" | |
wavy_dash = r"\uFF5E" | |
box_drawings_light_horizontal = r"\u2500" | |
fullwidth_low_line = r"\uFF3F" | |
chinese_punc = ( | |
full_stop | |
+ full_comma | |
+ full_exclamation_mark | |
+ full_question_mark | |
+ full_semicolon | |
+ full_colon | |
+ full_parentheses | |
+ quotation_mark_horizontal | |
+ quotation_mark_vertical | |
+ title_marks | |
+ wavy_low_line | |
+ ellipsis | |
+ enumeration_comma | |
+ hyphenation_point | |
+ forward_slash | |
+ wavy_dash | |
+ box_drawings_light_horizontal | |
+ fullwidth_low_line | |
) | |
# Armenian | |
armenian_apostrophe = r"\u055A" | |
emphasis_mark = r"\u055B" | |
exclamation_mark = r"\u055C" | |
armenian_comma = r"\u055D" | |
armenian_question_mark = r"\u055E" | |
abbreviation_mark = r"\u055F" | |
armenian_full_stop = r"\u0589" | |
armenian_punc = ( | |
armenian_apostrophe | |
+ emphasis_mark | |
+ exclamation_mark | |
+ armenian_comma | |
+ armenian_question_mark | |
+ abbreviation_mark | |
+ armenian_full_stop | |
) | |
lesser_than_symbol = r"<" | |
greater_than_symbol = r">" | |
lesser_than_sign = r"\u003c" | |
greater_than_sign = r"\u003e" | |
nbsp_written_form = r" " | |
# Quotation marks | |
left_double_quotes = r"\u201c" | |
right_double_quotes = r"\u201d" | |
left_double_angle = r"\u00ab" | |
right_double_angle = r"\u00bb" | |
left_single_angle = r"\u2039" | |
right_single_angle = r"\u203a" | |
low_double_quotes = r"\u201e" | |
low_single_quotes = r"\u201a" | |
high_double_quotes = r"\u201f" | |
high_single_quotes = r"\u201b" | |
all_punct_quotes = ( | |
left_double_quotes | |
+ right_double_quotes | |
+ left_double_angle | |
+ right_double_angle | |
+ left_single_angle | |
+ right_single_angle | |
+ low_double_quotes | |
+ low_single_quotes | |
+ high_double_quotes | |
+ high_single_quotes | |
+ right_single_quotation_mark | |
+ left_single_quotation_mark | |
) | |
mapping_quotes = ( | |
"[" | |
+ high_single_quotes | |
+ right_single_quotation_mark | |
+ left_single_quotation_mark | |
+ "]" | |
) | |
# Digits | |
english_digits = r"\u0030-\u0039" | |
bengali_digits = r"\u09e6-\u09ef" | |
khmer_digits = r"\u17e0-\u17e9" | |
devanagari_digits = r"\u0966-\u096f" | |
oriya_digits = r"\u0b66-\u0b6f" | |
extended_arabic_indic_digits = r"\u06f0-\u06f9" | |
kayah_li_digits = r"\ua900-\ua909" | |
fullwidth_digits = r"\uff10-\uff19" | |
malayam_digits = r"\u0d66-\u0d6f" | |
myanmar_digits = r"\u1040-\u1049" | |
roman_numeral = r"\u2170-\u2179" | |
nominal_digit_shapes = r"\u206f" | |
# Load punctuations from MMS-lab data | |
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f: | |
punc_list = punc_f.readlines() | |
punct_pattern = r"" | |
for punc in punc_list: | |
# the first character in the tab separated line is the punc to be removed | |
punct_pattern += re.escape(punc.split("\t")[0]) | |
shared_digits = ( | |
english_digits | |
+ bengali_digits | |
+ khmer_digits | |
+ devanagari_digits | |
+ oriya_digits | |
+ extended_arabic_indic_digits | |
+ kayah_li_digits | |
+ fullwidth_digits | |
+ malayam_digits | |
+ myanmar_digits | |
+ roman_numeral | |
+ nominal_digit_shapes | |
) | |
shared_punc_list = ( | |
basic_punc | |
+ all_punct_quotes | |
+ greater_than_sign | |
+ lesser_than_sign | |
+ inverted_question_mark | |
+ full_stop | |
+ semicolon | |
+ armenian_punc | |
+ inverted_exclamation_mark | |
+ arabic_comma | |
+ enumeration_comma | |
+ hindi_danda | |
+ quotation_mark | |
+ arabic_semicolon | |
+ arabic_question_mark | |
+ chinese_punc | |
+ punct_pattern | |
) | |
shared_mappping = { | |
lesser_than_symbol: "", | |
greater_than_symbol: "", | |
nbsp_written_form: "", | |
# r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2", # slow to run | |
} | |
shared_deletion_list = ( | |
left_to_right_mark | |
+ zero_width_nonjoiner | |
+ arabic_subscript_alef_and_inverted_damma | |
+ zero_width_space | |
+ arabic_diacritics | |
+ pop_directional_formatting | |
+ right_to_left_mark | |
+ left_to_right_embedding | |
) | |
norm_config = { | |
"*": { | |
"lower_case": True, | |
"punc_set": shared_punc_list, | |
"del_set": shared_deletion_list, | |
"mapping": shared_mappping, | |
"digit_set": shared_digits, | |
"unicode_norm": "NFKC", | |
"rm_diacritics": False, | |
} | |
} | |
# =============== Mongolian ===============# | |
norm_config["mon"] = norm_config["*"].copy() | |
# add soft hyphen to punc list to match with fleurs | |
norm_config["mon"]["del_set"] += r"\u00AD" | |
norm_config["khk"] = norm_config["mon"].copy() | |
# =============== Hebrew ===============# | |
norm_config["heb"] = norm_config["*"].copy() | |
# add "HEBREW POINT" symbols to match with fleurs | |
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF" | |
# =============== Thai ===============# | |
norm_config["tha"] = norm_config["*"].copy() | |
# add "Zero width joiner" symbols to match with fleurs | |
norm_config["tha"]["punc_set"] += r"\u200D" | |
# =============== Arabic ===============# | |
norm_config["ara"] = norm_config["*"].copy() | |
norm_config["ara"]["mapping"]["ٱ"] = "ا" | |
norm_config["arb"] = norm_config["ara"].copy() | |
# =============== Javanese ===============# | |
norm_config["jav"] = norm_config["*"].copy() | |
norm_config["jav"]["rm_diacritics"] = True | |