Spaces:

Jwrockon
/

ArtemisAIWhisper

Sleeping

App Files Files Community

ArtemisAIWhisper / whisper (1) /english_normalizer.py

Jwrockon

Upload 232 files (#1)

5e1b738 verified 9 days ago

raw

history blame contribute delete

22.8 kB

	# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
	# Most of the code is copy pasted from the original whisper repository
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import re
	import unicodedata
	from fractions import Fraction
	from typing import Iterator, List, Match, Optional, Union

	import regex


	# non-ASCII letters that are not separated by "NFKD" normalization
	ADDITIONAL_DIACRITICS = {
	"œ": "oe",
	"Œ": "OE",
	"ø": "o",
	"Ø": "O",
	"æ": "ae",
	"Æ": "AE",
	"ß": "ss",
	"ẞ": "SS",
	"đ": "d",
	"Đ": "D",
	"ð": "d",
	"Ð": "D",
	"þ": "th",
	"Þ": "th",
	"ł": "l",
	"Ł": "L",
	}


	def remove_symbols_and_diacritics(s: str, keep=""):
	"""
	Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
	manual mappings)
	"""

	def replace_character(char):
	if char in keep:
	return char
	elif char in ADDITIONAL_DIACRITICS:
	return ADDITIONAL_DIACRITICS[char]

	elif unicodedata.category(char) == "Mn":
	return ""

	elif unicodedata.category(char)[0] in "MSP":
	return " "

	return char

	return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))


	def remove_symbols(s: str):
	"""
	Replace any other markers, symbols, punctuations with a space, keeping diacritics
	"""
	return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))


	class BasicTextNormalizer:
	def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
	self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
	self.split_letters = split_letters

	def __call__(self, s: str):
	s = s.lower()
	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"$([^)]+?)$", "", s) # remove words between parenthesis
	s = self.clean(s).lower()

	if self.split_letters:
	s = " ".join(regex.findall(r"\X", s, regex.U))

	s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

	return s


	class EnglishNumberNormalizer:
	"""
	Convert any spelled-out numbers into arabic numbers, while handling:

	- remove any commas
	- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
	- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
	- spell out `one` and `ones`
	- interpret successive single-digit numbers as nominal: `one oh one` -> `101`
	"""

	def __init__(self):
	super().__init__()

	self.zeros = {"o", "oh", "zero"}
	# fmt: off
	self.ones = {
	name: i
	for i, name in enumerate(
	["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"],
	start=1,
	)
	}
	# fmt: on
	self.ones_plural = {
	"sixes" if name == "six" else name + "s": (value, "s") for name, value in self.ones.items()
	}
	self.ones_ordinal = {
	"zeroth": (0, "th"),
	"first": (1, "st"),
	"second": (2, "nd"),
	"third": (3, "rd"),
	"fifth": (5, "th"),
	"twelfth": (12, "th"),
	**{
	name + ("h" if name.endswith("t") else "th"): (value, "th")
	for name, value in self.ones.items()
	if value > 3 and value != 5 and value != 12
	},
	}
	self.ones_suffixed = {self.ones_plural, self.ones_ordinal}

	self.tens = {
	"twenty": 20,
	"thirty": 30,
	"forty": 40,
	"fifty": 50,
	"sixty": 60,
	"seventy": 70,
	"eighty": 80,
	"ninety": 90,
	}
	self.tens_plural = {name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()}
	self.tens_ordinal = {name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()}
	self.tens_suffixed = {self.tens_plural, self.tens_ordinal}

	self.multipliers = {
	"hundred": 100,
	"thousand": 1_000,
	"million": 1_000_000,
	"billion": 1_000_000_000,
	"trillion": 1_000_000_000_000,
	"quadrillion": 1_000_000_000_000_000,
	"quintillion": 1_000_000_000_000_000_000,
	"sextillion": 1_000_000_000_000_000_000_000,
	"septillion": 1_000_000_000_000_000_000_000_000,
	"octillion": 1_000_000_000_000_000_000_000_000_000,
	"nonillion": 1_000_000_000_000_000_000_000_000_000_000,
	"decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
	}
	self.multipliers_plural = {name + "s": (value, "s") for name, value in self.multipliers.items()}
	self.multipliers_ordinal = {name + "th": (value, "th") for name, value in self.multipliers.items()}
	self.multipliers_suffixed = {self.multipliers_plural, self.multipliers_ordinal}
	self.decimals = {self.ones, self.tens, *self.zeros}

	self.preceding_prefixers = {
	"minus": "-",
	"negative": "-",
	"plus": "+",
	"positive": "+",
	}
	self.following_prefixers = {
	"pound": "£",
	"pounds": "£",
	"euro": "€",
	"euros": "€",
	"dollar": "$",
	"dollars": "$",
	"cent": "¢",
	"cents": "¢",
	}
	self.prefixes = set(list(self.preceding_prefixers.values()) + list(self.following_prefixers.values()))
	self.suffixers = {
	"per": {"cent": "%"},
	"percent": "%",
	}
	self.specials = {"and", "double", "triple", "point"}

	self.words = {
	key
	for mapping in [
	self.zeros,
	self.ones,
	self.ones_suffixed,
	self.tens,
	self.tens_suffixed,
	self.multipliers,
	self.multipliers_suffixed,
	self.preceding_prefixers,
	self.following_prefixers,
	self.suffixers,
	self.specials,
	]
	for key in mapping
	}
	self.literal_words = {"one", "ones"}

	def process_words(self, words: List[str]) -> Iterator[str]:
	prefix: Optional[str] = None
	value: Optional[Union[str, int]] = None
	skip = False

	def to_fraction(s: str):
	try:
	return Fraction(s)
	except ValueError:
	return None

	def output(result: Union[str, int]):
	nonlocal prefix, value
	result = str(result)
	if prefix is not None:
	result = prefix + result
	value = None
	prefix = None
	return result

	if len(words) == 0:
	return

	for i, current in enumerate(words):
	prev = words[i - 1] if i != 0 else None
	next = words[i + 1] if i != len(words) - 1 else None
	if skip:
	skip = False
	continue

	next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
	has_prefix = current[0] in self.prefixes
	current_without_prefix = current[1:] if has_prefix else current
	if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
	# arabic numbers (potentially with signs and fractions)
	f = to_fraction(current_without_prefix)
	if f is None:
	raise ValueError("Converting the fraction failed")

	if value is not None:
	if isinstance(value, str) and value.endswith("."):
	# concatenate decimals / ip address components
	value = str(value) + str(current)
	continue
	else:
	yield output(value)

	prefix = current[0] if has_prefix else prefix
	if f.denominator == 1:
	value = f.numerator # store integers as int
	else:
	value = current_without_prefix
	elif current not in self.words:
	# non-numeric words
	if value is not None:
	yield output(value)
	yield output(current)
	elif current in self.zeros:
	value = str(value or "") + "0"
	elif current in self.ones:
	ones = self.ones[current]

	if value is None:
	value = ones
	elif isinstance(value, str) or prev in self.ones:
	if prev in self.tens and ones < 10: # replace the last zero with the digit
	value = value[:-1] + str(ones)
	else:
	value = str(value) + str(ones)
	elif ones < 10:
	if value % 10 == 0:
	value += ones
	else:
	value = str(value) + str(ones)
	else: # eleven to nineteen
	if value % 100 == 0:
	value += ones
	else:
	value = str(value) + str(ones)
	elif current in self.ones_suffixed:
	# ordinal or cardinal; yield the number right away
	ones, suffix = self.ones_suffixed[current]
	if value is None:
	yield output(str(ones) + suffix)
	elif isinstance(value, str) or prev in self.ones:
	if prev in self.tens and ones < 10:
	yield output(value[:-1] + str(ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	elif ones < 10:
	if value % 10 == 0:
	yield output(str(value + ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	else: # eleven to nineteen
	if value % 100 == 0:
	yield output(str(value + ones) + suffix)
	else:
	yield output(str(value) + str(ones) + suffix)
	value = None
	elif current in self.tens:
	tens = self.tens[current]
	if value is None:
	value = tens
	elif isinstance(value, str):
	value = str(value) + str(tens)
	else:
	if value % 100 == 0:
	value += tens
	else:
	value = str(value) + str(tens)
	elif current in self.tens_suffixed:
	# ordinal or cardinal; yield the number right away
	tens, suffix = self.tens_suffixed[current]
	if value is None:
	yield output(str(tens) + suffix)
	elif isinstance(value, str):
	yield output(str(value) + str(tens) + suffix)
	else:
	if value % 100 == 0:
	yield output(str(value + tens) + suffix)
	else:
	yield output(str(value) + str(tens) + suffix)
	elif current in self.multipliers:
	multiplier = self.multipliers[current]
	if value is None:
	value = multiplier
	elif isinstance(value, str) or value == 0:
	f = to_fraction(value)
	p = f * multiplier if f is not None else None
	if f is not None and p.denominator == 1:
	value = p.numerator
	else:
	yield output(value)
	value = multiplier
	else:
	before = value // 1000 * 1000
	residual = value % 1000
	value = before + residual * multiplier
	elif current in self.multipliers_suffixed:
	multiplier, suffix = self.multipliers_suffixed[current]
	if value is None:
	yield output(str(multiplier) + suffix)
	elif isinstance(value, str):
	f = to_fraction(value)
	p = f * multiplier if f is not None else None
	if f is not None and p.denominator == 1:
	yield output(str(p.numerator) + suffix)
	else:
	yield output(value)
	yield output(str(multiplier) + suffix)
	else: # int
	before = value // 1000 * 1000
	residual = value % 1000
	value = before + residual * multiplier
	yield output(str(value) + suffix)
	value = None
	elif current in self.preceding_prefixers:
	# apply prefix (positive, minus, etc.) if it precedes a number
	if value is not None:
	yield output(value)

	if next in self.words or next_is_numeric:
	prefix = self.preceding_prefixers[current]
	else:
	yield output(current)
	elif current in self.following_prefixers:
	# apply prefix (dollars, cents, etc.) only after a number
	if value is not None:
	prefix = self.following_prefixers[current]
	yield output(value)
	else:
	yield output(current)
	elif current in self.suffixers:
	# apply suffix symbols (percent -> '%')
	if value is not None:
	suffix = self.suffixers[current]
	if isinstance(suffix, dict):
	if next in suffix:
	yield output(str(value) + suffix[next])
	skip = True
	else:
	yield output(value)
	yield output(current)
	else:
	yield output(str(value) + suffix)
	else:
	yield output(current)
	elif current in self.specials:
	if next not in self.words and not next_is_numeric:
	# apply special handling only if the next word can be numeric
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "and":
	# ignore "and" after hundreds, thousands, etc.
	if prev not in self.multipliers:
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "double" or current == "triple":
	if next in self.ones or next in self.zeros:
	repeats = 2 if current == "double" else 3
	ones = self.ones.get(next, 0)
	value = str(value or "") + str(ones) * repeats
	skip = True
	else:
	if value is not None:
	yield output(value)
	yield output(current)
	elif current == "point":
	if next in self.decimals or next_is_numeric:
	value = str(value or "") + "."
	else:
	# should all have been covered at this point
	raise ValueError(f"Unexpected token: {current}")
	else:
	# all should have been covered at this point
	raise ValueError(f"Unexpected token: {current}")

	if value is not None:
	yield output(value)

	def preprocess(self, s: str):
	# replace "<number> and a half" with "<number> point five"
	results = []

	segments = re.split(r"\band\s+a\s+half\b", s)
	for i, segment in enumerate(segments):
	if len(segment.strip()) == 0:
	continue
	if i == len(segments) - 1:
	results.append(segment)
	else:
	results.append(segment)
	last_word = segment.rsplit(maxsplit=2)[-1]
	if last_word in self.decimals or last_word in self.multipliers:
	results.append("point five")
	else:
	results.append("and a half")

	s = " ".join(results)

	# put a space at number/letter boundary
	s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
	s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)

	# but remove spaces which could be a suffix
	s = re.sub(r"([0-9])\s+(st\|nd\|rd\|th\|s)\b", r"\1\2", s)

	return s

	def postprocess(self, s: str):
	def combine_cents(m: Match):
	try:
	currency = m.group(1)
	integer = m.group(2)
	cents = int(m.group(3))
	return f"{currency}{integer}.{cents:02d}"
	except ValueError:
	return m.string

	def extract_cents(m: Match):
	try:
	return f"¢{int(m.group(1))}"
	except ValueError:
	return m.string

	# apply currency postprocessing; "$2 and ¢7" -> "$2.07"
	s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
	s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)

	# write "one(s)" instead of "1(s)", just for the readability
	s = re.sub(r"\b1(s?)\b", r"one\1", s)

	return s

	def __call__(self, s: str):
	s = self.preprocess(s)
	s = " ".join(word for word in self.process_words(s.split()) if word is not None)
	s = self.postprocess(s)

	return s


	class EnglishSpellingNormalizer:
	"""
	Applies British-American spelling mappings as listed in [1].

	[1] https://www.tysto.com/uk-us-spelling-list.html
	"""

	def __init__(self, english_spelling_mapping):
	self.mapping = english_spelling_mapping

	def __call__(self, s: str):
	return " ".join(self.mapping.get(word, word) for word in s.split())


	class EnglishTextNormalizer:
	def __init__(self, english_spelling_mapping):
	self.ignore_patterns = r"\b(hmm\|mm\|mhm\|mmm\|uh\|um)\b"
	self.replacers = {
	# common contractions
	r"\bwon't\b": "will not",
	r"\bcan't\b": "can not",
	r"\blet's\b": "let us",
	r"\bain't\b": "aint",
	r"\by'all\b": "you all",
	r"\bwanna\b": "want to",
	r"\bgotta\b": "got to",
	r"\bgonna\b": "going to",
	r"\bi'ma\b": "i am going to",
	r"\bimma\b": "i am going to",
	r"\bwoulda\b": "would have",
	r"\bcoulda\b": "could have",
	r"\bshoulda\b": "should have",
	r"\bma'am\b": "madam",
	# contractions in titles/prefixes
	r"\bmr\b": "mister ",
	r"\bmrs\b": "missus ",
	r"\bst\b": "saint ",
	r"\bdr\b": "doctor ",
	r"\bprof\b": "professor ",
	r"\bcapt\b": "captain ",
	r"\bgov\b": "governor ",
	r"\bald\b": "alderman ",
	r"\bgen\b": "general ",
	r"\bsen\b": "senator ",
	r"\brep\b": "representative ",
	r"\bpres\b": "president ",
	r"\brev\b": "reverend ",
	r"\bhon\b": "honorable ",
	r"\basst\b": "assistant ",
	r"\bassoc\b": "associate ",
	r"\blt\b": "lieutenant ",
	r"\bcol\b": "colonel ",
	r"\bjr\b": "junior ",
	r"\bsr\b": "senior ",
	r"\besq\b": "esquire ",
	# prefect tenses, ideally it should be any past participles, but it's harder..
	r"'d been\b": " had been",
	r"'s been\b": " has been",
	r"'d gone\b": " had gone",
	r"'s gone\b": " has gone",
	r"'d done\b": " had done", # "'s done" is ambiguous
	r"'s got\b": " has got",
	# general contractions
	r"n't\b": " not",
	r"'re\b": " are",
	r"'s\b": " is",
	r"'d\b": " would",
	r"'ll\b": " will",
	r"'t\b": " not",
	r"'ve\b": " have",
	r"'m\b": " am",
	}
	self.standardize_numbers = EnglishNumberNormalizer()
	self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping)

	def __call__(self, s: str):
	s = s.lower()

	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"$([^)]+?)$", "", s) # remove words between parenthesis
	s = re.sub(self.ignore_patterns, "", s)
	s = re.sub(r"\s+'", "'", s) # standardize when there's a space before an apostrophe

	for pattern, replacement in self.replacers.items():
	s = re.sub(pattern, replacement, s)

	s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
	s = re.sub(r"\.([^0-9]\|$)", r" \1", s) # remove periods not followed by numbers
	s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep some symbols for numerics

	s = self.standardize_numbers(s)
	s = self.standardize_spellings(s)

	# now remove prefix/suffix symbols that are not preceded/followed by numbers
	s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
	s = re.sub(r"([^0-9])%", r"\1 ", s)

	s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

	return s