cdactvm commited on
Commit
cb9e139
·
verified ·
1 Parent(s): fc31623

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -147
app.py CHANGED
@@ -21,16 +21,56 @@ def transcribe_odiya_eng(speech):
21
  from indictrans import Transliterator
22
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
23
  text = p1(speech)["text"]
24
- text=trn.transform(text)
25
- text = master_function(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return text
 
27
  def transcribe_hin_eng(speech):
28
  from indictrans import Transliterator
29
  trn = Transliterator(source='hin', target='eng', build_lookup=True)
30
  text = p2(speech)["text"]
31
- text=trn.transform(text)
32
- text = master_function(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  return text
 
34
  def sel_lng(lng,mic=None, file=None):
35
  if mic is not None:
36
  audio = mic
@@ -50,20 +90,49 @@ def sel_lng(lng,mic=None, file=None):
50
  #####################################################
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def soundex(word):
54
  word = word.upper()
55
  word = ''.join(filter(str.isalpha, word))
56
  if not word:
57
  return None
58
  soundex_mapping = {
59
- 'B': '1', 'F': '1', 'P': '1', 'V': '1',
60
- 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2',
61
- 'S': '2', 'X': '2', 'Z': '2',
62
- 'D': '3', 'T': '3',
63
- 'L': '4',
64
- 'M': '5', 'N': '5',
65
- 'R': '6'
66
- }
67
  soundex_code = word[0]
68
  for char in word[1:]:
69
  if char not in ('H', 'W'):
@@ -72,27 +141,12 @@ def soundex(word):
72
  soundex_code = soundex_code.replace('0', '') + '000'
73
  return soundex_code[:4]
74
 
75
- # convert special tecken to numbers
76
-
77
- def is_number(x):
78
- if type(x) == str:
79
- x = x.replace(',', '')
80
- try:
81
- float(x)
82
- except:
83
- return False
84
- return True
85
-
86
- def text2int (textnum, numwords={}):
87
-
88
  units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
89
  'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
90
- # teens = ['T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
91
  tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
92
  scales = ['H536', 'T253', 'M450', 'C600']
93
- # scale_values = [100, 1_000, 10_0000, 1000_000_000]
94
- indian_scales = ['L200', 'C600', 'A610', 'K610']
95
- conjunction = ['and']
96
  ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
97
  'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
98
  ordinal_endings = [('ieth', 'y'), ('th', '')]
@@ -141,7 +195,6 @@ def text2int (textnum, numwords={}):
141
 
142
  if (not is_numword(word)) or (word == 'and' and not lastscale):
143
  if onnumber:
144
- # Flush the current number we are building
145
  curstring += repr(result + current) + " "
146
  curstring += word + " "
147
  result = current = 0
@@ -152,25 +205,23 @@ def text2int (textnum, numwords={}):
152
  scale, increment = from_numword(word)
153
  onnumber = True
154
 
155
- if lastunit and (word not in scales):
156
- # Assume this is part of a string of individual numbers to
157
- # be flushed, such as a zipcode "one two three four five"
158
- curstring += repr(result + current)
159
- result = current = 0
160
-
161
- if scale > 1:
162
- current = max(1, current)
163
-
164
- current = current * scale + increment
165
- if scale > 100:
166
- result += current
167
- current = 0
168
-
169
- lastscale = False
170
- lastunit = False
171
- if word in scales:
172
- lastscale = True
173
- elif word in units:
174
  lastunit = True
175
 
176
  if onnumber:
@@ -178,107 +229,29 @@ def text2int (textnum, numwords={}):
178
 
179
  return curstring
180
 
181
- # replace those words which are not correctly spelled to correct words.
182
- def replace_words(sentence):
183
- # Define the replacements
184
- replacements = [
185
- (r'\bjiro\b', 'zero'),
186
- (r'\bjero\b', 'zero'),
187
- (r'\bnn\b', 'one'),
188
- (r'\bn\b', 'one'),
189
- (r'\bna\b', 'one'),
190
- (r'\btu\b', 'two'),
191
- (r'\btoo\b', 'two'),
192
- (r'\bthiri\b', 'three'),
193
- (r'\bfor\b', 'four'),
194
- (r'\bfore\b', 'four'),
195
- (r'\bfib\b', 'five'),
196
- (r'\bdublseven\b', 'double seven'),
197
- (r'\bdubalathri\b', 'double three'),
198
- (r'\bnineeit\b', 'nine eight'),
199
- (r'\bfipeit\b', 'five eight'),
200
- (r'\bdubal\b', 'double'),
201
- (r'\bsevenatu\b', 'seven two'),
202
- ]
203
- # Apply the replacements
204
- for pattern, replacement in replacements:
205
- sentence = re.sub(pattern, replacement, sentence)
206
- return sentence
207
 
208
- # split text and numbers and get it into different sentences.
209
- def split_sentence(sentence):
210
- # List of word-based numbers
211
- word_numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
212
- "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
213
- "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty",
214
- "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million",
215
- 'zero', 'one', 'on','na','n','tu','two','to','too', 'three','tree','four',
216
- 'for','five','fib', 'six', 'seven', 'eight', 'eit', 'nine', 'eit', 'seven',
217
- 'sics', 'thri', 'for', 'eittu', 'eittu', 'nine','dubal','sikas','tri', 'double']
218
 
219
- # Split the sentence into tokens
220
- tokens = sentence.split()
221
- # Initialize variables to store the parts of the sentence
222
- pre_numbers = []
223
- numbers = []
224
- post_numbers = []
225
- found_numbers = False
226
- # Iterate through the tokens to classify them
227
- for token in tokens:
228
- if token.lower() in word_numbers:
229
- found_numbers = True
230
- numbers.append(token)
231
- else:
232
- if found_numbers:
233
- post_numbers.append(token)
234
- else:
235
- pre_numbers.append(token)
236
- # Join the parts back into sentences
237
- sentence1 = ' '.join(pre_numbers)
238
- number = ' '.join(numbers)
239
- sentence3 = ' '.join(post_numbers)
240
- return sentence1, number, sentence3
241
-
242
- # Process double followed by a numbers.
243
- def process_doubles(sentence):
244
- tokens = sentence.split()
245
- result = []
246
-
247
- i = 0
248
- while i < len(tokens):
249
- if tokens[i] == "double" or tokens[i] == "dubal":
250
- if i + 1 < len(tokens):
251
- # Repeat the next word twice
252
- result.append(tokens[i + 1])
253
- result.append(tokens[i + 1])
254
- i += 2 # Skip the next word as it's already added twice
255
- else:
256
- # If "double" is the last word, just add it (although this case is unusual)
257
- result.append(tokens[i])
258
- i += 1
259
- else:
260
- result.append(tokens[i])
261
- i += 1
262
-
263
- return ' '.join(result)
264
 
265
- # Concatenate text and numbers and form a single sentence.
266
- def concatenate_sentences(sentence1, numbers, sentence3):
267
- full_sentence = f"{sentence1} {numbers} {sentence3}"
268
- return full_sentence
269
-
270
- # define a master function to run all the above functions.
271
- def master_function(initial_input):
272
- output_string1 = replace_words(initial_input)
273
- sentence1, number, sentence3 = split_sentence(output_string1)
274
- processed_sentence = process_doubles(number)
275
- text = processed_sentence
276
- words = text.strip().split()
277
- soundex_codes = [soundex(word) for word in words]
278
- combined_text = " ".join(soundex_codes)
279
- numbers=text2int(combined_text)
280
- full_sentence = concatenate_sentences(sentence1, numbers, sentence3)
281
- return full_sentence
282
 
283
  ######################################################
284
  demo=gr.Interface(
 
21
  from indictrans import Transliterator
22
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
23
  text = p1(speech)["text"]
24
+ #text=trn.transform(text)
25
+ #text = master_function(text)
26
+ sentence = trn.transform(text)
27
+ replaced_words = replace_words(sentence)
28
+ processed_sentence = process_doubles(replaced_words)
29
+ input_sentence_1 = processed_sentence
30
+ # Create empty mappings
31
+ word_to_code_map = {}
32
+ code_to_word_map = {}
33
+
34
+ # Convert sentence to transcript
35
+ transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
36
+
37
+ # Convert transcript to numerical representation
38
+ numbers = text2int(transcript_1)
39
+
40
+ # Create reverse mapping
41
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
42
+
43
+ # Convert transcript back to sentence
44
+ text = transcript_to_sentence(numbers, code_to_word_map)
45
  return text
46
+
47
  def transcribe_hin_eng(speech):
48
  from indictrans import Transliterator
49
  trn = Transliterator(source='hin', target='eng', build_lookup=True)
50
  text = p2(speech)["text"]
51
+ #text=trn.transform(text)
52
+ #text = master_function(text)
53
+ sentence = trn.transform(text)
54
+ replaced_words = replace_words(sentence)
55
+ processed_sentence = process_doubles(replaced_words)
56
+ input_sentence_1 = processed_sentence
57
+ # Create empty mappings
58
+ word_to_code_map = {}
59
+ code_to_word_map = {}
60
+
61
+ # Convert sentence to transcript
62
+ transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
63
+
64
+ # Convert transcript to numerical representation
65
+ numbers = text2int(transcript_1)
66
+
67
+ # Create reverse mapping
68
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
69
+
70
+ # Convert transcript back to sentence
71
+ text = transcript_to_sentence(numbers, code_to_word_map)
72
  return text
73
+
74
  def sel_lng(lng,mic=None, file=None):
75
  if mic is not None:
76
  audio = mic
 
90
  #####################################################
91
 
92
 
93
+ def replace_words(sentence):
94
+ replacements = [
95
+ (r'\bjiro\b', 'zero'), (r'\bjero\b', 'zero'), (r'\bnn\b', 'one'),
96
+ (r'\bn\b', 'one'), (r'\bna\b', 'one'), (r'\btu\b', 'two'),
97
+ (r'\btoo\b', 'two'), (r'\bthiri\b', 'three'), (r'\bfor\b', 'four'),
98
+ (r'\bfore\b', 'four'), (r'\bfib\b', 'five'), (r'\bdublseven\b', 'double seven'),
99
+ (r'\bdubalathri\b', 'double three'), (r'\bnineeit\b', 'nine eight'),
100
+ (r'\bfipeit\b', 'five eight'), (r'\bdubal\b', 'double'), (r'\bsevenatu\b', 'seven two'),
101
+ ]
102
+ for pattern, replacement in replacements:
103
+ sentence = re.sub(pattern, replacement, sentence)
104
+ return sentence
105
+
106
+ # Function to process "double" followed by a number
107
+ def process_doubles(sentence):
108
+ tokens = sentence.split()
109
+ result = []
110
+ i = 0
111
+ while i < len(tokens):
112
+ if tokens[i] in ("double", "dubal"):
113
+ if i + 1 < len(tokens):
114
+ result.append(tokens[i + 1])
115
+ result.append(tokens[i + 1])
116
+ i += 2
117
+ else:
118
+ result.append(tokens[i])
119
+ i += 1
120
+ else:
121
+ result.append(tokens[i])
122
+ i += 1
123
+ return ' '.join(result)
124
+
125
+ # Function to generate Soundex code for a word
126
  def soundex(word):
127
  word = word.upper()
128
  word = ''.join(filter(str.isalpha, word))
129
  if not word:
130
  return None
131
  soundex_mapping = {
132
+ 'B': '1', 'F': '1', 'P': '1', 'V': '1',
133
+ 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
134
+ 'D': '3', 'T': '3', 'L': '4', 'M': '5', 'N': '5', 'R': '6'
135
+ }
 
 
 
 
136
  soundex_code = word[0]
137
  for char in word[1:]:
138
  if char not in ('H', 'W'):
 
141
  soundex_code = soundex_code.replace('0', '') + '000'
142
  return soundex_code[:4]
143
 
144
+ # Function to convert text to numerical representation
145
+ def text2int(textnum, numwords={}):
 
 
 
 
 
 
 
 
 
 
 
146
  units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
147
  'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
 
148
  tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
149
  scales = ['H536', 'T253', 'M450', 'C600']
 
 
 
150
  ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
151
  'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
152
  ordinal_endings = [('ieth', 'y'), ('th', '')]
 
195
 
196
  if (not is_numword(word)) or (word == 'and' and not lastscale):
197
  if onnumber:
 
198
  curstring += repr(result + current) + " "
199
  curstring += word + " "
200
  result = current = 0
 
205
  scale, increment = from_numword(word)
206
  onnumber = True
207
 
208
+ if lastunit and (word not in scales):
209
+ curstring += repr(result + current)
210
+ result = current = 0
211
+
212
+ if scale > 1:
213
+ current = max(1, current)
214
+
215
+ current = current * scale + increment
216
+ if scale > 100:
217
+ result += current
218
+ current = 0
219
+
220
+ lastscale = False
221
+ lastunit = False
222
+ if word in scales:
223
+ lastscale = True
224
+ elif word in units:
 
 
225
  lastunit = True
226
 
227
  if onnumber:
 
229
 
230
  return curstring
231
 
232
+ # Convert sentence to transcript using Soundex
233
+ def sentence_to_transcript(sentence, word_to_code_map):
234
+ words = sentence.split()
235
+ transcript_codes = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ for word in words:
238
+ if word not in word_to_code_map:
239
+ word_to_code_map[word] = soundex(word)
240
+ transcript_codes.append(word_to_code_map[word])
 
 
 
 
 
 
241
 
242
+ transcript = ' '.join(transcript_codes)
243
+ return transcript
244
+
245
+ # Convert transcript back to sentence using mapping
246
+ def transcript_to_sentence(transcript, code_to_word_map):
247
+ codes = transcript.split()
248
+ sentence_words = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ for code in codes:
251
+ sentence_words.append(code_to_word_map.get(code, code))
252
+
253
+ sentence = ' '.join(sentence_words)
254
+ return sentence
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  ######################################################
257
  demo=gr.Interface(