cdactvm commited on
Commit
a875242
·
verified ·
1 Parent(s): 12efc5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -156
app.py CHANGED
@@ -21,55 +21,27 @@ def transcribe_odiya_eng(speech):
21
  from indictrans import Transliterator
22
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
23
  text = p1(speech)["text"]
24
- #text=trn.transform(text)
25
- #text = master_function(text)
26
  sentence = trn.transform(text)
 
 
27
  replaced_words = replace_words(sentence)
28
  processed_sentence = process_doubles(replaced_words)
29
- input_sentence_1 = processed_sentence
30
- # Create empty mappings
31
- word_to_code_map = {}
32
- code_to_word_map = {}
33
-
34
- # Convert sentence to transcript
35
- transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
36
-
37
- # Convert transcript to numerical representation
38
- numbers = text2int(transcript_1)
39
-
40
- # Create reverse mapping
41
- code_to_word_map = {v: k for k, v in word_to_code_map.items()}
42
-
43
- # Convert transcript back to sentence
44
- text = transcript_to_sentence(numbers, code_to_word_map)
45
- return text
46
 
47
  def transcribe_hin_eng(speech):
48
  from indictrans import Transliterator
49
  trn = Transliterator(source='hin', target='eng', build_lookup=True)
50
  text = p2(speech)["text"]
51
- #text=trn.transform(text)
52
- #text = master_function(text)
53
  sentence = trn.transform(text)
54
- #replaced_words = replace_words(sentence)
55
- #processed_sentence = process_doubles(replaced_words)
56
- #input_sentence_1 = processed_sentence
57
- # Create empty mappings
58
- #word_to_code_map = {}
59
- #code_to_word_map = {}
60
-
61
- # Convert sentence to transcript
62
- #transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
63
-
64
- # Convert transcript to numerical representation
65
- #numbers = text2int(transcript_1)
66
-
67
- # Create reverse mapping
68
- #code_to_word_map = {v: k for k, v in word_to_code_map.items()}
69
-
70
- # Convert transcript back to sentence
71
- #text = transcript_to_sentence(numbers, code_to_word_map)
72
- return sentence
73
 
74
  def sel_lng(lng,mic=None, file=None):
75
  if mic is not None:
@@ -89,6 +61,21 @@ def sel_lng(lng,mic=None, file=None):
89
 
90
  #####################################################
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def replace_words(sentence):
94
  replacements = [
@@ -103,7 +90,6 @@ def replace_words(sentence):
103
  sentence = re.sub(pattern, replacement, sentence)
104
  return sentence
105
 
106
- # Function to process "double" followed by a number
107
  def process_doubles(sentence):
108
  tokens = sentence.split()
109
  result = []
@@ -122,7 +108,6 @@ def process_doubles(sentence):
122
  i += 1
123
  return ' '.join(result)
124
 
125
- # Function to generate Soundex code for a word
126
  def soundex(word):
127
  word = word.upper()
128
  word = ''.join(filter(str.isalpha, word))
@@ -150,122 +135,10 @@ def is_number(x):
150
  return False
151
  return True
152
 
153
- # Function to convert text to numerical representation
154
  def text2int(textnum, numwords={}):
155
- units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
156
- 'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
157
- tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
158
- scales = ['H536', 'T253', 'M450', 'C600']
159
- ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
160
- 'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
161
- ordinal_endings = [('ieth', 'y'), ('th', '')]
162
- if not numwords:
163
- numwords['and'] = (1, 0)
164
- for idx, word in enumerate(units): numwords[word] = (1, idx)
165
- for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
166
- for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
167
-
168
- textnum = textnum.replace('-', ' ')
169
-
170
- current = result = 0
171
- curstring = ''
172
- onnumber = False
173
- lastunit = False
174
- lastscale = False
175
-
176
- def is_numword(x):
177
- if is_number(x):
178
- return True
179
- if word in numwords:
180
- return True
181
- return False
182
-
183
-
184
- def from_numword(x):
185
- if is_number(x):
186
- scale = 0
187
- increment = int(x.replace(',', ''))
188
- return scale, increment
189
- return numwords[x]
190
-
191
- for word in textnum.split():
192
- if word in ordinal_words:
193
- scale, increment = (1, ordinal_words[word])
194
- current = current * scale + increment
195
- if scale > 100:
196
- result += current
197
- current = 0
198
- onnumber = True
199
- lastunit = False
200
- lastscale = False
201
- else:
202
- for ending, replacement in ordinal_endings:
203
- if word.endswith(ending):
204
- word = "%s%s" % (word[:-len(ending)], replacement)
205
-
206
- if (not is_numword(word)) or (word == 'and' and not lastscale):
207
- if onnumber:
208
- # Flush the current number we are building
209
- curstring += repr(result + current) + " "
210
- curstring += word + " "
211
- result = current = 0
212
- onnumber = False
213
- lastunit = False
214
- lastscale = False
215
- else:
216
- scale, increment = from_numword(word)
217
- onnumber = True
218
-
219
- if lastunit and (word not in scales):
220
- # Assume this is part of a string of individual numbers to
221
- # be flushed, such as a zipcode "one two three four five"
222
- curstring += repr(result + current)
223
- result = current = 0
224
-
225
- if scale > 1:
226
- current = max(1, current)
227
-
228
- current = current * scale + increment
229
- if scale > 100:
230
- result += current
231
- current = 0
232
-
233
- lastscale = False
234
- lastunit = False
235
- if word in scales:
236
- lastscale = True
237
- elif word in units:
238
- lastunit = True
239
-
240
- if onnumber:
241
- curstring += repr(result + current)
242
-
243
- return curstring
244
-
245
-
246
- # Convert sentence to transcript using Soundex
247
- def sentence_to_transcript(sentence, word_to_code_map):
248
- words = sentence.split()
249
- transcript_codes = []
250
-
251
- for word in words:
252
- if word not in word_to_code_map:
253
- word_to_code_map[word] = soundex(word)
254
- transcript_codes.append(word_to_code_map[word])
255
-
256
- transcript = ' '.join(transcript_codes)
257
- return transcript
258
-
259
- # Convert transcript back to sentence using mapping
260
- def transcript_to_sentence(transcript, code_to_word_map):
261
- codes = transcript.split()
262
- sentence_words = []
263
 
264
- for code in codes:
265
- sentence_words.append(code_to_word_map.get(code, code))
266
-
267
- sentence = ' '.join(sentence_words)
268
- return sentence
269
 
270
  ######################################################
271
  demo=gr.Interface(
 
21
  from indictrans import Transliterator
22
  trn = Transliterator(source='ori', target='eng', build_lookup=True)
23
  text = p1(speech)["text"]
24
+ if text is None:
25
+ return "Error: ASR returned None"
26
  sentence = trn.transform(text)
27
+ if sentence is None:
28
+ return "Error: Transliteration returned None"
29
  replaced_words = replace_words(sentence)
30
  processed_sentence = process_doubles(replaced_words)
31
+ return process_transcription(processed_sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def transcribe_hin_eng(speech):
34
  from indictrans import Transliterator
35
  trn = Transliterator(source='hin', target='eng', build_lookup=True)
36
  text = p2(speech)["text"]
37
+ if text is None:
38
+ return "Error: ASR returned None"
39
  sentence = trn.transform(text)
40
+ if sentence is None:
41
+ return "Error: Transliteration returned None"
42
+ replaced_words = replace_words(sentence)
43
+ processed_sentence = process_doubles(replaced_words)
44
+ return process_transcription(processed_sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def sel_lng(lng,mic=None, file=None):
47
  if mic is not None:
 
61
 
62
  #####################################################
63
 
64
+ def process_transcription(input_sentence):
65
+ word_to_code_map = {}
66
+ code_to_word_map = {}
67
+
68
+ transcript_1 = sentence_to_transcript(input_sentence, word_to_code_map)
69
+ if transcript_1 is None:
70
+ return "Error: Transcript conversion returned None"
71
+
72
+ numbers = text2int(transcript_1)
73
+ if numbers is None:
74
+ return "Error: Text to number conversion returned None"
75
+
76
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
77
+ text = transcript_to_sentence(numbers, code_to_word_map)
78
+ return text
79
 
80
  def replace_words(sentence):
81
  replacements = [
 
90
  sentence = re.sub(pattern, replacement, sentence)
91
  return sentence
92
 
 
93
  def process_doubles(sentence):
94
  tokens = sentence.split()
95
  result = []
 
108
  i += 1
109
  return ' '.join(result)
110
 
 
111
  def soundex(word):
112
  word = word.upper()
113
  word = ''.join(filter(str.isalpha, word))
 
135
  return False
136
  return True
137
 
 
138
  def text2int(textnum, numwords={}):
139
+ if not textnum:
140
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
 
 
 
 
 
142
 
143
  ######################################################
144
  demo=gr.Interface(