HwwwH commited on
Commit
7616f50
·
1 Parent(s): 1bbb766
Files changed (1) hide show
  1. processing_minicpmo.py +26 -26
processing_minicpmo.py CHANGED
@@ -102,6 +102,31 @@ class MiniCPMOProcessor(ProcessorMixin):
102
 
103
  return MiniCPMOBatchFeature(data={**model_inputs})
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def audio_feature_extract(
106
  self,
107
  audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
@@ -111,31 +136,6 @@ class MiniCPMOProcessor(ProcessorMixin):
111
  chunk_length: Optional[int] = 1,
112
  **kwargs,
113
  ):
114
- def get_audio_placeholder(audio_lens, chunk_input):
115
- pool_step = 2
116
- feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
117
-
118
- feature_lens = (feature_lens - 1) // 2 + 1
119
- output_lens = (feature_lens - pool_step) // pool_step + 1
120
-
121
- if chunk_input:
122
- fbank_feat_in_chunk = int(chunk_length * 100)
123
- cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
124
- audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
125
- num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
126
-
127
- place_holders = ""
128
- total_unk_len = 0
129
- for _ in range(num_audio_chunks):
130
- unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
131
- place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
132
- total_unk_len += unk_len
133
- audio_placeholder = place_holders
134
- else:
135
- audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
136
-
137
- return audio_placeholder
138
-
139
  if isinstance(audios, np.ndarray):
140
  audios_list = [[audios]]
141
  elif isinstance(audios[0], np.ndarray):
@@ -156,7 +156,7 @@ class MiniCPMOProcessor(ProcessorMixin):
156
  # audio placeholder not dependent on audio_parts
157
  for audios in audios_list:
158
  if audios:
159
- audio_ph_list.append([get_audio_placeholder(len(a), chunk_input) for a in audios])
160
  else:
161
  audio_ph_list.append([])
162
 
 
102
 
103
  return MiniCPMOBatchFeature(data={**model_inputs})
104
 
105
+ def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length):
106
+ pool_step = 2
107
+ feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
108
+
109
+ feature_lens = (feature_lens - 1) // 2 + 1
110
+ output_lens = (feature_lens - pool_step) // pool_step + 1
111
+
112
+ if chunk_input:
113
+ fbank_feat_in_chunk = int(chunk_length * 100)
114
+ cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
115
+ audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
116
+ num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
117
+
118
+ place_holders = ""
119
+ total_unk_len = 0
120
+ for _ in range(num_audio_chunks):
121
+ unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
122
+ place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
123
+ total_unk_len += unk_len
124
+ audio_placeholder = place_holders
125
+ else:
126
+ audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
127
+
128
+ return audio_placeholder
129
+
130
  def audio_feature_extract(
131
  self,
132
  audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
 
136
  chunk_length: Optional[int] = 1,
137
  **kwargs,
138
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  if isinstance(audios, np.ndarray):
140
  audios_list = [[audios]]
141
  elif isinstance(audios[0], np.ndarray):
 
156
  # audio placeholder not dependent on audio_parts
157
  for audios in audios_list:
158
  if audios:
159
+ audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios])
160
  else:
161
  audio_ph_list.append([])
162