run_speech_recognition_ctc.py
CHANGED
@@ -511,7 +511,6 @@ def main():
|
|
511 |
tokenizer_kwargs = {
|
512 |
"config": config if config.tokenizer_class is not None else None,
|
513 |
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
|
514 |
-
"bos_token": "<s>",
|
515 |
"unk_token": unk_token,
|
516 |
"pad_token": pad_token,
|
517 |
"word_delimiter_token": word_delimiter_token,
|
@@ -522,11 +521,10 @@ def main():
|
|
522 |
# one local process can concurrently download model & vocab.
|
523 |
|
524 |
# load feature_extractor and tokenizer
|
525 |
-
tokenizer =
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
)
|
530 |
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
531 |
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
532 |
)
|
|
|
511 |
tokenizer_kwargs = {
|
512 |
"config": config if config.tokenizer_class is not None else None,
|
513 |
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
|
|
|
514 |
"unk_token": unk_token,
|
515 |
"pad_token": pad_token,
|
516 |
"word_delimiter_token": word_delimiter_token,
|
|
|
521 |
# one local process can concurrently download model & vocab.
|
522 |
|
523 |
# load feature_extractor and tokenizer
|
524 |
+
tokenizer = Wav2Vec2CTCTokenizer(tokenizer_name_or_path,
|
525 |
+
use_auth_token=data_args.use_auth_token,
|
526 |
+
**tokenizer_kwargs,
|
527 |
+
)
|
|
|
528 |
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
529 |
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
|
530 |
)
|