File size: 3,550 Bytes
1b5d3b4
 
 
 
 
 
645c5d6
72733aa
1b5d3b4
f9d356e
87448d5
72733aa
1b5d3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87448d5
 
 
 
 
 
c442756
 
 
72733aa
 
18a3831
72733aa
 
 
c442756
 
 
87448d5
c442756
 
 
 
 
 
 
 
 
 
1b5d3b4
645c5d6
1b5d3b4
 
 
f9d356e
 
 
 
 
 
1b5d3b4
f9d356e
1b5d3b4
 
f9d356e
1b5d3b4
 
 
 
f9d356e
645c5d6
 
c442756
645c5d6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import gradio as gr
import sox
import subprocess
from fuzzywuzzy import fuzz
from data import get_data


DATASET = get_data()

def read_file_and_process(wav_file):
    filename = wav_file.split('.')[0]
    filename_16k = filename + "16k.wav"
    resampler(wav_file, filename_16k)
    speech, _ = sf.read(filename_16k)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    return inputs


def resampler(input_file_path, output_file_path):
    command = (
        f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
        f"{output_file_path}"
    )
    subprocess.call(command, shell=True)


def parse_transcription(logits):
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription


# def parse(wav_file):
#     input_values = read_file_and_process(wav_file)
#     with torch.no_grad():
#         logits = model(**input_values).logits
#     user_question = parse_transcription(logits)
#     return user_question


# Function to retrieve an answer based on a question (using fuzzy matching)
def get_answer(wav_file):
    input_values = read_file_and_process(wav_file)
    with torch.no_grad():
        logits = model(**input_values).logits
    user_question = parse_transcription(logits)

    highest_score = 0
    best_answer = None

    for item in DATASET:
        similarity_score = fuzz.token_set_ratio(user_question, item["question"])
        if similarity_score > highest_score:
            highest_score = similarity_score
            best_answer = item["answer"]

    if highest_score >= 80:  # Adjust the similarity threshold as needed
        return best_answer
    else:
        return "I don't have an answer to that question."


model_id = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

input_ = gr.Audio(source="microphone",
                  type="filepath",
                  label="لطفا دکمه ضبط صدا را بزنید و شروع به صحبت کنید و بعذ از اتمام صحبت دوباره دکمه ضبط را فشار دهید.",
                  show_download_button=True,
                  show_edit_button=True,
                 )
txtbox = gr.Textbox(
            label="متن گفتار شما: ",
            lines=5,
            text_align="right",
            show_label=True,
            show_copy_button=True,
        )

title = "Speech-to-Text (persian)"
description = "، توجه داشته باشید که هرچه گفتار شما شمرده تر باشد خروجی با کیفیت تری دارید.روی دکمه ضبط صدا کلیک کنید و سپس دسترسی مرورگر خود را به میکروفون دستگاه بدهید، سپس شروع به صحبت کنید و برای اتمام ضبط دوباره روی دکمه کلیک کنید"
article = "<p style='text-align: center'><a href='https://github.com/nimaprgrmr'>Large-Scale Self- and Semi-Supervised Learning for Speech Translation</a></p>"

demo = gr.Interface(fn=get_answer, inputs = input_,  outputs=txtbox, title=title, description=description, article = article,
             streaming=True, interactive=True,
             analytics_enabled=False, show_tips=False, enable_queue=True)
demo.launch(share=True)