#!/usr/bin/env python # coding: utf-8 import gradio as gr import pandas as pd from sentence_transformers import SentenceTransformer, util import torch import openai # New import for Whisper API # 載入語義搜索模型 model_checkpoint = "sickcell69/cti-semantic-search-minilm" #model_checkpoint = "sickcell69/bert-finetuned-ner" model = SentenceTransformer(model_checkpoint) # 載入數據 data_path = 'labeled_cti_data.json' data = pd.read_json(data_path) # 載入嵌入文件 embeddings_path = 'corpus_embeddings.pt' corpus_embeddings = torch.load(embeddings_path) def semantic_search(query): query_embedding = model.encode(query, convert_to_tensor=True) search_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5) results = [] for hit in search_hits[0]: text = " ".join(data.iloc[hit['corpus_id']]['tokens']) results.append(f"Score: {hit['score']:.4f} - Text: {text}") return "\n".join(results) # New function to transcribe audio using Whisper API def transcribe_audio(audio_file): audio_bytes = audio_file.read() response = openai.Audio.transcribe("whisper-1", audio_bytes) return response['text'] # Modified interface to include audio input iface = gr.Interface( fn=semantic_search, inputs=["text", "file"], # Add audio file input outputs="text", title="語義搜索應用", description="輸入一個查詢或上傳一個音頻文件,然後模型將返回最相似的結果。", examples=["example_audio.wav"] # Example audio file ) # New function to handle both text and audio inputs def handle_input(input_text, audio_file): if audio_file is not None: input_text = transcribe_audio(audio_file) return semantic_search(input_text) if __name__ == "__main__": #iface.launch() iface.launch(share=True) #網頁跑不出來