Spaces:
Sleeping
Sleeping
File size: 1,864 Bytes
f99b8f7 22a48ab f99b8f7 c64b53c f99b8f7 22a48ab f99b8f7 22a48ab f99b8f7 22a48ab f99b8f7 22a48ab f99b8f7 22a48ab f99b8f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python
# coding: utf-8
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import openai # New import for Whisper API
# 載入語義搜索模型
model_checkpoint = "sickcell69/cti-semantic-search-minilm"
#model_checkpoint = "sickcell69/bert-finetuned-ner"
model = SentenceTransformer(model_checkpoint)
# 載入數據
data_path = 'labeled_cti_data.json'
data = pd.read_json(data_path)
# 載入嵌入文件
embeddings_path = 'corpus_embeddings.pt'
corpus_embeddings = torch.load(embeddings_path)
def semantic_search(query):
query_embedding = model.encode(query, convert_to_tensor=True)
search_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
results = []
for hit in search_hits[0]:
text = " ".join(data.iloc[hit['corpus_id']]['tokens'])
results.append(f"Score: {hit['score']:.4f} - Text: {text}")
return "\n".join(results)
# New function to transcribe audio using Whisper API
def transcribe_audio(audio_file):
audio_bytes = audio_file.read()
response = openai.Audio.transcribe("whisper-1", audio_bytes)
return response['text']
# Modified interface to include audio input
iface = gr.Interface(
fn=semantic_search,
inputs=["text", "file"], # Add audio file input
outputs="text",
title="語義搜索應用",
description="輸入一個查詢或上傳一個音頻文件,然後模型將返回最相似的結果。",
examples=["example_audio.wav"] # Example audio file
)
# New function to handle both text and audio inputs
def handle_input(input_text, audio_file):
if audio_file is not None:
input_text = transcribe_audio(audio_file)
return semantic_search(input_text)
if __name__ == "__main__":
#iface.launch()
iface.launch(share=True) #網頁跑不出來
|