import json import os import torch from sentence_transformers import SentenceTransformer import faiss import numpy as np import gradio as gr def load_or_create_model_and_embeddings(model_name, data_file): model_path = os.path.join(output_dir, 'saved_model') embeddings_path = os.path.join(output_dir, 'corpus_embeddings.pt') if os.path.exists(model_path) and os.path.exists(embeddings_path): print("載入已保存的模型和嵌入...") model = SentenceTransformer(model_path) embeddings = torch.load(embeddings_path) else: model = SentenceTransformer(model_name) with open(data_file, 'r', encoding='utf-8') as f: data = json.load(f) texts = [item['text'] for item in data] embeddings = model.encode(texts, convert_to_tensor=True) return model, embeddings # 設置參數 model_name = 'sentence-transformers/all-MiniLM-L6-v2' data_file = 'labeled_cti_data.json' output_dir = '.' # 載入或創建模型和嵌入 model, embeddings= load_or_create_model_and_embeddings(model_name, data_file) # 創建 Faiss 索引 dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings.cpu().numpy().astype('float32')) def semantic_search(query, top_k=3): query_vector = model.encode([query], convert_to_tensor=True) distances, indices = index.search(query_vector.cpu().numpy().astype('float32'), top_k) results = [] for i, idx in enumerate(indices[0]): results.append({ 'text': texts[idx], 'similarity_score': 1 - distances[0][i] / 2 }) return results def search_and_format(query): results = semantic_search(query) formatted_results = "" for i, result in enumerate(results, 1): formatted_results += f"{i}. 相似度分數: {result['similarity_score']:.4f}\n" formatted_results += f" 情一: {result['text']}\n\n" return formatted_results # 創建Gradio界面 iface = gr.Interface( fn=search_and_format, inputs=gr.Textbox(lines=2, placeholder="輸入您的搜索查詢..."), outputs=gr.Textbox(lines=10), title="語義搜索", description="輸入查詢以搜索相關文本。將顯示前3個最相關的結果。" ) # 啟動Gradio界面 iface.launch()