|
import streamlit as st |
|
from streamlit.logger import get_logger |
|
import datasets |
|
import pandas as pd |
|
from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
|
from langchain_openai import ChatOpenAI |
|
from langchain_core.prompts import PromptTemplate |
|
from langchain_core.messages import HumanMessage, SystemMessage |
|
from sentence_transformers import util |
|
from torch import tensor |
|
|
|
|
|
|
|
LOGGER = get_logger(__name__) |
|
|
|
|
|
@st.cache_data |
|
def get_df() ->object: |
|
ds = datasets.load_dataset('sivan22/orach-chaim-embeddings-e5') |
|
df = pd.DataFrame.from_dict(ds['train']) |
|
df = df[df['bookname']==' 诪砖谞讛 讘专讜专讛'] |
|
return df |
|
|
|
@st.cache_resource |
|
def get_model()->object: |
|
model_name = "intfloat/multilingual-e5-large" |
|
model_kwargs = {'device': 'cpu'} |
|
encode_kwargs = {'normalize_embeddings': False} |
|
embeddings_model = HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
model_kwargs=model_kwargs, |
|
encode_kwargs=encode_kwargs |
|
) |
|
return embeddings_model |
|
|
|
@st.cache_resource |
|
def get_chat_api(api_key:str): |
|
chat = ChatOpenAI(model="gpt-3.5-turbo-16k", api_key=api_key) |
|
return chat |
|
|
|
|
|
def get_results(embeddings_model,input,df,num_of_results) -> pd.DataFrame: |
|
embeddings = embeddings_model.embed_query('query: '+ input) |
|
hits = util.semantic_search(tensor(embeddings), tensor(df['embeddings'].tolist()), top_k=num_of_results) |
|
hit_list = [hit['corpus_id'] for hit in hits[0]] |
|
return df.iloc[hit_list] |
|
|
|
def get_llm_results(query,chat,results): |
|
|
|
prompt_template = PromptTemplate.from_template( |
|
""" |
|
your misssion is to rank the given answers based on their relevance to the given question. |
|
Provide a relevancy score between 0 (not relevant) and 1 (highly relevant) for each possible answer. |
|
the results should be in the following JSON format: "answer": "score", "answer": "score" while answer is the possible answer's text and score is the relevancy score. |
|
|
|
the question is: {query} |
|
|
|
the possible answers are: |
|
{answers} |
|
|
|
""" ) |
|
|
|
messages = [ |
|
SystemMessage(content=""" |
|
You're a helpful assistant. |
|
Return a JSON formatted string. |
|
"""), |
|
HumanMessage(content=prompt_template.format(query=query, answers=str.join('\n', results['text'].head(10).tolist()))), |
|
] |
|
|
|
response = chat.invoke(messages) |
|
llm_results_df = pd.read_json(response.content, orient='index') |
|
llm_results_df.rename(columns={0: 'score'}, inplace=True) |
|
llm_results_df.sort_values(by='score', ascending=False, inplace=True) |
|
return llm_results_df |
|
|
|
|
|
|
|
def run(): |
|
|
|
st.set_page_config( |
|
page_title=" 讞讬驻讜砖 住诪谞讟讬 讘诪砖谞讛 讘专讜专讛", |
|
page_icon="馃摎", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
st.write("# 讞讬驻讜砖 讞讻诐 讘住驻专 诪砖谞讛 讘专讜专讛") |
|
|
|
embeddings_model = get_model() |
|
df = get_df() |
|
|
|
user_input = st.text_input('讻转讜讘 讻讗谉 讗转 砖讗诇转讱', placeholder='讻诪讛 谞专讜转 诪讚诇讬拽讬诐 讘讻诇 诇讬诇讛 诪诇讬诇讜转 讛讞谞讜讻讛') |
|
num_of_results = st.sidebar.slider('诪住驻专 讛转讜爪讗讜转 砖讘专爪讜谞讱 诇讛爪讬讙:',1,25,5) |
|
use_llm = st.sidebar.checkbox("讛砖转诪砖 讘诪讜讚诇 砖驻讛 讻讚讬 诇砖驻专 转讜爪讗讜转", False) |
|
openAikey = st.sidebar.text_input("OpenAI API key", type="password") |
|
|
|
if (st.button('讞驻砖') or user_input) and user_input!="": |
|
|
|
results = get_results(embeddings_model,user_input,df,num_of_results) |
|
|
|
if use_llm: |
|
if openAikey == None or openAikey=="": |
|
st.write("诇讗 讛讜讻谞住 诪驻转讞 砖诇 OpenAI") |
|
|
|
else: |
|
chat = get_chat_api(openAikey) |
|
llm_results = get_llm_results(user_input,chat,results) |
|
st.write(llm_results) |
|
|
|
else: |
|
st.write(results[['siman','sek','text']].head(10)) |
|
|
|
|
|
if __name__ == "__main__": |
|
run() |
|
|