from langchain_core.prompts import ChatPromptTemplate from langchain_community.llms.huggingface_hub import HuggingFaceHub from langchain_community.embeddings import HuggingFaceEmbeddings from src.vectordatabase import RAG, get_vectorstore import pandas as pd from dotenv import load_dotenv, find_dotenv #Load environmental variables from .env-file #load_dotenv(find_dotenv()) embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") llm = HuggingFaceHub( # Try different model here repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22 gb # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb task="text-generation", model_kwargs={ "max_new_tokens": 512, "top_k": 30, "temperature": 0.1, "repetition_penalty": 1.03, } #,huggingfacehub_api_token ) # To Do: Experiment with different templates prompt_test = ChatPromptTemplate.from_template("""[INST] Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts: Context: {context} Question: {input} [/INST]""" ) prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts: {context} Frage: {input} """ # Returns the answer in German ) prompt_en = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts: {context} Frage: {input} """ # Returns the answer in German ) #folder_path = #index_name = "speeches_1949_09_12" #index_name = "legislature20" #db = get def chatbot(message, history, db_inputs, llm=llm, prompt=prompt_de): db = get_vectorstore(inputs = db_inputs, embeddings=embeddings) raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message) # Only necessary because mistral does include it´s json structure in the output try: response = raw_response['answer'].split("Antwort: ")[1] except: response = raw_response['answer'] return response def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'): """ Retrieve speech contents based on keywords using a specified method. Parameters: ---------- db : FAISS The FAISS vector store containing speech embeddings. query : str The keyword(s) to search for in the speech contents. n : int, optional The number of speech contents to retrieve (default is 10). embeddings : Embeddings, optional An instance of embeddings used for embedding queries (default is embeddings). method : str, optional The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr' (maximal marginal relevance) (default is 'ss'). party_filter : str, optional A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve speeches from all parties (default is 'All'). Returns: ------- pandas.DataFrame A DataFrame containing the speech contents, dates, and party affiliations. Notes: ----- - The `db` parameter should be a FAISS vector store containing speech embeddings. - The `query` parameter specifies the keyword(s) to search for in the speech contents. - The `n` parameter determines the number of speech contents to retrieve (default is 10). - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings). - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr' (maximal marginal relevance) (default is 'ss'). - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve speeches from all parties (default is 'All'). """ query_embedding = embeddings.embed_query(query) # Maximal Marginal Relevance if method == 'mmr': df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance']) results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n) for doc in results: party = doc[0].metadata["party"] if party != party_filter and party_filter != 'All': continue speech_content = doc[0].page_content speech_date = doc[0].metadata["date"] score = round(doc[1], ndigits=2) df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content], 'Date': [speech_date], 'Party': [party], 'Relevance': [score]})], ignore_index=True) df_res.sort_values('Relevance', inplace=True, ascending=True) # Similarity Search else: df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party']) results = db.similarity_search_by_vector(query_embedding, k=n) for doc in results: party = doc.metadata["party"] if party != party_filter and party_filter != 'All': continue speech_content = doc.page_content speech_date = doc.metadata["date"] df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content], 'Date': [speech_date], 'Party': [party]})], ignore_index=True) return df_res