Spaces:
Sleeping
Sleeping
File size: 5,913 Bytes
f3d0f1e 85df319 f3d0f1e 0d7e513 f3d0f1e 5b20ce0 227833b f3d0f1e 3ebff47 f3d0f1e 3ebff47 f3d0f1e 85df319 b6312c1 85df319 f3d0f1e 85df319 f3d0f1e 85df319 3ebff47 f3d0f1e 85df319 f3d0f1e 85df319 0d7e513 85df319 0d7e513 e681b03 85df319 0d7e513 85df319 0d7e513 85df319 0d7e513 85df319 0d7e513 85df319 0d7e513 e681b03 85df319 0d7e513 85df319 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings
from src.vectordatabase import RAG, get_vectorstore
import pandas as pd
from dotenv import load_dotenv, find_dotenv
#Load environmental variables from .env-file
#load_dotenv(find_dotenv())
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
llm = HuggingFaceHub(
# Try different model here
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
# repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
# repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22 gb
# repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
task="text-generation",
model_kwargs={
"max_new_tokens": 512,
"top_k": 30,
"temperature": 0.1,
"repetition_penalty": 1.03,
}
#,huggingfacehub_api_token
)
# To Do: Experiment with different templates replying in german or english depending on the input language
prompt1 = ChatPromptTemplate.from_template("""<s>[INST]
Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
Context: {context}
Question: {input}
[/INST]"""
# Returns the answer in English!?
)
prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
<context>
{context}
</context>
Frage: {input}
"""
# Returns the answer in German
)
#folder_path =
#index_name = "speeches_1949_09_12"
#index_name = "legislature20"
#db = get
def chatbot(message, history, db_inputs, llm=llm, prompt=prompt2):
db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
# Only necessary because mistral does not give beautiful outputs
response = raw_response['answer'].split("Antwort: ")[1]
return response
def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
"""
Retrieve speech contents based on keywords using a specified method.
Parameters:
----------
db : FAISS
The FAISS vector store containing speech embeddings.
query : str
The keyword(s) to search for in the speech contents.
n : int, optional
The number of speech contents to retrieve (default is 10).
embeddings : Embeddings, optional
An instance of embeddings used for embedding queries (default is embeddings).
method : str, optional
The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
(maximal marginal relevance) (default is 'ss').
party_filter : str, optional
A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
speeches from all parties (default is 'All').
Returns:
-------
pandas.DataFrame
A DataFrame containing the speech contents, dates, and party affiliations.
Notes:
-----
- The `db` parameter should be a FAISS vector store containing speech embeddings.
- The `query` parameter specifies the keyword(s) to search for in the speech contents.
- The `n` parameter determines the number of speech contents to retrieve (default is 10).
- The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
- The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
and 'mmr' (maximal marginal relevance) (default is 'ss').
- The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
speeches from all parties (default is 'All').
"""
query_embedding = embeddings.embed_query(query)
# Maximal Marginal Relevance
if method == 'mmr':
df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
for doc in results:
party = doc[0].metadata["party"]
if party != party_filter and party_filter != 'All':
continue
speech_content = doc[0].page_content
speech_date = doc[0].metadata["date"]
score = round(doc[1], ndigits=2)
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
'Date': [speech_date],
'Party': [party],
'Relevance': [score]})], ignore_index=True)
df_res.sort_values('Relevance', inplace=True, ascending=True)
# Similarity Search
else:
df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
results = db.similarity_search_by_vector(query_embedding, k=n)
for doc in results:
party = doc.metadata["party"]
if party != party_filter and party_filter != 'All':
continue
speech_content = doc.page_content
speech_date = doc.metadata["date"]
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
'Date': [speech_date],
'Party': [party]})], ignore_index=True)
return df_res
|