File size: 5,913 Bytes
f3d0f1e
 
 
 
85df319
f3d0f1e
 
0d7e513
f3d0f1e
 
5b20ce0
227833b
f3d0f1e
 
 
 
3ebff47
 
 
 
f3d0f1e
 
 
 
 
 
 
3ebff47
 
f3d0f1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85df319
 
b6312c1
85df319
 
 
 
 
 
f3d0f1e
85df319
 
f3d0f1e
85df319
3ebff47
f3d0f1e
 
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
85df319
 
0d7e513
85df319
 
0d7e513
e681b03
 
85df319
0d7e513
 
85df319
0d7e513
85df319
 
 
0d7e513
85df319
 
0d7e513
85df319
 
0d7e513
 
e681b03
85df319
0d7e513
 
 
85df319
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings


from src.vectordatabase import RAG, get_vectorstore
import pandas as pd
from dotenv import load_dotenv, find_dotenv

#Load environmental variables from .env-file
#load_dotenv(find_dotenv())


embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
llm = HuggingFaceHub(
    # Try different model here
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
    # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22 gb
    # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
        }
        #,huggingfacehub_api_token

)
# To Do: Experiment with different templates replying in german or english depending on the input language
prompt1 = ChatPromptTemplate.from_template("""<s>[INST] 

                    Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:



                    Context: {context}



                    Question: {input}  

                    [/INST]"""
                    # Returns the answer in English!?
) 

prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:



        <context>

        {context}

        </context>



        Frage: {input}

        """
        # Returns the answer in German
)

 
#folder_path = 
#index_name = "speeches_1949_09_12"
#index_name = "legislature20"
#db = get



    
     

def chatbot(message, history, db_inputs, llm=llm, prompt=prompt2):
    db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
    raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
    # Only necessary because mistral does not give beautiful outputs 
    response = raw_response['answer'].split("Antwort: ")[1]
    return response  


def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
    """

    Retrieve speech contents based on keywords using a specified method.



    Parameters:

    ----------

    db : FAISS

        The FAISS vector store containing speech embeddings.



    query : str

        The keyword(s) to search for in the speech contents.



    n : int, optional

        The number of speech contents to retrieve (default is 10).



    embeddings : Embeddings, optional

        An instance of embeddings used for embedding queries (default is embeddings).



    method : str, optional

        The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr' 

        (maximal marginal relevance) (default is 'ss').



    party_filter : str, optional

        A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve 

        speeches from all parties (default is 'All').



    Returns:

    -------

    pandas.DataFrame

        A DataFrame containing the speech contents, dates, and party affiliations.

    

    Notes:

    -----

    - The `db` parameter should be a FAISS vector store containing speech embeddings.

    - The `query` parameter specifies the keyword(s) to search for in the speech contents.

    - The `n` parameter determines the number of speech contents to retrieve (default is 10).

    - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).

    - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search) 

      and 'mmr' (maximal marginal relevance) (default is 'ss').

    - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve 

      speeches from all parties (default is 'All').

    """

    query_embedding = embeddings.embed_query(query)

    # Maximal Marginal Relevance
    if method == 'mmr':
        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
        results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
        for doc in results:
            party = doc[0].metadata["party"]
            if party != party_filter and party_filter != 'All':
                continue
            speech_content = doc[0].page_content
            speech_date = doc[0].metadata["date"]
            score = round(doc[1], ndigits=2)
            df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
                                                      'Date': [speech_date],
                                                      'Party': [party],
                                                      'Relevance': [score]})], ignore_index=True)
        df_res.sort_values('Relevance', inplace=True, ascending=True)

    # Similarity Search
    else:
        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
        results = db.similarity_search_by_vector(query_embedding, k=n)
        for doc in results:
            party = doc.metadata["party"]
            if party != party_filter and party_filter != 'All':
                continue
            speech_content = doc.page_content
            speech_date = doc.metadata["date"]
            df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
                                                      'Date': [speech_date],
                                                      'Party': [party]})], ignore_index=True)
    return df_res