File size: 5,807 Bytes
11e59d2
 
d2399db
11e59d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b2baef
d2399db
11e59d2
 
1b2baef
 
 
 
11e59d2
 
 
 
 
 
 
 
 
a6ed7c3
 
11e59d2
 
 
a4d1f8f
11e59d2
1b2baef
11e59d2
 
 
b5253d5
11e59d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2399db
 
 
 
 
 
 
11e59d2
d2399db
11e59d2
 
1ffd215
ed7883c
1ffd215
ed7883c
11e59d2
 
 
 
 
 
 
d2399db
11e59d2
5b58187
 
d2399db
5b58187
d2399db
5b58187
 
 
 
 
d2399db
 
 
 
 
5b58187
d2399db
5b58187
d2399db
 
5b58187
d2399db
 
 
11e59d2
 
d2399db
5b58187
11e59d2
 
 
 
 
 
 
 
 
 
 
 
b240311
11e59d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import nest_asyncio
import gradio as gr
import tiktoken
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import LLMRerank
import logging
import sys
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.legacy.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI
import os
import pandas as pd
from llama_index.core import Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle
import time
from huggingface_hub import login
from gradio import ChatMessage
nest_asyncio.apply()
hf_token = os.getenv('hf_token')


# Replace 'your_token_here' with your actual Hugging Face API token
login(token=hf_token)
# quantize to save memory
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="kheopss/kheops_hermes-202k-e3-v0.14-bnb-16bit",
    tokenizer_name="kheopss/kheops_hermes-202k-e3-v0.14-bnb-16bit",
    context_window=3900,
    max_new_tokens=2560,
    model_kwargs={"quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.01, "top_k": 0.95, "top_p": 0.95},
    device_map="cuda:0",
 
)

embed_model = HuggingFaceEmbedding(
    model_name="jinaai/jina-embeddings-v3",
)
Settings.llm=llm
Settings.embed_model=embed_model
# Replace 'file_path.json' with the path to your JSON file


file_path = 'response_metropo_cleaned.json'

data = pd.read_json(file_path)

documents = [Document(text=row['values'],metadata={"filename": row['file_name'], "description":row['file_description']},) for index, row in data.iterrows()]
index = VectorStoreIndex.from_documents(documents, show_progress=True)

def get_retrieved_nodes(
    query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
):
    query_bundle = QueryBundle(query_str)
    # configure retriever
    phase_01_start = time.time()
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=vector_top_k,
    )
    retrieved_nodes = retriever.retrieve(query_bundle)
    phase_01_end = time.time()
    print(f"Phase 01 <RETRIEVING> took  : {phase_01_end-phase_01_start}")
    phase_02_start = time.time()
    if with_reranker:
        # configure reranker
        reranker = LLMRerank(
     
            choice_batch_size=5,
            top_n=reranker_top_n,
        )
        retrieved_nodes = reranker.postprocess_nodes(
            retrieved_nodes, query_bundle
        )
    phase_02_end = time.time()
    print(f"Phase 02 <RERANKING> took  : {phase_02_end-phase_02_start}")
    return retrieved_nodes

def get_all_text(new_nodes):
    texts = []
    for i, node in enumerate(new_nodes, 1):
        texts.append(f"\nDocument {i} : {node.get_text()}")
    return ' '.join(texts)

# Charger le tokenizer cl100k_base
encoding = tiktoken.get_encoding("cl100k_base")

def estimate_tokens(text):
    # Encoder le texte pour obtenir les tokens
    tokens = encoding.encode(text)
    return len(tokens)

def process_final(user_prom,history):
    import time
    all_process_start = time.time()
    system_p = '''
You are a conversational AI assistant tasked with helping public agents in Nice guide residents and citizens to appropriate services. Your role is to respond to user queries using only the information provided in the documents. You are not allowed to invent or infer information beyond what is given in the documents.

Always respond in French, ensuring that your answers are clear, concise, and grounded in the document content. Make sure to provide helpful and accurate responses based solely on the documents provided, while engaging in conversation to assist based on user questions.'''
    new_nodes = get_retrieved_nodes(
      user_prom,
      vector_top_k=5,
      reranker_top_n=3,
      with_reranker=True,
    ) 
    get_texts = get_all_text(new_nodes)

    print("PHASE 03 passing to LLM\n")
    sys_p = f"<|im_start|>system \n{system_p}\n DOCUMENTS {get_texts}\n<|im_end|>"
    prompt_f=""
    total_tokens = estimate_tokens(prompt_f)

    for val in reversed(history):
        if val[0]:
            user_p = f" <|im_start|>user \n {val[0]}\n<|im_end|>"
        if val[1]:
            assistant_p = f" <|im_start|>assistant \n {val[1]}\n<|im_end|>"
        current_tokens = estimate_tokens(user_p+assistant_p)
        # Vérifier si l'ajout de cet historique dépasse la limite
        if total_tokens + current_tokens > 3000:
            break  # Arrêter l'ajout si on dépasse la limite
        else:
            # Ajouter à `prompt_f` et mettre à jour le nombre total de tokens
            prompt_f = user_p + assistant_p + prompt_f
            total_tokens += current_tokens
    prompt_f=f"{sys_p} {prompt_f} <|im_start|>user \n{user_prom} \n<|im_end|><|im_start|>assistant \n"
    phase_03_start = time.time()
    gen =llm.stream_complete(formatted=True, prompt=prompt_f)
    print (f"le nombre TOTAL de tokens : {total_tokens}\n")
    print("_"*100)
    print(prompt_f)
    print("o"*100)
    for response in gen:
        yield response.text


description = """
<p>
<center>
<img src="https://www.nicecotedazur.org/wp-content/themes/mnca/images/logo-metropole-nca.png" alt="rick" width="250"/>
</center>
</p>
<p style="text-align:right"> Made by KHEOPS AI</p>
"""   
demo = gr.ChatInterface(
    fn=process_final, 
    title="METROPOLE CHATBOT",
    description=description,
)
demo.launch(share=True, debug =True)