Spaces:
Running
Running
luanpoppe
commited on
Commit
·
3143cff
1
Parent(s):
ca8a144
feat: adicionando busca pelo modelo de um usuário para entrar no system prompt final
Browse files- _utils/resumo_completo_cursor.py +29 -16
- resumos/serializer.py +4 -6
- setup/environment.py +3 -1
_utils/resumo_completo_cursor.py
CHANGED
@@ -16,6 +16,9 @@ import numpy as np
|
|
16 |
from rank_bm25 import BM25Okapi
|
17 |
import logging
|
18 |
from cohere import Client
|
|
|
|
|
|
|
19 |
|
20 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
21 |
"""Combine multiple ranked lists using reciprocal rank fusion"""
|
@@ -85,21 +88,20 @@ class DocumentSummarizer:
|
|
85 |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
86 |
"""Load PDF and split into chunks with metadata"""
|
87 |
loader = PyPDFLoader(pdf_path)
|
88 |
-
pages = loader.load()
|
89 |
chunks = []
|
90 |
char_count = 0
|
91 |
|
92 |
for page in pages:
|
93 |
text = page.page_content
|
94 |
-
#
|
95 |
-
page_chunks = self.text_splitter.split_text(text)
|
96 |
|
97 |
for chunk in page_chunks:
|
98 |
chunk_id = str(uuid.uuid4())
|
99 |
-
start_char = text.find(chunk)
|
100 |
end_char = start_char + len(chunk)
|
101 |
|
102 |
-
doc_chunk = DocumentChunk(
|
103 |
content=chunk,
|
104 |
page_number=page.metadata.get('page') + 1, # 1-based page numbering
|
105 |
chunk_id=chunk_id,
|
@@ -119,7 +121,7 @@ class DocumentSummarizer:
|
|
119 |
|
120 |
return chunks
|
121 |
|
122 |
-
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
|
123 |
"""Create vector store with metadata"""
|
124 |
texts = [chunk.content for chunk in chunks]
|
125 |
metadatas = [{
|
@@ -136,7 +138,7 @@ class DocumentSummarizer:
|
|
136 |
)
|
137 |
return vector_store
|
138 |
|
139 |
-
def rerank_chunks(
|
140 |
self,
|
141 |
chunks: List[Dict],
|
142 |
query: str,
|
@@ -180,7 +182,7 @@ class DocumentSummarizer:
|
|
180 |
logging.error(f"Reranking failed: {str(e)}")
|
181 |
return chunks[:k] # Fallback to original ordering
|
182 |
|
183 |
-
def generate_summary_with_sources(
|
184 |
self,
|
185 |
vector_store: Chroma,
|
186 |
query: str = "Summarize the main points of this document"
|
@@ -256,7 +258,7 @@ class DocumentSummarizer:
|
|
256 |
|
257 |
return structured_output
|
258 |
|
259 |
-
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
|
260 |
"""Get extended context around a specific chunk"""
|
261 |
metadata = self.chunk_metadata.get(chunk_id)
|
262 |
if not metadata:
|
@@ -270,7 +272,7 @@ class DocumentSummarizer:
|
|
270 |
|
271 |
class ContextualRetriever:
|
272 |
def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
|
273 |
-
self.config = config
|
274 |
self.claude_client = Anthropic(api_key=claude_api_key)
|
275 |
self.logger = logging.getLogger(__name__)
|
276 |
self.bm25 = None
|
@@ -293,12 +295,12 @@ class ContextualRetriever:
|
|
293 |
max_tokens=100,
|
294 |
messages=[{"role": "user", "content": prompt}]
|
295 |
)
|
296 |
-
return response.content[0].text
|
297 |
except Exception as e:
|
298 |
self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
|
299 |
return ""
|
300 |
|
301 |
-
def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]:
|
302 |
"""Add context to all chunks"""
|
303 |
contextualized_chunks = []
|
304 |
for chunk in chunks:
|
@@ -315,7 +317,7 @@ class ContextualRetriever:
|
|
315 |
return contextualized_chunks
|
316 |
|
317 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
318 |
-
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature):
|
319 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
320 |
self.config = config
|
321 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
@@ -323,6 +325,7 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
323 |
self.system_prompt = system_prompt
|
324 |
self.gpt_model = gpt_model
|
325 |
self.gpt_temperature = gpt_temperature
|
|
|
326 |
|
327 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
328 |
"""Create vector store and BM25 index with contextualized chunks"""
|
@@ -453,18 +456,27 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
453 |
|
454 |
prompt_template = self.system_prompt
|
455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
prompt = PromptTemplate(
|
457 |
template=prompt_template,
|
458 |
-
input_variables=["context"]
|
459 |
)
|
460 |
|
461 |
llm = ChatOpenAI(
|
462 |
temperature=self.gpt_temperature,
|
463 |
model_name=self.gpt_model,
|
464 |
api_key=self.openai_api_key,
|
|
|
465 |
)
|
466 |
|
467 |
-
response = llm.predict(prompt.format(context="\n\n".join(contexts)))
|
468 |
|
469 |
# Split the response into paragraphs
|
470 |
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
|
@@ -515,7 +527,8 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
|
|
515 |
claude_context_model=serializer["claude_context_model"],
|
516 |
system_prompt=serializer["system_prompt"],
|
517 |
gpt_model=serializer["model"],
|
518 |
-
gpt_temperature=serializer["gpt_temperature"]
|
|
|
519 |
)
|
520 |
|
521 |
# # Load and process document
|
|
|
16 |
from rank_bm25 import BM25Okapi
|
17 |
import logging
|
18 |
from cohere import Client
|
19 |
+
import requests
|
20 |
+
from setup.environment import api_url
|
21 |
+
from rest_framework.response import Response
|
22 |
|
23 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
24 |
"""Combine multiple ranked lists using reciprocal rank fusion"""
|
|
|
88 |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
89 |
"""Load PDF and split into chunks with metadata"""
|
90 |
loader = PyPDFLoader(pdf_path)
|
91 |
+
pages = loader.load() # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
92 |
chunks = []
|
93 |
char_count = 0
|
94 |
|
95 |
for page in pages:
|
96 |
text = page.page_content
|
97 |
+
page_chunks = self.text_splitter.split_text(text) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
|
|
98 |
|
99 |
for chunk in page_chunks:
|
100 |
chunk_id = str(uuid.uuid4())
|
101 |
+
start_char = text.find(chunk) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
102 |
end_char = start_char + len(chunk)
|
103 |
|
104 |
+
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
105 |
content=chunk,
|
106 |
page_number=page.metadata.get('page') + 1, # 1-based page numbering
|
107 |
chunk_id=chunk_id,
|
|
|
121 |
|
122 |
return chunks
|
123 |
|
124 |
+
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: # Esta função nunca está sendo utilizada
|
125 |
"""Create vector store with metadata"""
|
126 |
texts = [chunk.content for chunk in chunks]
|
127 |
metadatas = [{
|
|
|
138 |
)
|
139 |
return vector_store
|
140 |
|
141 |
+
def rerank_chunks( # Esta função nunca está sendo utilizada
|
142 |
self,
|
143 |
chunks: List[Dict],
|
144 |
query: str,
|
|
|
182 |
logging.error(f"Reranking failed: {str(e)}")
|
183 |
return chunks[:k] # Fallback to original ordering
|
184 |
|
185 |
+
def generate_summary_with_sources( # Esta função nunca está sendo utilizada
|
186 |
self,
|
187 |
vector_store: Chroma,
|
188 |
query: str = "Summarize the main points of this document"
|
|
|
258 |
|
259 |
return structured_output
|
260 |
|
261 |
+
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: # Esta função nunca está sendo utilizada
|
262 |
"""Get extended context around a specific chunk"""
|
263 |
metadata = self.chunk_metadata.get(chunk_id)
|
264 |
if not metadata:
|
|
|
272 |
|
273 |
class ContextualRetriever:
|
274 |
def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
|
275 |
+
self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada.
|
276 |
self.claude_client = Anthropic(api_key=claude_api_key)
|
277 |
self.logger = logging.getLogger(__name__)
|
278 |
self.bm25 = None
|
|
|
295 |
max_tokens=100,
|
296 |
messages=[{"role": "user", "content": prompt}]
|
297 |
)
|
298 |
+
return response.content[0].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
|
299 |
except Exception as e:
|
300 |
self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
|
301 |
return ""
|
302 |
|
303 |
+
def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk
|
304 |
"""Add context to all chunks"""
|
305 |
contextualized_chunks = []
|
306 |
for chunk in chunks:
|
|
|
317 |
return contextualized_chunks
|
318 |
|
319 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
320 |
+
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature, id_modelo_do_usuario):
|
321 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
322 |
self.config = config
|
323 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
|
|
325 |
self.system_prompt = system_prompt
|
326 |
self.gpt_model = gpt_model
|
327 |
self.gpt_temperature = gpt_temperature
|
328 |
+
self.id_modelo_do_usuario = id_modelo_do_usuario
|
329 |
|
330 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
331 |
"""Create vector store and BM25 index with contextualized chunks"""
|
|
|
456 |
|
457 |
prompt_template = self.system_prompt
|
458 |
|
459 |
+
url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
|
460 |
+
resposta = requests.get(url_request)
|
461 |
+
|
462 |
+
if (resposta.status_code != 200):
|
463 |
+
return Response({"error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica"})
|
464 |
+
|
465 |
+
modelo_buscado = resposta.json()["modelo"]
|
466 |
+
|
467 |
prompt = PromptTemplate(
|
468 |
template=prompt_template,
|
469 |
+
input_variables=["context", "modelo_usuario"]
|
470 |
)
|
471 |
|
472 |
llm = ChatOpenAI(
|
473 |
temperature=self.gpt_temperature,
|
474 |
model_name=self.gpt_model,
|
475 |
api_key=self.openai_api_key,
|
476 |
+
|
477 |
)
|
478 |
|
479 |
+
response = llm.predict(prompt.format(context="\n\n".join(contexts), modelo_usuario=modelo_buscado))
|
480 |
|
481 |
# Split the response into paragraphs
|
482 |
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
|
|
|
527 |
claude_context_model=serializer["claude_context_model"],
|
528 |
system_prompt=serializer["system_prompt"],
|
529 |
gpt_model=serializer["model"],
|
530 |
+
gpt_temperature=serializer["gpt_temperature"],
|
531 |
+
id_modelo_do_usuario=serializer["id_modelo_do_usuario"]
|
532 |
)
|
533 |
|
534 |
# # Load and process document
|
resumos/serializer.py
CHANGED
@@ -37,17 +37,14 @@ system_prompt = """
|
|
37 |
|
38 |
Context: {context}
|
39 |
|
|
|
|
|
40 |
Key points:
|
41 |
"""
|
42 |
user_message = "What are the main points of this document?"
|
43 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
44 |
-
# files = serializers.ListField(child=serializers.FileField(), required=True)
|
45 |
system_prompt = serializers.CharField(required=False, default=system_prompt)
|
46 |
user_message = serializers.CharField(required=False, default=user_message)
|
47 |
-
# model = serializers.CharField(required=False, default=default_model)
|
48 |
-
# hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
49 |
-
# chunk_size = serializers.IntegerField(required=False, default=1000)
|
50 |
-
# chunk_overlap = serializers.IntegerField(required=False, default=200)
|
51 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
52 |
embedding_weight = serializers.FloatField(default=0.5)
|
53 |
bm25_weight = serializers.FloatField(default=0.5)
|
@@ -57,4 +54,5 @@ class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
|
57 |
model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
|
58 |
more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
|
59 |
claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
|
60 |
-
gpt_temperature = serializers.FloatField(default=0)
|
|
|
|
37 |
|
38 |
Context: {context}
|
39 |
|
40 |
+
Modelo do usuário: {modelo_usuario}
|
41 |
+
|
42 |
Key points:
|
43 |
"""
|
44 |
user_message = "What are the main points of this document?"
|
45 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
|
|
46 |
system_prompt = serializers.CharField(required=False, default=system_prompt)
|
47 |
user_message = serializers.CharField(required=False, default=user_message)
|
|
|
|
|
|
|
|
|
48 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
49 |
embedding_weight = serializers.FloatField(default=0.5)
|
50 |
bm25_weight = serializers.FloatField(default=0.5)
|
|
|
54 |
model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
|
55 |
more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
|
56 |
claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
|
57 |
+
gpt_temperature = serializers.FloatField(default=0)
|
58 |
+
id_modelo_do_usuario = serializers.IntegerField(required=True)
|
setup/environment.py
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
default_model = "gpt-4o-mini"
|
2 |
-
# default_model = "gpt-4o"
|
|
|
|
|
|
1 |
default_model = "gpt-4o-mini"
|
2 |
+
# default_model = "gpt-4o"
|
3 |
+
|
4 |
+
api_url = "https://luanpoppe-vella-backend.hf.space"
|