luanpoppe commited on
Commit
3143cff
·
1 Parent(s): ca8a144

feat: adicionando busca pelo modelo de um usuário para entrar no system prompt final

Browse files
_utils/resumo_completo_cursor.py CHANGED
@@ -16,6 +16,9 @@ import numpy as np
16
  from rank_bm25 import BM25Okapi
17
  import logging
18
  from cohere import Client
 
 
 
19
 
20
  def reciprocal_rank_fusion(result_lists, weights=None):
21
  """Combine multiple ranked lists using reciprocal rank fusion"""
@@ -85,21 +88,20 @@ class DocumentSummarizer:
85
  def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
86
  """Load PDF and split into chunks with metadata"""
87
  loader = PyPDFLoader(pdf_path)
88
- pages = loader.load()
89
  chunks = []
90
  char_count = 0
91
 
92
  for page in pages:
93
  text = page.page_content
94
- # Split the page content
95
- page_chunks = self.text_splitter.split_text(text)
96
 
97
  for chunk in page_chunks:
98
  chunk_id = str(uuid.uuid4())
99
- start_char = text.find(chunk)
100
  end_char = start_char + len(chunk)
101
 
102
- doc_chunk = DocumentChunk(
103
  content=chunk,
104
  page_number=page.metadata.get('page') + 1, # 1-based page numbering
105
  chunk_id=chunk_id,
@@ -119,7 +121,7 @@ class DocumentSummarizer:
119
 
120
  return chunks
121
 
122
- def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
123
  """Create vector store with metadata"""
124
  texts = [chunk.content for chunk in chunks]
125
  metadatas = [{
@@ -136,7 +138,7 @@ class DocumentSummarizer:
136
  )
137
  return vector_store
138
 
139
- def rerank_chunks(
140
  self,
141
  chunks: List[Dict],
142
  query: str,
@@ -180,7 +182,7 @@ class DocumentSummarizer:
180
  logging.error(f"Reranking failed: {str(e)}")
181
  return chunks[:k] # Fallback to original ordering
182
 
183
- def generate_summary_with_sources(
184
  self,
185
  vector_store: Chroma,
186
  query: str = "Summarize the main points of this document"
@@ -256,7 +258,7 @@ class DocumentSummarizer:
256
 
257
  return structured_output
258
 
259
- def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
260
  """Get extended context around a specific chunk"""
261
  metadata = self.chunk_metadata.get(chunk_id)
262
  if not metadata:
@@ -270,7 +272,7 @@ class DocumentSummarizer:
270
 
271
  class ContextualRetriever:
272
  def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
273
- self.config = config
274
  self.claude_client = Anthropic(api_key=claude_api_key)
275
  self.logger = logging.getLogger(__name__)
276
  self.bm25 = None
@@ -293,12 +295,12 @@ class ContextualRetriever:
293
  max_tokens=100,
294
  messages=[{"role": "user", "content": prompt}]
295
  )
296
- return response.content[0].text
297
  except Exception as e:
298
  self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
299
  return ""
300
 
301
- def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]:
302
  """Add context to all chunks"""
303
  contextualized_chunks = []
304
  for chunk in chunks:
@@ -315,7 +317,7 @@ class ContextualRetriever:
315
  return contextualized_chunks
316
 
317
  class EnhancedDocumentSummarizer(DocumentSummarizer):
318
- def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature):
319
  super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
320
  self.config = config
321
  self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
@@ -323,6 +325,7 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
323
  self.system_prompt = system_prompt
324
  self.gpt_model = gpt_model
325
  self.gpt_temperature = gpt_temperature
 
326
 
327
  def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
328
  """Create vector store and BM25 index with contextualized chunks"""
@@ -453,18 +456,27 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
453
 
454
  prompt_template = self.system_prompt
455
 
 
 
 
 
 
 
 
 
456
  prompt = PromptTemplate(
457
  template=prompt_template,
458
- input_variables=["context"]
459
  )
460
 
461
  llm = ChatOpenAI(
462
  temperature=self.gpt_temperature,
463
  model_name=self.gpt_model,
464
  api_key=self.openai_api_key,
 
465
  )
466
 
467
- response = llm.predict(prompt.format(context="\n\n".join(contexts)))
468
 
469
  # Split the response into paragraphs
470
  summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
@@ -515,7 +527,8 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
515
  claude_context_model=serializer["claude_context_model"],
516
  system_prompt=serializer["system_prompt"],
517
  gpt_model=serializer["model"],
518
- gpt_temperature=serializer["gpt_temperature"]
 
519
  )
520
 
521
  # # Load and process document
 
16
  from rank_bm25 import BM25Okapi
17
  import logging
18
  from cohere import Client
19
+ import requests
20
+ from setup.environment import api_url
21
+ from rest_framework.response import Response
22
 
23
  def reciprocal_rank_fusion(result_lists, weights=None):
24
  """Combine multiple ranked lists using reciprocal rank fusion"""
 
88
  def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
89
  """Load PDF and split into chunks with metadata"""
90
  loader = PyPDFLoader(pdf_path)
91
+ pages = loader.load() # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
92
  chunks = []
93
  char_count = 0
94
 
95
  for page in pages:
96
  text = page.page_content
97
+ page_chunks = self.text_splitter.split_text(text) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
 
98
 
99
  for chunk in page_chunks:
100
  chunk_id = str(uuid.uuid4())
101
+ start_char = text.find(chunk) # Retorna a posição onde se encontra o chunk dentro da página inteira
102
  end_char = start_char + len(chunk)
103
 
104
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
105
  content=chunk,
106
  page_number=page.metadata.get('page') + 1, # 1-based page numbering
107
  chunk_id=chunk_id,
 
121
 
122
  return chunks
123
 
124
+ def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: # Esta função nunca está sendo utilizada
125
  """Create vector store with metadata"""
126
  texts = [chunk.content for chunk in chunks]
127
  metadatas = [{
 
138
  )
139
  return vector_store
140
 
141
+ def rerank_chunks( # Esta função nunca está sendo utilizada
142
  self,
143
  chunks: List[Dict],
144
  query: str,
 
182
  logging.error(f"Reranking failed: {str(e)}")
183
  return chunks[:k] # Fallback to original ordering
184
 
185
+ def generate_summary_with_sources( # Esta função nunca está sendo utilizada
186
  self,
187
  vector_store: Chroma,
188
  query: str = "Summarize the main points of this document"
 
258
 
259
  return structured_output
260
 
261
+ def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: # Esta função nunca está sendo utilizada
262
  """Get extended context around a specific chunk"""
263
  metadata = self.chunk_metadata.get(chunk_id)
264
  if not metadata:
 
272
 
273
  class ContextualRetriever:
274
  def __init__(self, config: RetrievalConfig, claude_api_key: str, claude_context_model):
275
+ self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada.
276
  self.claude_client = Anthropic(api_key=claude_api_key)
277
  self.logger = logging.getLogger(__name__)
278
  self.bm25 = None
 
295
  max_tokens=100,
296
  messages=[{"role": "user", "content": prompt}]
297
  )
298
+ return response.content[0].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
299
  except Exception as e:
300
  self.logger.error(f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}")
301
  return ""
302
 
303
+ def contextualize_chunks(self, full_text: str, chunks: List[DocumentChunk]) -> List[ContextualizedChunk]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk
304
  """Add context to all chunks"""
305
  contextualized_chunks = []
306
  for chunk in chunks:
 
317
  return contextualized_chunks
318
 
319
  class EnhancedDocumentSummarizer(DocumentSummarizer):
320
+ def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature, id_modelo_do_usuario):
321
  super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
322
  self.config = config
323
  self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
 
325
  self.system_prompt = system_prompt
326
  self.gpt_model = gpt_model
327
  self.gpt_temperature = gpt_temperature
328
+ self.id_modelo_do_usuario = id_modelo_do_usuario
329
 
330
  def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
331
  """Create vector store and BM25 index with contextualized chunks"""
 
456
 
457
  prompt_template = self.system_prompt
458
 
459
+ url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
460
+ resposta = requests.get(url_request)
461
+
462
+ if (resposta.status_code != 200):
463
+ return Response({"error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica"})
464
+
465
+ modelo_buscado = resposta.json()["modelo"]
466
+
467
  prompt = PromptTemplate(
468
  template=prompt_template,
469
+ input_variables=["context", "modelo_usuario"]
470
  )
471
 
472
  llm = ChatOpenAI(
473
  temperature=self.gpt_temperature,
474
  model_name=self.gpt_model,
475
  api_key=self.openai_api_key,
476
+
477
  )
478
 
479
+ response = llm.predict(prompt.format(context="\n\n".join(contexts), modelo_usuario=modelo_buscado))
480
 
481
  # Split the response into paragraphs
482
  summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
 
527
  claude_context_model=serializer["claude_context_model"],
528
  system_prompt=serializer["system_prompt"],
529
  gpt_model=serializer["model"],
530
+ gpt_temperature=serializer["gpt_temperature"],
531
+ id_modelo_do_usuario=serializer["id_modelo_do_usuario"]
532
  )
533
 
534
  # # Load and process document
resumos/serializer.py CHANGED
@@ -37,17 +37,14 @@ system_prompt = """
37
 
38
  Context: {context}
39
 
 
 
40
  Key points:
41
  """
42
  user_message = "What are the main points of this document?"
43
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
44
- # files = serializers.ListField(child=serializers.FileField(), required=True)
45
  system_prompt = serializers.CharField(required=False, default=system_prompt)
46
  user_message = serializers.CharField(required=False, default=user_message)
47
- # model = serializers.CharField(required=False, default=default_model)
48
- # hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
49
- # chunk_size = serializers.IntegerField(required=False, default=1000)
50
- # chunk_overlap = serializers.IntegerField(required=False, default=200)
51
  num_chunks_retrieval = serializers.IntegerField(default=5)
52
  embedding_weight = serializers.FloatField(default=0.5)
53
  bm25_weight = serializers.FloatField(default=0.5)
@@ -57,4 +54,5 @@ class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
57
  model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
58
  more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
59
  claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
60
- gpt_temperature = serializers.FloatField(default=0)
 
 
37
 
38
  Context: {context}
39
 
40
+ Modelo do usuário: {modelo_usuario}
41
+
42
  Key points:
43
  """
44
  user_message = "What are the main points of this document?"
45
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
 
46
  system_prompt = serializers.CharField(required=False, default=system_prompt)
47
  user_message = serializers.CharField(required=False, default=user_message)
 
 
 
 
48
  num_chunks_retrieval = serializers.IntegerField(default=5)
49
  embedding_weight = serializers.FloatField(default=0.5)
50
  bm25_weight = serializers.FloatField(default=0.5)
 
54
  model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
55
  more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
56
  claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
57
+ gpt_temperature = serializers.FloatField(default=0)
58
+ id_modelo_do_usuario = serializers.IntegerField(required=True)
setup/environment.py CHANGED
@@ -1,2 +1,4 @@
1
  default_model = "gpt-4o-mini"
2
- # default_model = "gpt-4o"
 
 
 
1
  default_model = "gpt-4o-mini"
2
+ # default_model = "gpt-4o"
3
+
4
+ api_url = "https://luanpoppe-vella-backend.hf.space"