luanpoppe commited on
Commit
12d3e1a
·
1 Parent(s): 12b0dd7

feat: melhorias no código e refatorações

Browse files
_utils/LLMs/LLM_class.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from setup.environment import default_model
2
+
3
+
4
+ class LLM:
5
+ def __init__(self):
6
+ pass
7
+
8
+ # def create_GPT_model(self, model=default_model):
9
+ # return ChatOpen()
_utils/chains/Chain_class.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Chain:
2
+ def __init__(self, prompt, model):
3
+ self.prompt = prompt
4
+ self.model = model
5
+
6
+ def create_prompt_model_chain(self):
7
+ return self.prompt | self.model
8
+
9
+ def invoke_prompt_model_chain(self, invoke_params):
10
+ chain = self.create_prompt_model_chain()
11
+ return chain.invoke(invoke_params)
_utils/gerar_relatorio_modelo_usuario/DocumentSummarizer_simples.py CHANGED
@@ -1,18 +1,16 @@
1
- import os
2
  from typing import List, Dict, Tuple, Optional
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain_community.document_loaders import PyPDFLoader
5
- from langchain_huggingface import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import Chroma
7
- from langchain_community.chat_models import ChatOpenAI
8
- from langchain.prompts import PromptTemplate
9
- import uuid
10
  import logging
11
  from cohere import Client
12
  from _utils.models.gerar_relatorio import (
13
  DocumentChunk,
14
  )
15
- from langchain.schema import Document
16
 
17
 
18
  class DocumentSummarizer:
@@ -29,94 +27,10 @@ class DocumentSummarizer:
29
  self.openai_api_key = openai_api_key
30
  self.cohere_client = Client(cohere_api_key)
31
  self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
32
- self.text_splitter = RecursiveCharacterTextSplitter(
33
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
34
- )
35
- self.chunk_metadata = {} # Store chunk metadata for tracing
36
  self.num_k_rerank = num_k_rerank
37
  self.model_cohere_rerank = model_cohere_rerank
38
 
39
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
40
- """Load PDF and split into chunks with metadata"""
41
- loader = PyPDFLoader(pdf_path)
42
- pages = (
43
- loader.load()
44
- ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
45
- chunks = []
46
- char_count = 0
47
-
48
- for page in pages:
49
- text = page.page_content
50
- page_chunks = self.text_splitter.split_text(
51
- text
52
- ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
53
-
54
- for chunk in page_chunks:
55
- chunk_id = str(uuid.uuid4())
56
- start_char = text.find(
57
- chunk
58
- ) # Retorna a posição onde se encontra o chunk dentro da página inteira
59
- end_char = start_char + len(chunk)
60
-
61
- doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
62
- content=chunk,
63
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
64
- chunk_id=chunk_id,
65
- start_char=char_count + start_char,
66
- end_char=char_count + end_char,
67
- )
68
- chunks.append(doc_chunk)
69
-
70
- # Store metadata for later retrieval
71
- self.chunk_metadata[chunk_id] = {
72
- "page": doc_chunk.page_number,
73
- "start_char": doc_chunk.start_char,
74
- "end_char": doc_chunk.end_char,
75
- }
76
-
77
- char_count += len(text)
78
-
79
- return chunks
80
-
81
- def load_and_split_text(self, text: str) -> List[DocumentChunk]:
82
- """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
83
- page = Document(page_content=text, metadata={"page": 1})
84
- chunks = []
85
- char_count = 0
86
-
87
- text = page.page_content
88
- page_chunks = self.text_splitter.split_text(
89
- text
90
- ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
91
- print("\n\n\n")
92
- print("page_chunks: ", page_chunks)
93
-
94
- for chunk in page_chunks:
95
- chunk_id = str(uuid.uuid4())
96
- start_char = text.find(
97
- chunk
98
- ) # Retorna a posição onde se encontra o chunk dentro da página inteira
99
- end_char = start_char + len(chunk)
100
-
101
- doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
102
- content=chunk,
103
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
104
- chunk_id=chunk_id,
105
- start_char=char_count + start_char,
106
- end_char=char_count + end_char,
107
- )
108
- chunks.append(doc_chunk)
109
-
110
- # Store metadata for later retrieval
111
- self.chunk_metadata[chunk_id] = {
112
- "page": doc_chunk.page_number,
113
- "start_char": doc_chunk.start_char,
114
- "end_char": doc_chunk.end_char,
115
- }
116
-
117
- char_count += len(text)
118
-
119
- return chunks
120
 
121
  def create_vector_store(
122
  self, chunks: List[DocumentChunk]
@@ -233,7 +147,7 @@ class DocumentSummarizer:
233
  temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
234
  )
235
 
236
- response = llm.predict(prompt.format(context="\n\n".join(contexts)))
237
 
238
  # Split the response into paragraphs
239
  summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
 
 
1
  from typing import List, Dict, Tuple, Optional
2
+ from _utils.splitters.Splitter_class import Splitter
3
+ from setup.easy_imports import (
4
+ HuggingFaceEmbeddings,
5
+ Chroma,
6
+ ChatOpenAI,
7
+ PromptTemplate,
8
+ )
9
  import logging
10
  from cohere import Client
11
  from _utils.models.gerar_relatorio import (
12
  DocumentChunk,
13
  )
 
14
 
15
 
16
  class DocumentSummarizer:
 
27
  self.openai_api_key = openai_api_key
28
  self.cohere_client = Client(cohere_api_key)
29
  self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
 
 
 
30
  self.num_k_rerank = num_k_rerank
31
  self.model_cohere_rerank = model_cohere_rerank
32
 
33
+ self.splitter = Splitter(chunk_size, chunk_overlap)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def create_vector_store(
36
  self, chunks: List[DocumentChunk]
 
147
  temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
148
  )
149
 
150
+ response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
151
 
152
  # Split the response into paragraphs
153
  summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py CHANGED
@@ -1,22 +1,23 @@
1
  import os
2
  from typing import List, Dict, Tuple, Optional
3
- from langchain_community.vectorstores import Chroma
4
- from langchain_community.chat_models import ChatOpenAI
5
- from langchain.chains import create_extraction_chain
6
- from langchain.prompts import PromptTemplate
7
- from rank_bm25 import BM25Okapi
 
 
 
8
  import logging
9
  import requests
10
  from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
11
  DocumentSummarizer,
12
  )
13
  from _utils.models.gerar_relatorio import (
14
- ContextualizedChunk,
15
  RetrievalConfig,
16
  )
17
  from modelos_usuarios.serializer import ModeloUsuarioSerializer
18
  from setup.environment import api_url
19
- from rest_framework.response import Response
20
  from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
21
  ContextualRetriever,
22
  )
@@ -24,6 +25,7 @@ from asgiref.sync import sync_to_async
24
 
25
 
26
  class EnhancedDocumentSummarizer(DocumentSummarizer):
 
27
  def __init__(
28
  self,
29
  openai_api_key: str,
@@ -35,12 +37,12 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
35
  num_k_rerank,
36
  model_cohere_rerank,
37
  claude_context_model,
38
- prompt_relatorio,
39
  gpt_model,
40
  gpt_temperature,
41
  id_modelo_do_usuario,
42
- prompt_modelo,
43
- reciprocal_rank_fusion
44
  ):
45
  super().__init__(
46
  openai_api_key,
@@ -56,58 +58,15 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
56
  config, claude_api_key, claude_context_model
57
  )
58
  self.logger = logging.getLogger(__name__)
59
- self.prompt_relatorio = prompt_relatorio
60
  self.gpt_model = gpt_model
61
  self.gpt_temperature = gpt_temperature
62
  self.id_modelo_do_usuario = id_modelo_do_usuario
63
- self.prompt_modelo = prompt_modelo
64
  self.reciprocal_rank_fusion = reciprocal_rank_fusion
65
  self.resumo_gerado = ""
66
 
67
- def create_enhanced_vector_store(
68
- self, chunks: List[ContextualizedChunk], is_contextualized_chunk
69
- ) -> Tuple[Chroma, BM25Okapi, List[str]]:
70
- """Create vector store and BM25 index with contextualized chunks"""
71
- try:
72
- # Prepare texts with context
73
- if is_contextualized_chunk:
74
- texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
75
- else:
76
- texts = [f"{chunk.content}" for chunk in chunks]
77
-
78
- # Create vector store
79
- metadatas = []
80
- for chunk in chunks:
81
- if is_contextualized_chunk:
82
- context = chunk.context
83
- else:
84
- context = ""
85
- metadatas.append(
86
- {
87
- "chunk_id": chunk.chunk_id,
88
- "page": chunk.page_number,
89
- "start_char": chunk.start_char,
90
- "end_char": chunk.end_char,
91
- "context": context,
92
- }
93
- )
94
-
95
- vector_store = Chroma.from_texts(
96
- texts=texts, metadatas=metadatas, embedding=self.embeddings
97
- )
98
-
99
- # Create BM25 index
100
- tokenized_texts = [text.split() for text in texts]
101
- bm25 = BM25Okapi(tokenized_texts)
102
-
103
- # Get chunk IDs in order
104
- chunk_ids = [chunk.chunk_id for chunk in chunks]
105
-
106
- return vector_store, bm25, chunk_ids
107
-
108
- except Exception as e:
109
- self.logger.error(f"Error creating enhanced vector store: {str(e)}")
110
- raise
111
 
112
  def retrieve_with_rank_fusion(
113
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
@@ -254,25 +213,25 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
254
  )
255
 
256
  prompt_gerar_relatorio = PromptTemplate(
257
- template=self.prompt_relatorio, input_variables=["context"]
258
  )
259
 
260
- relatorio_gerado = llm.predict(
261
  prompt_gerar_relatorio.format(context="\n\n".join(contexts))
262
  )
263
 
264
- self.resumo_gerado = relatorio_gerado
265
 
266
  prompt_gerar_modelo = PromptTemplate(
267
- template=self.prompt_modelo,
268
  input_variables=["context", "modelo_usuario"],
269
  )
270
 
271
- modelo_gerado = llm.predict(
272
  prompt_gerar_modelo.format(
273
  context=relatorio_gerado, modelo_usuario=serializer.data["modelo"]
274
  )
275
- )
276
 
277
  # Split the response into paragraphs
278
  summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
 
1
  import os
2
  from typing import List, Dict, Tuple, Optional
3
+ from _utils.vector_stores.Vector_store_class import VectorStore
4
+ from setup.easy_imports import (
5
+ Chroma,
6
+ ChatOpenAI,
7
+ PromptTemplate,
8
+ BM25Okapi,
9
+ Response,
10
+ )
11
  import logging
12
  import requests
13
  from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
14
  DocumentSummarizer,
15
  )
16
  from _utils.models.gerar_relatorio import (
 
17
  RetrievalConfig,
18
  )
19
  from modelos_usuarios.serializer import ModeloUsuarioSerializer
20
  from setup.environment import api_url
 
21
  from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
22
  ContextualRetriever,
23
  )
 
25
 
26
 
27
  class EnhancedDocumentSummarizer(DocumentSummarizer):
28
+
29
  def __init__(
30
  self,
31
  openai_api_key: str,
 
37
  num_k_rerank,
38
  model_cohere_rerank,
39
  claude_context_model,
40
+ prompt_auxiliar,
41
  gpt_model,
42
  gpt_temperature,
43
  id_modelo_do_usuario,
44
+ prompt_gerar_documento,
45
+ reciprocal_rank_fusion,
46
  ):
47
  super().__init__(
48
  openai_api_key,
 
58
  config, claude_api_key, claude_context_model
59
  )
60
  self.logger = logging.getLogger(__name__)
61
+ self.prompt_auxiliar = prompt_auxiliar
62
  self.gpt_model = gpt_model
63
  self.gpt_temperature = gpt_temperature
64
  self.id_modelo_do_usuario = id_modelo_do_usuario
65
+ self.prompt_gerar_documento = prompt_gerar_documento
66
  self.reciprocal_rank_fusion = reciprocal_rank_fusion
67
  self.resumo_gerado = ""
68
 
69
+ self.vector_store = VectorStore(embedding_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def retrieve_with_rank_fusion(
72
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
 
213
  )
214
 
215
  prompt_gerar_relatorio = PromptTemplate(
216
+ template=self.prompt_auxiliar, input_variables=["context"]
217
  )
218
 
219
+ relatorio_gerado = llm.invoke(
220
  prompt_gerar_relatorio.format(context="\n\n".join(contexts))
221
  )
222
 
223
+ self.resumo_gerado = relatorio_gerado.content
224
 
225
  prompt_gerar_modelo = PromptTemplate(
226
+ template=self.prompt_gerar_documento,
227
  input_variables=["context", "modelo_usuario"],
228
  )
229
 
230
+ modelo_gerado = llm.invoke(
231
  prompt_gerar_modelo.format(
232
  context=relatorio_gerado, modelo_usuario=serializer.data["modelo"]
233
  )
234
+ ).content
235
 
236
  # Split the response into paragraphs
237
  summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -1,4 +1,11 @@
1
  import os
 
 
 
 
 
 
 
2
  from langchain_openai import ChatOpenAI
3
  from typing import List, Dict, Tuple, Optional
4
  from anthropic import Anthropic, AsyncAnthropic
@@ -12,7 +19,7 @@ from dataclasses import dataclass
12
  from langchain_core.messages import HumanMessage
13
  from asgiref.sync import sync_to_async
14
 
15
- from _utils.gerar_relatorio_modelo_usuario.llm_calls import claude_answer, gpt_answer
16
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
17
  from _utils.models.gerar_relatorio import (
18
  ContextualizedChunk,
@@ -39,11 +46,11 @@ class ContextualRetriever:
39
  try:
40
  print("COMEÇOU A REQUISIÇÃO")
41
  prompt = contextual_prompt(full_text, chunk.content)
42
- # response = await claude_answer(
43
  # self.claude_client, self.claude_context_model, prompt
44
  # )
45
 
46
- response = await gpt_answer(prompt)
47
  return response
48
  except Exception as e:
49
  self.logger.error(
@@ -51,6 +58,13 @@ class ContextualRetriever:
51
  )
52
  return ""
53
 
 
 
 
 
 
 
 
54
  async def create_contextualized_chunk(self, chunk, full_text):
55
  lista_contador.append(0)
56
  print("contador: ", len(lista_contador))
@@ -90,3 +104,40 @@ class ContextualRetriever:
90
  contextualized_chunks = [task.result() for task in tasks]
91
 
92
  return contextualized_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ # from _utils.gerar_relatorio_modelo_usuario.prompts import (
3
+ # prompt_auxiliar_do_contextual_prompt,
4
+ # )
5
+ from _utils.chains.Chain_class import Chain
6
+ from _utils.prompts.Prompt_class import Prompt
7
+ from _utils.splitters.Splitter_class import Splitter
8
+ from setup.easy_imports import PyPDFLoader
9
  from langchain_openai import ChatOpenAI
10
  from typing import List, Dict, Tuple, Optional
11
  from anthropic import Anthropic, AsyncAnthropic
 
19
  from langchain_core.messages import HumanMessage
20
  from asgiref.sync import sync_to_async
21
 
22
+ from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
23
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
24
  from _utils.models.gerar_relatorio import (
25
  ContextualizedChunk,
 
46
  try:
47
  print("COMEÇOU A REQUISIÇÃO")
48
  prompt = contextual_prompt(full_text, chunk.content)
49
+ # response = await aclaude_answer(
50
  # self.claude_client, self.claude_context_model, prompt
51
  # )
52
 
53
+ response = await agpt_answer(prompt)
54
  return response
55
  except Exception as e:
56
  self.logger.error(
 
58
  )
59
  return ""
60
 
61
+ # def gerar_resumo_auxiliar_do_contextual_embedding(self):
62
+ # prompt = Prompt().create_prompt_template(
63
+ # "", prompt_auxiliar_do_contextual_prompt
64
+ # )
65
+ # Chain(prompt, ChatOpenAI())
66
+ # return
67
+
68
  async def create_contextualized_chunk(self, chunk, full_text):
69
  lista_contador.append(0)
70
  print("contador: ", len(lista_contador))
 
104
  contextualized_chunks = [task.result() for task in tasks]
105
 
106
  return contextualized_chunks
107
+
108
+
109
+ def get_full_text_and_all_PDFs_chunks(contexto, listaPDFs, splitterObject: Splitter):
110
+ all_PDFs_chunks = []
111
+ full_text = ""
112
+ if contexto:
113
+ full_text = contexto
114
+ chunks = splitterObject.load_and_split_text(full_text)
115
+ all_PDFs_chunks = chunks
116
+ else:
117
+ # Load and process document
118
+ for pdf in listaPDFs:
119
+ pdf_path = pdf
120
+ chunks = splitterObject.load_and_split_document(pdf_path)
121
+ all_PDFs_chunks = all_PDFs_chunks + chunks
122
+ # Get full text for contextualization
123
+ loader = PyPDFLoader(pdf_path)
124
+ pages = loader.load()
125
+ full_text = " ".join([page.page_content for page in pages])
126
+
127
+ return full_text, all_PDFs_chunks, pages
128
+
129
+
130
+ async def contextualize_chunk_based_on_serializer(
131
+ serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
132
+ ):
133
+ if serializer["should_have_contextual_chunks"]:
134
+ contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
135
+ pages, all_PDFs_chunks
136
+ )
137
+ chunks_passados = contextualized_chunks
138
+ is_contextualized_chunk = True
139
+ else:
140
+ chunks_passados = all_PDFs_chunks
141
+ is_contextualized_chunk = False
142
+
143
+ return chunks_passados, is_contextualized_chunk
_utils/gerar_relatorio_modelo_usuario/llm_calls.py CHANGED
@@ -1,12 +1,11 @@
1
  import os
 
2
  from langchain_core.messages import HumanMessage
3
  from langchain_openai import ChatOpenAI
4
 
5
 
6
- async def claude_answer(claude_client, claude_context_model, prompt):
7
- print("\n")
8
- print("Começou uma requisição pelo Claude")
9
- print("\n")
10
  response = await claude_client.messages.create(
11
  model=claude_context_model,
12
  max_tokens=100,
@@ -17,7 +16,7 @@ async def claude_answer(claude_client, claude_context_model, prompt):
17
  ].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
18
 
19
 
20
- async def gpt_answer(prompt):
21
  gpt = ChatOpenAI(
22
  temperature=0,
23
  model="gpt-4o-mini",
@@ -26,3 +25,23 @@ async def gpt_answer(prompt):
26
  )
27
  response = await gpt.ainvoke([HumanMessage(content=prompt)])
28
  return response.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from setup.environment import default_model
3
  from langchain_core.messages import HumanMessage
4
  from langchain_openai import ChatOpenAI
5
 
6
 
7
+ async def aclaude_answer(claude_client, claude_context_model, prompt):
8
+ print("\n\nComeçou uma requisição pelo Claude")
 
 
9
  response = await claude_client.messages.create(
10
  model=claude_context_model,
11
  max_tokens=100,
 
16
  ].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
17
 
18
 
19
+ async def agpt_answer(prompt):
20
  gpt = ChatOpenAI(
21
  temperature=0,
22
  model="gpt-4o-mini",
 
25
  )
26
  response = await gpt.ainvoke([HumanMessage(content=prompt)])
27
  return response.content
28
+
29
+
30
+ def gpt_answer(
31
+ prompt,
32
+ temperature=0,
33
+ model=default_model,
34
+ max_retries=5,
35
+ shouldReturnFullResponse=False,
36
+ ):
37
+ gpt = ChatOpenAI(
38
+ temperature=temperature,
39
+ model=model,
40
+ api_key=os.environ.get("OPENAI_API_KEY"),
41
+ max_retries=max_retries,
42
+ )
43
+ response = gpt.invoke([HumanMessage(content=prompt)])
44
+ if shouldReturnFullResponse:
45
+ return response
46
+ else:
47
+ return response.content
_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED
@@ -17,12 +17,55 @@ Please return only the succinct context (without displaying your internal reason
17
  ```
18
  """
19
 
20
- system_prompt_modelo = """
21
- You are a large language model that must produce a single final sentence in **Portuguese**. To do this, you will follow a private chain of thought and then produce a final answer. The final answer must follow the formatting and stylistic conventions shown in the user-provided model `user's template`. The information to be included in the final sentence is derived from the `context` (a report describing a legal case).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  **Contextual Information (provided separately):**
23
  {context}
24
  **User Model (provided separately):**
25
- <user's_template>{modelo_usuario}</user's_template>
26
  **Instructions:**
27
  1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
28
  2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
@@ -46,8 +89,7 @@ After composing the sentence, but before presenting it as the final answer, refl
46
  - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
47
  """
48
 
49
- system_prompt_relatorio = """
50
- You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
51
  **Instructions**:
52
  1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
53
  2. **Reading the Context**: Extract the following information from `context`:
@@ -59,26 +101,17 @@ You are a language model specialized in producing concise and well-structured le
59
  3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
60
  4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
61
  ```
62
- <relatorio>
63
  Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
64
  A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
65
  Não há outras causas interruptivas ou suspensivas da prescrição.
66
- </relatorio>
67
  ```
68
  5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
69
  **Reminder**:
70
  - Do not include your chain of thought in the final output.
71
  - Do not add extra information or commentary beyond the specified format.
72
  - The final answer must be in Portuguese.
73
- ---
74
-
75
- **Contextual Information (provided separately):**
76
- {context}
77
- ---
78
- **Example with a given context**:
79
- - Input:
80
- `context` = "Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição."
81
- - Expected final answer:
82
  ```
83
  <formato>
84
  Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
@@ -86,3 +119,61 @@ A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
86
  Não há outras causas interruptivas ou suspensivas da prescrição.
87
  </formato>
88
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ```
18
  """
19
 
20
+ # Novo nome --> prompt-auxiliar --> Para gerar documentos (é usado como auxiliar no prompt final)
21
+ prompt_auxiliar = """
22
+ You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
23
+ **Instructions**:
24
+ 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
25
+ 2. **Reading the Context**: Extract the following information from `context`:
26
+ - The name of the defendant (réu).
27
+ - The crime they have been accused of (nome_do_crime).
28
+ - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
29
+ - The date the accusation was accepted (data_do_recebimento).
30
+ - The ID of the decision document (id_do_documento).
31
+ 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
32
+ 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
33
+ ```
34
+ <relatorio>
35
+ Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
36
+ A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
37
+ Não há outras causas interruptivas ou suspensivas da prescrição.
38
+ </relatorio>
39
+ ```
40
+ 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
41
+ **Reminder**:
42
+ - Do not include your chain of thought in the final output.
43
+ - Do not add extra information or commentary beyond the specified format.
44
+ - The final answer must be in Portuguese.
45
+ ---
46
+
47
+ **Contextual Information (provided separately):**
48
+ {context}
49
+ ---
50
+ **Example with a given context**:
51
+ - Input:
52
+ `context` = 'Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição.'
53
+ - Expected final answer:
54
+ ```
55
+ <formato>
56
+ Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
57
+ A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
58
+ Não há outras causas interruptivas ou suspensivas da prescrição.
59
+ </formato>
60
+ """
61
+
62
+ # Novo nome --> prompt-gerar-documento --> Para gerar documentos
63
+ prompt_gerar_documento = """
64
+ You are a large language model that must produce a single final document in **Portuguese**. To do this, you will follow a private chain of thought and then produce a final answer. The final answer must follow the formatting and stylistic conventions shown in the user-provided model `user's template`. The information to be included in the final document is derived from the `context` (a report describing a legal case).
65
  **Contextual Information (provided separately):**
66
  {context}
67
  **User Model (provided separately):**
68
+ <user's_template>PROMPT DO MODELO DO USUÁRIO</user's_template>
69
  **Instructions:**
70
  1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
71
  2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
 
89
  - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
90
  """
91
 
92
+ prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
 
93
  **Instructions**:
94
  1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
95
  2. **Reading the Context**: Extract the following information from `context`:
 
101
  3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
102
  4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
103
  ```
104
+ <formato>
105
  Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
106
  A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
107
  Não há outras causas interruptivas ou suspensivas da prescrição.
108
+ </formato>
109
  ```
110
  5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
111
  **Reminder**:
112
  - Do not include your chain of thought in the final output.
113
  - Do not add extra information or commentary beyond the specified format.
114
  - The final answer must be in Portuguese.
 
 
 
 
 
 
 
 
 
115
  ```
116
  <formato>
117
  Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
 
119
  Não há outras causas interruptivas ou suspensivas da prescrição.
120
  </formato>
121
  """
122
+
123
+ prompt_auxiliar_do_contextual_prompt = """Você é um assistente jurídico especializado em direito brasileiro. Sua tarefa é criar um resumo conciso e informativo de um processo jurídico, de acordo com as leis do Brasil. O resumo deve focar nos momentos cruciais do processo, na última movimentação processual e nas principais movimentações que ocorreram.
124
+
125
+ Aqui estão as 10 principais peças processuais em ordem cronológica do processo civil brasileiro que você deve priorizar em sua análise:
126
+ 1. Petição Inicial
127
+ 2. Contestação
128
+ 3. Réplica
129
+ 4. Decisão de Saneamento
130
+ 5. Sentença
131
+ 6. Recurso de Apelação
132
+ 7. Embargos de Declaração
133
+ 8. Cumprimento de Sentença
134
+ 9. Embargos à Execução
135
+ 10. Agravo de Instrumento
136
+
137
+ Siga este passo a passo para criar o resumo:
138
+
139
+ 1. Leia atentamente todo o processo jurídico fornecido.
140
+ <processo_juridico>
141
+ {{PROCESSO_JURIDICO}}
142
+ </processo_juridico>
143
+
144
+ 2. Identifique e anote as datas e conteúdos relevantes relacionados às 10 peças processuais listadas acima.
145
+
146
+ 3. Organize cronologicamente as informações coletadas.
147
+
148
+ 4. Destaque a última movimentação processual e seu significado para o andamento do processo.
149
+
150
+ 5. Resuma as principais movimentações, focando em seu impacto no processo.
151
+
152
+ 6. Elabore um texto coeso que apresente o fluxo do processo, destacando os pontos cruciais e as decisões mais importantes.
153
+
154
+ Após criar o resumo inicial, utilize a técnica socrática de reflexão para garantir a precisão e completude do resumo. Faça a si mesmo as seguintes perguntas:
155
+
156
+ 1. O resumo abrange todas as 10 peças processuais principais?
157
+ 2. A última movimentação processual está claramente identificada e explicada?
158
+ 3. O texto apresenta uma visão clara do fluxo do processo?
159
+ 4. Todas as informações cruciais para o entendimento do caso estão incluídas?
160
+ 5. O resumo está livre de opiniões pessoais e se atém aos fatos do processo?
161
+ 6. A linguagem utilizada é clara e acessível, mesmo para quem não é especialista em direito?
162
+
163
+ Revise e ajuste o resumo conforme necessário com base nessa reflexão.
164
+
165
+ O resumo final deve ter no máximo 2 páginas de extensão (aproximadamente 1000 palavras).
166
+
167
+ Formate sua resposta da seguinte maneira:
168
+
169
+ <resumo_processo>
170
+ [Insira aqui o resumo do processo jurídico]
171
+ </resumo_processo>
172
+
173
+ <reflexao_socratica>
174
+ [Insira aqui suas respostas às perguntas da reflexão socrática]
175
+ </reflexao_socratica>
176
+
177
+ <resumo_final>
178
+ [Insira aqui o resumo final revisado, se houver alterações após a reflexão]
179
+ </resumo_final>"""
_utils/gerar_relatorio_modelo_usuario/utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def gerar_resposta_compilada(serializer):
2
+ return {
3
+ "num_chunks_retrieval": serializer["num_chunks_retrieval"],
4
+ "embedding_weight": serializer["embedding_weight"],
5
+ "bm25_weight": serializer["bm25_weight"],
6
+ "context_window": serializer["context_window"],
7
+ "chunk_overlap": serializer["chunk_overlap"],
8
+ "num_k_rerank": serializer["num_k_rerank"],
9
+ "model_cohere_rerank": serializer["model_cohere_rerank"],
10
+ "more_initial_chunks_for_reranking": serializer[
11
+ "more_initial_chunks_for_reranking"
12
+ ],
13
+ "claude_context_model": serializer["claude_context_model"],
14
+ "gpt_temperature": serializer["gpt_temperature"],
15
+ "user_message": serializer["user_message"],
16
+ "model": serializer["model"],
17
+ "hf_embedding": serializer["hf_embedding"],
18
+ "chunk_size": serializer["chunk_size"],
19
+ "chunk_overlap": serializer["chunk_overlap"],
20
+ "prompt_auxiliar": serializer["prompt_auxiliar"],
21
+ "prompt_gerar_documento": serializer["prompt_gerar_documento"],
22
+ }
_utils/prompts/Prompt_class.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup.easy_imports import ChatPromptTemplate
2
+
3
+
4
+ class Prompt:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def create_prompt_template(self, system_prompt, user_prompt):
9
+ prompt_template = ChatPromptTemplate.from_messages(
10
+ [("system", system_prompt), ("user", user_prompt)]
11
+ )
12
+ return prompt_template
_utils/resumo_completo_cursor.py CHANGED
@@ -1,9 +1,13 @@
1
  import os
2
- from langchain_community.document_loaders import PyPDFLoader
3
- import json
4
  from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
5
  EnhancedDocumentSummarizer,
6
  )
 
 
 
 
 
7
  from _utils.models.gerar_relatorio import (
8
  RetrievalConfig,
9
  )
@@ -38,7 +42,6 @@ async def get_llm_summary_answer_by_cursor_complete(
38
  serializer, listaPDFs=None, contexto=None
39
  ):
40
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
41
- allPdfsChunks = []
42
  # Configuration
43
  config = RetrievalConfig(
44
  num_chunks=serializer["num_chunks_retrieval"],
@@ -59,82 +62,31 @@ async def get_llm_summary_answer_by_cursor_complete(
59
  num_k_rerank=serializer["num_k_rerank"],
60
  model_cohere_rerank=serializer["model_cohere_rerank"],
61
  claude_context_model=serializer["claude_context_model"],
62
- prompt_relatorio=serializer["prompt_relatorio"],
63
  gpt_model=serializer["model"],
64
  gpt_temperature=serializer["gpt_temperature"],
65
  id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
66
- prompt_modelo=serializer["prompt_modelo"],
67
  reciprocal_rank_fusion=reciprocal_rank_fusion,
68
  )
69
 
70
- full_text = ""
71
- if contexto:
72
- full_text = contexto
73
- chunks = summarizer.load_and_split_text(full_text)
74
- allPdfsChunks = chunks
75
- else:
76
- # # Load and process document
77
- # pdf_path = "./Im_a_storyteller.pdf"
78
- # chunks = summarizer.load_and_split_document(pdf_path)
79
-
80
- # Load and process document
81
- for pdf in listaPDFs:
82
- pdf_path = pdf
83
- chunks = summarizer.load_and_split_document(pdf_path)
84
- allPdfsChunks = allPdfsChunks + chunks
85
-
86
- # Get full text for contextualization
87
- loader = PyPDFLoader(pdf_path)
88
- pages = loader.load()
89
- full_text = " ".join([page.page_content for page in pages])
90
- # Contextualize chunks
91
- if serializer["should_have_contextual_chunks"]:
92
- contextualized_chunks = (
93
- await summarizer.contextual_retriever.contextualize_all_chunks(
94
- pages, allPdfsChunks
95
- )
96
  )
97
- chunks_passados = contextualized_chunks
98
- is_contextualized_chunk = True
99
- else:
100
- chunks_passados = allPdfsChunks
101
- is_contextualized_chunk = False
102
 
103
  # Create enhanced vector store and BM25 index
104
- vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
105
- chunks_passados, is_contextualized_chunk
 
 
106
  )
107
 
108
- prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
109
- **Instructions**:
110
- 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
111
- 2. **Reading the Context**: Extract the following information from `context`:
112
- - The name of the defendant (réu).
113
- - The crime they have been accused of (nome_do_crime).
114
- - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
115
- - The date the accusation was accepted (data_do_recebimento).
116
- - The ID of the decision document (id_do_documento).
117
- 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
118
- 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
119
- ```
120
- <formato>
121
- Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
122
- A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
123
- Não há outras causas interruptivas ou suspensivas da prescrição.
124
- </formato>
125
- ```
126
- 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
127
- **Reminder**:
128
- - Do not include your chain of thought in the final output.
129
- - Do not add extra information or commentary beyond the specified format.
130
- - The final answer must be in Portuguese.
131
- ```
132
- <formato>
133
- Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
134
- A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
135
- Não há outras causas interruptivas ou suspensivas da prescrição.
136
- </formato>
137
- """
138
  # Generate enhanced summary
139
  structured_summaries = await summarizer.generate_enhanced_summary(
140
  vector_store,
@@ -142,7 +94,7 @@ Não há outras causas interruptivas ou suspensivas da prescrição.
142
  chunk_ids
143
  # , serializer["user_message"]
144
  ,
145
- prompt_resumo_sem_context,
146
  )
147
 
148
  if not isinstance(structured_summaries, list):
@@ -150,41 +102,15 @@ Não há outras causas interruptivas ou suspensivas da prescrição.
150
 
151
  return Response({"erro": structured_summaries})
152
 
153
- # Output results as JSON
154
- # json_output = json.dumps(structured_summaries, indent=2)
155
- # print("\nStructured Summaries:")
156
- # print(json_output)
157
- texto_completo = ""
158
- print("\n\n\n")
159
- print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
160
- texto_completo += summarizer.resumo_gerado
161
- texto_completo += "\n\n"
162
- print("\n\n\n")
163
- print("structured_summaries: ", structured_summaries)
164
  for x in structured_summaries:
165
  texto_completo = texto_completo + x["content"] + "\n"
 
 
 
166
  return {
167
  "resultado": structured_summaries,
168
  "texto_completo": texto_completo,
169
- "parametros-utilizados": {
170
- "num_chunks_retrieval": serializer["num_chunks_retrieval"],
171
- "embedding_weight": serializer["embedding_weight"],
172
- "bm25_weight": serializer["bm25_weight"],
173
- "context_window": serializer["context_window"],
174
- "chunk_overlap": serializer["chunk_overlap"],
175
- "num_k_rerank": serializer["num_k_rerank"],
176
- "model_cohere_rerank": serializer["model_cohere_rerank"],
177
- "more_initial_chunks_for_reranking": serializer[
178
- "more_initial_chunks_for_reranking"
179
- ],
180
- "claude_context_model": serializer["claude_context_model"],
181
- "gpt_temperature": serializer["gpt_temperature"],
182
- "user_message": serializer["user_message"],
183
- "model": serializer["model"],
184
- "hf_embedding": serializer["hf_embedding"],
185
- "chunk_size": serializer["chunk_size"],
186
- "chunk_overlap": serializer["chunk_overlap"],
187
- "prompt_relatorio": serializer["prompt_relatorio"],
188
- "prompt_modelo": serializer["prompt_modelo"],
189
- },
190
  }
 
1
  import os
2
+ from _utils.gerar_relatorio_modelo_usuario.prompts import prompt_auxiliar_SEM_CONTEXT
 
3
  from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
4
  EnhancedDocumentSummarizer,
5
  )
6
+ from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
7
+ contextualize_chunk_based_on_serializer,
8
+ get_full_text_and_all_PDFs_chunks,
9
+ )
10
+ from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada
11
  from _utils.models.gerar_relatorio import (
12
  RetrievalConfig,
13
  )
 
42
  serializer, listaPDFs=None, contexto=None
43
  ):
44
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
 
45
  # Configuration
46
  config = RetrievalConfig(
47
  num_chunks=serializer["num_chunks_retrieval"],
 
62
  num_k_rerank=serializer["num_k_rerank"],
63
  model_cohere_rerank=serializer["model_cohere_rerank"],
64
  claude_context_model=serializer["claude_context_model"],
65
+ prompt_auxiliar=serializer["prompt_auxiliar"],
66
  gpt_model=serializer["model"],
67
  gpt_temperature=serializer["gpt_temperature"],
68
  id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
69
+ prompt_gerar_documento=serializer["prompt_gerar_documento"],
70
  reciprocal_rank_fusion=reciprocal_rank_fusion,
71
  )
72
 
73
+ full_text, allPdfsChunks, pages = get_full_text_and_all_PDFs_chunks(
74
+ contexto, listaPDFs, summarizer.splitter
75
+ )
76
+
77
+ chunks_passados, is_contextualized_chunk = (
78
+ await contextualize_chunk_based_on_serializer(
79
+ serializer, summarizer.contextual_retriever, pages, allPdfsChunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  )
81
+ )
 
 
 
 
82
 
83
  # Create enhanced vector store and BM25 index
84
+ vector_store, bm25, chunk_ids = (
85
+ summarizer.vector_store.create_enhanced_vector_store(
86
+ chunks_passados, is_contextualized_chunk
87
+ )
88
  )
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  # Generate enhanced summary
91
  structured_summaries = await summarizer.generate_enhanced_summary(
92
  vector_store,
 
94
  chunk_ids
95
  # , serializer["user_message"]
96
  ,
97
+ prompt_auxiliar_SEM_CONTEXT,
98
  )
99
 
100
  if not isinstance(structured_summaries, list):
 
102
 
103
  return Response({"erro": structured_summaries})
104
 
105
+ texto_completo = summarizer.resumo_gerado + "\n\n"
106
+
 
 
 
 
 
 
 
 
 
107
  for x in structured_summaries:
108
  texto_completo = texto_completo + x["content"] + "\n"
109
+
110
+ print("\n\ntexto_completo: ", texto_completo)
111
+
112
  return {
113
  "resultado": structured_summaries,
114
  "texto_completo": texto_completo,
115
+ "parametros-utilizados": gerar_resposta_compilada(serializer),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  }
_utils/resumo_simples_cursor.py CHANGED
@@ -1,221 +1,234 @@
1
  import os
2
  from typing import List, Dict, Tuple
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.document_loaders import PyPDFLoader
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
- from langchain.vectorstores import Chroma
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.chains import create_extraction_chain
9
- from langchain.prompts import PromptTemplate
 
 
10
  from dataclasses import dataclass
11
  import uuid
12
  import json
13
  from langchain_huggingface import HuggingFaceEndpoint
14
  from setup.environment import default_model
15
 
16
- os.environ["LANGCHAIN_TRACING_V2"]="true"
17
- os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
18
  os.environ.get("LANGCHAIN_API_KEY")
19
- os.environ["LANGCHAIN_PROJECT"]="VELLA"
 
20
 
21
  @dataclass
22
  class DocumentChunk:
23
- content: str
24
- page_number: int
25
- chunk_id: str
26
- start_char: int
27
- end_char: int
 
28
 
29
  class DocumentSummarizer:
30
- def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
31
- self.model = model
32
- self.system_prompt = system_prompt
33
- self.openai_api_key = openai_api_key
34
- self.embeddings = HuggingFaceEmbeddings(
35
- model_name=embedding
36
- )
37
- self.text_splitter = RecursiveCharacterTextSplitter(
38
- chunk_size=chunk_config["size"],
39
- chunk_overlap=chunk_config["overlap"]
40
- )
41
- self.chunk_metadata = {} # Store chunk metadata for tracing
42
-
43
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
44
- """Load PDF and split into chunks with metadata"""
45
- loader = PyPDFLoader(pdf_path)
46
- pages = loader.load()
47
- chunks = []
48
- char_count = 0
49
-
50
- for page in pages:
51
- text = page.page_content
52
- # Split the page content
53
- page_chunks = self.text_splitter.split_text(text)
54
-
55
- for chunk in page_chunks:
56
- chunk_id = str(uuid.uuid4())
57
- start_char = text.find(chunk)
58
- end_char = start_char + len(chunk)
59
-
60
- doc_chunk = DocumentChunk(
61
- content=chunk,
62
- page_number=page.metadata.get('page') + 1, # 1-based page numbering
63
- chunk_id=chunk_id,
64
- start_char=char_count + start_char,
65
- end_char=char_count + end_char
66
- )
67
- chunks.append(doc_chunk)
68
-
69
- # Store metadata for later retrieval
70
- self.chunk_metadata[chunk_id] = {
71
- 'page': doc_chunk.page_number,
72
- 'start_char': doc_chunk.start_char,
73
- 'end_char': doc_chunk.end_char
74
- }
75
-
76
- char_count += len(text)
77
-
78
- return chunks
79
-
80
- def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
81
- """Create vector store with metadata"""
82
- texts = [chunk.content for chunk in chunks]
83
- metadatas = [{
84
- 'chunk_id': chunk.chunk_id,
85
- 'page': chunk.page_number,
86
- 'start_char': chunk.start_char,
87
- 'end_char': chunk.end_char
88
- } for chunk in chunks]
89
-
90
- vector_store = Chroma.from_texts(
91
- texts=texts,
92
- metadatas=metadatas,
93
- embedding=self.embeddings
94
- )
95
- return vector_store
96
-
97
- def generate_summary_with_sources(
98
- self,
99
- vector_store: Chroma,
100
- query: str = "Summarize the main points of this document"
101
- ) -> List[Dict]:
102
- """Generate summary with source citations, returning structured JSON data"""
103
- # Retrieve relevant chunks with metadata
104
- relevant_docs = vector_store.similarity_search_with_score(query, k=5)
105
-
106
- # Prepare context and track sources
107
- contexts = []
108
- sources = []
109
-
110
- for doc, score in relevant_docs:
111
- chunk_id = doc.metadata['chunk_id']
112
- context = doc.page_content
113
- contexts.append(context)
114
-
115
- sources.append({
116
- 'content': context,
117
- 'page': doc.metadata['page'],
118
- 'chunk_id': chunk_id,
119
- 'relevance_score': score
120
- })
121
-
122
- prompt = PromptTemplate(
123
- template=self.system_prompt,
124
- input_variables=["context"]
125
- )
126
- llm = ""
127
-
128
- if (self.model == default_model):
129
- llm = ChatOpenAI(
130
- temperature=0,
131
- model_name="gpt-4o-mini",
132
- api_key=self.openai_api_key
133
  )
134
- else:
135
- llm = HuggingFaceEndpoint(
136
- repo_id=self.model,
137
- task="text-generation",
138
- max_new_tokens=1100,
139
- do_sample=False,
140
- huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
-
144
- response = llm.predict(prompt.format(context="\n\n".join(contexts)))
145
-
146
- # Split the response into paragraphs
147
- summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
148
-
149
- # Create structured output
150
- structured_output = []
151
- for idx, summary in enumerate(summaries):
152
- # Associate each summary with the most relevant source
153
- structured_output.append({
154
- "content": summary,
155
- "source": {
156
- "page": sources[min(idx, len(sources)-1)]['page'],
157
- "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
158
- "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
159
- }
160
- })
161
-
162
- return structured_output
163
-
164
- def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
165
- """Get extended context around a specific chunk"""
166
- metadata = self.chunk_metadata.get(chunk_id)
167
- if not metadata:
168
- return None
169
-
170
- return {
171
- 'page': metadata['page'],
172
- 'start_char': metadata['start_char'],
173
- 'end_char': metadata['end_char']
174
- }
175
 
176
  def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
177
- # By Luan
178
- allPdfsChunks = []
179
-
180
- # Initialize summarizer
181
- summarizer = DocumentSummarizer(
182
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
183
- embedding=serializer["hf_embedding"],
184
- chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
185
- system_prompt=serializer["system_prompt"],
186
- model=serializer["model"]
187
- )
188
-
189
- # Load and process document
190
- for pdf in listaPDFs:
191
- pdf_path = pdf
192
- chunks = summarizer.load_and_split_document(pdf_path)
193
- allPdfsChunks = allPdfsChunks + chunks
194
-
195
- vector_store = summarizer.create_vector_store(allPdfsChunks)
196
-
197
- # Generate structured summary
198
- structured_summaries = summarizer.generate_summary_with_sources(vector_store)
199
-
200
- # Print or return the structured data
201
- # print(structured_summaries)
202
- json_data = json.dumps(structured_summaries)
203
- print("\n\n")
204
- print(json_data)
205
- return structured_summaries
206
- # If you need to send to frontend, you can just return structured_summaries
207
- # It will be in the format:
208
- # [
209
- # {
210
- # "content": "Summary point 1...",
211
- # "source": {
212
- # "page": 1,
213
- # "text": "Source text...",
214
- # "relevance_score": 0.95
215
- # }
216
- # },
217
- # ...
218
- # ]
 
 
 
 
219
 
220
  if __name__ == "__main__":
221
  get_llm_summary_answer_by_cursor()
 
1
  import os
2
  from typing import List, Dict, Tuple
3
+ from setup.easy_imports import (
4
+ HuggingFaceEmbeddings,
5
+ PyPDFLoader,
6
+ Chroma,
7
+ ChatOpenAI,
8
+ create_extraction_chain,
9
+ PromptTemplate,
10
+ RecursiveCharacterTextSplitter,
11
+ )
12
  from dataclasses import dataclass
13
  import uuid
14
  import json
15
  from langchain_huggingface import HuggingFaceEndpoint
16
  from setup.environment import default_model
17
 
18
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
19
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
20
  os.environ.get("LANGCHAIN_API_KEY")
21
+ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
22
+
23
 
24
  @dataclass
25
  class DocumentChunk:
26
+ content: str
27
+ page_number: int
28
+ chunk_id: str
29
+ start_char: int
30
+ end_char: int
31
+
32
 
33
  class DocumentSummarizer:
34
+
35
+ def __init__(
36
+ self, openai_api_key: str, model, embedding, chunk_config, system_prompt
37
+ ):
38
+ self.model = model
39
+ self.system_prompt = system_prompt
40
+ self.openai_api_key = openai_api_key
41
+ self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
42
+ self.text_splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
+ self.chunk_metadata = {} # Store chunk metadata for tracing
46
+
47
+ def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
48
+ """Load PDF and split into chunks with metadata"""
49
+ loader = PyPDFLoader(pdf_path)
50
+ pages = loader.load()
51
+ chunks = []
52
+ char_count = 0
53
+
54
+ for page in pages:
55
+ text = page.page_content
56
+ # Split the page content
57
+ page_chunks = self.text_splitter.split_text(text)
58
+
59
+ for chunk in page_chunks:
60
+ chunk_id = str(uuid.uuid4())
61
+ start_char = text.find(chunk)
62
+ end_char = start_char + len(chunk)
63
+
64
+ doc_chunk = DocumentChunk(
65
+ content=chunk,
66
+ page_number=page.metadata.get("page") + 1, # 1-based page numbering
67
+ chunk_id=chunk_id,
68
+ start_char=char_count + start_char,
69
+ end_char=char_count + end_char,
70
+ )
71
+ chunks.append(doc_chunk)
72
+
73
+ # Store metadata for later retrieval
74
+ self.chunk_metadata[chunk_id] = {
75
+ "page": doc_chunk.page_number,
76
+ "start_char": doc_chunk.start_char,
77
+ "end_char": doc_chunk.end_char,
78
+ }
79
+
80
+ char_count += len(text)
81
+
82
+ return chunks
83
+
84
+ def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
85
+ """Create vector store with metadata"""
86
+ texts = [chunk.content for chunk in chunks]
87
+ metadatas = [
88
+ {
89
+ "chunk_id": chunk.chunk_id,
90
+ "page": chunk.page_number,
91
+ "start_char": chunk.start_char,
92
+ "end_char": chunk.end_char,
93
+ }
94
+ for chunk in chunks
95
+ ]
96
+
97
+ vector_store = Chroma.from_texts(
98
+ texts=texts, metadatas=metadatas, embedding=self.embeddings
99
  )
100
+ return vector_store
101
+
102
+ def generate_summary_with_sources(
103
+ self,
104
+ vector_store: Chroma,
105
+ query: str = "Summarize the main points of this document",
106
+ ) -> List[Dict]:
107
+ """Generate summary with source citations, returning structured JSON data"""
108
+ # Retrieve relevant chunks with metadata
109
+ relevant_docs = vector_store.similarity_search_with_score(query, k=5)
110
+
111
+ # Prepare context and track sources
112
+ contexts = []
113
+ sources = []
114
+
115
+ for doc, score in relevant_docs:
116
+ chunk_id = doc.metadata["chunk_id"]
117
+ context = doc.page_content
118
+ contexts.append(context)
119
+
120
+ sources.append(
121
+ {
122
+ "content": context,
123
+ "page": doc.metadata["page"],
124
+ "chunk_id": chunk_id,
125
+ "relevance_score": score,
126
+ }
127
+ )
128
+
129
+ prompt = PromptTemplate(
130
+ template=self.system_prompt, input_variables=["context"]
131
+ )
132
+ llm = ""
133
+
134
+ if self.model == default_model:
135
+ llm = ChatOpenAI(
136
+ temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
137
+ )
138
+ else:
139
+ llm = HuggingFaceEndpoint(
140
+ repo_id=self.model,
141
+ task="text-generation",
142
+ max_new_tokens=1100,
143
+ do_sample=False,
144
+ huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
145
+ )
146
+
147
+ response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
148
+
149
+ # Split the response into paragraphs
150
+ summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
151
+
152
+ # Create structured output
153
+ structured_output = []
154
+ for idx, summary in enumerate(summaries):
155
+ # Associate each summary with the most relevant source
156
+ structured_output.append(
157
+ {
158
+ "content": summary,
159
+ "source": {
160
+ "page": sources[min(idx, len(sources) - 1)]["page"],
161
+ "text": sources[min(idx, len(sources) - 1)]["content"][:200]
162
+ + "...",
163
+ "relevance_score": sources[min(idx, len(sources) - 1)][
164
+ "relevance_score"
165
+ ],
166
+ },
167
+ }
168
+ )
169
+
170
+ return structured_output
171
+
172
+ def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
173
+ """Get extended context around a specific chunk"""
174
+ metadata = self.chunk_metadata.get(chunk_id)
175
+ if not metadata:
176
+ return None
177
+
178
+ return {
179
+ "page": metadata["page"],
180
+ "start_char": metadata["start_char"],
181
+ "end_char": metadata["end_char"],
182
+ }
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
186
+ # By Luan
187
+ allPdfsChunks = []
188
+
189
+ # Initialize summarizer
190
+ summarizer = DocumentSummarizer(
191
+ openai_api_key=os.environ.get("OPENAI_API_KEY"),
192
+ embedding=serializer["hf_embedding"],
193
+ chunk_config={
194
+ "size": serializer["chunk_size"],
195
+ "overlap": serializer["chunk_overlap"],
196
+ },
197
+ system_prompt=serializer["system_prompt"],
198
+ model=serializer["model"],
199
+ )
200
+
201
+ # Load and process document
202
+ for pdf in listaPDFs:
203
+ pdf_path = pdf
204
+ chunks = summarizer.load_and_split_document(pdf_path)
205
+ allPdfsChunks = allPdfsChunks + chunks
206
+
207
+ vector_store = summarizer.create_vector_store(allPdfsChunks)
208
+
209
+ # Generate structured summary
210
+ structured_summaries = summarizer.generate_summary_with_sources(vector_store)
211
+
212
+ # Print or return the structured data
213
+ # print(structured_summaries)
214
+ json_data = json.dumps(structured_summaries)
215
+ print("\n\n")
216
+ print(json_data)
217
+ return structured_summaries
218
+ # If you need to send to frontend, you can just return structured_summaries
219
+ # It will be in the format:
220
+ # [
221
+ # {
222
+ # "content": "Summary point 1...",
223
+ # "source": {
224
+ # "page": 1,
225
+ # "text": "Source text...",
226
+ # "relevance_score": 0.95
227
+ # }
228
+ # },
229
+ # ...
230
+ # ]
231
+
232
 
233
  if __name__ == "__main__":
234
  get_llm_summary_answer_by_cursor()
_utils/splitters/Splitter_class.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
2
+ from typing import List, Dict, Tuple, Optional
3
+ from _utils.models.gerar_relatorio import (
4
+ DocumentChunk,
5
+ )
6
+ import uuid
7
+
8
+
9
+ class Splitter:
10
+ def __init__(
11
+ self,
12
+ chunk_size,
13
+ chunk_overlap,
14
+ ):
15
+ self.text_splitter = RecursiveCharacterTextSplitter(
16
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
17
+ )
18
+ self.chunk_metadata = {} # Store chunk metadata for tracing
19
+
20
+ def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
21
+ """Load PDF and split into chunks with metadata"""
22
+ loader = PyPDFLoader(pdf_path)
23
+ pages = (
24
+ loader.load()
25
+ ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
26
+ chunks = []
27
+ char_count = 0
28
+
29
+ for page in pages:
30
+ text = page.page_content
31
+ page_chunks = self.text_splitter.split_text(
32
+ text
33
+ ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
34
+
35
+ for chunk in page_chunks:
36
+ chunk_id = str(uuid.uuid4())
37
+ start_char = text.find(
38
+ chunk
39
+ ) # Retorna a posição onde se encontra o chunk dentro da página inteira
40
+ end_char = start_char + len(chunk)
41
+
42
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
43
+ content=chunk,
44
+ page_number=page.metadata.get("page") + 1, # 1-based page numbering
45
+ chunk_id=chunk_id,
46
+ start_char=char_count + start_char,
47
+ end_char=char_count + end_char,
48
+ )
49
+ chunks.append(doc_chunk)
50
+
51
+ # Store metadata for later retrieval
52
+ self.chunk_metadata[chunk_id] = {
53
+ "page": doc_chunk.page_number,
54
+ "start_char": doc_chunk.start_char,
55
+ "end_char": doc_chunk.end_char,
56
+ }
57
+
58
+ char_count += len(text)
59
+
60
+ return chunks
61
+
62
+ def load_and_split_text(self, text: str) -> List[DocumentChunk]:
63
+ """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
64
+ page = Document(page_content=text, metadata={"page": 1})
65
+ chunks = []
66
+ char_count = 0
67
+
68
+ text = page.page_content
69
+ page_chunks = self.text_splitter.split_text(
70
+ text
71
+ ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
72
+ print("\n\n\n")
73
+ print("page_chunks: ", page_chunks)
74
+
75
+ for chunk in page_chunks:
76
+ chunk_id = str(uuid.uuid4())
77
+ start_char = text.find(
78
+ chunk
79
+ ) # Retorna a posição onde se encontra o chunk dentro da página inteira
80
+ end_char = start_char + len(chunk)
81
+
82
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
83
+ content=chunk,
84
+ page_number=page.metadata.get("page") + 1, # 1-based page numbering
85
+ chunk_id=chunk_id,
86
+ start_char=char_count + start_char,
87
+ end_char=char_count + end_char,
88
+ )
89
+ chunks.append(doc_chunk)
90
+
91
+ # Store metadata for later retrieval
92
+ self.chunk_metadata[chunk_id] = {
93
+ "page": doc_chunk.page_number,
94
+ "start_char": doc_chunk.start_char,
95
+ "end_char": doc_chunk.end_char,
96
+ }
97
+
98
+ char_count += len(text)
99
+
100
+ return chunks
_utils/vector_stores/Vector_store_class.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Optional
2
+ from _utils.models.gerar_relatorio import (
3
+ ContextualizedChunk,
4
+ )
5
+ from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
6
+ import logging
7
+
8
+
9
+ class VectorStore:
10
+ def __init__(self, embedding_model):
11
+ self.logger = logging.getLogger(__name__)
12
+ self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
13
+ pass
14
+
15
+ def create_enhanced_vector_store(
16
+ self, chunks: List[ContextualizedChunk], is_contextualized_chunk
17
+ ) -> Tuple[Chroma, BM25Okapi, List[str]]:
18
+ """Create vector store and BM25 index with contextualized chunks"""
19
+ try:
20
+ # Prepare texts with context
21
+ if is_contextualized_chunk:
22
+ texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
23
+ else:
24
+ texts = [f"{chunk.content}" for chunk in chunks]
25
+
26
+ # Create vector store
27
+ metadatas = []
28
+ for chunk in chunks:
29
+ if is_contextualized_chunk:
30
+ context = chunk.context
31
+ else:
32
+ context = ""
33
+ metadatas.append(
34
+ {
35
+ "chunk_id": chunk.chunk_id,
36
+ "page": chunk.page_number,
37
+ "start_char": chunk.start_char,
38
+ "end_char": chunk.end_char,
39
+ "context": context,
40
+ }
41
+ )
42
+
43
+ vector_store = Chroma.from_texts(
44
+ texts=texts, metadatas=metadatas, embedding=self.embeddings
45
+ )
46
+
47
+ # Create BM25 index
48
+ tokenized_texts = [text.split() for text in texts]
49
+ bm25 = BM25Okapi(tokenized_texts)
50
+
51
+ # Get chunk IDs in order
52
+ chunk_ids = [chunk.chunk_id for chunk in chunks]
53
+
54
+ return vector_store, bm25, chunk_ids
55
+
56
+ except Exception as e:
57
+ self.logger.error(f"Error creating enhanced vector store: {str(e)}")
58
+ raise
gerar_documento/serializer.py CHANGED
@@ -1,8 +1,8 @@
1
  from rest_framework import serializers
2
  from _antigos.resumos.serializer import ResumoCursorSerializer
3
  from _utils.gerar_relatorio_modelo_usuario.prompts import (
4
- system_prompt_modelo,
5
- system_prompt_relatorio,
6
  )
7
 
8
  user_message = "What are the main points of this document?"
@@ -10,10 +10,10 @@ user_message = "What are the main points of this document?"
10
 
11
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
12
  system_prompt = None
13
- prompt_relatorio = serializers.CharField(
14
- required=False, default=system_prompt_relatorio
 
15
  )
16
- prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
17
  user_message = serializers.CharField(required=False, default=user_message)
18
  num_chunks_retrieval = serializers.IntegerField(default=5)
19
  embedding_weight = serializers.FloatField(default=0.5)
 
1
  from rest_framework import serializers
2
  from _antigos.resumos.serializer import ResumoCursorSerializer
3
  from _utils.gerar_relatorio_modelo_usuario.prompts import (
4
+ prompt_gerar_documento,
5
+ prompt_auxiliar,
6
  )
7
 
8
  user_message = "What are the main points of this document?"
 
10
 
11
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
12
  system_prompt = None
13
+ prompt_auxiliar = serializers.CharField(required=False, default=prompt_auxiliar)
14
+ prompt_gerar_documento = serializers.CharField(
15
+ required=False, default=prompt_gerar_documento
16
  )
 
17
  user_message = serializers.CharField(required=False, default=user_message)
18
  num_chunks_retrieval = serializers.IntegerField(default=5)
19
  embedding_weight = serializers.FloatField(default=0.5)
gerar_documento/views.py CHANGED
@@ -1,7 +1,11 @@
1
- from rest_framework.views import APIView
2
- from adrf.views import APIView as AsyncAPIView
3
- from rest_framework.response import Response
4
-
 
 
 
 
5
  from _utils.handle_files import handle_pdf_files_from_serializer, remove_pdf_temp_files
6
  from _utils.resumo_completo_cursor import (
7
  get_llm_summary_answer_by_cursor_complete,
@@ -9,9 +13,6 @@ from _utils.resumo_completo_cursor import (
9
  from .serializer import (
10
  ResumoCursorCompeltoSerializer,
11
  )
12
- from rest_framework.parsers import MultiPartParser
13
- from drf_spectacular.utils import extend_schema
14
- from datetime import datetime
15
 
16
 
17
  class ResumoSimplesCursorCompletoView(AsyncAPIView):
 
1
+ from setup.easy_imports import (
2
+ Response,
3
+ AsyncAPIView,
4
+ APIView,
5
+ MultiPartParser,
6
+ extend_schema,
7
+ )
8
+ from datetime import datetime
9
  from _utils.handle_files import handle_pdf_files_from_serializer, remove_pdf_temp_files
10
  from _utils.resumo_completo_cursor import (
11
  get_llm_summary_answer_by_cursor_complete,
 
13
  from .serializer import (
14
  ResumoCursorCompeltoSerializer,
15
  )
 
 
 
16
 
17
 
18
  class ResumoSimplesCursorCompletoView(AsyncAPIView):
setup/easy_imports.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from adrf.views import APIView as AsyncAPIView
2
+ from drf_spectacular.utils import extend_schema
3
+
4
+ from rest_framework.views import APIView
5
+ from rest_framework.response import Response
6
+ from rest_framework.parsers import MultiPartParser
7
+
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+
10
+ # from langchain_huggingface import HuggingFaceEmbeddings
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain.prompts import PromptTemplate
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_community.document_loaders import PyPDFLoader
15
+ from langchain_community.vectorstores import Chroma
16
+
17
+ # from langchain_community.chat_models import ChatOpenAI
18
+ from langchain_openai import ChatOpenAI
19
+ from langchain.schema import Document
20
+ from langchain.chains import create_extraction_chain
21
+
22
+ from rank_bm25 import BM25Okapi