Spaces:
Running
Running
luanpoppe
commited on
Commit
·
12d3e1a
1
Parent(s):
12b0dd7
feat: melhorias no código e refatorações
Browse files- _utils/LLMs/LLM_class.py +9 -0
- _utils/chains/Chain_class.py +11 -0
- _utils/gerar_relatorio_modelo_usuario/DocumentSummarizer_simples.py +9 -95
- _utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py +21 -62
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +54 -3
- _utils/gerar_relatorio_modelo_usuario/llm_calls.py +24 -5
- _utils/gerar_relatorio_modelo_usuario/prompts.py +107 -16
- _utils/gerar_relatorio_modelo_usuario/utils.py +22 -0
- _utils/prompts/Prompt_class.py +12 -0
- _utils/resumo_completo_cursor.py +27 -101
- _utils/resumo_simples_cursor.py +212 -199
- _utils/splitters/Splitter_class.py +100 -0
- _utils/vector_stores/Vector_store_class.py +58 -0
- gerar_documento/serializer.py +5 -5
- gerar_documento/views.py +8 -7
- setup/easy_imports.py +22 -0
_utils/LLMs/LLM_class.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup.environment import default_model
|
2 |
+
|
3 |
+
|
4 |
+
class LLM:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
# def create_GPT_model(self, model=default_model):
|
9 |
+
# return ChatOpen()
|
_utils/chains/Chain_class.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Chain:
|
2 |
+
def __init__(self, prompt, model):
|
3 |
+
self.prompt = prompt
|
4 |
+
self.model = model
|
5 |
+
|
6 |
+
def create_prompt_model_chain(self):
|
7 |
+
return self.prompt | self.model
|
8 |
+
|
9 |
+
def invoke_prompt_model_chain(self, invoke_params):
|
10 |
+
chain = self.create_prompt_model_chain()
|
11 |
+
return chain.invoke(invoke_params)
|
_utils/gerar_relatorio_modelo_usuario/DocumentSummarizer_simples.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
-
import os
|
2 |
from typing import List, Dict, Tuple, Optional
|
3 |
-
from
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
import logging
|
11 |
from cohere import Client
|
12 |
from _utils.models.gerar_relatorio import (
|
13 |
DocumentChunk,
|
14 |
)
|
15 |
-
from langchain.schema import Document
|
16 |
|
17 |
|
18 |
class DocumentSummarizer:
|
@@ -29,94 +27,10 @@ class DocumentSummarizer:
|
|
29 |
self.openai_api_key = openai_api_key
|
30 |
self.cohere_client = Client(cohere_api_key)
|
31 |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
32 |
-
self.text_splitter = RecursiveCharacterTextSplitter(
|
33 |
-
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
34 |
-
)
|
35 |
-
self.chunk_metadata = {} # Store chunk metadata for tracing
|
36 |
self.num_k_rerank = num_k_rerank
|
37 |
self.model_cohere_rerank = model_cohere_rerank
|
38 |
|
39 |
-
|
40 |
-
"""Load PDF and split into chunks with metadata"""
|
41 |
-
loader = PyPDFLoader(pdf_path)
|
42 |
-
pages = (
|
43 |
-
loader.load()
|
44 |
-
) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
45 |
-
chunks = []
|
46 |
-
char_count = 0
|
47 |
-
|
48 |
-
for page in pages:
|
49 |
-
text = page.page_content
|
50 |
-
page_chunks = self.text_splitter.split_text(
|
51 |
-
text
|
52 |
-
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
53 |
-
|
54 |
-
for chunk in page_chunks:
|
55 |
-
chunk_id = str(uuid.uuid4())
|
56 |
-
start_char = text.find(
|
57 |
-
chunk
|
58 |
-
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
59 |
-
end_char = start_char + len(chunk)
|
60 |
-
|
61 |
-
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
62 |
-
content=chunk,
|
63 |
-
page_number=page.metadata.get("page") + 1, # 1-based page numbering
|
64 |
-
chunk_id=chunk_id,
|
65 |
-
start_char=char_count + start_char,
|
66 |
-
end_char=char_count + end_char,
|
67 |
-
)
|
68 |
-
chunks.append(doc_chunk)
|
69 |
-
|
70 |
-
# Store metadata for later retrieval
|
71 |
-
self.chunk_metadata[chunk_id] = {
|
72 |
-
"page": doc_chunk.page_number,
|
73 |
-
"start_char": doc_chunk.start_char,
|
74 |
-
"end_char": doc_chunk.end_char,
|
75 |
-
}
|
76 |
-
|
77 |
-
char_count += len(text)
|
78 |
-
|
79 |
-
return chunks
|
80 |
-
|
81 |
-
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
82 |
-
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
83 |
-
page = Document(page_content=text, metadata={"page": 1})
|
84 |
-
chunks = []
|
85 |
-
char_count = 0
|
86 |
-
|
87 |
-
text = page.page_content
|
88 |
-
page_chunks = self.text_splitter.split_text(
|
89 |
-
text
|
90 |
-
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
91 |
-
print("\n\n\n")
|
92 |
-
print("page_chunks: ", page_chunks)
|
93 |
-
|
94 |
-
for chunk in page_chunks:
|
95 |
-
chunk_id = str(uuid.uuid4())
|
96 |
-
start_char = text.find(
|
97 |
-
chunk
|
98 |
-
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
99 |
-
end_char = start_char + len(chunk)
|
100 |
-
|
101 |
-
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
102 |
-
content=chunk,
|
103 |
-
page_number=page.metadata.get("page") + 1, # 1-based page numbering
|
104 |
-
chunk_id=chunk_id,
|
105 |
-
start_char=char_count + start_char,
|
106 |
-
end_char=char_count + end_char,
|
107 |
-
)
|
108 |
-
chunks.append(doc_chunk)
|
109 |
-
|
110 |
-
# Store metadata for later retrieval
|
111 |
-
self.chunk_metadata[chunk_id] = {
|
112 |
-
"page": doc_chunk.page_number,
|
113 |
-
"start_char": doc_chunk.start_char,
|
114 |
-
"end_char": doc_chunk.end_char,
|
115 |
-
}
|
116 |
-
|
117 |
-
char_count += len(text)
|
118 |
-
|
119 |
-
return chunks
|
120 |
|
121 |
def create_vector_store(
|
122 |
self, chunks: List[DocumentChunk]
|
@@ -233,7 +147,7 @@ class DocumentSummarizer:
|
|
233 |
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
|
234 |
)
|
235 |
|
236 |
-
response = llm.
|
237 |
|
238 |
# Split the response into paragraphs
|
239 |
summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
|
|
|
|
|
1 |
from typing import List, Dict, Tuple, Optional
|
2 |
+
from _utils.splitters.Splitter_class import Splitter
|
3 |
+
from setup.easy_imports import (
|
4 |
+
HuggingFaceEmbeddings,
|
5 |
+
Chroma,
|
6 |
+
ChatOpenAI,
|
7 |
+
PromptTemplate,
|
8 |
+
)
|
9 |
import logging
|
10 |
from cohere import Client
|
11 |
from _utils.models.gerar_relatorio import (
|
12 |
DocumentChunk,
|
13 |
)
|
|
|
14 |
|
15 |
|
16 |
class DocumentSummarizer:
|
|
|
27 |
self.openai_api_key = openai_api_key
|
28 |
self.cohere_client = Client(cohere_api_key)
|
29 |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
|
|
|
|
|
|
|
|
30 |
self.num_k_rerank = num_k_rerank
|
31 |
self.model_cohere_rerank = model_cohere_rerank
|
32 |
|
33 |
+
self.splitter = Splitter(chunk_size, chunk_overlap)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def create_vector_store(
|
36 |
self, chunks: List[DocumentChunk]
|
|
|
147 |
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
|
148 |
)
|
149 |
|
150 |
+
response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
|
151 |
|
152 |
# Split the response into paragraphs
|
153 |
summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
|
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple, Optional
|
3 |
-
from
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
8 |
import logging
|
9 |
import requests
|
10 |
from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
|
11 |
DocumentSummarizer,
|
12 |
)
|
13 |
from _utils.models.gerar_relatorio import (
|
14 |
-
ContextualizedChunk,
|
15 |
RetrievalConfig,
|
16 |
)
|
17 |
from modelos_usuarios.serializer import ModeloUsuarioSerializer
|
18 |
from setup.environment import api_url
|
19 |
-
from rest_framework.response import Response
|
20 |
from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
|
21 |
ContextualRetriever,
|
22 |
)
|
@@ -24,6 +25,7 @@ from asgiref.sync import sync_to_async
|
|
24 |
|
25 |
|
26 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
|
27 |
def __init__(
|
28 |
self,
|
29 |
openai_api_key: str,
|
@@ -35,12 +37,12 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
35 |
num_k_rerank,
|
36 |
model_cohere_rerank,
|
37 |
claude_context_model,
|
38 |
-
|
39 |
gpt_model,
|
40 |
gpt_temperature,
|
41 |
id_modelo_do_usuario,
|
42 |
-
|
43 |
-
reciprocal_rank_fusion
|
44 |
):
|
45 |
super().__init__(
|
46 |
openai_api_key,
|
@@ -56,58 +58,15 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
56 |
config, claude_api_key, claude_context_model
|
57 |
)
|
58 |
self.logger = logging.getLogger(__name__)
|
59 |
-
self.
|
60 |
self.gpt_model = gpt_model
|
61 |
self.gpt_temperature = gpt_temperature
|
62 |
self.id_modelo_do_usuario = id_modelo_do_usuario
|
63 |
-
self.
|
64 |
self.reciprocal_rank_fusion = reciprocal_rank_fusion
|
65 |
self.resumo_gerado = ""
|
66 |
|
67 |
-
|
68 |
-
self, chunks: List[ContextualizedChunk], is_contextualized_chunk
|
69 |
-
) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
70 |
-
"""Create vector store and BM25 index with contextualized chunks"""
|
71 |
-
try:
|
72 |
-
# Prepare texts with context
|
73 |
-
if is_contextualized_chunk:
|
74 |
-
texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
|
75 |
-
else:
|
76 |
-
texts = [f"{chunk.content}" for chunk in chunks]
|
77 |
-
|
78 |
-
# Create vector store
|
79 |
-
metadatas = []
|
80 |
-
for chunk in chunks:
|
81 |
-
if is_contextualized_chunk:
|
82 |
-
context = chunk.context
|
83 |
-
else:
|
84 |
-
context = ""
|
85 |
-
metadatas.append(
|
86 |
-
{
|
87 |
-
"chunk_id": chunk.chunk_id,
|
88 |
-
"page": chunk.page_number,
|
89 |
-
"start_char": chunk.start_char,
|
90 |
-
"end_char": chunk.end_char,
|
91 |
-
"context": context,
|
92 |
-
}
|
93 |
-
)
|
94 |
-
|
95 |
-
vector_store = Chroma.from_texts(
|
96 |
-
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
97 |
-
)
|
98 |
-
|
99 |
-
# Create BM25 index
|
100 |
-
tokenized_texts = [text.split() for text in texts]
|
101 |
-
bm25 = BM25Okapi(tokenized_texts)
|
102 |
-
|
103 |
-
# Get chunk IDs in order
|
104 |
-
chunk_ids = [chunk.chunk_id for chunk in chunks]
|
105 |
-
|
106 |
-
return vector_store, bm25, chunk_ids
|
107 |
-
|
108 |
-
except Exception as e:
|
109 |
-
self.logger.error(f"Error creating enhanced vector store: {str(e)}")
|
110 |
-
raise
|
111 |
|
112 |
def retrieve_with_rank_fusion(
|
113 |
self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
|
@@ -254,25 +213,25 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
254 |
)
|
255 |
|
256 |
prompt_gerar_relatorio = PromptTemplate(
|
257 |
-
template=self.
|
258 |
)
|
259 |
|
260 |
-
relatorio_gerado = llm.
|
261 |
prompt_gerar_relatorio.format(context="\n\n".join(contexts))
|
262 |
)
|
263 |
|
264 |
-
self.resumo_gerado = relatorio_gerado
|
265 |
|
266 |
prompt_gerar_modelo = PromptTemplate(
|
267 |
-
template=self.
|
268 |
input_variables=["context", "modelo_usuario"],
|
269 |
)
|
270 |
|
271 |
-
modelo_gerado = llm.
|
272 |
prompt_gerar_modelo.format(
|
273 |
context=relatorio_gerado, modelo_usuario=serializer.data["modelo"]
|
274 |
)
|
275 |
-
)
|
276 |
|
277 |
# Split the response into paragraphs
|
278 |
summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
|
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple, Optional
|
3 |
+
from _utils.vector_stores.Vector_store_class import VectorStore
|
4 |
+
from setup.easy_imports import (
|
5 |
+
Chroma,
|
6 |
+
ChatOpenAI,
|
7 |
+
PromptTemplate,
|
8 |
+
BM25Okapi,
|
9 |
+
Response,
|
10 |
+
)
|
11 |
import logging
|
12 |
import requests
|
13 |
from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
|
14 |
DocumentSummarizer,
|
15 |
)
|
16 |
from _utils.models.gerar_relatorio import (
|
|
|
17 |
RetrievalConfig,
|
18 |
)
|
19 |
from modelos_usuarios.serializer import ModeloUsuarioSerializer
|
20 |
from setup.environment import api_url
|
|
|
21 |
from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
|
22 |
ContextualRetriever,
|
23 |
)
|
|
|
25 |
|
26 |
|
27 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
28 |
+
|
29 |
def __init__(
|
30 |
self,
|
31 |
openai_api_key: str,
|
|
|
37 |
num_k_rerank,
|
38 |
model_cohere_rerank,
|
39 |
claude_context_model,
|
40 |
+
prompt_auxiliar,
|
41 |
gpt_model,
|
42 |
gpt_temperature,
|
43 |
id_modelo_do_usuario,
|
44 |
+
prompt_gerar_documento,
|
45 |
+
reciprocal_rank_fusion,
|
46 |
):
|
47 |
super().__init__(
|
48 |
openai_api_key,
|
|
|
58 |
config, claude_api_key, claude_context_model
|
59 |
)
|
60 |
self.logger = logging.getLogger(__name__)
|
61 |
+
self.prompt_auxiliar = prompt_auxiliar
|
62 |
self.gpt_model = gpt_model
|
63 |
self.gpt_temperature = gpt_temperature
|
64 |
self.id_modelo_do_usuario = id_modelo_do_usuario
|
65 |
+
self.prompt_gerar_documento = prompt_gerar_documento
|
66 |
self.reciprocal_rank_fusion = reciprocal_rank_fusion
|
67 |
self.resumo_gerado = ""
|
68 |
|
69 |
+
self.vector_store = VectorStore(embedding_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def retrieve_with_rank_fusion(
|
72 |
self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
|
|
|
213 |
)
|
214 |
|
215 |
prompt_gerar_relatorio = PromptTemplate(
|
216 |
+
template=self.prompt_auxiliar, input_variables=["context"]
|
217 |
)
|
218 |
|
219 |
+
relatorio_gerado = llm.invoke(
|
220 |
prompt_gerar_relatorio.format(context="\n\n".join(contexts))
|
221 |
)
|
222 |
|
223 |
+
self.resumo_gerado = relatorio_gerado.content
|
224 |
|
225 |
prompt_gerar_modelo = PromptTemplate(
|
226 |
+
template=self.prompt_gerar_documento,
|
227 |
input_variables=["context", "modelo_usuario"],
|
228 |
)
|
229 |
|
230 |
+
modelo_gerado = llm.invoke(
|
231 |
prompt_gerar_modelo.format(
|
232 |
context=relatorio_gerado, modelo_usuario=serializer.data["modelo"]
|
233 |
)
|
234 |
+
).content
|
235 |
|
236 |
# Split the response into paragraphs
|
237 |
summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from langchain_openai import ChatOpenAI
|
3 |
from typing import List, Dict, Tuple, Optional
|
4 |
from anthropic import Anthropic, AsyncAnthropic
|
@@ -12,7 +19,7 @@ from dataclasses import dataclass
|
|
12 |
from langchain_core.messages import HumanMessage
|
13 |
from asgiref.sync import sync_to_async
|
14 |
|
15 |
-
from _utils.gerar_relatorio_modelo_usuario.llm_calls import
|
16 |
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
|
17 |
from _utils.models.gerar_relatorio import (
|
18 |
ContextualizedChunk,
|
@@ -39,11 +46,11 @@ class ContextualRetriever:
|
|
39 |
try:
|
40 |
print("COMEÇOU A REQUISIÇÃO")
|
41 |
prompt = contextual_prompt(full_text, chunk.content)
|
42 |
-
# response = await
|
43 |
# self.claude_client, self.claude_context_model, prompt
|
44 |
# )
|
45 |
|
46 |
-
response = await
|
47 |
return response
|
48 |
except Exception as e:
|
49 |
self.logger.error(
|
@@ -51,6 +58,13 @@ class ContextualRetriever:
|
|
51 |
)
|
52 |
return ""
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
async def create_contextualized_chunk(self, chunk, full_text):
|
55 |
lista_contador.append(0)
|
56 |
print("contador: ", len(lista_contador))
|
@@ -90,3 +104,40 @@ class ContextualRetriever:
|
|
90 |
contextualized_chunks = [task.result() for task in tasks]
|
91 |
|
92 |
return contextualized_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
# from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
3 |
+
# prompt_auxiliar_do_contextual_prompt,
|
4 |
+
# )
|
5 |
+
from _utils.chains.Chain_class import Chain
|
6 |
+
from _utils.prompts.Prompt_class import Prompt
|
7 |
+
from _utils.splitters.Splitter_class import Splitter
|
8 |
+
from setup.easy_imports import PyPDFLoader
|
9 |
from langchain_openai import ChatOpenAI
|
10 |
from typing import List, Dict, Tuple, Optional
|
11 |
from anthropic import Anthropic, AsyncAnthropic
|
|
|
19 |
from langchain_core.messages import HumanMessage
|
20 |
from asgiref.sync import sync_to_async
|
21 |
|
22 |
+
from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
|
23 |
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
|
24 |
from _utils.models.gerar_relatorio import (
|
25 |
ContextualizedChunk,
|
|
|
46 |
try:
|
47 |
print("COMEÇOU A REQUISIÇÃO")
|
48 |
prompt = contextual_prompt(full_text, chunk.content)
|
49 |
+
# response = await aclaude_answer(
|
50 |
# self.claude_client, self.claude_context_model, prompt
|
51 |
# )
|
52 |
|
53 |
+
response = await agpt_answer(prompt)
|
54 |
return response
|
55 |
except Exception as e:
|
56 |
self.logger.error(
|
|
|
58 |
)
|
59 |
return ""
|
60 |
|
61 |
+
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
|
62 |
+
# prompt = Prompt().create_prompt_template(
|
63 |
+
# "", prompt_auxiliar_do_contextual_prompt
|
64 |
+
# )
|
65 |
+
# Chain(prompt, ChatOpenAI())
|
66 |
+
# return
|
67 |
+
|
68 |
async def create_contextualized_chunk(self, chunk, full_text):
|
69 |
lista_contador.append(0)
|
70 |
print("contador: ", len(lista_contador))
|
|
|
104 |
contextualized_chunks = [task.result() for task in tasks]
|
105 |
|
106 |
return contextualized_chunks
|
107 |
+
|
108 |
+
|
109 |
+
def get_full_text_and_all_PDFs_chunks(contexto, listaPDFs, splitterObject: Splitter):
|
110 |
+
all_PDFs_chunks = []
|
111 |
+
full_text = ""
|
112 |
+
if contexto:
|
113 |
+
full_text = contexto
|
114 |
+
chunks = splitterObject.load_and_split_text(full_text)
|
115 |
+
all_PDFs_chunks = chunks
|
116 |
+
else:
|
117 |
+
# Load and process document
|
118 |
+
for pdf in listaPDFs:
|
119 |
+
pdf_path = pdf
|
120 |
+
chunks = splitterObject.load_and_split_document(pdf_path)
|
121 |
+
all_PDFs_chunks = all_PDFs_chunks + chunks
|
122 |
+
# Get full text for contextualization
|
123 |
+
loader = PyPDFLoader(pdf_path)
|
124 |
+
pages = loader.load()
|
125 |
+
full_text = " ".join([page.page_content for page in pages])
|
126 |
+
|
127 |
+
return full_text, all_PDFs_chunks, pages
|
128 |
+
|
129 |
+
|
130 |
+
async def contextualize_chunk_based_on_serializer(
|
131 |
+
serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
|
132 |
+
):
|
133 |
+
if serializer["should_have_contextual_chunks"]:
|
134 |
+
contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
|
135 |
+
pages, all_PDFs_chunks
|
136 |
+
)
|
137 |
+
chunks_passados = contextualized_chunks
|
138 |
+
is_contextualized_chunk = True
|
139 |
+
else:
|
140 |
+
chunks_passados = all_PDFs_chunks
|
141 |
+
is_contextualized_chunk = False
|
142 |
+
|
143 |
+
return chunks_passados, is_contextualized_chunk
|
_utils/gerar_relatorio_modelo_usuario/llm_calls.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
import os
|
|
|
2 |
from langchain_core.messages import HumanMessage
|
3 |
from langchain_openai import ChatOpenAI
|
4 |
|
5 |
|
6 |
-
async def
|
7 |
-
print("\n")
|
8 |
-
print("Começou uma requisição pelo Claude")
|
9 |
-
print("\n")
|
10 |
response = await claude_client.messages.create(
|
11 |
model=claude_context_model,
|
12 |
max_tokens=100,
|
@@ -17,7 +16,7 @@ async def claude_answer(claude_client, claude_context_model, prompt):
|
|
17 |
].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
|
18 |
|
19 |
|
20 |
-
async def
|
21 |
gpt = ChatOpenAI(
|
22 |
temperature=0,
|
23 |
model="gpt-4o-mini",
|
@@ -26,3 +25,23 @@ async def gpt_answer(prompt):
|
|
26 |
)
|
27 |
response = await gpt.ainvoke([HumanMessage(content=prompt)])
|
28 |
return response.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from setup.environment import default_model
|
3 |
from langchain_core.messages import HumanMessage
|
4 |
from langchain_openai import ChatOpenAI
|
5 |
|
6 |
|
7 |
+
async def aclaude_answer(claude_client, claude_context_model, prompt):
|
8 |
+
print("\n\nComeçou uma requisição pelo Claude")
|
|
|
|
|
9 |
response = await claude_client.messages.create(
|
10 |
model=claude_context_model,
|
11 |
max_tokens=100,
|
|
|
16 |
].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
|
17 |
|
18 |
|
19 |
+
async def agpt_answer(prompt):
|
20 |
gpt = ChatOpenAI(
|
21 |
temperature=0,
|
22 |
model="gpt-4o-mini",
|
|
|
25 |
)
|
26 |
response = await gpt.ainvoke([HumanMessage(content=prompt)])
|
27 |
return response.content
|
28 |
+
|
29 |
+
|
30 |
+
def gpt_answer(
|
31 |
+
prompt,
|
32 |
+
temperature=0,
|
33 |
+
model=default_model,
|
34 |
+
max_retries=5,
|
35 |
+
shouldReturnFullResponse=False,
|
36 |
+
):
|
37 |
+
gpt = ChatOpenAI(
|
38 |
+
temperature=temperature,
|
39 |
+
model=model,
|
40 |
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
41 |
+
max_retries=max_retries,
|
42 |
+
)
|
43 |
+
response = gpt.invoke([HumanMessage(content=prompt)])
|
44 |
+
if shouldReturnFullResponse:
|
45 |
+
return response
|
46 |
+
else:
|
47 |
+
return response.content
|
_utils/gerar_relatorio_modelo_usuario/prompts.py
CHANGED
@@ -17,12 +17,55 @@ Please return only the succinct context (without displaying your internal reason
|
|
17 |
```
|
18 |
"""
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
**Contextual Information (provided separately):**
|
23 |
{context}
|
24 |
**User Model (provided separately):**
|
25 |
-
<user's_template>
|
26 |
**Instructions:**
|
27 |
1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
|
28 |
2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
|
@@ -46,8 +89,7 @@ After composing the sentence, but before presenting it as the final answer, refl
|
|
46 |
- Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
|
47 |
"""
|
48 |
|
49 |
-
|
50 |
-
You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
|
51 |
**Instructions**:
|
52 |
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
|
53 |
2. **Reading the Context**: Extract the following information from `context`:
|
@@ -59,26 +101,17 @@ You are a language model specialized in producing concise and well-structured le
|
|
59 |
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
|
60 |
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
|
61 |
```
|
62 |
-
<
|
63 |
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
|
64 |
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
|
65 |
Não há outras causas interruptivas ou suspensivas da prescrição.
|
66 |
-
</
|
67 |
```
|
68 |
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
|
69 |
**Reminder**:
|
70 |
- Do not include your chain of thought in the final output.
|
71 |
- Do not add extra information or commentary beyond the specified format.
|
72 |
- The final answer must be in Portuguese.
|
73 |
-
---
|
74 |
-
|
75 |
-
**Contextual Information (provided separately):**
|
76 |
-
{context}
|
77 |
-
---
|
78 |
-
**Example with a given context**:
|
79 |
-
- Input:
|
80 |
-
`context` = "Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição."
|
81 |
-
- Expected final answer:
|
82 |
```
|
83 |
<formato>
|
84 |
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
|
@@ -86,3 +119,61 @@ A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
|
|
86 |
Não há outras causas interruptivas ou suspensivas da prescrição.
|
87 |
</formato>
|
88 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
```
|
18 |
"""
|
19 |
|
20 |
+
# Novo nome --> prompt-auxiliar --> Para gerar documentos (é usado como auxiliar no prompt final)
|
21 |
+
prompt_auxiliar = """
|
22 |
+
You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
|
23 |
+
**Instructions**:
|
24 |
+
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
|
25 |
+
2. **Reading the Context**: Extract the following information from `context`:
|
26 |
+
- The name of the defendant (réu).
|
27 |
+
- The crime they have been accused of (nome_do_crime).
|
28 |
+
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
|
29 |
+
- The date the accusation was accepted (data_do_recebimento).
|
30 |
+
- The ID of the decision document (id_do_documento).
|
31 |
+
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
|
32 |
+
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
|
33 |
+
```
|
34 |
+
<relatorio>
|
35 |
+
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
|
36 |
+
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
|
37 |
+
Não há outras causas interruptivas ou suspensivas da prescrição.
|
38 |
+
</relatorio>
|
39 |
+
```
|
40 |
+
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
|
41 |
+
**Reminder**:
|
42 |
+
- Do not include your chain of thought in the final output.
|
43 |
+
- Do not add extra information or commentary beyond the specified format.
|
44 |
+
- The final answer must be in Portuguese.
|
45 |
+
---
|
46 |
+
|
47 |
+
**Contextual Information (provided separately):**
|
48 |
+
{context}
|
49 |
+
---
|
50 |
+
**Example with a given context**:
|
51 |
+
- Input:
|
52 |
+
`context` = 'Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição.'
|
53 |
+
- Expected final answer:
|
54 |
+
```
|
55 |
+
<formato>
|
56 |
+
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
|
57 |
+
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
|
58 |
+
Não há outras causas interruptivas ou suspensivas da prescrição.
|
59 |
+
</formato>
|
60 |
+
"""
|
61 |
+
|
62 |
+
# Novo nome --> prompt-gerar-documento --> Para gerar documentos
|
63 |
+
prompt_gerar_documento = """
|
64 |
+
You are a large language model that must produce a single final document in **Portuguese**. To do this, you will follow a private chain of thought and then produce a final answer. The final answer must follow the formatting and stylistic conventions shown in the user-provided model `user's template`. The information to be included in the final document is derived from the `context` (a report describing a legal case).
|
65 |
**Contextual Information (provided separately):**
|
66 |
{context}
|
67 |
**User Model (provided separately):**
|
68 |
+
<user's_template>PROMPT DO MODELO DO USUÁRIO</user's_template>
|
69 |
**Instructions:**
|
70 |
1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
|
71 |
2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
|
|
|
89 |
- Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
|
90 |
"""
|
91 |
|
92 |
+
prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
|
|
|
93 |
**Instructions**:
|
94 |
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
|
95 |
2. **Reading the Context**: Extract the following information from `context`:
|
|
|
101 |
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
|
102 |
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
|
103 |
```
|
104 |
+
<formato>
|
105 |
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
|
106 |
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
|
107 |
Não há outras causas interruptivas ou suspensivas da prescrição.
|
108 |
+
</formato>
|
109 |
```
|
110 |
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
|
111 |
**Reminder**:
|
112 |
- Do not include your chain of thought in the final output.
|
113 |
- Do not add extra information or commentary beyond the specified format.
|
114 |
- The final answer must be in Portuguese.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
```
|
116 |
<formato>
|
117 |
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
|
|
|
119 |
Não há outras causas interruptivas ou suspensivas da prescrição.
|
120 |
</formato>
|
121 |
"""
|
122 |
+
|
123 |
+
prompt_auxiliar_do_contextual_prompt = """Você é um assistente jurídico especializado em direito brasileiro. Sua tarefa é criar um resumo conciso e informativo de um processo jurídico, de acordo com as leis do Brasil. O resumo deve focar nos momentos cruciais do processo, na última movimentação processual e nas principais movimentações que ocorreram.
|
124 |
+
|
125 |
+
Aqui estão as 10 principais peças processuais em ordem cronológica do processo civil brasileiro que você deve priorizar em sua análise:
|
126 |
+
1. Petição Inicial
|
127 |
+
2. Contestação
|
128 |
+
3. Réplica
|
129 |
+
4. Decisão de Saneamento
|
130 |
+
5. Sentença
|
131 |
+
6. Recurso de Apelação
|
132 |
+
7. Embargos de Declaração
|
133 |
+
8. Cumprimento de Sentença
|
134 |
+
9. Embargos à Execução
|
135 |
+
10. Agravo de Instrumento
|
136 |
+
|
137 |
+
Siga este passo a passo para criar o resumo:
|
138 |
+
|
139 |
+
1. Leia atentamente todo o processo jurídico fornecido.
|
140 |
+
<processo_juridico>
|
141 |
+
{{PROCESSO_JURIDICO}}
|
142 |
+
</processo_juridico>
|
143 |
+
|
144 |
+
2. Identifique e anote as datas e conteúdos relevantes relacionados às 10 peças processuais listadas acima.
|
145 |
+
|
146 |
+
3. Organize cronologicamente as informações coletadas.
|
147 |
+
|
148 |
+
4. Destaque a última movimentação processual e seu significado para o andamento do processo.
|
149 |
+
|
150 |
+
5. Resuma as principais movimentações, focando em seu impacto no processo.
|
151 |
+
|
152 |
+
6. Elabore um texto coeso que apresente o fluxo do processo, destacando os pontos cruciais e as decisões mais importantes.
|
153 |
+
|
154 |
+
Após criar o resumo inicial, utilize a técnica socrática de reflexão para garantir a precisão e completude do resumo. Faça a si mesmo as seguintes perguntas:
|
155 |
+
|
156 |
+
1. O resumo abrange todas as 10 peças processuais principais?
|
157 |
+
2. A última movimentação processual está claramente identificada e explicada?
|
158 |
+
3. O texto apresenta uma visão clara do fluxo do processo?
|
159 |
+
4. Todas as informações cruciais para o entendimento do caso estão incluídas?
|
160 |
+
5. O resumo está livre de opiniões pessoais e se atém aos fatos do processo?
|
161 |
+
6. A linguagem utilizada é clara e acessível, mesmo para quem não é especialista em direito?
|
162 |
+
|
163 |
+
Revise e ajuste o resumo conforme necessário com base nessa reflexão.
|
164 |
+
|
165 |
+
O resumo final deve ter no máximo 2 páginas de extensão (aproximadamente 1000 palavras).
|
166 |
+
|
167 |
+
Formate sua resposta da seguinte maneira:
|
168 |
+
|
169 |
+
<resumo_processo>
|
170 |
+
[Insira aqui o resumo do processo jurídico]
|
171 |
+
</resumo_processo>
|
172 |
+
|
173 |
+
<reflexao_socratica>
|
174 |
+
[Insira aqui suas respostas às perguntas da reflexão socrática]
|
175 |
+
</reflexao_socratica>
|
176 |
+
|
177 |
+
<resumo_final>
|
178 |
+
[Insira aqui o resumo final revisado, se houver alterações após a reflexão]
|
179 |
+
</resumo_final>"""
|
_utils/gerar_relatorio_modelo_usuario/utils.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def gerar_resposta_compilada(serializer):
|
2 |
+
return {
|
3 |
+
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
|
4 |
+
"embedding_weight": serializer["embedding_weight"],
|
5 |
+
"bm25_weight": serializer["bm25_weight"],
|
6 |
+
"context_window": serializer["context_window"],
|
7 |
+
"chunk_overlap": serializer["chunk_overlap"],
|
8 |
+
"num_k_rerank": serializer["num_k_rerank"],
|
9 |
+
"model_cohere_rerank": serializer["model_cohere_rerank"],
|
10 |
+
"more_initial_chunks_for_reranking": serializer[
|
11 |
+
"more_initial_chunks_for_reranking"
|
12 |
+
],
|
13 |
+
"claude_context_model": serializer["claude_context_model"],
|
14 |
+
"gpt_temperature": serializer["gpt_temperature"],
|
15 |
+
"user_message": serializer["user_message"],
|
16 |
+
"model": serializer["model"],
|
17 |
+
"hf_embedding": serializer["hf_embedding"],
|
18 |
+
"chunk_size": serializer["chunk_size"],
|
19 |
+
"chunk_overlap": serializer["chunk_overlap"],
|
20 |
+
"prompt_auxiliar": serializer["prompt_auxiliar"],
|
21 |
+
"prompt_gerar_documento": serializer["prompt_gerar_documento"],
|
22 |
+
}
|
_utils/prompts/Prompt_class.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup.easy_imports import ChatPromptTemplate
|
2 |
+
|
3 |
+
|
4 |
+
class Prompt:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def create_prompt_template(self, system_prompt, user_prompt):
|
9 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
10 |
+
[("system", system_prompt), ("user", user_prompt)]
|
11 |
+
)
|
12 |
+
return prompt_template
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import os
|
2 |
-
from
|
3 |
-
import json
|
4 |
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
|
5 |
EnhancedDocumentSummarizer,
|
6 |
)
|
|
|
|
|
|
|
|
|
|
|
7 |
from _utils.models.gerar_relatorio import (
|
8 |
RetrievalConfig,
|
9 |
)
|
@@ -38,7 +42,6 @@ async def get_llm_summary_answer_by_cursor_complete(
|
|
38 |
serializer, listaPDFs=None, contexto=None
|
39 |
):
|
40 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
41 |
-
allPdfsChunks = []
|
42 |
# Configuration
|
43 |
config = RetrievalConfig(
|
44 |
num_chunks=serializer["num_chunks_retrieval"],
|
@@ -59,82 +62,31 @@ async def get_llm_summary_answer_by_cursor_complete(
|
|
59 |
num_k_rerank=serializer["num_k_rerank"],
|
60 |
model_cohere_rerank=serializer["model_cohere_rerank"],
|
61 |
claude_context_model=serializer["claude_context_model"],
|
62 |
-
|
63 |
gpt_model=serializer["model"],
|
64 |
gpt_temperature=serializer["gpt_temperature"],
|
65 |
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
|
66 |
-
|
67 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
68 |
)
|
69 |
|
70 |
-
full_text =
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
# pdf_path = "./Im_a_storyteller.pdf"
|
78 |
-
# chunks = summarizer.load_and_split_document(pdf_path)
|
79 |
-
|
80 |
-
# Load and process document
|
81 |
-
for pdf in listaPDFs:
|
82 |
-
pdf_path = pdf
|
83 |
-
chunks = summarizer.load_and_split_document(pdf_path)
|
84 |
-
allPdfsChunks = allPdfsChunks + chunks
|
85 |
-
|
86 |
-
# Get full text for contextualization
|
87 |
-
loader = PyPDFLoader(pdf_path)
|
88 |
-
pages = loader.load()
|
89 |
-
full_text = " ".join([page.page_content for page in pages])
|
90 |
-
# Contextualize chunks
|
91 |
-
if serializer["should_have_contextual_chunks"]:
|
92 |
-
contextualized_chunks = (
|
93 |
-
await summarizer.contextual_retriever.contextualize_all_chunks(
|
94 |
-
pages, allPdfsChunks
|
95 |
-
)
|
96 |
)
|
97 |
-
|
98 |
-
is_contextualized_chunk = True
|
99 |
-
else:
|
100 |
-
chunks_passados = allPdfsChunks
|
101 |
-
is_contextualized_chunk = False
|
102 |
|
103 |
# Create enhanced vector store and BM25 index
|
104 |
-
vector_store, bm25, chunk_ids =
|
105 |
-
|
|
|
|
|
106 |
)
|
107 |
|
108 |
-
prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
|
109 |
-
**Instructions**:
|
110 |
-
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
|
111 |
-
2. **Reading the Context**: Extract the following information from `context`:
|
112 |
-
- The name of the defendant (réu).
|
113 |
-
- The crime they have been accused of (nome_do_crime).
|
114 |
-
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
|
115 |
-
- The date the accusation was accepted (data_do_recebimento).
|
116 |
-
- The ID of the decision document (id_do_documento).
|
117 |
-
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
|
118 |
-
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
|
119 |
-
```
|
120 |
-
<formato>
|
121 |
-
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
|
122 |
-
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
|
123 |
-
Não há outras causas interruptivas ou suspensivas da prescrição.
|
124 |
-
</formato>
|
125 |
-
```
|
126 |
-
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
|
127 |
-
**Reminder**:
|
128 |
-
- Do not include your chain of thought in the final output.
|
129 |
-
- Do not add extra information or commentary beyond the specified format.
|
130 |
-
- The final answer must be in Portuguese.
|
131 |
-
```
|
132 |
-
<formato>
|
133 |
-
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
|
134 |
-
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
|
135 |
-
Não há outras causas interruptivas ou suspensivas da prescrição.
|
136 |
-
</formato>
|
137 |
-
"""
|
138 |
# Generate enhanced summary
|
139 |
structured_summaries = await summarizer.generate_enhanced_summary(
|
140 |
vector_store,
|
@@ -142,7 +94,7 @@ Não há outras causas interruptivas ou suspensivas da prescrição.
|
|
142 |
chunk_ids
|
143 |
# , serializer["user_message"]
|
144 |
,
|
145 |
-
|
146 |
)
|
147 |
|
148 |
if not isinstance(structured_summaries, list):
|
@@ -150,41 +102,15 @@ Não há outras causas interruptivas ou suspensivas da prescrição.
|
|
150 |
|
151 |
return Response({"erro": structured_summaries})
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
# print("\nStructured Summaries:")
|
156 |
-
# print(json_output)
|
157 |
-
texto_completo = ""
|
158 |
-
print("\n\n\n")
|
159 |
-
print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
|
160 |
-
texto_completo += summarizer.resumo_gerado
|
161 |
-
texto_completo += "\n\n"
|
162 |
-
print("\n\n\n")
|
163 |
-
print("structured_summaries: ", structured_summaries)
|
164 |
for x in structured_summaries:
|
165 |
texto_completo = texto_completo + x["content"] + "\n"
|
|
|
|
|
|
|
166 |
return {
|
167 |
"resultado": structured_summaries,
|
168 |
"texto_completo": texto_completo,
|
169 |
-
"parametros-utilizados":
|
170 |
-
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
|
171 |
-
"embedding_weight": serializer["embedding_weight"],
|
172 |
-
"bm25_weight": serializer["bm25_weight"],
|
173 |
-
"context_window": serializer["context_window"],
|
174 |
-
"chunk_overlap": serializer["chunk_overlap"],
|
175 |
-
"num_k_rerank": serializer["num_k_rerank"],
|
176 |
-
"model_cohere_rerank": serializer["model_cohere_rerank"],
|
177 |
-
"more_initial_chunks_for_reranking": serializer[
|
178 |
-
"more_initial_chunks_for_reranking"
|
179 |
-
],
|
180 |
-
"claude_context_model": serializer["claude_context_model"],
|
181 |
-
"gpt_temperature": serializer["gpt_temperature"],
|
182 |
-
"user_message": serializer["user_message"],
|
183 |
-
"model": serializer["model"],
|
184 |
-
"hf_embedding": serializer["hf_embedding"],
|
185 |
-
"chunk_size": serializer["chunk_size"],
|
186 |
-
"chunk_overlap": serializer["chunk_overlap"],
|
187 |
-
"prompt_relatorio": serializer["prompt_relatorio"],
|
188 |
-
"prompt_modelo": serializer["prompt_modelo"],
|
189 |
-
},
|
190 |
}
|
|
|
1 |
import os
|
2 |
+
from _utils.gerar_relatorio_modelo_usuario.prompts import prompt_auxiliar_SEM_CONTEXT
|
|
|
3 |
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
|
4 |
EnhancedDocumentSummarizer,
|
5 |
)
|
6 |
+
from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
|
7 |
+
contextualize_chunk_based_on_serializer,
|
8 |
+
get_full_text_and_all_PDFs_chunks,
|
9 |
+
)
|
10 |
+
from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada
|
11 |
from _utils.models.gerar_relatorio import (
|
12 |
RetrievalConfig,
|
13 |
)
|
|
|
42 |
serializer, listaPDFs=None, contexto=None
|
43 |
):
|
44 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
|
|
45 |
# Configuration
|
46 |
config = RetrievalConfig(
|
47 |
num_chunks=serializer["num_chunks_retrieval"],
|
|
|
62 |
num_k_rerank=serializer["num_k_rerank"],
|
63 |
model_cohere_rerank=serializer["model_cohere_rerank"],
|
64 |
claude_context_model=serializer["claude_context_model"],
|
65 |
+
prompt_auxiliar=serializer["prompt_auxiliar"],
|
66 |
gpt_model=serializer["model"],
|
67 |
gpt_temperature=serializer["gpt_temperature"],
|
68 |
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
|
69 |
+
prompt_gerar_documento=serializer["prompt_gerar_documento"],
|
70 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
71 |
)
|
72 |
|
73 |
+
full_text, allPdfsChunks, pages = get_full_text_and_all_PDFs_chunks(
|
74 |
+
contexto, listaPDFs, summarizer.splitter
|
75 |
+
)
|
76 |
+
|
77 |
+
chunks_passados, is_contextualized_chunk = (
|
78 |
+
await contextualize_chunk_based_on_serializer(
|
79 |
+
serializer, summarizer.contextual_retriever, pages, allPdfsChunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
)
|
81 |
+
)
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Create enhanced vector store and BM25 index
|
84 |
+
vector_store, bm25, chunk_ids = (
|
85 |
+
summarizer.vector_store.create_enhanced_vector_store(
|
86 |
+
chunks_passados, is_contextualized_chunk
|
87 |
+
)
|
88 |
)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
# Generate enhanced summary
|
91 |
structured_summaries = await summarizer.generate_enhanced_summary(
|
92 |
vector_store,
|
|
|
94 |
chunk_ids
|
95 |
# , serializer["user_message"]
|
96 |
,
|
97 |
+
prompt_auxiliar_SEM_CONTEXT,
|
98 |
)
|
99 |
|
100 |
if not isinstance(structured_summaries, list):
|
|
|
102 |
|
103 |
return Response({"erro": structured_summaries})
|
104 |
|
105 |
+
texto_completo = summarizer.resumo_gerado + "\n\n"
|
106 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
for x in structured_summaries:
|
108 |
texto_completo = texto_completo + x["content"] + "\n"
|
109 |
+
|
110 |
+
print("\n\ntexto_completo: ", texto_completo)
|
111 |
+
|
112 |
return {
|
113 |
"resultado": structured_summaries,
|
114 |
"texto_completo": texto_completo,
|
115 |
+
"parametros-utilizados": gerar_resposta_compilada(serializer),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
}
|
_utils/resumo_simples_cursor.py
CHANGED
@@ -1,221 +1,234 @@
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple
|
3 |
-
from
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
from dataclasses import dataclass
|
11 |
import uuid
|
12 |
import json
|
13 |
from langchain_huggingface import HuggingFaceEndpoint
|
14 |
from setup.environment import default_model
|
15 |
|
16 |
-
os.environ["LANGCHAIN_TRACING_V2"]="true"
|
17 |
-
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
|
18 |
os.environ.get("LANGCHAIN_API_KEY")
|
19 |
-
os.environ["LANGCHAIN_PROJECT"]="VELLA"
|
|
|
20 |
|
21 |
@dataclass
|
22 |
class DocumentChunk:
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
class DocumentSummarizer:
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
)
|
41 |
-
self.chunk_metadata = {} # Store chunk metadata for tracing
|
42 |
-
|
43 |
-
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
44 |
-
"""Load PDF and split into chunks with metadata"""
|
45 |
-
loader = PyPDFLoader(pdf_path)
|
46 |
-
pages = loader.load()
|
47 |
-
chunks = []
|
48 |
-
char_count = 0
|
49 |
-
|
50 |
-
for page in pages:
|
51 |
-
text = page.page_content
|
52 |
-
# Split the page content
|
53 |
-
page_chunks = self.text_splitter.split_text(text)
|
54 |
-
|
55 |
-
for chunk in page_chunks:
|
56 |
-
chunk_id = str(uuid.uuid4())
|
57 |
-
start_char = text.find(chunk)
|
58 |
-
end_char = start_char + len(chunk)
|
59 |
-
|
60 |
-
doc_chunk = DocumentChunk(
|
61 |
-
content=chunk,
|
62 |
-
page_number=page.metadata.get('page') + 1, # 1-based page numbering
|
63 |
-
chunk_id=chunk_id,
|
64 |
-
start_char=char_count + start_char,
|
65 |
-
end_char=char_count + end_char
|
66 |
-
)
|
67 |
-
chunks.append(doc_chunk)
|
68 |
-
|
69 |
-
# Store metadata for later retrieval
|
70 |
-
self.chunk_metadata[chunk_id] = {
|
71 |
-
'page': doc_chunk.page_number,
|
72 |
-
'start_char': doc_chunk.start_char,
|
73 |
-
'end_char': doc_chunk.end_char
|
74 |
-
}
|
75 |
-
|
76 |
-
char_count += len(text)
|
77 |
-
|
78 |
-
return chunks
|
79 |
-
|
80 |
-
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
|
81 |
-
"""Create vector store with metadata"""
|
82 |
-
texts = [chunk.content for chunk in chunks]
|
83 |
-
metadatas = [{
|
84 |
-
'chunk_id': chunk.chunk_id,
|
85 |
-
'page': chunk.page_number,
|
86 |
-
'start_char': chunk.start_char,
|
87 |
-
'end_char': chunk.end_char
|
88 |
-
} for chunk in chunks]
|
89 |
-
|
90 |
-
vector_store = Chroma.from_texts(
|
91 |
-
texts=texts,
|
92 |
-
metadatas=metadatas,
|
93 |
-
embedding=self.embeddings
|
94 |
-
)
|
95 |
-
return vector_store
|
96 |
-
|
97 |
-
def generate_summary_with_sources(
|
98 |
-
self,
|
99 |
-
vector_store: Chroma,
|
100 |
-
query: str = "Summarize the main points of this document"
|
101 |
-
) -> List[Dict]:
|
102 |
-
"""Generate summary with source citations, returning structured JSON data"""
|
103 |
-
# Retrieve relevant chunks with metadata
|
104 |
-
relevant_docs = vector_store.similarity_search_with_score(query, k=5)
|
105 |
-
|
106 |
-
# Prepare context and track sources
|
107 |
-
contexts = []
|
108 |
-
sources = []
|
109 |
-
|
110 |
-
for doc, score in relevant_docs:
|
111 |
-
chunk_id = doc.metadata['chunk_id']
|
112 |
-
context = doc.page_content
|
113 |
-
contexts.append(context)
|
114 |
-
|
115 |
-
sources.append({
|
116 |
-
'content': context,
|
117 |
-
'page': doc.metadata['page'],
|
118 |
-
'chunk_id': chunk_id,
|
119 |
-
'relevance_score': score
|
120 |
-
})
|
121 |
-
|
122 |
-
prompt = PromptTemplate(
|
123 |
-
template=self.system_prompt,
|
124 |
-
input_variables=["context"]
|
125 |
-
)
|
126 |
-
llm = ""
|
127 |
-
|
128 |
-
if (self.model == default_model):
|
129 |
-
llm = ChatOpenAI(
|
130 |
-
temperature=0,
|
131 |
-
model_name="gpt-4o-mini",
|
132 |
-
api_key=self.openai_api_key
|
133 |
)
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
response = llm.predict(prompt.format(context="\n\n".join(contexts)))
|
145 |
-
|
146 |
-
# Split the response into paragraphs
|
147 |
-
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
|
148 |
-
|
149 |
-
# Create structured output
|
150 |
-
structured_output = []
|
151 |
-
for idx, summary in enumerate(summaries):
|
152 |
-
# Associate each summary with the most relevant source
|
153 |
-
structured_output.append({
|
154 |
-
"content": summary,
|
155 |
-
"source": {
|
156 |
-
"page": sources[min(idx, len(sources)-1)]['page'],
|
157 |
-
"text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
|
158 |
-
"relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
|
159 |
-
}
|
160 |
-
})
|
161 |
-
|
162 |
-
return structured_output
|
163 |
-
|
164 |
-
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
|
165 |
-
"""Get extended context around a specific chunk"""
|
166 |
-
metadata = self.chunk_metadata.get(chunk_id)
|
167 |
-
if not metadata:
|
168 |
-
return None
|
169 |
-
|
170 |
-
return {
|
171 |
-
'page': metadata['page'],
|
172 |
-
'start_char': metadata['start_char'],
|
173 |
-
'end_char': metadata['end_char']
|
174 |
-
}
|
175 |
|
176 |
def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
219 |
|
220 |
if __name__ == "__main__":
|
221 |
get_llm_summary_answer_by_cursor()
|
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple
|
3 |
+
from setup.easy_imports import (
|
4 |
+
HuggingFaceEmbeddings,
|
5 |
+
PyPDFLoader,
|
6 |
+
Chroma,
|
7 |
+
ChatOpenAI,
|
8 |
+
create_extraction_chain,
|
9 |
+
PromptTemplate,
|
10 |
+
RecursiveCharacterTextSplitter,
|
11 |
+
)
|
12 |
from dataclasses import dataclass
|
13 |
import uuid
|
14 |
import json
|
15 |
from langchain_huggingface import HuggingFaceEndpoint
|
16 |
from setup.environment import default_model
|
17 |
|
18 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
19 |
+
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
|
20 |
os.environ.get("LANGCHAIN_API_KEY")
|
21 |
+
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
22 |
+
|
23 |
|
24 |
@dataclass
|
25 |
class DocumentChunk:
|
26 |
+
content: str
|
27 |
+
page_number: int
|
28 |
+
chunk_id: str
|
29 |
+
start_char: int
|
30 |
+
end_char: int
|
31 |
+
|
32 |
|
33 |
class DocumentSummarizer:
|
34 |
+
|
35 |
+
def __init__(
|
36 |
+
self, openai_api_key: str, model, embedding, chunk_config, system_prompt
|
37 |
+
):
|
38 |
+
self.model = model
|
39 |
+
self.system_prompt = system_prompt
|
40 |
+
self.openai_api_key = openai_api_key
|
41 |
+
self.embeddings = HuggingFaceEmbeddings(model_name=embedding)
|
42 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
43 |
+
chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
)
|
45 |
+
self.chunk_metadata = {} # Store chunk metadata for tracing
|
46 |
+
|
47 |
+
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
48 |
+
"""Load PDF and split into chunks with metadata"""
|
49 |
+
loader = PyPDFLoader(pdf_path)
|
50 |
+
pages = loader.load()
|
51 |
+
chunks = []
|
52 |
+
char_count = 0
|
53 |
+
|
54 |
+
for page in pages:
|
55 |
+
text = page.page_content
|
56 |
+
# Split the page content
|
57 |
+
page_chunks = self.text_splitter.split_text(text)
|
58 |
+
|
59 |
+
for chunk in page_chunks:
|
60 |
+
chunk_id = str(uuid.uuid4())
|
61 |
+
start_char = text.find(chunk)
|
62 |
+
end_char = start_char + len(chunk)
|
63 |
+
|
64 |
+
doc_chunk = DocumentChunk(
|
65 |
+
content=chunk,
|
66 |
+
page_number=page.metadata.get("page") + 1, # 1-based page numbering
|
67 |
+
chunk_id=chunk_id,
|
68 |
+
start_char=char_count + start_char,
|
69 |
+
end_char=char_count + end_char,
|
70 |
+
)
|
71 |
+
chunks.append(doc_chunk)
|
72 |
+
|
73 |
+
# Store metadata for later retrieval
|
74 |
+
self.chunk_metadata[chunk_id] = {
|
75 |
+
"page": doc_chunk.page_number,
|
76 |
+
"start_char": doc_chunk.start_char,
|
77 |
+
"end_char": doc_chunk.end_char,
|
78 |
+
}
|
79 |
+
|
80 |
+
char_count += len(text)
|
81 |
+
|
82 |
+
return chunks
|
83 |
+
|
84 |
+
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
|
85 |
+
"""Create vector store with metadata"""
|
86 |
+
texts = [chunk.content for chunk in chunks]
|
87 |
+
metadatas = [
|
88 |
+
{
|
89 |
+
"chunk_id": chunk.chunk_id,
|
90 |
+
"page": chunk.page_number,
|
91 |
+
"start_char": chunk.start_char,
|
92 |
+
"end_char": chunk.end_char,
|
93 |
+
}
|
94 |
+
for chunk in chunks
|
95 |
+
]
|
96 |
+
|
97 |
+
vector_store = Chroma.from_texts(
|
98 |
+
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
99 |
)
|
100 |
+
return vector_store
|
101 |
+
|
102 |
+
def generate_summary_with_sources(
|
103 |
+
self,
|
104 |
+
vector_store: Chroma,
|
105 |
+
query: str = "Summarize the main points of this document",
|
106 |
+
) -> List[Dict]:
|
107 |
+
"""Generate summary with source citations, returning structured JSON data"""
|
108 |
+
# Retrieve relevant chunks with metadata
|
109 |
+
relevant_docs = vector_store.similarity_search_with_score(query, k=5)
|
110 |
+
|
111 |
+
# Prepare context and track sources
|
112 |
+
contexts = []
|
113 |
+
sources = []
|
114 |
+
|
115 |
+
for doc, score in relevant_docs:
|
116 |
+
chunk_id = doc.metadata["chunk_id"]
|
117 |
+
context = doc.page_content
|
118 |
+
contexts.append(context)
|
119 |
+
|
120 |
+
sources.append(
|
121 |
+
{
|
122 |
+
"content": context,
|
123 |
+
"page": doc.metadata["page"],
|
124 |
+
"chunk_id": chunk_id,
|
125 |
+
"relevance_score": score,
|
126 |
+
}
|
127 |
+
)
|
128 |
+
|
129 |
+
prompt = PromptTemplate(
|
130 |
+
template=self.system_prompt, input_variables=["context"]
|
131 |
+
)
|
132 |
+
llm = ""
|
133 |
+
|
134 |
+
if self.model == default_model:
|
135 |
+
llm = ChatOpenAI(
|
136 |
+
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
|
137 |
+
)
|
138 |
+
else:
|
139 |
+
llm = HuggingFaceEndpoint(
|
140 |
+
repo_id=self.model,
|
141 |
+
task="text-generation",
|
142 |
+
max_new_tokens=1100,
|
143 |
+
do_sample=False,
|
144 |
+
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
|
145 |
+
)
|
146 |
+
|
147 |
+
response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
|
148 |
+
|
149 |
+
# Split the response into paragraphs
|
150 |
+
summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
|
151 |
+
|
152 |
+
# Create structured output
|
153 |
+
structured_output = []
|
154 |
+
for idx, summary in enumerate(summaries):
|
155 |
+
# Associate each summary with the most relevant source
|
156 |
+
structured_output.append(
|
157 |
+
{
|
158 |
+
"content": summary,
|
159 |
+
"source": {
|
160 |
+
"page": sources[min(idx, len(sources) - 1)]["page"],
|
161 |
+
"text": sources[min(idx, len(sources) - 1)]["content"][:200]
|
162 |
+
+ "...",
|
163 |
+
"relevance_score": sources[min(idx, len(sources) - 1)][
|
164 |
+
"relevance_score"
|
165 |
+
],
|
166 |
+
},
|
167 |
+
}
|
168 |
+
)
|
169 |
+
|
170 |
+
return structured_output
|
171 |
+
|
172 |
+
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
|
173 |
+
"""Get extended context around a specific chunk"""
|
174 |
+
metadata = self.chunk_metadata.get(chunk_id)
|
175 |
+
if not metadata:
|
176 |
+
return None
|
177 |
+
|
178 |
+
return {
|
179 |
+
"page": metadata["page"],
|
180 |
+
"start_char": metadata["start_char"],
|
181 |
+
"end_char": metadata["end_char"],
|
182 |
+
}
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
|
186 |
+
# By Luan
|
187 |
+
allPdfsChunks = []
|
188 |
+
|
189 |
+
# Initialize summarizer
|
190 |
+
summarizer = DocumentSummarizer(
|
191 |
+
openai_api_key=os.environ.get("OPENAI_API_KEY"),
|
192 |
+
embedding=serializer["hf_embedding"],
|
193 |
+
chunk_config={
|
194 |
+
"size": serializer["chunk_size"],
|
195 |
+
"overlap": serializer["chunk_overlap"],
|
196 |
+
},
|
197 |
+
system_prompt=serializer["system_prompt"],
|
198 |
+
model=serializer["model"],
|
199 |
+
)
|
200 |
+
|
201 |
+
# Load and process document
|
202 |
+
for pdf in listaPDFs:
|
203 |
+
pdf_path = pdf
|
204 |
+
chunks = summarizer.load_and_split_document(pdf_path)
|
205 |
+
allPdfsChunks = allPdfsChunks + chunks
|
206 |
+
|
207 |
+
vector_store = summarizer.create_vector_store(allPdfsChunks)
|
208 |
+
|
209 |
+
# Generate structured summary
|
210 |
+
structured_summaries = summarizer.generate_summary_with_sources(vector_store)
|
211 |
+
|
212 |
+
# Print or return the structured data
|
213 |
+
# print(structured_summaries)
|
214 |
+
json_data = json.dumps(structured_summaries)
|
215 |
+
print("\n\n")
|
216 |
+
print(json_data)
|
217 |
+
return structured_summaries
|
218 |
+
# If you need to send to frontend, you can just return structured_summaries
|
219 |
+
# It will be in the format:
|
220 |
+
# [
|
221 |
+
# {
|
222 |
+
# "content": "Summary point 1...",
|
223 |
+
# "source": {
|
224 |
+
# "page": 1,
|
225 |
+
# "text": "Source text...",
|
226 |
+
# "relevance_score": 0.95
|
227 |
+
# }
|
228 |
+
# },
|
229 |
+
# ...
|
230 |
+
# ]
|
231 |
+
|
232 |
|
233 |
if __name__ == "__main__":
|
234 |
get_llm_summary_answer_by_cursor()
|
_utils/splitters/Splitter_class.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
|
2 |
+
from typing import List, Dict, Tuple, Optional
|
3 |
+
from _utils.models.gerar_relatorio import (
|
4 |
+
DocumentChunk,
|
5 |
+
)
|
6 |
+
import uuid
|
7 |
+
|
8 |
+
|
9 |
+
class Splitter:
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
chunk_size,
|
13 |
+
chunk_overlap,
|
14 |
+
):
|
15 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
16 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
17 |
+
)
|
18 |
+
self.chunk_metadata = {} # Store chunk metadata for tracing
|
19 |
+
|
20 |
+
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
|
21 |
+
"""Load PDF and split into chunks with metadata"""
|
22 |
+
loader = PyPDFLoader(pdf_path)
|
23 |
+
pages = (
|
24 |
+
loader.load()
|
25 |
+
) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
26 |
+
chunks = []
|
27 |
+
char_count = 0
|
28 |
+
|
29 |
+
for page in pages:
|
30 |
+
text = page.page_content
|
31 |
+
page_chunks = self.text_splitter.split_text(
|
32 |
+
text
|
33 |
+
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
34 |
+
|
35 |
+
for chunk in page_chunks:
|
36 |
+
chunk_id = str(uuid.uuid4())
|
37 |
+
start_char = text.find(
|
38 |
+
chunk
|
39 |
+
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
40 |
+
end_char = start_char + len(chunk)
|
41 |
+
|
42 |
+
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
43 |
+
content=chunk,
|
44 |
+
page_number=page.metadata.get("page") + 1, # 1-based page numbering
|
45 |
+
chunk_id=chunk_id,
|
46 |
+
start_char=char_count + start_char,
|
47 |
+
end_char=char_count + end_char,
|
48 |
+
)
|
49 |
+
chunks.append(doc_chunk)
|
50 |
+
|
51 |
+
# Store metadata for later retrieval
|
52 |
+
self.chunk_metadata[chunk_id] = {
|
53 |
+
"page": doc_chunk.page_number,
|
54 |
+
"start_char": doc_chunk.start_char,
|
55 |
+
"end_char": doc_chunk.end_char,
|
56 |
+
}
|
57 |
+
|
58 |
+
char_count += len(text)
|
59 |
+
|
60 |
+
return chunks
|
61 |
+
|
62 |
+
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
63 |
+
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
64 |
+
page = Document(page_content=text, metadata={"page": 1})
|
65 |
+
chunks = []
|
66 |
+
char_count = 0
|
67 |
+
|
68 |
+
text = page.page_content
|
69 |
+
page_chunks = self.text_splitter.split_text(
|
70 |
+
text
|
71 |
+
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
72 |
+
print("\n\n\n")
|
73 |
+
print("page_chunks: ", page_chunks)
|
74 |
+
|
75 |
+
for chunk in page_chunks:
|
76 |
+
chunk_id = str(uuid.uuid4())
|
77 |
+
start_char = text.find(
|
78 |
+
chunk
|
79 |
+
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
80 |
+
end_char = start_char + len(chunk)
|
81 |
+
|
82 |
+
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
83 |
+
content=chunk,
|
84 |
+
page_number=page.metadata.get("page") + 1, # 1-based page numbering
|
85 |
+
chunk_id=chunk_id,
|
86 |
+
start_char=char_count + start_char,
|
87 |
+
end_char=char_count + end_char,
|
88 |
+
)
|
89 |
+
chunks.append(doc_chunk)
|
90 |
+
|
91 |
+
# Store metadata for later retrieval
|
92 |
+
self.chunk_metadata[chunk_id] = {
|
93 |
+
"page": doc_chunk.page_number,
|
94 |
+
"start_char": doc_chunk.start_char,
|
95 |
+
"end_char": doc_chunk.end_char,
|
96 |
+
}
|
97 |
+
|
98 |
+
char_count += len(text)
|
99 |
+
|
100 |
+
return chunks
|
_utils/vector_stores/Vector_store_class.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Tuple, Optional
|
2 |
+
from _utils.models.gerar_relatorio import (
|
3 |
+
ContextualizedChunk,
|
4 |
+
)
|
5 |
+
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
|
6 |
+
import logging
|
7 |
+
|
8 |
+
|
9 |
+
class VectorStore:
|
10 |
+
def __init__(self, embedding_model):
|
11 |
+
self.logger = logging.getLogger(__name__)
|
12 |
+
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
13 |
+
pass
|
14 |
+
|
15 |
+
def create_enhanced_vector_store(
|
16 |
+
self, chunks: List[ContextualizedChunk], is_contextualized_chunk
|
17 |
+
) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
18 |
+
"""Create vector store and BM25 index with contextualized chunks"""
|
19 |
+
try:
|
20 |
+
# Prepare texts with context
|
21 |
+
if is_contextualized_chunk:
|
22 |
+
texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
|
23 |
+
else:
|
24 |
+
texts = [f"{chunk.content}" for chunk in chunks]
|
25 |
+
|
26 |
+
# Create vector store
|
27 |
+
metadatas = []
|
28 |
+
for chunk in chunks:
|
29 |
+
if is_contextualized_chunk:
|
30 |
+
context = chunk.context
|
31 |
+
else:
|
32 |
+
context = ""
|
33 |
+
metadatas.append(
|
34 |
+
{
|
35 |
+
"chunk_id": chunk.chunk_id,
|
36 |
+
"page": chunk.page_number,
|
37 |
+
"start_char": chunk.start_char,
|
38 |
+
"end_char": chunk.end_char,
|
39 |
+
"context": context,
|
40 |
+
}
|
41 |
+
)
|
42 |
+
|
43 |
+
vector_store = Chroma.from_texts(
|
44 |
+
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
45 |
+
)
|
46 |
+
|
47 |
+
# Create BM25 index
|
48 |
+
tokenized_texts = [text.split() for text in texts]
|
49 |
+
bm25 = BM25Okapi(tokenized_texts)
|
50 |
+
|
51 |
+
# Get chunk IDs in order
|
52 |
+
chunk_ids = [chunk.chunk_id for chunk in chunks]
|
53 |
+
|
54 |
+
return vector_store, bm25, chunk_ids
|
55 |
+
|
56 |
+
except Exception as e:
|
57 |
+
self.logger.error(f"Error creating enhanced vector store: {str(e)}")
|
58 |
+
raise
|
gerar_documento/serializer.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from rest_framework import serializers
|
2 |
from _antigos.resumos.serializer import ResumoCursorSerializer
|
3 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
4 |
-
|
5 |
-
|
6 |
)
|
7 |
|
8 |
user_message = "What are the main points of this document?"
|
@@ -10,10 +10,10 @@ user_message = "What are the main points of this document?"
|
|
10 |
|
11 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
12 |
system_prompt = None
|
13 |
-
|
14 |
-
|
|
|
15 |
)
|
16 |
-
prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
|
17 |
user_message = serializers.CharField(required=False, default=user_message)
|
18 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
19 |
embedding_weight = serializers.FloatField(default=0.5)
|
|
|
1 |
from rest_framework import serializers
|
2 |
from _antigos.resumos.serializer import ResumoCursorSerializer
|
3 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
4 |
+
prompt_gerar_documento,
|
5 |
+
prompt_auxiliar,
|
6 |
)
|
7 |
|
8 |
user_message = "What are the main points of this document?"
|
|
|
10 |
|
11 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
12 |
system_prompt = None
|
13 |
+
prompt_auxiliar = serializers.CharField(required=False, default=prompt_auxiliar)
|
14 |
+
prompt_gerar_documento = serializers.CharField(
|
15 |
+
required=False, default=prompt_gerar_documento
|
16 |
)
|
|
|
17 |
user_message = serializers.CharField(required=False, default=user_message)
|
18 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
19 |
embedding_weight = serializers.FloatField(default=0.5)
|
gerar_documento/views.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
5 |
from _utils.handle_files import handle_pdf_files_from_serializer, remove_pdf_temp_files
|
6 |
from _utils.resumo_completo_cursor import (
|
7 |
get_llm_summary_answer_by_cursor_complete,
|
@@ -9,9 +13,6 @@ from _utils.resumo_completo_cursor import (
|
|
9 |
from .serializer import (
|
10 |
ResumoCursorCompeltoSerializer,
|
11 |
)
|
12 |
-
from rest_framework.parsers import MultiPartParser
|
13 |
-
from drf_spectacular.utils import extend_schema
|
14 |
-
from datetime import datetime
|
15 |
|
16 |
|
17 |
class ResumoSimplesCursorCompletoView(AsyncAPIView):
|
|
|
1 |
+
from setup.easy_imports import (
|
2 |
+
Response,
|
3 |
+
AsyncAPIView,
|
4 |
+
APIView,
|
5 |
+
MultiPartParser,
|
6 |
+
extend_schema,
|
7 |
+
)
|
8 |
+
from datetime import datetime
|
9 |
from _utils.handle_files import handle_pdf_files_from_serializer, remove_pdf_temp_files
|
10 |
from _utils.resumo_completo_cursor import (
|
11 |
get_llm_summary_answer_by_cursor_complete,
|
|
|
13 |
from .serializer import (
|
14 |
ResumoCursorCompeltoSerializer,
|
15 |
)
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
class ResumoSimplesCursorCompletoView(AsyncAPIView):
|
setup/easy_imports.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from adrf.views import APIView as AsyncAPIView
|
2 |
+
from drf_spectacular.utils import extend_schema
|
3 |
+
|
4 |
+
from rest_framework.views import APIView
|
5 |
+
from rest_framework.response import Response
|
6 |
+
from rest_framework.parsers import MultiPartParser
|
7 |
+
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
|
10 |
+
# from langchain_huggingface import HuggingFaceEmbeddings
|
11 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
+
from langchain.prompts import PromptTemplate
|
13 |
+
from langchain_core.prompts import ChatPromptTemplate
|
14 |
+
from langchain_community.document_loaders import PyPDFLoader
|
15 |
+
from langchain_community.vectorstores import Chroma
|
16 |
+
|
17 |
+
# from langchain_community.chat_models import ChatOpenAI
|
18 |
+
from langchain_openai import ChatOpenAI
|
19 |
+
from langchain.schema import Document
|
20 |
+
from langchain.chains import create_extraction_chain
|
21 |
+
|
22 |
+
from rank_bm25 import BM25Okapi
|