luanpoppe commited on
Commit
1286e81
·
1 Parent(s): c66a7e7

fix: adicionando opção de não utilizar o contextualized chunks temporariamente

Browse files
_utils/gerar_relatorio_modelo_usuario/ContextualRetriever_original.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Optional
2
+ from anthropic import Anthropic
3
+ import logging
4
+ from _utils.models.gerar_relatorio import (
5
+ ContextualizedChunk,
6
+ DocumentChunk,
7
+ RetrievalConfig,
8
+ )
9
+ from langchain.schema import Document
10
+
11
+
12
+ class ContextualRetriever:
13
+
14
+ def __init__(
15
+ self, config: RetrievalConfig, claude_api_key: str, claude_context_model
16
+ ):
17
+ self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada.
18
+ self.claude_client = Anthropic(api_key=claude_api_key)
19
+ self.logger = logging.getLogger(__name__)
20
+ self.bm25 = None
21
+ self.claude_context_model = claude_context_model
22
+
23
+ def generate_context(self, full_text: str, chunk: DocumentChunk) -> str:
24
+ """Generate contextual description using Claude"""
25
+ try:
26
+ # prompt = f"""<document>
27
+ # {full_text}
28
+ # </document>
29
+ # Here is the chunk we want to situate within the whole document
30
+ # <chunk>
31
+ # {chunk.content}
32
+ # </chunk>
33
+ # Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
34
+
35
+ prompt = f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
36
+ 1. **Chain of Thought (internal)**:
37
+ - Identify the document ID, which is the value between "NUM." and "- Pág".
38
+ - Identify the document name from the header.
39
+ 2. **Reflection (internal)**:
40
+ - Confirm the document ID and name are correctly identified.
41
+ - Ensure the final context is concise and helpful.
42
+ 3. **Final Response**:
43
+ - Provide a short context situating the *chunk* within the document, including the document ID and document name.
44
+ - Do not include any reasoning or reflection in your response.
45
+ **Example Usage:**
46
+ ```
47
+ <document> {full_text} </document>
48
+ <chunk> {chunk.content} </chunk>
49
+ Please return only the succinct context (without displaying your internal reasoning), including the document ID and the document name.
50
+ ```
51
+ """
52
+
53
+ response = self.claude_client.messages.create(
54
+ model=self.claude_context_model,
55
+ max_tokens=100,
56
+ messages=[{"role": "user", "content": prompt}],
57
+ )
58
+ return response.content[
59
+ 0
60
+ ].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
61
+ except Exception as e:
62
+ self.logger.error(
63
+ f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
64
+ )
65
+ return ""
66
+
67
+ def contextualize_all_chunks(
68
+ self, full_text: List[Document], chunks: List[DocumentChunk]
69
+ ) -> List[
70
+ ContextualizedChunk
71
+ ]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk
72
+ """Add context to all chunks"""
73
+
74
+ smaller_context = ""
75
+ contextualized_chunks = []
76
+ print("\n\n")
77
+ print("len(chunks): ", len(chunks))
78
+ for chunk in chunks:
79
+ contador_pagina = -1
80
+ while contador_pagina <= 1:
81
+ local_page = full_text[chunk.page_number + contador_pagina]
82
+ if local_page:
83
+ smaller_context += local_page.page_content
84
+ contador_pagina += 1
85
+ print("chunk.page_number: ", chunk.page_number)
86
+ context = self.generate_context(smaller_context, chunk)
87
+ contextualized_chunk = ContextualizedChunk(
88
+ content=chunk.content,
89
+ page_number=chunk.page_number,
90
+ chunk_id=chunk.chunk_id,
91
+ start_char=chunk.start_char,
92
+ end_char=chunk.end_char,
93
+ context=context,
94
+ )
95
+ contextualized_chunks.append(contextualized_chunk)
96
+ return contextualized_chunks
_utils/gerar_relatorio_modelo_usuario/DocumentSummarizer_simples.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Tuple, Optional
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_community.chat_models import ChatOpenAI
8
+ from langchain.prompts import PromptTemplate
9
+ import uuid
10
+ import logging
11
+ from cohere import Client
12
+ from _utils.models.gerar_relatorio import (
13
+ DocumentChunk,
14
+ )
15
+ from langchain.schema import Document
16
+
17
+
18
+ class DocumentSummarizer:
19
+ def __init__(
20
+ self,
21
+ openai_api_key: str,
22
+ cohere_api_key: str,
23
+ embedding_model,
24
+ chunk_size,
25
+ chunk_overlap,
26
+ num_k_rerank,
27
+ model_cohere_rerank,
28
+ ):
29
+ self.openai_api_key = openai_api_key
30
+ self.cohere_client = Client(cohere_api_key)
31
+ self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
32
+ self.text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
34
+ )
35
+ self.chunk_metadata = {} # Store chunk metadata for tracing
36
+ self.num_k_rerank = num_k_rerank
37
+ self.model_cohere_rerank = model_cohere_rerank
38
+
39
+ def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
40
+ """Load PDF and split into chunks with metadata"""
41
+ loader = PyPDFLoader(pdf_path)
42
+ pages = (
43
+ loader.load()
44
+ ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
45
+ chunks = []
46
+ char_count = 0
47
+
48
+ for page in pages:
49
+ text = page.page_content
50
+ page_chunks = self.text_splitter.split_text(
51
+ text
52
+ ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
53
+
54
+ for chunk in page_chunks:
55
+ chunk_id = str(uuid.uuid4())
56
+ start_char = text.find(
57
+ chunk
58
+ ) # Retorna a posição onde se encontra o chunk dentro da página inteira
59
+ end_char = start_char + len(chunk)
60
+
61
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
62
+ content=chunk,
63
+ page_number=page.metadata.get("page") + 1, # 1-based page numbering
64
+ chunk_id=chunk_id,
65
+ start_char=char_count + start_char,
66
+ end_char=char_count + end_char,
67
+ )
68
+ chunks.append(doc_chunk)
69
+
70
+ # Store metadata for later retrieval
71
+ self.chunk_metadata[chunk_id] = {
72
+ "page": doc_chunk.page_number,
73
+ "start_char": doc_chunk.start_char,
74
+ "end_char": doc_chunk.end_char,
75
+ }
76
+
77
+ char_count += len(text)
78
+
79
+ return chunks
80
+
81
+ def load_and_split_text(self, text: str) -> List[DocumentChunk]:
82
+ """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
83
+ page = Document(page_content=text, metadata={"page": 1})
84
+ chunks = []
85
+ char_count = 0
86
+
87
+ text = page.page_content
88
+ page_chunks = self.text_splitter.split_text(
89
+ text
90
+ ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
91
+ print("\n\n\n")
92
+ print("page_chunks: ", page_chunks)
93
+
94
+ for chunk in page_chunks:
95
+ chunk_id = str(uuid.uuid4())
96
+ start_char = text.find(
97
+ chunk
98
+ ) # Retorna a posição onde se encontra o chunk dentro da página inteira
99
+ end_char = start_char + len(chunk)
100
+
101
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
102
+ content=chunk,
103
+ page_number=page.metadata.get("page") + 1, # 1-based page numbering
104
+ chunk_id=chunk_id,
105
+ start_char=char_count + start_char,
106
+ end_char=char_count + end_char,
107
+ )
108
+ chunks.append(doc_chunk)
109
+
110
+ # Store metadata for later retrieval
111
+ self.chunk_metadata[chunk_id] = {
112
+ "page": doc_chunk.page_number,
113
+ "start_char": doc_chunk.start_char,
114
+ "end_char": doc_chunk.end_char,
115
+ }
116
+
117
+ char_count += len(text)
118
+
119
+ return chunks
120
+
121
+ def create_vector_store(
122
+ self, chunks: List[DocumentChunk]
123
+ ) -> Chroma: # Esta função nunca está sendo utilizada
124
+ """Create vector store with metadata"""
125
+ texts = [chunk.content for chunk in chunks]
126
+ metadatas = [
127
+ {
128
+ "chunk_id": chunk.chunk_id,
129
+ "page": chunk.page_number,
130
+ "start_char": chunk.start_char,
131
+ "end_char": chunk.end_char,
132
+ }
133
+ for chunk in chunks
134
+ ]
135
+
136
+ vector_store = Chroma.from_texts(
137
+ texts=texts, metadatas=metadatas, embedding=self.embeddings
138
+ )
139
+ return vector_store
140
+
141
+ def rerank_chunks( # Esta função nunca está sendo utilizada
142
+ self, chunks: List[Dict], query: str, k: int = 5
143
+ ) -> List[Dict]:
144
+ """
145
+ Rerank chunks using Cohere's reranking model.
146
+
147
+ Args:
148
+ chunks: List of dictionaries containing chunks and their metadata
149
+ query: Original search query
150
+ k: Number of top chunks to return
151
+
152
+ Returns:
153
+ List of reranked chunks with updated relevance scores
154
+ """
155
+ try:
156
+ # Prepare documents for reranking
157
+ documents = [chunk["content"] for chunk in chunks]
158
+
159
+ # Get reranking scores from Cohere
160
+ results = self.cohere_client.rerank(
161
+ query=query,
162
+ documents=documents,
163
+ top_n=k,
164
+ model=self.model_cohere_rerank,
165
+ )
166
+
167
+ # Create reranked results with original metadata
168
+ reranked_chunks = []
169
+ for hit in results:
170
+ original_chunk = chunks[hit.index]
171
+ reranked_chunks.append(
172
+ {**original_chunk, "relevance_score": hit.relevance_score}
173
+ )
174
+
175
+ return reranked_chunks
176
+
177
+ except Exception as e:
178
+ logging.error(f"Reranking failed: {str(e)}")
179
+ return chunks[:k] # Fallback to original ordering
180
+
181
+ def generate_summary_with_sources( # Esta função nunca está sendo utilizada
182
+ self,
183
+ vector_store: Chroma,
184
+ query: str = "Summarize the main points of this document",
185
+ ) -> List[Dict]:
186
+ """Generate summary with source citations using reranking"""
187
+ # Retrieve more initial chunks for reranking
188
+ relevant_docs = vector_store.similarity_search_with_score(query, k=20)
189
+
190
+ # Prepare chunks for reranking
191
+ chunks = []
192
+ for doc, score in relevant_docs:
193
+ chunks.append(
194
+ {
195
+ "content": doc.page_content,
196
+ "page": doc.metadata["page"],
197
+ "chunk_id": doc.metadata["chunk_id"],
198
+ "relevance_score": score,
199
+ }
200
+ )
201
+
202
+ # Rerank chunks
203
+ reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)
204
+
205
+ # Prepare context and sources from reranked chunks
206
+ contexts = []
207
+ sources = []
208
+
209
+ for chunk in reranked_chunks:
210
+ contexts.append(chunk["content"])
211
+ sources.append(
212
+ {
213
+ "content": chunk["content"],
214
+ "page": chunk["page"],
215
+ "chunk_id": chunk["chunk_id"],
216
+ "relevance_score": chunk["relevance_score"],
217
+ }
218
+ )
219
+
220
+ prompt_template = """
221
+ Based on the following context, provide multiple key points from the document.
222
+ For each point, create a new paragraph.
223
+ Each paragraph should be a complete, self-contained insight.
224
+
225
+ Context: {context}
226
+
227
+ Key points:
228
+ """
229
+
230
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context"])
231
+
232
+ llm = ChatOpenAI(
233
+ temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
234
+ )
235
+
236
+ response = llm.predict(prompt.format(context="\n\n".join(contexts)))
237
+
238
+ # Split the response into paragraphs
239
+ summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
240
+
241
+ # Create structured output
242
+ structured_output = []
243
+ for idx, summary in enumerate(summaries):
244
+ # Associate each summary with the most relevant source
245
+ structured_output.append(
246
+ {
247
+ "content": summary,
248
+ "source": {
249
+ "page": sources[min(idx, len(sources) - 1)]["page"],
250
+ "text": sources[min(idx, len(sources) - 1)]["content"][:200]
251
+ + "...",
252
+ "relevance_score": sources[min(idx, len(sources) - 1)][
253
+ "relevance_score"
254
+ ],
255
+ },
256
+ }
257
+ )
258
+
259
+ return structured_output
260
+
261
+ def get_source_context(
262
+ self, chunk_id: str, window: int = 100
263
+ ) -> Dict: # Esta função nunca está sendo utilizada
264
+ """Get extended context around a specific chunk"""
265
+ metadata = self.chunk_metadata.get(chunk_id)
266
+ if not metadata:
267
+ return None
268
+
269
+ return {
270
+ "page": metadata["page"],
271
+ "start_char": metadata["start_char"],
272
+ "end_char": metadata["end_char"],
273
+ }
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Tuple, Optional
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_community.chat_models import ChatOpenAI
5
+ from langchain.chains import create_extraction_chain
6
+ from langchain.prompts import PromptTemplate
7
+ from rank_bm25 import BM25Okapi
8
+ import logging
9
+ import requests
10
+ from _utils.gerar_relatorio_modelo_usuario.DocumentSummarizer_simples import (
11
+ DocumentSummarizer,
12
+ )
13
+ from _utils.models.gerar_relatorio import (
14
+ ContextualizedChunk,
15
+ RetrievalConfig,
16
+ )
17
+ from setup.environment import api_url
18
+ from rest_framework.response import Response
19
+ from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
20
+ ContextualRetriever,
21
+ )
22
+
23
+
24
+ class EnhancedDocumentSummarizer(DocumentSummarizer):
25
+ def __init__(
26
+ self,
27
+ openai_api_key: str,
28
+ claude_api_key: str,
29
+ config: RetrievalConfig,
30
+ embedding_model,
31
+ chunk_size,
32
+ chunk_overlap,
33
+ num_k_rerank,
34
+ model_cohere_rerank,
35
+ claude_context_model,
36
+ prompt_relatorio,
37
+ gpt_model,
38
+ gpt_temperature,
39
+ id_modelo_do_usuario,
40
+ prompt_modelo,
41
+ reciprocal_rank_fusion,
42
+ ):
43
+ super().__init__(
44
+ openai_api_key,
45
+ os.environ.get("COHERE_API_KEY"),
46
+ embedding_model,
47
+ chunk_size,
48
+ chunk_overlap,
49
+ num_k_rerank,
50
+ model_cohere_rerank,
51
+ )
52
+ self.config = config
53
+ self.contextual_retriever = ContextualRetriever(
54
+ config, claude_api_key, claude_context_model
55
+ )
56
+ self.logger = logging.getLogger(__name__)
57
+ self.prompt_relatorio = prompt_relatorio
58
+ self.gpt_model = gpt_model
59
+ self.gpt_temperature = gpt_temperature
60
+ self.id_modelo_do_usuario = id_modelo_do_usuario
61
+ self.prompt_modelo = prompt_modelo
62
+ self.reciprocal_rank_fusion = reciprocal_rank_fusion
63
+
64
+ def create_enhanced_vector_store(
65
+ self, chunks: List[ContextualizedChunk], is_contextualized_chunk
66
+ ) -> Tuple[Chroma, BM25Okapi, List[str]]:
67
+ """Create vector store and BM25 index with contextualized chunks"""
68
+ try:
69
+ # Prepare texts with context
70
+ if is_contextualized_chunk:
71
+ texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
72
+ else:
73
+ texts = [f"{chunk.content}" for chunk in chunks]
74
+
75
+ # Create vector store
76
+ metadatas = []
77
+ for chunk in chunks:
78
+ if is_contextualized_chunk:
79
+ context = chunk.context
80
+ else:
81
+ context = ""
82
+ metadatas.append(
83
+ {
84
+ "chunk_id": chunk.chunk_id,
85
+ "page": chunk.page_number,
86
+ "start_char": chunk.start_char,
87
+ "end_char": chunk.end_char,
88
+ "context": context,
89
+ }
90
+ )
91
+
92
+ vector_store = Chroma.from_texts(
93
+ texts=texts, metadatas=metadatas, embedding=self.embeddings
94
+ )
95
+
96
+ # Create BM25 index
97
+ tokenized_texts = [text.split() for text in texts]
98
+ bm25 = BM25Okapi(tokenized_texts)
99
+
100
+ # Get chunk IDs in order
101
+ chunk_ids = [chunk.chunk_id for chunk in chunks]
102
+
103
+ return vector_store, bm25, chunk_ids
104
+
105
+ except Exception as e:
106
+ self.logger.error(f"Error creating enhanced vector store: {str(e)}")
107
+ raise
108
+
109
+ def retrieve_with_rank_fusion(
110
+ self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
111
+ ) -> List[Dict]:
112
+ """Combine embedding and BM25 retrieval results"""
113
+ try:
114
+ # Get embedding results
115
+ embedding_results = vector_store.similarity_search_with_score(
116
+ query, k=self.config.num_chunks
117
+ )
118
+
119
+ # Convert embedding results to list of (chunk_id, score)
120
+ embedding_list = [
121
+ (doc.metadata["chunk_id"], 1 / (1 + score))
122
+ for doc, score in embedding_results
123
+ ]
124
+
125
+ # Get BM25 results
126
+ tokenized_query = query.split()
127
+ bm25_scores = bm25.get_scores(tokenized_query)
128
+
129
+ # Convert BM25 scores to list of (chunk_id, score)
130
+ bm25_list = [
131
+ (chunk_ids[i], float(score)) for i, score in enumerate(bm25_scores)
132
+ ]
133
+
134
+ # Sort bm25_list by score in descending order and limit to top N results
135
+ bm25_list = sorted(bm25_list, key=lambda x: x[1], reverse=True)[
136
+ : self.config.num_chunks
137
+ ]
138
+
139
+ # Normalize BM25 scores
140
+ calculo_max = max(
141
+ [score for _, score in bm25_list]
142
+ ) # Criei este max() pois em alguns momentos estava vindo valores 0, e reclamava que não podia dividir por 0
143
+ max_bm25 = calculo_max if bm25_list and calculo_max else 1
144
+ bm25_list = [(doc_id, score / max_bm25) for doc_id, score in bm25_list]
145
+
146
+ # Pass the lists to rank fusion
147
+ result_lists = [embedding_list, bm25_list]
148
+ weights = [self.config.embedding_weight, self.config.bm25_weight]
149
+
150
+ combined_results = self.reciprocal_rank_fusion(
151
+ result_lists, weights=weights
152
+ )
153
+
154
+ return combined_results
155
+
156
+ except Exception as e:
157
+ self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
158
+ raise
159
+
160
+ def generate_enhanced_summary(
161
+ self,
162
+ vector_store: Chroma,
163
+ bm25: BM25Okapi,
164
+ chunk_ids: List[str],
165
+ query: str = "Summarize the main points of this document",
166
+ ) -> List[Dict]:
167
+ """Generate enhanced summary using both vector and BM25 retrieval"""
168
+ try:
169
+ # Get combined results using rank fusion
170
+ ranked_results = self.retrieve_with_rank_fusion(
171
+ vector_store, bm25, chunk_ids, query
172
+ )
173
+
174
+ # Prepare context and track sources
175
+ contexts = []
176
+ sources = []
177
+
178
+ # Get full documents for top results
179
+ for chunk_id, score in ranked_results[: self.config.num_chunks]:
180
+ results = vector_store.get(
181
+ where={"chunk_id": chunk_id}, include=["documents", "metadatas"]
182
+ )
183
+
184
+ if results["documents"]:
185
+ context = results["documents"][0]
186
+ metadata = results["metadatas"][0]
187
+
188
+ contexts.append(context)
189
+ sources.append(
190
+ {
191
+ "content": context,
192
+ "page": metadata["page"],
193
+ "chunk_id": chunk_id,
194
+ "relevance_score": score,
195
+ "context": metadata.get("context", ""),
196
+ }
197
+ )
198
+
199
+ url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
200
+ resposta = requests.get(url_request)
201
+
202
+ if resposta.status_code != 200:
203
+ return Response(
204
+ {
205
+ "error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica"
206
+ }
207
+ )
208
+
209
+ modelo_buscado = resposta.json()["modelo"]
210
+
211
+ llm = ChatOpenAI(
212
+ temperature=self.gpt_temperature,
213
+ model_name=self.gpt_model,
214
+ api_key=self.openai_api_key,
215
+ )
216
+
217
+ prompt_gerar_relatorio = PromptTemplate(
218
+ template=self.prompt_relatorio, input_variables=["context"]
219
+ )
220
+
221
+ relatorio_gerado = llm.predict(
222
+ prompt_gerar_relatorio.format(context="\n\n".join(contexts))
223
+ )
224
+
225
+ prompt_gerar_modelo = PromptTemplate(
226
+ template=self.prompt_modelo,
227
+ input_variables=["context", "modelo_usuario"],
228
+ )
229
+
230
+ modelo_gerado = llm.predict(
231
+ prompt_gerar_modelo.format(
232
+ context=relatorio_gerado, modelo_usuario=modelo_buscado
233
+ )
234
+ )
235
+
236
+ # Split the response into paragraphs
237
+ summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
238
+
239
+ # Create structured output
240
+ structured_output = []
241
+ for idx, summary in enumerate(summaries):
242
+ source_idx = min(idx, len(sources) - 1)
243
+ structured_output.append(
244
+ {
245
+ "content": summary,
246
+ "source": {
247
+ "page": sources[source_idx]["page"],
248
+ "text": sources[source_idx]["content"][:200] + "...",
249
+ "context": sources[source_idx]["context"],
250
+ "relevance_score": sources[source_idx]["relevance_score"],
251
+ "chunk_id": sources[source_idx]["chunk_id"],
252
+ },
253
+ }
254
+ )
255
+
256
+ return structured_output
257
+
258
+ except Exception as e:
259
+ self.logger.error(f"Error generating enhanced summary: {str(e)}")
260
+ raise
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import ChatOpenAI
3
+ from typing import List, Dict, Tuple, Optional
4
+ from anthropic import Anthropic
5
+ import logging
6
+ from langchain.schema import Document
7
+ import asyncio
8
+ from langchain.prompts import PromptTemplate
9
+ from typing import List
10
+ from multiprocessing import Process, Barrier, Queue
11
+ from dataclasses import dataclass
12
+ from langchain_core.messages import HumanMessage
13
+
14
+ from _utils.gerar_relatorio_modelo_usuario.llm_calls import claude_answer, gpt_answer
15
+ from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
16
+ from _utils.models.gerar_relatorio import (
17
+ ContextualizedChunk,
18
+ DocumentChunk,
19
+ RetrievalConfig,
20
+ )
21
+
22
+ lista_contador = []
23
+
24
+
25
+ def task(name, barrier, queue, chunk, full_text, config, claude_context_model):
26
+ """Função independente para processar um chunk."""
27
+ print(f"Process {name} ready")
28
+ barrier.wait() # Espera todos os processos estarem prontos
29
+ retriever = ContextualRetriever(config, None, claude_context_model)
30
+ result = retriever.create_contextualized_chunk(chunk, full_text)
31
+ queue.put(result) # Armazena o resultado na fila
32
+
33
+
34
+ class ContextualRetriever:
35
+ def __init__(
36
+ self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str
37
+ ):
38
+ self.config = config
39
+ self.claude_client = Anthropic(api_key=claude_api_key)
40
+ self.logger = logging.getLogger(__name__)
41
+ self.bm25 = None
42
+ self.claude_context_model = claude_context_model
43
+
44
+ def llm_generate_context(self, full_text: str, chunk: DocumentChunk) -> str:
45
+ """Generate contextual description using ChatOpenAI"""
46
+ try:
47
+ prompt = contextual_prompt(full_text, chunk.content)
48
+ print("COMEÇOU A REQUISIÇÃO")
49
+ # response = claude_answer(self.claude_client, self.claude_context_model, prompt)
50
+ response = gpt_answer(prompt)
51
+ return response
52
+ except Exception as e:
53
+ self.logger.error(
54
+ f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
55
+ )
56
+ return ""
57
+
58
+ def create_contextualized_chunk(self, chunk, full_text):
59
+ lista_contador.append(0)
60
+ print("contador: ", len(lista_contador))
61
+ page_content = ""
62
+ for i in range(
63
+ max(0, chunk.page_number - 1),
64
+ min(len(full_text), chunk.page_number + 2),
65
+ ):
66
+ page_content += full_text[i].page_content if full_text[i] else ""
67
+
68
+ context = self.llm_generate_context(page_content, chunk)
69
+ return ContextualizedChunk(
70
+ content=chunk.content,
71
+ page_number=chunk.page_number,
72
+ chunk_id=chunk.chunk_id,
73
+ start_char=chunk.start_char,
74
+ end_char=chunk.end_char,
75
+ context=context,
76
+ )
77
+
78
+ def contextualize_all_chunks(
79
+ self, full_text: List[Document], chunks: List[DocumentChunk]
80
+ ) -> List[ContextualizedChunk]:
81
+ """Add context to all chunks"""
82
+ contextualized_chunks = []
83
+
84
+ # tasks = [create_contextualized_chunk(chunk) for chunk in chunks]
85
+ # contextualized_chunks = await asyncio.gather(*tasks)
86
+
87
+ contextualized_chunks = self.main(chunks, full_text)
88
+
89
+ return contextualized_chunks
90
+
91
+ # def task(self, name, barrier, queue, chunk, full_text):
92
+ # print(f"Process {name} ready")
93
+ # barrier.wait() # Wait for all processes to be ready
94
+ # result = self.create_contextualized_chunk(chunk, full_text)
95
+ # queue.put(result) # Store the result in the queue
96
+
97
+ def main(self, chunks, full_text):
98
+ barrier = Barrier(1)
99
+ queue = Queue()
100
+ processes = []
101
+
102
+ for i in range(len(chunks)):
103
+ p = Process(
104
+ target=task,
105
+ args=(
106
+ f"P{i+1}",
107
+ barrier,
108
+ queue,
109
+ chunks[i],
110
+ full_text,
111
+ self.config,
112
+ self.claude_context_model,
113
+ ),
114
+ )
115
+ processes.append(p)
116
+ p.start()
117
+
118
+ results = []
119
+ for p in processes:
120
+ p.join()
121
+
122
+ # Collect results from the queue
123
+ while not queue.empty():
124
+ print("queue.get(): ", queue.get())
125
+ results.append(queue.get())
126
+
127
+ return results
_utils/gerar_relatorio_modelo_usuario/llm_calls.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_core.messages import HumanMessage
3
+ from langchain_openai import ChatOpenAI
4
+
5
+
6
+ def claude_answer(claude_client, claude_context_model, prompt):
7
+ response = claude_client.messages.create(
8
+ model=claude_context_model,
9
+ max_tokens=100,
10
+ messages=[{"role": "user", "content": prompt}],
11
+ )
12
+ return response.content[
13
+ 0
14
+ ].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
15
+
16
+
17
+ def gpt_answer(prompt):
18
+ gpt = ChatOpenAI(
19
+ temperature=0,
20
+ model="gpt-4o-mini",
21
+ api_key=os.environ.get("OPENAI_API_KEY"),
22
+ )
23
+ response = gpt.invoke([HumanMessage(content=prompt)])
24
+ return response.content
_utils/gerar_relatorio_modelo_usuario/prompts.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def contextual_prompt(full_text, chunk_content):
2
+ return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
3
+ 1. **Chain of Thought (internal)**:
4
+ - Identify the document ID, which is the value between "NUM." and "- Pág".
5
+ - Identify the document name from the header.
6
+ 2. **Reflection (internal)**:
7
+ - Confirm the document ID and name are correctly identified.
8
+ - Ensure the final context is concise and helpful.
9
+ 3. **Final Response**:
10
+ - Provide a short context situating the *chunk* within the document, including the document ID and document name.
11
+ - Do not include any reasoning or reflection in your response.
12
+ **Example Usage:**
13
+ ```
14
+ <document> {full_text} </document>
15
+ <chunk> {chunk_content} </chunk>
16
+ Please return only the succinct context (without displaying your internal reasoning), including the document ID and the document name.
17
+ ```
18
+ """
19
+
20
+
21
+ # system_prompt_modelo = """
22
+ # Based on the following context, provide multiple key points from the document.
23
+ # For each point, create a new paragraph.
24
+ # Each paragraph should be a complete, self-contained insight.
25
+ # Include any relevant context provided.
26
+
27
+ # Context: {context}
28
+
29
+ # Modelo do usuário: {modelo_usuario}
30
+
31
+ # Key points:
32
+ # """
33
+ system_prompt_modelo = """
34
+ You are a large language model that must produce a single final sentence in **Portuguese**. To do this, you will follow a private chain of thought and then produce a final answer. The final answer must follow the formatting and stylistic conventions shown in the user-provided model `user's template`. The information to be included in the final sentence is derived from the `context` (a report describing a legal case).
35
+ **Contextual Information (provided separately):**
36
+ {context}
37
+ **User Model (provided separately):**
38
+ {modelo_usuario}
39
+ **Instructions:**
40
+ 1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
41
+ 2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
42
+ - Carefully review the `context` which is a legal report of a case.
43
+ - Identify:
44
+ - The defendant’s name.
45
+ - The crime’s name, its article, and any subsection (inciso).
46
+ - The date of receipt of the complaint (data do recebimento da denúncia).
47
+ - The document ID.
48
+ - Ensure these elements are correctly incorporated into the final sentence.
49
+ - Check compliance with the formatting style indicated by `user's template`.
50
+ - Compose the sentence following the structure from the user model.
51
+ - Use reflection: Before finalizing the answer, reassess if all required information is included, if the format matches the user model, and if the sentence is written correctly in Portuguese.
52
+ 3. **Reflection Technique (private):**
53
+ After composing the sentence, but before presenting it as the final answer, reflect if:
54
+ - All required details from the `context` are accurately included.
55
+ - The sentence format strictly matches the pattern of `user's template`.
56
+ - The sentence is grammatically correct in Portuguese.
57
+ 4. **Final Answer:**
58
+ - After completing the chain of thought and ensuring correctness through reflection, present only the final sentence in Portuguese.
59
+ - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
60
+ """
61
+
62
+ # system_prompt_relatorio = """
63
+ # Based on the following context, provide multiple key points from the document.
64
+ # For each point, create a new paragraph.
65
+ # Each paragraph should be a complete, self-contained insight.
66
+ # Include any relevant context provided.
67
+
68
+ # Context: {context}
69
+
70
+ # Key points:
71
+ # """
72
+ system_prompt_relatorio = """
73
+ You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
74
+ **Instructions**:
75
+ 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
76
+ 2. **Reading the Context**: Extract the following information from `context`:
77
+ - The name of the defendant (réu).
78
+ - The crime they have been accused of (nome_do_crime).
79
+ - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
80
+ - The date the accusation was accepted (data_do_recebimento).
81
+ - The ID of the decision document (id_do_documento).
82
+ 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
83
+ 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
84
+ ```
85
+ <formato>
86
+ Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
87
+ A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
88
+ Não há outras causas interruptivas ou suspensivas da prescrição.
89
+ </formato>
90
+ ```
91
+ 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
92
+ **Reminder**:
93
+ - Do not include your chain of thought in the final output.
94
+ - Do not add extra information or commentary beyond the specified format.
95
+ - The final answer must be in Portuguese.
96
+ ---
97
+
98
+ **Contextual Information (provided separately):**
99
+ {context}
100
+ ---
101
+ **Example with a given context**:
102
+ - Input:
103
+ `context` = "Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição."
104
+ - Expected final answer:
105
+ ```
106
+ <formato>
107
+ Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
108
+ A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
109
+ Não há outras causas interruptivas ou suspensivas da prescrição.
110
+ </formato>
111
+ """
_utils/models/gerar_relatorio.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple, Optional
2
+ from dataclasses import dataclass
3
+ import numpy as np
4
+
5
+
6
+ @dataclass
7
+ class DocumentChunk:
8
+ content: str
9
+ page_number: int
10
+ chunk_id: str
11
+ start_char: int
12
+ end_char: int
13
+
14
+
15
+ @dataclass
16
+ class RetrievalConfig:
17
+ num_chunks: int = 5
18
+ embedding_weight: float = 0.5
19
+ bm25_weight: float = 0.5
20
+ context_window: int = 3
21
+ chunk_overlap: int = 200
22
+ chunk_size: int = 1000
23
+
24
+
25
+ @dataclass
26
+ class ContextualizedChunk(DocumentChunk):
27
+ context: str = ""
28
+ embedding: Optional[np.ndarray] = None
29
+ bm25_score: Optional[float] = None
_utils/ragas.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from _utils.resumo_completo_cursor import EnhancedDocumentSummarizer, RetrievalConfig
4
+ from rest_framework.response import Response
5
+ from ragas import evaluate
6
+
7
+ from langchain.chains import SequentialChain
8
+ from langchain.prompts import PromptTemplate
9
+
10
+ # from langchain.schema import ChainResult
11
+ from langchain.memory import SimpleMemory
12
+
13
+
14
+ def test_ragas(serializer, listaPDFs):
15
+
16
+ # Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer
17
+ config = RetrievalConfig(
18
+ num_chunks=serializer["num_chunks_retrieval"],
19
+ embedding_weight=serializer["embedding_weight"],
20
+ bm25_weight=serializer["bm25_weight"],
21
+ context_window=serializer["context_window"],
22
+ chunk_overlap=serializer["chunk_overlap"],
23
+ )
24
+
25
+ summarizer = EnhancedDocumentSummarizer(
26
+ openai_api_key=os.environ.get("OPENAI_API_KEY"),
27
+ claude_api_key=os.environ.get("CLAUDE_API_KEY"),
28
+ config=config,
29
+ embedding_model=serializer["hf_embedding"],
30
+ chunk_overlap=serializer["chunk_overlap"],
31
+ chunk_size=serializer["chunk_size"],
32
+ num_k_rerank=serializer["num_k_rerank"],
33
+ model_cohere_rerank=serializer["model_cohere_rerank"],
34
+ claude_context_model=serializer["claude_context_model"],
35
+ prompt_relatorio=serializer["prompt_relatorio"],
36
+ gpt_model=serializer["model"],
37
+ gpt_temperature=serializer["gpt_temperature"],
38
+ id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
39
+ prompt_modelo=serializer["prompt_modelo"],
40
+ )
41
+
42
+ # Step 1: Define the components
43
+ def load_and_split_documents(pdf_list, summarizer):
44
+ """Loads and splits PDF documents into chunks."""
45
+ all_chunks = []
46
+ for pdf_path in pdf_list:
47
+ chunks = summarizer.load_and_split_document(pdf_path)
48
+ all_chunks.extend(chunks)
49
+ return {"chunks": all_chunks}
50
+
51
+ def get_full_text_from_pdfs(pdf_list):
52
+ """Gets the full text from PDFs for contextualization."""
53
+ full_text = []
54
+ for pdf_path in pdf_list:
55
+ loader = PyPDFLoader(pdf_path)
56
+ pages = loader.load()
57
+ text = " ".join([page.page_content for page in pages])
58
+ full_text.append(text)
59
+ return {"full_text": " ".join(full_text)}
60
+
61
+ def contextualize_all_chunks(full_text, chunks, contextual_retriever):
62
+ """Adds context to chunks using Claude."""
63
+ contextualized_chunks = contextual_retriever.contextualize_all_chunks(
64
+ full_text, chunks
65
+ )
66
+ return {"contextualized_chunks": contextualized_chunks}
67
+
68
+ def create_vector_store(contextualized_chunks, summarizer):
69
+ """Creates an enhanced vector store and BM25 index."""
70
+ vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
71
+ contextualized_chunks
72
+ )
73
+ return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids}
74
+
75
+ def generate_summary(vector_store, bm25, chunk_ids, query, summarizer):
76
+ """Generates an enhanced summary using the vector store and BM25 index."""
77
+ structured_summaries = summarizer.generate_enhanced_summary(
78
+ vector_store, bm25, chunk_ids, query
79
+ )
80
+ return {"structured_summaries": structured_summaries}
81
+
82
+ # Step 3: Define Sequential Chain
83
+ chain = SequentialChain(
84
+ chains=[
85
+ lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer),
86
+ lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]),
87
+ lambda inputs: contextualize_all_chunks(
88
+ inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever
89
+ ),
90
+ lambda inputs: create_vector_store(
91
+ inputs["contextualized_chunks"], summarizer
92
+ ),
93
+ lambda inputs: generate_summary(
94
+ inputs["vector_store"],
95
+ inputs["bm25"],
96
+ inputs["chunk_ids"],
97
+ inputs["user_message"],
98
+ summarizer,
99
+ ),
100
+ ],
101
+ input_variables=["pdf_list", "user_message"],
102
+ output_variables=["structured_summaries"],
103
+ )
104
+
105
+ from ragas.langchain.evalchain import RagasEvaluatorChain
106
+ from ragas.metrics import (
107
+ LLMContextRecall,
108
+ Faithfulness,
109
+ FactualCorrectness,
110
+ SemanticSimilarity,
111
+ )
112
+ from ragas import evaluate
113
+ from ragas.llms import LangchainLLMWrapper
114
+
115
+ # from ragas.embeddings import LangchainEmbeddingsWrapper
116
+ # evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
117
+ evaluator_llm = LangchainLLMWrapper(chain)
118
+ # evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
119
+ from datasets import load_dataset
120
+
121
+ dataset = load_dataset(
122
+ "explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True
123
+ )
124
+
125
+ from ragas import EvaluationDataset
126
+
127
+ eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"])
128
+
129
+ metrics = [
130
+ LLMContextRecall(llm=evaluator_llm),
131
+ FactualCorrectness(llm=evaluator_llm),
132
+ Faithfulness(llm=evaluator_llm),
133
+ # SemanticSimilarity(embeddings=evaluator_embeddings)
134
+ ]
135
+ results = evaluate(dataset=eval_dataset, metrics=metrics)
136
+ print("results: ", results)
137
+
138
+ # Step 4: Run the Chain
139
+ inputs = {
140
+ "pdf_list": listaPDFs,
141
+ "user_message": serializer["user_message"],
142
+ }
143
+ # result = chain.run(inputs)
144
+ return Response({"msg": results})
145
+
146
+ # Step 5: Format the Output
147
+ # return {
148
+ # "resultado": result["structured_summaries"],
149
+ # "parametros-utilizados": {
150
+ # "num_chunks_retrieval": serializer["num_chunks_retrieval"],
151
+ # "embedding_weight": serializer["embedding_weight"],
152
+ # "bm25_weight": serializer["bm25_weight"],
153
+ # "context_window": serializer["context_window"],
154
+ # "chunk_overlap": serializer["chunk_overlap"],
155
+ # "num_k_rerank": serializer["num_k_rerank"],
156
+ # "model_cohere_rerank": serializer["model_cohere_rerank"],
157
+ # "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"],
158
+ # "claude_context_model": serializer["claude_context_model"],
159
+ # "gpt_temperature": serializer["gpt_temperature"],
160
+ # "user_message": serializer["user_message"],
161
+ # "model": serializer["model"],
162
+ # "hf_embedding": serializer["hf_embedding"],
163
+ # "chunk_size": serializer["chunk_size"],
164
+ # "chunk_overlap": serializer["chunk_overlap"],
165
+ # "prompt_relatorio": serializer["prompt_relatorio"],
166
+ # "prompt_modelo": serializer["prompt_modelo"],
167
+ # },
168
+ # }
_utils/resumo_completo_cursor.py CHANGED
@@ -1,26 +1,13 @@
1
  import os
2
- from typing import List, Dict, Tuple, Optional
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.document_loaders import PyPDFLoader
5
- from langchain_huggingface import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import Chroma
7
- from langchain_community.chat_models import ChatOpenAI
8
- from langchain.chains import create_extraction_chain
9
- from langchain.prompts import PromptTemplate
10
- from dataclasses import dataclass
11
- import uuid
12
  import json
13
- from anthropic import Anthropic
14
- import numpy as np
15
- from rank_bm25 import BM25Okapi
16
- import logging
17
- from cohere import Client
18
- import requests
19
- from setup.environment import api_url
20
- from rest_framework.response import Response
21
- from langchain.schema import Document
22
 
23
- listaContador = []
24
 
25
  def reciprocal_rank_fusion(result_lists, weights=None):
26
  """Combine multiple ranked lists using reciprocal rank fusion"""
@@ -47,603 +34,6 @@ os.environ.get("LANGCHAIN_API_KEY")
47
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
48
 
49
 
50
- @dataclass
51
- class DocumentChunk:
52
- content: str
53
- page_number: int
54
- chunk_id: str
55
- start_char: int
56
- end_char: int
57
-
58
-
59
- @dataclass
60
- class RetrievalConfig:
61
- num_chunks: int = 5
62
- embedding_weight: float = 0.5
63
- bm25_weight: float = 0.5
64
- context_window: int = 3
65
- chunk_overlap: int = 200
66
- chunk_size: int = 1000
67
-
68
-
69
- @dataclass
70
- class ContextualizedChunk(DocumentChunk):
71
- context: str = ""
72
- embedding: Optional[np.ndarray] = None
73
- bm25_score: Optional[float] = None
74
-
75
-
76
- class DocumentSummarizer:
77
-
78
- def __init__(
79
- self,
80
- openai_api_key: str,
81
- cohere_api_key: str,
82
- embedding_model,
83
- chunk_size,
84
- chunk_overlap,
85
- num_k_rerank,
86
- model_cohere_rerank,
87
- ):
88
- self.openai_api_key = openai_api_key
89
- self.cohere_client = Client(cohere_api_key)
90
- self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
91
- self.text_splitter = RecursiveCharacterTextSplitter(
92
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
93
- )
94
- self.chunk_metadata = {} # Store chunk metadata for tracing
95
- self.num_k_rerank = num_k_rerank
96
- self.model_cohere_rerank = model_cohere_rerank
97
-
98
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
99
- """Load PDF and split into chunks with metadata"""
100
- loader = PyPDFLoader(pdf_path)
101
- pages = (
102
- loader.load()
103
- ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
104
- chunks = []
105
- char_count = 0
106
-
107
- for page in pages:
108
- text = page.page_content
109
- page_chunks = self.text_splitter.split_text(
110
- text
111
- ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
112
-
113
- for chunk in page_chunks:
114
- chunk_id = str(uuid.uuid4())
115
- start_char = text.find(
116
- chunk
117
- ) # Retorna a posição onde se encontra o chunk dentro da página inteira
118
- end_char = start_char + len(chunk)
119
-
120
- doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
121
- content=chunk,
122
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
123
- chunk_id=chunk_id,
124
- start_char=char_count + start_char,
125
- end_char=char_count + end_char,
126
- )
127
- chunks.append(doc_chunk)
128
-
129
- # Store metadata for later retrieval
130
- self.chunk_metadata[chunk_id] = {
131
- "page": doc_chunk.page_number,
132
- "start_char": doc_chunk.start_char,
133
- "end_char": doc_chunk.end_char,
134
- }
135
-
136
- char_count += len(text)
137
-
138
- return chunks
139
-
140
- def load_and_split_text(self, text: str) -> List[DocumentChunk]:
141
- """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
142
- page = Document(page_content=text, metadata={"page": 1})
143
- chunks = []
144
- char_count = 0
145
-
146
- text = page.page_content
147
- page_chunks = self.text_splitter.split_text(
148
- text
149
- ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
150
- print("\n\n\n")
151
- print("page_chunks: ", page_chunks)
152
-
153
- for chunk in page_chunks:
154
- chunk_id = str(uuid.uuid4())
155
- start_char = text.find(
156
- chunk
157
- ) # Retorna a posição onde se encontra o chunk dentro da página inteira
158
- end_char = start_char + len(chunk)
159
-
160
- doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
161
- content=chunk,
162
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
163
- chunk_id=chunk_id,
164
- start_char=char_count + start_char,
165
- end_char=char_count + end_char,
166
- )
167
- chunks.append(doc_chunk)
168
-
169
- # Store metadata for later retrieval
170
- self.chunk_metadata[chunk_id] = {
171
- "page": doc_chunk.page_number,
172
- "start_char": doc_chunk.start_char,
173
- "end_char": doc_chunk.end_char,
174
- }
175
-
176
- char_count += len(text)
177
-
178
- return chunks
179
-
180
- def create_vector_store(
181
- self, chunks: List[DocumentChunk]
182
- ) -> Chroma: # Esta função nunca está sendo utilizada
183
- """Create vector store with metadata"""
184
- texts = [chunk.content for chunk in chunks]
185
- metadatas = [
186
- {
187
- "chunk_id": chunk.chunk_id,
188
- "page": chunk.page_number,
189
- "start_char": chunk.start_char,
190
- "end_char": chunk.end_char,
191
- }
192
- for chunk in chunks
193
- ]
194
-
195
- vector_store = Chroma.from_texts(
196
- texts=texts, metadatas=metadatas, embedding=self.embeddings
197
- )
198
- return vector_store
199
-
200
- def rerank_chunks( # Esta função nunca está sendo utilizada
201
- self, chunks: List[Dict], query: str, k: int = 5
202
- ) -> List[Dict]:
203
- """
204
- Rerank chunks using Cohere's reranking model.
205
-
206
- Args:
207
- chunks: List of dictionaries containing chunks and their metadata
208
- query: Original search query
209
- k: Number of top chunks to return
210
-
211
- Returns:
212
- List of reranked chunks with updated relevance scores
213
- """
214
- try:
215
- # Prepare documents for reranking
216
- documents = [chunk["content"] for chunk in chunks]
217
-
218
- # Get reranking scores from Cohere
219
- results = self.cohere_client.rerank(
220
- query=query,
221
- documents=documents,
222
- top_n=k,
223
- model=self.model_cohere_rerank,
224
- )
225
-
226
- # Create reranked results with original metadata
227
- reranked_chunks = []
228
- for hit in results:
229
- original_chunk = chunks[hit.index]
230
- reranked_chunks.append(
231
- {**original_chunk, "relevance_score": hit.relevance_score}
232
- )
233
-
234
- return reranked_chunks
235
-
236
- except Exception as e:
237
- logging.error(f"Reranking failed: {str(e)}")
238
- return chunks[:k] # Fallback to original ordering
239
-
240
- def generate_summary_with_sources( # Esta função nunca está sendo utilizada
241
- self,
242
- vector_store: Chroma,
243
- query: str = "Summarize the main points of this document",
244
- ) -> List[Dict]:
245
- """Generate summary with source citations using reranking"""
246
- # Retrieve more initial chunks for reranking
247
- relevant_docs = vector_store.similarity_search_with_score(query, k=20)
248
-
249
- # Prepare chunks for reranking
250
- chunks = []
251
- for doc, score in relevant_docs:
252
- chunks.append(
253
- {
254
- "content": doc.page_content,
255
- "page": doc.metadata["page"],
256
- "chunk_id": doc.metadata["chunk_id"],
257
- "relevance_score": score,
258
- }
259
- )
260
-
261
- # Rerank chunks
262
- reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)
263
-
264
- # Prepare context and sources from reranked chunks
265
- contexts = []
266
- sources = []
267
-
268
- for chunk in reranked_chunks:
269
- contexts.append(chunk["content"])
270
- sources.append(
271
- {
272
- "content": chunk["content"],
273
- "page": chunk["page"],
274
- "chunk_id": chunk["chunk_id"],
275
- "relevance_score": chunk["relevance_score"],
276
- }
277
- )
278
-
279
- prompt_template = """
280
- Based on the following context, provide multiple key points from the document.
281
- For each point, create a new paragraph.
282
- Each paragraph should be a complete, self-contained insight.
283
-
284
- Context: {context}
285
-
286
- Key points:
287
- """
288
-
289
- prompt = PromptTemplate(template=prompt_template, input_variables=["context"])
290
-
291
- llm = ChatOpenAI(
292
- temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
293
- )
294
-
295
- response = llm.predict(prompt.format(context="\n\n".join(contexts)))
296
-
297
- # Split the response into paragraphs
298
- summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
299
-
300
- # Create structured output
301
- structured_output = []
302
- for idx, summary in enumerate(summaries):
303
- # Associate each summary with the most relevant source
304
- structured_output.append(
305
- {
306
- "content": summary,
307
- "source": {
308
- "page": sources[min(idx, len(sources) - 1)]["page"],
309
- "text": sources[min(idx, len(sources) - 1)]["content"][:200]
310
- + "...",
311
- "relevance_score": sources[min(idx, len(sources) - 1)][
312
- "relevance_score"
313
- ],
314
- },
315
- }
316
- )
317
-
318
- return structured_output
319
-
320
- def get_source_context(
321
- self, chunk_id: str, window: int = 100
322
- ) -> Dict: # Esta função nunca está sendo utilizada
323
- """Get extended context around a specific chunk"""
324
- metadata = self.chunk_metadata.get(chunk_id)
325
- if not metadata:
326
- return None
327
-
328
- return {
329
- "page": metadata["page"],
330
- "start_char": metadata["start_char"],
331
- "end_char": metadata["end_char"],
332
- }
333
-
334
-
335
- class ContextualRetriever:
336
-
337
- def __init__(
338
- self, config: RetrievalConfig, claude_api_key: str, claude_context_model
339
- ):
340
- self.config = config # Este self.config no momento não está sendo utilizada para nada dentro desta classe. Analisar se deveria estar sendo utilizada.
341
- self.claude_client = Anthropic(api_key=claude_api_key)
342
- self.logger = logging.getLogger(__name__)
343
- self.bm25 = None
344
- self.claude_context_model = claude_context_model
345
-
346
- def generate_context(self, full_text: str, chunk: DocumentChunk) -> str:
347
- """Generate contextual description using Claude"""
348
- try:
349
- # prompt = f"""<document>
350
- # {full_text}
351
- # </document>
352
- # Here is the chunk we want to situate within the whole document
353
- # <chunk>
354
- # {chunk.content}
355
- # </chunk>
356
- # Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
357
-
358
- prompt = f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
359
- 1. **Chain of Thought (internal)**:
360
- - Identify the document ID, which is the value between "NUM." and "- Pág".
361
- - Identify the document name from the header.
362
- 2. **Reflection (internal)**:
363
- - Confirm the document ID and name are correctly identified.
364
- - Ensure the final context is concise and helpful.
365
- 3. **Final Response**:
366
- - Provide a short context situating the *chunk* within the document, including the document ID and document name.
367
- - Do not include any reasoning or reflection in your response.
368
- **Example Usage:**
369
- ```
370
- <document> {full_text} </document>
371
- <chunk> {chunk.content} </chunk>
372
- Please return only the succinct context (without displaying your internal reasoning), including the document ID and the document name.
373
- ```
374
- """
375
-
376
- response = self.claude_client.messages.create(
377
- model=self.claude_context_model,
378
- max_tokens=100,
379
- messages=[{"role": "user", "content": prompt}],
380
- )
381
- return response.content[
382
- 0
383
- ].text # O response.content é uma lista pois é passada uma lista de mensagens, e também retornado uma lista de mensagens, sendo a primeira a mais recente, que é a resposta do model
384
- except Exception as e:
385
- self.logger.error(
386
- f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
387
- )
388
- return ""
389
-
390
- def contextualize_chunks(
391
- self, full_text: List[Document], chunks: List[DocumentChunk]
392
- ) -> List[
393
- ContextualizedChunk
394
- ]: # Pega um chunk e apenas adiciona uma propriedade de contexto a ela, sendo esta propriedade a resposta da função acima, que chama um Model do Claude para dizer o contexto de um chunk
395
- """Add context to all chunks"""
396
-
397
- smaller_context = ""
398
- contextualized_chunks = []
399
- print("\n\n")
400
- print("len(chunks): ", len(chunks))
401
- for chunk in chunks:
402
- contador_pagina = -1
403
- while contador_pagina <= 1:
404
- local_page = full_text[chunk.page_number + contador_pagina]
405
- if local_page:
406
- smaller_context += local_page.page_content
407
- contador_pagina += 1
408
- print("chunk.page_number: ", chunk.page_number)
409
- context = self.generate_context(smaller_context, chunk)
410
- contextualized_chunk = ContextualizedChunk(
411
- content=chunk.content,
412
- page_number=chunk.page_number,
413
- chunk_id=chunk.chunk_id,
414
- start_char=chunk.start_char,
415
- end_char=chunk.end_char,
416
- context=context,
417
- )
418
- contextualized_chunks.append(contextualized_chunk)
419
- return contextualized_chunks
420
-
421
-
422
- class EnhancedDocumentSummarizer(DocumentSummarizer):
423
-
424
- def __init__(
425
- self,
426
- openai_api_key: str,
427
- claude_api_key: str,
428
- config: RetrievalConfig,
429
- embedding_model,
430
- chunk_size,
431
- chunk_overlap,
432
- num_k_rerank,
433
- model_cohere_rerank,
434
- claude_context_model,
435
- prompt_relatorio,
436
- gpt_model,
437
- gpt_temperature,
438
- id_modelo_do_usuario,
439
- prompt_modelo,
440
- ):
441
- super().__init__(
442
- openai_api_key,
443
- os.environ.get("COHERE_API_KEY"),
444
- embedding_model,
445
- chunk_size,
446
- chunk_overlap,
447
- num_k_rerank,
448
- model_cohere_rerank,
449
- )
450
- self.config = config
451
- self.contextual_retriever = ContextualRetriever(
452
- config, claude_api_key, claude_context_model
453
- )
454
- self.logger = logging.getLogger(__name__)
455
- self.prompt_relatorio = prompt_relatorio
456
- self.gpt_model = gpt_model
457
- self.gpt_temperature = gpt_temperature
458
- self.id_modelo_do_usuario = id_modelo_do_usuario
459
- self.prompt_modelo = prompt_modelo
460
-
461
- def create_enhanced_vector_store(
462
- self, chunks: List[ContextualizedChunk]
463
- ) -> Tuple[Chroma, BM25Okapi, List[str]]:
464
- """Create vector store and BM25 index with contextualized chunks"""
465
- try:
466
- # Prepare texts with context
467
- texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
468
-
469
- # Create vector store
470
- metadatas = [
471
- {
472
- "chunk_id": chunk.chunk_id,
473
- "page": chunk.page_number,
474
- "start_char": chunk.start_char,
475
- "end_char": chunk.end_char,
476
- "context": chunk.context,
477
- }
478
- for chunk in chunks
479
- ]
480
-
481
- vector_store = Chroma.from_texts(
482
- texts=texts, metadatas=metadatas, embedding=self.embeddings
483
- )
484
-
485
- # Create BM25 index
486
- tokenized_texts = [text.split() for text in texts]
487
- bm25 = BM25Okapi(tokenized_texts)
488
-
489
- # Get chunk IDs in order
490
- chunk_ids = [chunk.chunk_id for chunk in chunks]
491
-
492
- return vector_store, bm25, chunk_ids
493
-
494
- except Exception as e:
495
- self.logger.error(f"Error creating enhanced vector store: {str(e)}")
496
- raise
497
-
498
- def retrieve_with_rank_fusion(
499
- self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
500
- ) -> List[Dict]:
501
- """Combine embedding and BM25 retrieval results"""
502
- try:
503
- # Get embedding results
504
- embedding_results = vector_store.similarity_search_with_score(
505
- query, k=self.config.num_chunks
506
- )
507
-
508
- # Convert embedding results to list of (chunk_id, score)
509
- embedding_list = [
510
- (doc.metadata["chunk_id"], 1 / (1 + score))
511
- for doc, score in embedding_results
512
- ]
513
-
514
- # Get BM25 results
515
- tokenized_query = query.split()
516
- bm25_scores = bm25.get_scores(tokenized_query)
517
-
518
- # Convert BM25 scores to list of (chunk_id, score)
519
- bm25_list = [
520
- (chunk_ids[i], float(score)) for i, score in enumerate(bm25_scores)
521
- ]
522
-
523
- # Sort bm25_list by score in descending order and limit to top N results
524
- bm25_list = sorted(bm25_list, key=lambda x: x[1], reverse=True)[
525
- : self.config.num_chunks
526
- ]
527
-
528
- # Normalize BM25 scores
529
- max_bm25 = max([score for _, score in bm25_list]) if bm25_list else 1
530
- bm25_list = [(doc_id, score / max_bm25) for doc_id, score in bm25_list]
531
-
532
- # Pass the lists to rank fusion
533
- result_lists = [embedding_list, bm25_list]
534
- weights = [self.config.embedding_weight, self.config.bm25_weight]
535
-
536
- combined_results = reciprocal_rank_fusion(result_lists, weights=weights)
537
-
538
- return combined_results
539
-
540
- except Exception as e:
541
- self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
542
- raise
543
-
544
- def generate_enhanced_summary(
545
- self,
546
- vector_store: Chroma,
547
- bm25: BM25Okapi,
548
- chunk_ids: List[str],
549
- query: str = "Summarize the main points of this document",
550
- ) -> List[Dict]:
551
- """Generate enhanced summary using both vector and BM25 retrieval"""
552
- try:
553
- # Get combined results using rank fusion
554
- ranked_results = self.retrieve_with_rank_fusion(
555
- vector_store, bm25, chunk_ids, query
556
- )
557
-
558
- # Prepare context and track sources
559
- contexts = []
560
- sources = []
561
-
562
- # Get full documents for top results
563
- for chunk_id, score in ranked_results[: self.config.num_chunks]:
564
- results = vector_store.get(
565
- where={"chunk_id": chunk_id}, include=["documents", "metadatas"]
566
- )
567
-
568
- if results["documents"]:
569
- context = results["documents"][0]
570
- metadata = results["metadatas"][0]
571
-
572
- contexts.append(context)
573
- sources.append(
574
- {
575
- "content": context,
576
- "page": metadata["page"],
577
- "chunk_id": chunk_id,
578
- "relevance_score": score,
579
- "context": metadata.get("context", ""),
580
- }
581
- )
582
-
583
- url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
584
- resposta = requests.get(url_request)
585
-
586
- if resposta.status_code != 200:
587
- return Response(
588
- {
589
- "error": "Ocorreu um problema. Pode ser que o modelo não tenha sido encontrado. Tente novamente e/ou entre em contato com a equipe técnica"
590
- }
591
- )
592
-
593
- modelo_buscado = resposta.json()["modelo"]
594
-
595
- llm = ChatOpenAI(
596
- temperature=self.gpt_temperature,
597
- model_name=self.gpt_model,
598
- api_key=self.openai_api_key,
599
- )
600
-
601
- prompt_gerar_relatorio = PromptTemplate(
602
- template=self.prompt_relatorio, input_variables=["context"]
603
- )
604
-
605
- relatorio_gerado = llm.predict(
606
- prompt_gerar_relatorio.format(context="\n\n".join(contexts))
607
- )
608
-
609
- prompt_gerar_modelo = PromptTemplate(
610
- template=self.prompt_modelo,
611
- input_variables=["context", "modelo_usuario"],
612
- )
613
-
614
- modelo_gerado = llm.predict(
615
- prompt_gerar_modelo.format(
616
- context=relatorio_gerado, modelo_usuario=modelo_buscado
617
- )
618
- )
619
-
620
- # Split the response into paragraphs
621
- summaries = [p.strip() for p in modelo_gerado.split("\n\n") if p.strip()]
622
-
623
- # Create structured output
624
- structured_output = []
625
- for idx, summary in enumerate(summaries):
626
- source_idx = min(idx, len(sources) - 1)
627
- structured_output.append(
628
- {
629
- "content": summary,
630
- "source": {
631
- "page": sources[source_idx]["page"],
632
- "text": sources[source_idx]["content"][:200] + "...",
633
- "context": sources[source_idx]["context"],
634
- "relevance_score": sources[source_idx]["relevance_score"],
635
- "chunk_id": sources[source_idx]["chunk_id"],
636
- },
637
- }
638
- )
639
-
640
- return structured_output
641
-
642
- except Exception as e:
643
- self.logger.error(f"Error generating enhanced summary: {str(e)}")
644
- raise
645
-
646
-
647
  async def get_llm_summary_answer_by_cursor_complete(
648
  serializer, listaPDFs=None, contexto=None
649
  ):
@@ -675,6 +65,7 @@ async def get_llm_summary_answer_by_cursor_complete(
675
  gpt_temperature=serializer["gpt_temperature"],
676
  id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
677
  prompt_modelo=serializer["prompt_modelo"],
 
678
  )
679
 
680
  full_text = ""
@@ -699,18 +90,61 @@ async def get_llm_summary_answer_by_cursor_complete(
699
  full_text = " ".join([page.page_content for page in pages])
700
 
701
  # Contextualize chunks
702
- contextualized_chunks = await summarizer.contextual_retriever.contextualize_chunks(
703
- pages, allPdfsChunks
704
- )
 
 
 
 
 
 
 
 
705
 
706
  # Create enhanced vector store and BM25 index
707
  vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
708
- contextualized_chunks
709
  )
710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  # Generate enhanced summary
712
  structured_summaries = summarizer.generate_enhanced_summary(
713
- vector_store, bm25, chunk_ids, serializer["user_message"]
 
 
 
 
 
714
  )
715
 
716
  # Output results as JSON
@@ -719,7 +153,7 @@ async def get_llm_summary_answer_by_cursor_complete(
719
  print(json_output)
720
  texto_completo = ""
721
  for x in structured_summaries:
722
- texto_completo = texto_completo + x["content"]
723
  return {
724
  "resultado": structured_summaries,
725
  "texto_completo": texto_completo,
@@ -745,169 +179,3 @@ async def get_llm_summary_answer_by_cursor_complete(
745
  "prompt_modelo": serializer["prompt_modelo"],
746
  },
747
  }
748
-
749
-
750
- from ragas import evaluate
751
-
752
- from langchain.chains import SequentialChain
753
- from langchain.prompts import PromptTemplate
754
-
755
- # from langchain.schema import ChainResult
756
- from langchain.memory import SimpleMemory
757
-
758
-
759
- def test_ragas(serializer, listaPDFs):
760
-
761
- # Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer
762
- config = RetrievalConfig(
763
- num_chunks=serializer["num_chunks_retrieval"],
764
- embedding_weight=serializer["embedding_weight"],
765
- bm25_weight=serializer["bm25_weight"],
766
- context_window=serializer["context_window"],
767
- chunk_overlap=serializer["chunk_overlap"],
768
- )
769
-
770
- summarizer = EnhancedDocumentSummarizer(
771
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
772
- claude_api_key=os.environ.get("CLAUDE_API_KEY"),
773
- config=config,
774
- embedding_model=serializer["hf_embedding"],
775
- chunk_overlap=serializer["chunk_overlap"],
776
- chunk_size=serializer["chunk_size"],
777
- num_k_rerank=serializer["num_k_rerank"],
778
- model_cohere_rerank=serializer["model_cohere_rerank"],
779
- claude_context_model=serializer["claude_context_model"],
780
- prompt_relatorio=serializer["prompt_relatorio"],
781
- gpt_model=serializer["model"],
782
- gpt_temperature=serializer["gpt_temperature"],
783
- id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
784
- prompt_modelo=serializer["prompt_modelo"],
785
- )
786
-
787
- # Step 1: Define the components
788
- def load_and_split_documents(pdf_list, summarizer):
789
- """Loads and splits PDF documents into chunks."""
790
- all_chunks = []
791
- for pdf_path in pdf_list:
792
- chunks = summarizer.load_and_split_document(pdf_path)
793
- all_chunks.extend(chunks)
794
- return {"chunks": all_chunks}
795
-
796
- def get_full_text_from_pdfs(pdf_list):
797
- """Gets the full text from PDFs for contextualization."""
798
- full_text = []
799
- for pdf_path in pdf_list:
800
- loader = PyPDFLoader(pdf_path)
801
- pages = loader.load()
802
- text = " ".join([page.page_content for page in pages])
803
- full_text.append(text)
804
- return {"full_text": " ".join(full_text)}
805
-
806
- def contextualize_chunks(full_text, chunks, contextual_retriever):
807
- """Adds context to chunks using Claude."""
808
- contextualized_chunks = contextual_retriever.contextualize_chunks(
809
- full_text, chunks
810
- )
811
- return {"contextualized_chunks": contextualized_chunks}
812
-
813
- def create_vector_store(contextualized_chunks, summarizer):
814
- """Creates an enhanced vector store and BM25 index."""
815
- vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
816
- contextualized_chunks
817
- )
818
- return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids}
819
-
820
- def generate_summary(vector_store, bm25, chunk_ids, query, summarizer):
821
- """Generates an enhanced summary using the vector store and BM25 index."""
822
- structured_summaries = summarizer.generate_enhanced_summary(
823
- vector_store, bm25, chunk_ids, query
824
- )
825
- return {"structured_summaries": structured_summaries}
826
-
827
- # Step 3: Define Sequential Chain
828
- chain = SequentialChain(
829
- chains=[
830
- lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer),
831
- lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]),
832
- lambda inputs: contextualize_chunks(
833
- inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever
834
- ),
835
- lambda inputs: create_vector_store(
836
- inputs["contextualized_chunks"], summarizer
837
- ),
838
- lambda inputs: generate_summary(
839
- inputs["vector_store"],
840
- inputs["bm25"],
841
- inputs["chunk_ids"],
842
- inputs["user_message"],
843
- summarizer,
844
- ),
845
- ],
846
- input_variables=["pdf_list", "user_message"],
847
- output_variables=["structured_summaries"],
848
- )
849
-
850
- from ragas.langchain.evalchain import RagasEvaluatorChain
851
- from ragas.metrics import (
852
- LLMContextRecall,
853
- Faithfulness,
854
- FactualCorrectness,
855
- SemanticSimilarity,
856
- )
857
- from ragas import evaluate
858
- from ragas.llms import LangchainLLMWrapper
859
-
860
- # from ragas.embeddings import LangchainEmbeddingsWrapper
861
- # evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
862
- evaluator_llm = LangchainLLMWrapper(chain)
863
- # evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
864
- from datasets import load_dataset
865
-
866
- dataset = load_dataset(
867
- "explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True
868
- )
869
-
870
- from ragas import EvaluationDataset
871
-
872
- eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"])
873
-
874
- metrics = [
875
- LLMContextRecall(llm=evaluator_llm),
876
- FactualCorrectness(llm=evaluator_llm),
877
- Faithfulness(llm=evaluator_llm),
878
- # SemanticSimilarity(embeddings=evaluator_embeddings)
879
- ]
880
- results = evaluate(dataset=eval_dataset, metrics=metrics)
881
- print("results: ", results)
882
-
883
- # Step 4: Run the Chain
884
- inputs = {
885
- "pdf_list": listaPDFs,
886
- "user_message": serializer["user_message"],
887
- }
888
- # result = chain.run(inputs)
889
- return Response({"msg": results})
890
-
891
- # Step 5: Format the Output
892
- # return {
893
- # "resultado": result["structured_summaries"],
894
- # "parametros-utilizados": {
895
- # "num_chunks_retrieval": serializer["num_chunks_retrieval"],
896
- # "embedding_weight": serializer["embedding_weight"],
897
- # "bm25_weight": serializer["bm25_weight"],
898
- # "context_window": serializer["context_window"],
899
- # "chunk_overlap": serializer["chunk_overlap"],
900
- # "num_k_rerank": serializer["num_k_rerank"],
901
- # "model_cohere_rerank": serializer["model_cohere_rerank"],
902
- # "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"],
903
- # "claude_context_model": serializer["claude_context_model"],
904
- # "gpt_temperature": serializer["gpt_temperature"],
905
- # "user_message": serializer["user_message"],
906
- # "model": serializer["model"],
907
- # "hf_embedding": serializer["hf_embedding"],
908
- # "chunk_size": serializer["chunk_size"],
909
- # "chunk_overlap": serializer["chunk_overlap"],
910
- # "prompt_relatorio": serializer["prompt_relatorio"],
911
- # "prompt_modelo": serializer["prompt_modelo"],
912
- # },
913
- # }
 
1
  import os
 
 
2
  from langchain_community.document_loaders import PyPDFLoader
 
 
 
 
 
 
 
3
  import json
4
+ from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
5
+ EnhancedDocumentSummarizer,
6
+ )
7
+ from _utils.models.gerar_relatorio import (
8
+ RetrievalConfig,
9
+ )
 
 
 
10
 
 
11
 
12
  def reciprocal_rank_fusion(result_lists, weights=None):
13
  """Combine multiple ranked lists using reciprocal rank fusion"""
 
34
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  async def get_llm_summary_answer_by_cursor_complete(
38
  serializer, listaPDFs=None, contexto=None
39
  ):
 
65
  gpt_temperature=serializer["gpt_temperature"],
66
  id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
67
  prompt_modelo=serializer["prompt_modelo"],
68
+ reciprocal_rank_fusion=reciprocal_rank_fusion,
69
  )
70
 
71
  full_text = ""
 
90
  full_text = " ".join([page.page_content for page in pages])
91
 
92
  # Contextualize chunks
93
+ if serializer["should_have_contextual_chunks"]:
94
+ contextualized_chunks = (
95
+ await summarizer.contextual_retriever.contextualize_all_chunks(
96
+ pages, allPdfsChunks
97
+ )
98
+ )
99
+ chunks_passados = contextualized_chunks
100
+ is_contextualized_chunk = True
101
+ else:
102
+ chunks_passados = allPdfsChunks
103
+ is_contextualized_chunk = False
104
 
105
  # Create enhanced vector store and BM25 index
106
  vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
107
+ chunks_passados, is_contextualized_chunk
108
  )
109
 
110
+ prompt_relatorio_sem_context = """ You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
111
+ **Instructions**:
112
+ 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
113
+ 2. **Reading the Context**: Extract the following information from `context`:
114
+ - The name of the defendant (réu).
115
+ - The crime they have been accused of (nome_do_crime).
116
+ - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
117
+ - The date the accusation was accepted (data_do_recebimento).
118
+ - The ID of the decision document (id_do_documento).
119
+ 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
120
+ 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
121
+ ```
122
+ <formato>
123
+ Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
124
+ A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
125
+ Não há outras causas interruptivas ou suspensivas da prescrição.
126
+ </formato>
127
+ ```
128
+ 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
129
+ **Reminder**:
130
+ - Do not include your chain of thought in the final output.
131
+ - Do not add extra information or commentary beyond the specified format.
132
+ - The final answer must be in Portuguese.
133
+ ```
134
+ <formato>
135
+ Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
136
+ A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
137
+ Não há outras causas interruptivas ou suspensivas da prescrição.
138
+ </formato>
139
+ """
140
  # Generate enhanced summary
141
  structured_summaries = summarizer.generate_enhanced_summary(
142
+ vector_store,
143
+ bm25,
144
+ chunk_ids
145
+ # , serializer["user_message"]
146
+ ,
147
+ prompt_relatorio_sem_context,
148
  )
149
 
150
  # Output results as JSON
 
153
  print(json_output)
154
  texto_completo = ""
155
  for x in structured_summaries:
156
+ texto_completo = texto_completo + x["content"] + "\n"
157
  return {
158
  "resultado": structured_summaries,
159
  "texto_completo": texto_completo,
 
179
  "prompt_modelo": serializer["prompt_modelo"],
180
  },
181
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gerar_relatorio_modelo_usuario/__init__.py ADDED
File without changes
gerar_relatorio_modelo_usuario/admin.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.contrib import admin
2
+
3
+ # Register your models here.
gerar_relatorio_modelo_usuario/apps.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class GerarRelatorioModeloUsuarioConfig(AppConfig):
5
+ default_auto_field = 'django.db.models.BigAutoField'
6
+ name = 'gerar_relatorio_modelo_usuario'
gerar_relatorio_modelo_usuario/migrations/__init__.py ADDED
File without changes
gerar_relatorio_modelo_usuario/models.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.db import models
2
+
3
+ # Create your models here.
gerar_relatorio_modelo_usuario/serializer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rest_framework import serializers
2
+ from resumos.serializer import ResumoCursorSerializer
3
+ from _utils.gerar_relatorio_modelo_usuario.prompts import (
4
+ system_prompt_modelo,
5
+ system_prompt_relatorio,
6
+ )
7
+
8
+ user_message = "What are the main points of this document?"
9
+
10
+
11
+ class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
12
+ system_prompt = None
13
+ prompt_relatorio = serializers.CharField(
14
+ required=False, default=system_prompt_relatorio
15
+ )
16
+ prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
17
+ user_message = serializers.CharField(required=False, default=user_message)
18
+ num_chunks_retrieval = serializers.IntegerField(default=5)
19
+ embedding_weight = serializers.FloatField(default=0.5)
20
+ bm25_weight = serializers.FloatField(default=0.5)
21
+ context_window = serializers.IntegerField(default=3)
22
+ chunk_overlap = serializers.IntegerField(default=200)
23
+ num_k_rerank = serializers.IntegerField(default=5)
24
+ model_cohere_rerank = serializers.CharField(
25
+ required=False, default="rerank-english-v2.0"
26
+ )
27
+ more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
28
+ claude_context_model = serializers.CharField(
29
+ required=False, default="claude-3-haiku-20240307"
30
+ )
31
+ gpt_temperature = serializers.FloatField(default=0)
32
+ id_modelo_do_usuario = serializers.IntegerField(required=True)
33
+ should_have_contextual_chunks = serializers.BooleanField(default=False)
gerar_relatorio_modelo_usuario/tests.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.test import TestCase
2
+
3
+ # Create your tests here.
gerar_relatorio_modelo_usuario/views.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rest_framework.views import APIView
2
+ from adrf.views import APIView as AsyncAPIView
3
+ import tempfile, os
4
+ from rest_framework.response import Response
5
+
6
+ from _utils.resumo_completo_cursor import (
7
+ get_llm_summary_answer_by_cursor_complete,
8
+ )
9
+ from .serializer import (
10
+ ResumoCursorCompeltoSerializer,
11
+ )
12
+ from rest_framework.parsers import MultiPartParser
13
+ from drf_spectacular.utils import extend_schema
14
+
15
+
16
+ class ResumoSimplesCursorCompletoView(AsyncAPIView):
17
+ parser_classes = [MultiPartParser]
18
+
19
+ @extend_schema(
20
+ request=ResumoCursorCompeltoSerializer,
21
+ )
22
+ async def post(self, request):
23
+ serializer = ResumoCursorCompeltoSerializer(data=request.data)
24
+ if serializer.is_valid(raise_exception=True):
25
+ print("\n\n\n")
26
+ print("serializer.validated_data: ", serializer.validated_data)
27
+ print("\n\n\n")
28
+ listaPDFs = []
29
+ data = serializer.validated_data
30
+ print("\nserializer.validated_data: ", serializer.validated_data)
31
+
32
+ for file in serializer.validated_data["files"]:
33
+ file.seek(0)
34
+ with tempfile.NamedTemporaryFile(
35
+ delete=False, suffix=".pdf"
36
+ ) as temp_file: # Create a temporary file to save the uploaded PDF
37
+ for (
38
+ chunk
39
+ ) in (
40
+ file.chunks()
41
+ ): # Write the uploaded file content to the temporary file
42
+ temp_file.write(chunk)
43
+ temp_file_path = (
44
+ temp_file.name
45
+ ) # Get the path of the temporary file
46
+ listaPDFs.append(temp_file_path)
47
+ print("listaPDFs: ", listaPDFs)
48
+
49
+ resposta_llm = await get_llm_summary_answer_by_cursor_complete(
50
+ data, listaPDFs
51
+ )
52
+
53
+ final = resposta_llm
54
+ print("\n\n\n")
55
+ print("final: ", final)
56
+
57
+ for file in listaPDFs:
58
+ os.remove(file)
59
+
60
+ return Response({"resposta": final})
ragas_api/serializer.py CHANGED
@@ -1,5 +1,5 @@
1
  from rest_framework import serializers
2
- from resumos.serializer import ResumoCursorCompeltoSerializer
3
 
4
 
5
  class RagasSerializer(ResumoCursorCompeltoSerializer):
 
1
  from rest_framework import serializers
2
+ from gerar_relatorio_modelo_usuario.serializer import ResumoCursorCompeltoSerializer
3
 
4
 
5
  class RagasSerializer(ResumoCursorCompeltoSerializer):
ragas_api/views.py CHANGED
@@ -2,9 +2,9 @@ from rest_framework.views import APIView
2
  import tempfile, os
3
  from rest_framework.response import Response
4
 
 
5
  from _utils.resumo_completo_cursor import (
6
  get_llm_summary_answer_by_cursor_complete,
7
- test_ragas,
8
  )
9
  from .serializer import (
10
  RagasFromTextSerializer,
 
2
  import tempfile, os
3
  from rest_framework.response import Response
4
 
5
+ from _utils.ragas import test_ragas
6
  from _utils.resumo_completo_cursor import (
7
  get_llm_summary_answer_by_cursor_complete,
 
8
  )
9
  from .serializer import (
10
  RagasFromTextSerializer,
resumos/serializer.py CHANGED
@@ -27,127 +27,3 @@ class ResumoCursorSerializer(serializers.Serializer):
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
  chunk_size = serializers.IntegerField(required=False, default=1000)
29
  chunk_overlap = serializers.IntegerField(required=False, default=200)
30
-
31
-
32
- # system_prompt_relatorio = """
33
- # Based on the following context, provide multiple key points from the document.
34
- # For each point, create a new paragraph.
35
- # Each paragraph should be a complete, self-contained insight.
36
- # Include any relevant context provided.
37
-
38
- # Context: {context}
39
-
40
- # Key points:
41
- # """
42
- system_prompt_relatorio = """
43
- You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
44
- **Instructions**:
45
- 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
46
- 2. **Reading the Context**: Extract the following information from `context`:
47
- - The name of the defendant (réu).
48
- - The crime they have been accused of (nome_do_crime).
49
- - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
50
- - The date the accusation was accepted (data_do_recebimento).
51
- - The ID of the decision document (id_do_documento).
52
- 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
53
- 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
54
- ```
55
- <formato>
56
- Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
57
- A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
58
- Não há outras causas interruptivas ou suspensivas da prescrição.
59
- </formato>
60
- ```
61
- 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
62
- **Reminder**:
63
- - Do not include your chain of thought in the final output.
64
- - Do not add extra information or commentary beyond the specified format.
65
- - The final answer must be in Portuguese.
66
- ---
67
-
68
- **Contextual Information (provided separately):**
69
- {context}
70
- ---
71
- **Example with a given context**:
72
- - Input:
73
- `context` = "Em 10/03/2021, o Ministério Público denunciou João da Silva, imputando-lhe o crime de furto qualificado, previsto no art. 155, §4º, inciso II, do Código Penal. A denúncia foi recebida em 12/03/2021, conforme Decisão nº 20210312-01. Não há menção a qualquer causa interruptiva ou suspensiva da prescrição."
74
- - Expected final answer:
75
- ```
76
- <formato>
77
- Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
78
- A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
79
- Não há outras causas interruptivas ou suspensivas da prescrição.
80
- </formato>
81
- """
82
-
83
- user_message = "What are the main points of this document?"
84
-
85
- # system_prompt_modelo = """
86
- # Based on the following context, provide multiple key points from the document.
87
- # For each point, create a new paragraph.
88
- # Each paragraph should be a complete, self-contained insight.
89
- # Include any relevant context provided.
90
-
91
- # Context: {context}
92
-
93
- # Modelo do usuário: {modelo_usuario}
94
-
95
- # Key points:
96
- # """
97
- system_prompt_modelo = """
98
- You are a large language model that must produce a single final sentence in **Portuguese**. To do this, you will follow a private chain of thought and then produce a final answer. The final answer must follow the formatting and stylistic conventions shown in the user-provided model `user's template`. The information to be included in the final sentence is derived from the `context` (a report describing a legal case).
99
- **Contextual Information (provided separately):**
100
- {context}
101
- **User Model (provided separately):**
102
- {modelo_do_usuario}
103
- **Instructions:**
104
- 1. **Goal:** Produce one single final sentence in Portuguese that matches the structure, format, and style given by `user's template`.
105
- 2. **Chain of Thought (private to the assistant and not to be shown in the final answer):**
106
- - Carefully review the `context` which is a legal report of a case.
107
- - Identify:
108
- - The defendant’s name.
109
- - The crime’s name, its article, and any subsection (inciso).
110
- - The date of receipt of the complaint (data do recebimento da denúncia).
111
- - The document ID.
112
- - Ensure these elements are correctly incorporated into the final sentence.
113
- - Check compliance with the formatting style indicated by `user's template`.
114
- - Compose the sentence following the structure from the user model.
115
- - Use reflection: Before finalizing the answer, reassess if all required information is included, if the format matches the user model, and if the sentence is written correctly in Portuguese.
116
- 3. **Reflection Technique (private):**
117
- After composing the sentence, but before presenting it as the final answer, reflect if:
118
- - All required details from the `context` are accurately included.
119
- - The sentence format strictly matches the pattern of `user's template`.
120
- - The sentence is grammatically correct in Portuguese.
121
- 4. **Final Answer:**
122
- - After completing the chain of thought and ensuring correctness through reflection, present only the final sentence in Portuguese.
123
- - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
124
- """
125
- class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
126
- system_prompt = None
127
- prompt_relatorio = serializers.CharField(required=False, default=system_prompt_relatorio)
128
- prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
129
- user_message = serializers.CharField(required=False, default=user_message)
130
- num_chunks_retrieval = serializers.IntegerField(default=5)
131
- embedding_weight = serializers.FloatField(default=0.5)
132
- bm25_weight = serializers.FloatField(default=0.5)
133
- context_window = serializers.IntegerField(default=3)
134
- chunk_overlap = serializers.IntegerField(default=200)
135
- num_k_rerank = serializers.IntegerField(default=5)
136
- model_cohere_rerank = serializers.CharField(required=False, default="rerank-english-v2.0")
137
- more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
138
- claude_context_model = serializers.CharField(required=False, default="claude-3-haiku-20240307")
139
- gpt_temperature = serializers.FloatField(default=0)
140
- id_modelo_do_usuario = serializers.IntegerField(required=True)
141
-
142
-
143
- class RagasSerializer(ResumoCursorCompeltoSerializer):
144
- files = serializers.ListField(child=serializers.FileField(), required=True)
145
- id_modelo_do_usuario = serializers.IntegerField(required=False)
146
- hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
147
-
148
-
149
- class RagasFromTextSerializer(ResumoCursorCompeltoSerializer):
150
- files = None
151
- id_modelo_do_usuario = serializers.IntegerField(required=False, default=9)
152
- user_message = serializers.CharField(required=True)
153
- context_provided = serializers.CharField(required=False)
 
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
  chunk_size = serializers.IntegerField(required=False, default=1000)
29
  chunk_overlap = serializers.IntegerField(required=False, default=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resumos/views.py CHANGED
@@ -1,20 +1,11 @@
1
  from rest_framework.views import APIView
2
- from adrf.views import APIView as AsyncAPIView
3
  import tempfile, os
4
  from rest_framework.response import Response
5
-
6
- from _utils.resumo_completo_cursor import (
7
- get_llm_summary_answer_by_cursor_complete,
8
- test_ragas,
9
- )
10
  from _utils.resumo_simples_cursor import get_llm_summary_answer_by_cursor
11
  from _utils.utils import DEFAULT_SYSTEM_PROMPT
12
  from .serializer import (
13
- RagasFromTextSerializer,
14
- ResumoCursorCompeltoSerializer,
15
  ResumoPDFSerializer,
16
  ResumoCursorSerializer,
17
- RagasSerializer,
18
  )
19
  from _utils.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
20
  from setup.environment import default_model
@@ -151,164 +142,3 @@ class ResumoSimplesCursorView(APIView):
151
  os.remove(file)
152
 
153
  return Response({"resposta": resposta_llm})
154
-
155
-
156
- class ResumoSimplesCursorCompletoView(AsyncAPIView):
157
- parser_classes = [MultiPartParser]
158
-
159
- @extend_schema(
160
- request=ResumoCursorCompeltoSerializer,
161
- )
162
- async def post(self, request):
163
- serializer = ResumoCursorCompeltoSerializer(data=request.data)
164
- if serializer.is_valid(raise_exception=True):
165
- print("\n\n\n")
166
- print("serializer.validated_data: ", serializer.validated_data)
167
- print("\n\n\n")
168
- listaPDFs = []
169
- data = serializer.validated_data
170
- print("\nserializer.validated_data: ", serializer.validated_data)
171
-
172
- for file in serializer.validated_data["files"]:
173
- file.seek(0)
174
- with tempfile.NamedTemporaryFile(
175
- delete=False, suffix=".pdf"
176
- ) as temp_file: # Create a temporary file to save the uploaded PDF
177
- for (
178
- chunk
179
- ) in (
180
- file.chunks()
181
- ): # Write the uploaded file content to the temporary file
182
- temp_file.write(chunk)
183
- temp_file_path = (
184
- temp_file.name
185
- ) # Get the path of the temporary file
186
- listaPDFs.append(temp_file_path)
187
- print("listaPDFs: ", listaPDFs)
188
-
189
- # resposta_llm = await get_llm_summary_answer_by_cursor_complete(
190
- # data, listaPDFs
191
- # )
192
-
193
- resposta_llm = await get_llm_summary_answer_by_cursor_complete(
194
- data, listaPDFs
195
- )
196
-
197
- final = resposta_llm
198
- print("\n\n\n")
199
- print("final: ", final)
200
-
201
- for file in listaPDFs:
202
- os.remove(file)
203
-
204
- return Response({"resposta": final})
205
-
206
-
207
- class RagasView(APIView):
208
- parser_classes = [MultiPartParser]
209
-
210
- @extend_schema(
211
- request=RagasSerializer,
212
- )
213
- def post(self, request):
214
- serializer = RagasSerializer(data=request.data)
215
- print("\n\n\n")
216
- print("\n\n\n")
217
- print("serializer.data: ", serializer)
218
- listaPDFs = []
219
- if serializer.is_valid(raise_exception=True):
220
- for file in serializer.validated_data["files"]:
221
- file.seek(0)
222
- with tempfile.NamedTemporaryFile(
223
- delete=False, suffix=".pdf"
224
- ) as temp_file: # Create a temporary file to save the uploaded PDF
225
- for (
226
- chunk
227
- ) in (
228
- file.chunks()
229
- ): # Write the uploaded file content to the temporary file
230
- temp_file.write(chunk)
231
- temp_file_path = (
232
- temp_file.name
233
- ) # Get the path of the temporary file
234
- listaPDFs.append(temp_file_path)
235
-
236
- result = test_ragas(serializer, listaPDFs)
237
-
238
- for file in listaPDFs:
239
- os.remove(file)
240
-
241
- return Response({"msg": result})
242
-
243
-
244
- class RagasFromTextView(APIView):
245
- def post(self, request):
246
- serializer = RagasFromTextSerializer(data=request.data)
247
- if serializer.is_valid(raise_exception=True):
248
- from datasets import Dataset
249
- from ragas import evaluate
250
- from ragas.metrics import (
251
- faithfulness,
252
- answer_relevancy,
253
- answer_correctness,
254
- context_precision,
255
- context_recall,
256
- )
257
- import os
258
- from datasets import load_dataset
259
- import pandas as pd
260
-
261
- os.environ.get("OPENAI_API_KEY")
262
-
263
- df_pandas = pd.read_csv(
264
- "D:/repositorios/projetos-pessoais/projeto-y-backend-hugginf-face-teste-01/vella-backend/_utils/files/ragas_testset.csv"
265
- )
266
- # print(df_pandas["position"]) # Print a specific column
267
- data = {
268
- "user_input": [
269
- "What is the capital of France?",
270
- ],
271
- "response": [],
272
- "retrieved_contexts": [],
273
- }
274
-
275
- reference = [
276
- "Paris is the capital of France. It is a major European city known for its culture."
277
- ]
278
-
279
- for x in df_pandas["user_input"]:
280
- data["user_input"].append(x)
281
-
282
- for x in df_pandas["reference"]:
283
- reference.append(x)
284
-
285
- print("data: ", reference)
286
-
287
- for i in range(len(reference)):
288
- serializer.validated_data["user_message"] = data["user_input"][i]
289
- resposta_llm = get_llm_summary_answer_by_cursor_complete(
290
- serializer.validated_data, contexto=reference[i]
291
- )
292
- data["response"].append(resposta_llm["texto_completo"])
293
- lista_reference_contexts = []
294
- for x in resposta_llm["resultado"]:
295
- lista_reference_contexts.append(x["source"]["text"])
296
- data["retrieved_contexts"].append(lista_reference_contexts)
297
-
298
- # Convert the data to a Hugging Face Dataset
299
- dataset = Dataset.from_dict(data)
300
-
301
- # Define the metrics you want to evaluate
302
- metrics = [
303
- faithfulness,
304
- # answer_relevancy,
305
- # answer_correctness,
306
- # context_precision,
307
- # context_recall,
308
- ]
309
-
310
- # Evaluate the dataset using the selected metrics
311
- results = evaluate(dataset, metrics)
312
-
313
- # results.to_pandas().to_csv("./result.csv")
314
- return Response({"resposta": results.to_pandas().to_string()})
 
1
  from rest_framework.views import APIView
 
2
  import tempfile, os
3
  from rest_framework.response import Response
 
 
 
 
 
4
  from _utils.resumo_simples_cursor import get_llm_summary_answer_by_cursor
5
  from _utils.utils import DEFAULT_SYSTEM_PROMPT
6
  from .serializer import (
 
 
7
  ResumoPDFSerializer,
8
  ResumoCursorSerializer,
 
9
  )
10
  from _utils.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
11
  from setup.environment import default_model
 
142
  os.remove(file)
143
 
144
  return Response({"resposta": resposta_llm})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup/settings.py CHANGED
@@ -52,6 +52,7 @@ INSTALLED_APPS = [
52
  "resumos",
53
  "modelos_usuarios",
54
  "ragas_api",
 
55
  ]
56
 
57
  MIDDLEWARE = [
 
52
  "resumos",
53
  "modelos_usuarios",
54
  "ragas_api",
55
+ "gerar_relatorio_modelo_usuario",
56
  ]
57
 
58
  MIDDLEWARE = [
setup/urls.py CHANGED
@@ -4,11 +4,11 @@ from rest_framework import routers
4
  from drf_spectacular.views import SpectacularSwaggerView, SpectacularAPIView
5
 
6
 
 
7
  from pdfs.views import getPDF
8
  from resumos.views import (
9
  ResumoView,
10
  ResumoSimplesCursorView,
11
- ResumoSimplesCursorCompletoView,
12
  )
13
  from ragas_api.views import RagasFromTextView, RagasView
14
  from modelos_usuarios.views import (
@@ -31,7 +31,7 @@ urlpatterns = [
31
  path("resumo", ResumoView.as_view(), name="summary-pdf"),
32
  path("resumo/cursor", ResumoSimplesCursorView.as_view(), name="summary-cursor-pdf"),
33
  path(
34
- "resumo/cursor-completo",
35
  ResumoSimplesCursorCompletoView.as_view(),
36
  name="summary-cursor-completo-pdf",
37
  ),
 
4
  from drf_spectacular.views import SpectacularSwaggerView, SpectacularAPIView
5
 
6
 
7
+ from gerar_relatorio_modelo_usuario.views import ResumoSimplesCursorCompletoView
8
  from pdfs.views import getPDF
9
  from resumos.views import (
10
  ResumoView,
11
  ResumoSimplesCursorView,
 
12
  )
13
  from ragas_api.views import RagasFromTextView, RagasView
14
  from modelos_usuarios.views import (
 
31
  path("resumo", ResumoView.as_view(), name="summary-pdf"),
32
  path("resumo/cursor", ResumoSimplesCursorView.as_view(), name="summary-cursor-pdf"),
33
  path(
34
+ "gerar-relatorio",
35
  ResumoSimplesCursorCompletoView.as_view(),
36
  name="summary-cursor-completo-pdf",
37
  ),