vella-backend / _utils /resumo_completo_cursor.py
AntonioHCastro's picture
feat: adicionando o resumo na resposta final
ecd9808
raw
history blame
8.39 kB
import os
from langchain_community.document_loaders import PyPDFLoader
import json
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
EnhancedDocumentSummarizer,
)
from _utils.models.gerar_relatorio import (
RetrievalConfig,
)
def reciprocal_rank_fusion(result_lists, weights=None):
"""Combine multiple ranked lists using reciprocal rank fusion"""
fused_scores = {}
num_lists = len(result_lists)
if weights is None:
weights = [1.0] * num_lists
for i in range(num_lists):
for doc_id, score in result_lists[i]:
if doc_id not in fused_scores:
fused_scores[doc_id] = 0
fused_scores[doc_id] += weights[i] * score
# Sort by score in descending order
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
async def get_llm_summary_answer_by_cursor_complete(
serializer, listaPDFs=None, contexto=None
):
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
allPdfsChunks = []
# Configuration
config = RetrievalConfig(
num_chunks=serializer["num_chunks_retrieval"],
embedding_weight=serializer["embedding_weight"],
bm25_weight=serializer["bm25_weight"],
context_window=serializer["context_window"],
chunk_overlap=serializer["chunk_overlap"],
)
# Initialize enhanced summarizer
summarizer = EnhancedDocumentSummarizer(
openai_api_key=os.environ.get("OPENAI_API_KEY"),
claude_api_key=os.environ.get("CLAUDE_API_KEY"),
config=config,
embedding_model=serializer["hf_embedding"],
chunk_overlap=serializer["chunk_overlap"],
chunk_size=serializer["chunk_size"],
num_k_rerank=serializer["num_k_rerank"],
model_cohere_rerank=serializer["model_cohere_rerank"],
claude_context_model=serializer["claude_context_model"],
prompt_relatorio=serializer["prompt_relatorio"],
gpt_model=serializer["model"],
gpt_temperature=serializer["gpt_temperature"],
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
prompt_modelo=serializer["prompt_modelo"],
reciprocal_rank_fusion=reciprocal_rank_fusion,
)
full_text = ""
if contexto:
full_text = contexto
chunks = summarizer.load_and_split_text(full_text)
allPdfsChunks = chunks
else:
# # Load and process document
# pdf_path = "./Im_a_storyteller.pdf"
# chunks = summarizer.load_and_split_document(pdf_path)
# Load and process document
for pdf in listaPDFs:
pdf_path = pdf
chunks = summarizer.load_and_split_document(pdf_path)
allPdfsChunks = allPdfsChunks + chunks
# Get full text for contextualization
loader = PyPDFLoader(pdf_path)
pages = loader.load()
full_text = " ".join([page.page_content for page in pages])
# Contextualize chunks
if serializer["should_have_contextual_chunks"]:
contextualized_chunks = (
await summarizer.contextual_retriever.contextualize_all_chunks(
pages, allPdfsChunks
)
)
chunks_passados = contextualized_chunks
is_contextualized_chunk = True
else:
chunks_passados = allPdfsChunks
is_contextualized_chunk = False
# Create enhanced vector store and BM25 index
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
chunks_passados, is_contextualized_chunk
)
prompt_resumo_sem_context = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
**Instructions**:
1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
2. **Reading the Context**: Extract the following information from `context`:
- The name of the defendant (réu).
- The crime they have been accused of (nome_do_crime).
- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
- The date the accusation was accepted (data_do_recebimento).
- The ID of the decision document (id_do_documento).
3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
```
5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
**Reminder**:
- Do not include your chain of thought in the final output.
- Do not add extra information or commentary beyond the specified format.
- The final answer must be in Portuguese.
```
<formato>
Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
Não há outras causas interruptivas ou suspensivas da prescrição.
</formato>
"""
# Generate enhanced summary
structured_summaries = await summarizer.generate_enhanced_summary(
vector_store,
bm25,
chunk_ids
# , serializer["user_message"]
,
prompt_resumo_sem_context,
)
if not isinstance(structured_summaries, list):
from rest_framework.response import Response
return Response({"erro": structured_summaries})
# Output results as JSON
# json_output = json.dumps(structured_summaries, indent=2)
# print("\nStructured Summaries:")
# print(json_output)
texto_completo = ""
print("\n\n\n")
print("summarizer.resumo_gerado: ", summarizer.resumo_gerado)
texto_completo += summarizer.resumo_gerado
texto_completo += "\n\n"
print("\n\n\n")
print("structured_summaries: ", structured_summaries)
for x in structured_summaries:
texto_completo = texto_completo + x["content"] + "\n"
return {
"resultado": structured_summaries,
"texto_completo": texto_completo,
"parametros-utilizados": {
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
"embedding_weight": serializer["embedding_weight"],
"bm25_weight": serializer["bm25_weight"],
"context_window": serializer["context_window"],
"chunk_overlap": serializer["chunk_overlap"],
"num_k_rerank": serializer["num_k_rerank"],
"model_cohere_rerank": serializer["model_cohere_rerank"],
"more_initial_chunks_for_reranking": serializer[
"more_initial_chunks_for_reranking"
],
"claude_context_model": serializer["claude_context_model"],
"gpt_temperature": serializer["gpt_temperature"],
"user_message": serializer["user_message"],
"model": serializer["model"],
"hf_embedding": serializer["hf_embedding"],
"chunk_size": serializer["chunk_size"],
"chunk_overlap": serializer["chunk_overlap"],
"prompt_relatorio": serializer["prompt_relatorio"],
"prompt_modelo": serializer["prompt_modelo"],
},
}