File size: 4,106 Bytes
1fd7b67
12d3e1a
1286e81
 
 
12d3e1a
 
 
 
 
1286e81
 
 
c625f4c
ca8a144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c625f4c
 
ca8a144
1fd7b67
c625f4c
 
 
1fd7b67
c625f4c
 
1fd7b67
b374298
c625f4c
ca8a144
 
 
 
 
 
c625f4c
ca8a144
 
 
 
 
c625f4c
ca8a144
 
 
 
 
 
 
12d3e1a
ca8a144
3143cff
cb23311
12d3e1a
1286e81
ca8a144
 
b374298
 
12d3e1a
 
 
 
 
baeaaa5
12d3e1a
ca8a144
 
12d3e1a
 
 
 
c625f4c
ca8a144
 
baeaaa5
1286e81
 
 
 
 
12d3e1a
ca8a144
1fd7b67
baeaaa5
 
 
 
 
12d3e1a
 
c625f4c
1286e81
12d3e1a
 
 
ca8a144
c625f4c
 
12d3e1a
c625f4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
from _utils.gerar_relatorio_modelo_usuario.prompts import prompt_auxiliar_SEM_CONTEXT
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
    EnhancedDocumentSummarizer,
)
from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
    contextualize_chunk_based_on_serializer,
    get_full_text_and_all_PDFs_chunks,
)
from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada
from _utils.models.gerar_relatorio import (
    RetrievalConfig,
)


def reciprocal_rank_fusion(result_lists, weights=None):
    """Combine multiple ranked lists using reciprocal rank fusion"""
    fused_scores = {}
    num_lists = len(result_lists)
    if weights is None:
        weights = [1.0] * num_lists

    for i in range(num_lists):
        for doc_id, score in result_lists[i]:
            if doc_id not in fused_scores:
                fused_scores[doc_id] = 0
            fused_scores[doc_id] += weights[i] * score

    # Sort by score in descending order
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_results


os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"


async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
    """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
    # Configuration
    config = RetrievalConfig(
        num_chunks=serializer["num_chunks_retrieval"],
        embedding_weight=serializer["embedding_weight"],
        bm25_weight=serializer["bm25_weight"],
        context_window=serializer["context_window"],
        chunk_overlap=serializer["chunk_overlap"],
    )

    # Initialize enhanced summarizer
    summarizer = EnhancedDocumentSummarizer(
        openai_api_key=os.environ.get("OPENAI_API_KEY"),
        claude_api_key=os.environ.get("CLAUDE_API_KEY"),
        config=config,
        embedding_model=serializer["hf_embedding"],
        chunk_overlap=serializer["chunk_overlap"],
        chunk_size=serializer["chunk_size"],
        num_k_rerank=serializer["num_k_rerank"],
        model_cohere_rerank=serializer["model_cohere_rerank"],
        claude_context_model=serializer["claude_context_model"],
        prompt_auxiliar=serializer["prompt_auxiliar"],
        gpt_model=serializer["model"],
        gpt_temperature=serializer["gpt_temperature"],
        # id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
        prompt_gerar_documento=serializer["prompt_gerar_documento"],
        reciprocal_rank_fusion=reciprocal_rank_fusion,
    )

    allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
        listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"]
    )

    chunks_passados, is_contextualized_chunk = (
        await contextualize_chunk_based_on_serializer(
            serializer, summarizer.contextual_retriever, pages, allPdfsChunks
        )
    )

    # Create enhanced vector store and BM25 index
    vector_store, bm25, chunk_ids = (
        summarizer.vector_store.create_enhanced_vector_store(
            chunks_passados, is_contextualized_chunk
        )
    )

    # Generate enhanced summary
    structured_summaries = await summarizer.generate_enhanced_summary(
        vector_store,
        bm25,
        chunk_ids
        # , serializer["user_message"]
        ,
        prompt_auxiliar_SEM_CONTEXT,
    )

    if not isinstance(structured_summaries, list):
        from rest_framework.response import Response

        return Response({"erro": structured_summaries})

    texto_completo = summarizer.resumo_gerado + "\n\n"

    for x in structured_summaries:
        texto_completo = texto_completo + x["content"] + "\n"

    print("\n\ntexto_completo: ", texto_completo)

    return {
        "resultado": structured_summaries,
        "texto_completo": texto_completo,
        "parametros-utilizados": gerar_resposta_compilada(serializer),
    }