import os from langchain_community.document_loaders import PyPDFLoader from _utils.resumo_completo_cursor import EnhancedDocumentSummarizer, RetrievalConfig from rest_framework.response import Response from ragas import evaluate from langchain.chains import SequentialChain from langchain.prompts import PromptTemplate # from langchain.schema import ChainResult from langchain.memory import SimpleMemory def test_ragas(serializer, listaPDFs): # Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer config = RetrievalConfig( num_chunks=serializer["num_chunks_retrieval"], embedding_weight=serializer["embedding_weight"], bm25_weight=serializer["bm25_weight"], context_window=serializer["context_window"], chunk_overlap=serializer["chunk_overlap"], ) summarizer = EnhancedDocumentSummarizer( openai_api_key=os.environ.get("OPENAI_API_KEY"), claude_api_key=os.environ.get("CLAUDE_API_KEY"), config=config, embedding_model=serializer["hf_embedding"], chunk_overlap=serializer["chunk_overlap"], chunk_size=serializer["chunk_size"], num_k_rerank=serializer["num_k_rerank"], model_cohere_rerank=serializer["model_cohere_rerank"], claude_context_model=serializer["claude_context_model"], prompt_relatorio=serializer["prompt_relatorio"], gpt_model=serializer["model"], gpt_temperature=serializer["gpt_temperature"], id_modelo_do_usuario=serializer["id_modelo_do_usuario"], prompt_modelo=serializer["prompt_modelo"], ) # Step 1: Define the components def load_and_split_documents(pdf_list, summarizer): """Loads and splits PDF documents into chunks.""" all_chunks = [] for pdf_path in pdf_list: chunks = summarizer.load_and_split_document(pdf_path) all_chunks.extend(chunks) return {"chunks": all_chunks} def get_full_text_from_pdfs(pdf_list): """Gets the full text from PDFs for contextualization.""" full_text = [] for pdf_path in pdf_list: loader = PyPDFLoader(pdf_path) pages = loader.load() text = " ".join([page.page_content for page in pages]) full_text.append(text) return {"full_text": " ".join(full_text)} def contextualize_all_chunks(full_text, chunks, contextual_retriever): """Adds context to chunks using Claude.""" contextualized_chunks = contextual_retriever.contextualize_all_chunks( full_text, chunks ) return {"contextualized_chunks": contextualized_chunks} def create_vector_store(contextualized_chunks, summarizer): """Creates an enhanced vector store and BM25 index.""" vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store( contextualized_chunks ) return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids} def generate_summary(vector_store, bm25, chunk_ids, query, summarizer): """Generates an enhanced summary using the vector store and BM25 index.""" structured_summaries = summarizer.generate_enhanced_summary( vector_store, bm25, chunk_ids, query ) return {"structured_summaries": structured_summaries} # Step 3: Define Sequential Chain chain = SequentialChain( chains=[ lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer), lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]), lambda inputs: contextualize_all_chunks( inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever ), lambda inputs: create_vector_store( inputs["contextualized_chunks"], summarizer ), lambda inputs: generate_summary( inputs["vector_store"], inputs["bm25"], inputs["chunk_ids"], inputs["user_message"], summarizer, ), ], input_variables=["pdf_list", "user_message"], output_variables=["structured_summaries"], ) from ragas.langchain.evalchain import RagasEvaluatorChain from ragas.metrics import ( LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity, ) from ragas import evaluate from ragas.llms import LangchainLLMWrapper # from ragas.embeddings import LangchainEmbeddingsWrapper # evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) evaluator_llm = LangchainLLMWrapper(chain) # evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) from datasets import load_dataset dataset = load_dataset( "explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True ) from ragas import EvaluationDataset eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"]) metrics = [ LLMContextRecall(llm=evaluator_llm), FactualCorrectness(llm=evaluator_llm), Faithfulness(llm=evaluator_llm), # SemanticSimilarity(embeddings=evaluator_embeddings) ] results = evaluate(dataset=eval_dataset, metrics=metrics) print("results: ", results) # Step 4: Run the Chain inputs = { "pdf_list": listaPDFs, "user_message": serializer["user_message"], } # result = chain.run(inputs) return Response({"msg": results}) # Step 5: Format the Output # return { # "resultado": result["structured_summaries"], # "parametros-utilizados": { # "num_chunks_retrieval": serializer["num_chunks_retrieval"], # "embedding_weight": serializer["embedding_weight"], # "bm25_weight": serializer["bm25_weight"], # "context_window": serializer["context_window"], # "chunk_overlap": serializer["chunk_overlap"], # "num_k_rerank": serializer["num_k_rerank"], # "model_cohere_rerank": serializer["model_cohere_rerank"], # "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"], # "claude_context_model": serializer["claude_context_model"], # "gpt_temperature": serializer["gpt_temperature"], # "user_message": serializer["user_message"], # "model": serializer["model"], # "hf_embedding": serializer["hf_embedding"], # "chunk_size": serializer["chunk_size"], # "chunk_overlap": serializer["chunk_overlap"], # "prompt_relatorio": serializer["prompt_relatorio"], # "prompt_modelo": serializer["prompt_modelo"], # }, # }