vella-backend / _utils /resumo_simples_cursor.py
luanpoppe
feat: adicionandoresumo cursor completo
ca8a144
raw
history blame
7.22 kB
import os
from typing import List, Dict, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.prompts import PromptTemplate
from dataclasses import dataclass
import uuid
import json
from langchain_huggingface import HuggingFaceEndpoint
from setup.environment import default_model
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]="VELLA"
@dataclass
class DocumentChunk:
content: str
page_number: int
chunk_id: str
start_char: int
end_char: int
class DocumentSummarizer:
def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
self.model = model
self.system_prompt = system_prompt
self.openai_api_key = openai_api_key
self.embeddings = HuggingFaceEmbeddings(
model_name=embedding
)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_config["size"],
chunk_overlap=chunk_config["overlap"]
)
self.chunk_metadata = {} # Store chunk metadata for tracing
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
"""Load PDF and split into chunks with metadata"""
loader = PyPDFLoader(pdf_path)
pages = loader.load()
chunks = []
char_count = 0
for page in pages:
text = page.page_content
# Split the page content
page_chunks = self.text_splitter.split_text(text)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(chunk)
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk(
content=chunk,
page_number=page.metadata.get('page') + 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
'page': doc_chunk.page_number,
'start_char': doc_chunk.start_char,
'end_char': doc_chunk.end_char
}
char_count += len(text)
return chunks
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
"""Create vector store with metadata"""
texts = [chunk.content for chunk in chunks]
metadatas = [{
'chunk_id': chunk.chunk_id,
'page': chunk.page_number,
'start_char': chunk.start_char,
'end_char': chunk.end_char
} for chunk in chunks]
vector_store = Chroma.from_texts(
texts=texts,
metadatas=metadatas,
embedding=self.embeddings
)
return vector_store
def generate_summary_with_sources(
self,
vector_store: Chroma,
query: str = "Summarize the main points of this document"
) -> List[Dict]:
"""Generate summary with source citations, returning structured JSON data"""
# Retrieve relevant chunks with metadata
relevant_docs = vector_store.similarity_search_with_score(query, k=5)
# Prepare context and track sources
contexts = []
sources = []
for doc, score in relevant_docs:
chunk_id = doc.metadata['chunk_id']
context = doc.page_content
contexts.append(context)
sources.append({
'content': context,
'page': doc.metadata['page'],
'chunk_id': chunk_id,
'relevance_score': score
})
prompt = PromptTemplate(
template=self.system_prompt,
input_variables=["context"]
)
llm = ""
if (self.model == default_model):
llm = ChatOpenAI(
temperature=0,
model_name="gpt-4o-mini",
api_key=self.openai_api_key
)
else:
llm = HuggingFaceEndpoint(
repo_id=self.model,
task="text-generation",
max_new_tokens=1100,
do_sample=False,
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
response = llm.predict(prompt.format(context="\n\n".join(contexts)))
# Split the response into paragraphs
summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
# Create structured output
structured_output = []
for idx, summary in enumerate(summaries):
# Associate each summary with the most relevant source
structured_output.append({
"content": summary,
"source": {
"page": sources[min(idx, len(sources)-1)]['page'],
"text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
"relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
}
})
return structured_output
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
"""Get extended context around a specific chunk"""
metadata = self.chunk_metadata.get(chunk_id)
if not metadata:
return None
return {
'page': metadata['page'],
'start_char': metadata['start_char'],
'end_char': metadata['end_char']
}
def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
# By Luan
allPdfsChunks = []
# Initialize summarizer
summarizer = DocumentSummarizer(
openai_api_key=os.environ.get("OPENAI_API_KEY"),
embedding=serializer["hf_embedding"],
chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
system_prompt=serializer["system_prompt"],
model=serializer["model"]
)
# Load and process document
for pdf in listaPDFs:
pdf_path = pdf
chunks = summarizer.load_and_split_document(pdf_path)
allPdfsChunks = allPdfsChunks + chunks
vector_store = summarizer.create_vector_store(allPdfsChunks)
# Generate structured summary
structured_summaries = summarizer.generate_summary_with_sources(vector_store)
# Print or return the structured data
# print(structured_summaries)
json_data = json.dumps(structured_summaries)
print("\n\n")
print(json_data)
return structured_summaries
# If you need to send to frontend, you can just return structured_summaries
# It will be in the format:
# [
# {
# "content": "Summary point 1...",
# "source": {
# "page": 1,
# "text": "Source text...",
# "relevance_score": 0.95
# }
# },
# ...
# ]
if __name__ == "__main__":
get_llm_summary_answer_by_cursor()