Spaces:

luanpoppe
/

vella-backend

Starting

vella-backend / _utils /resumo_simples_cursor.py

luanpoppe

feat: adicionandoresumo cursor completo

ca8a144 2 months ago

7.22 kB

	import os
	from typing import List, Dict, Tuple
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import PyPDFLoader
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import create_extraction_chain
	from langchain.prompts import PromptTemplate
	from dataclasses import dataclass
	import uuid
	import json
	from langchain_huggingface import HuggingFaceEndpoint
	from setup.environment import default_model

	os.environ["LANGCHAIN_TRACING_V2"]="true"
	os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
	os.environ.get("LANGCHAIN_API_KEY")
	os.environ["LANGCHAIN_PROJECT"]="VELLA"

	@dataclass
	class DocumentChunk:
	content: str
	page_number: int
	chunk_id: str
	start_char: int
	end_char: int

	class DocumentSummarizer:
	def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
	self.model = model
	self.system_prompt = system_prompt
	self.openai_api_key = openai_api_key
	self.embeddings = HuggingFaceEmbeddings(
	model_name=embedding
	)
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_config["size"],
	chunk_overlap=chunk_config["overlap"]
	)
	self.chunk_metadata = {} # Store chunk metadata for tracing

	def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
	"""Load PDF and split into chunks with metadata"""
	loader = PyPDFLoader(pdf_path)
	pages = loader.load()
	chunks = []
	char_count = 0

	for page in pages:
	text = page.page_content
	# Split the page content
	page_chunks = self.text_splitter.split_text(text)

	for chunk in page_chunks:
	chunk_id = str(uuid.uuid4())
	start_char = text.find(chunk)
	end_char = start_char + len(chunk)

	doc_chunk = DocumentChunk(
	content=chunk,
	page_number=page.metadata.get('page') + 1, # 1-based page numbering
	chunk_id=chunk_id,
	start_char=char_count + start_char,
	end_char=char_count + end_char
	)
	chunks.append(doc_chunk)

	# Store metadata for later retrieval
	self.chunk_metadata[chunk_id] = {
	'page': doc_chunk.page_number,
	'start_char': doc_chunk.start_char,
	'end_char': doc_chunk.end_char
	}

	char_count += len(text)

	return chunks

	def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
	"""Create vector store with metadata"""
	texts = [chunk.content for chunk in chunks]
	metadatas = [{
	'chunk_id': chunk.chunk_id,
	'page': chunk.page_number,
	'start_char': chunk.start_char,
	'end_char': chunk.end_char
	} for chunk in chunks]

	vector_store = Chroma.from_texts(
	texts=texts,
	metadatas=metadatas,
	embedding=self.embeddings
	)
	return vector_store

	def generate_summary_with_sources(
	self,
	vector_store: Chroma,
	query: str = "Summarize the main points of this document"
	) -> List[Dict]:
	"""Generate summary with source citations, returning structured JSON data"""
	# Retrieve relevant chunks with metadata
	relevant_docs = vector_store.similarity_search_with_score(query, k=5)

	# Prepare context and track sources
	contexts = []
	sources = []

	for doc, score in relevant_docs:
	chunk_id = doc.metadata['chunk_id']
	context = doc.page_content
	contexts.append(context)

	sources.append({
	'content': context,
	'page': doc.metadata['page'],
	'chunk_id': chunk_id,
	'relevance_score': score
	})

	prompt = PromptTemplate(
	template=self.system_prompt,
	input_variables=["context"]
	)
	llm = ""

	if (self.model == default_model):
	llm = ChatOpenAI(
	temperature=0,
	model_name="gpt-4o-mini",
	api_key=self.openai_api_key
	)
	else:
	llm = HuggingFaceEndpoint(
	repo_id=self.model,
	task="text-generation",
	max_new_tokens=1100,
	do_sample=False,
	huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	)


	response = llm.predict(prompt.format(context="\n\n".join(contexts)))

	# Split the response into paragraphs
	summaries = [p.strip() for p in response.split('\n\n') if p.strip()]

	# Create structured output
	structured_output = []
	for idx, summary in enumerate(summaries):
	# Associate each summary with the most relevant source
	structured_output.append({
	"content": summary,
	"source": {
	"page": sources[min(idx, len(sources)-1)]['page'],
	"text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
	"relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
	}
	})

	return structured_output

	def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
	"""Get extended context around a specific chunk"""
	metadata = self.chunk_metadata.get(chunk_id)
	if not metadata:
	return None

	return {
	'page': metadata['page'],
	'start_char': metadata['start_char'],
	'end_char': metadata['end_char']
	}

	def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
	# By Luan
	allPdfsChunks = []

	# Initialize summarizer
	summarizer = DocumentSummarizer(
	openai_api_key=os.environ.get("OPENAI_API_KEY"),
	embedding=serializer["hf_embedding"],
	chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
	system_prompt=serializer["system_prompt"],
	model=serializer["model"]
	)

	# Load and process document
	for pdf in listaPDFs:
	pdf_path = pdf
	chunks = summarizer.load_and_split_document(pdf_path)
	allPdfsChunks = allPdfsChunks + chunks

	vector_store = summarizer.create_vector_store(allPdfsChunks)

	# Generate structured summary
	structured_summaries = summarizer.generate_summary_with_sources(vector_store)

	# Print or return the structured data
	# print(structured_summaries)
	json_data = json.dumps(structured_summaries)
	print("\n\n")
	print(json_data)
	return structured_summaries
	# If you need to send to frontend, you can just return structured_summaries
	# It will be in the format:
	# [
	# {
	# "content": "Summary point 1...",
	# "source": {
	# "page": 1,
	# "text": "Source text...",
	# "relevance_score": 0.95
	# }
	# },
	# ...
	# ]

	if __name__ == "__main__":
	get_llm_summary_answer_by_cursor()