import os from typing import List, Dict, Tuple from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.chat_models import ChatOpenAI from langchain.chains import create_extraction_chain from langchain.prompts import PromptTemplate from dataclasses import dataclass import uuid import json from langchain_huggingface import HuggingFaceEndpoint from setup.environment import default_model os.environ["LANGCHAIN_TRACING_V2"]="true" os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" os.environ.get("LANGCHAIN_API_KEY") os.environ["LANGCHAIN_PROJECT"]="VELLA" @dataclass class DocumentChunk: content: str page_number: int chunk_id: str start_char: int end_char: int class DocumentSummarizer: def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt): self.model = model self.system_prompt = system_prompt self.openai_api_key = openai_api_key self.embeddings = HuggingFaceEmbeddings( model_name=embedding ) self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_config["size"], chunk_overlap=chunk_config["overlap"] ) self.chunk_metadata = {} # Store chunk metadata for tracing def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]: """Load PDF and split into chunks with metadata""" loader = PyPDFLoader(pdf_path) pages = loader.load() chunks = [] char_count = 0 for page in pages: text = page.page_content # Split the page content page_chunks = self.text_splitter.split_text(text) for chunk in page_chunks: chunk_id = str(uuid.uuid4()) start_char = text.find(chunk) end_char = start_char + len(chunk) doc_chunk = DocumentChunk( content=chunk, page_number=page.metadata.get('page') + 1, # 1-based page numbering chunk_id=chunk_id, start_char=char_count + start_char, end_char=char_count + end_char ) chunks.append(doc_chunk) # Store metadata for later retrieval self.chunk_metadata[chunk_id] = { 'page': doc_chunk.page_number, 'start_char': doc_chunk.start_char, 'end_char': doc_chunk.end_char } char_count += len(text) return chunks def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: """Create vector store with metadata""" texts = [chunk.content for chunk in chunks] metadatas = [{ 'chunk_id': chunk.chunk_id, 'page': chunk.page_number, 'start_char': chunk.start_char, 'end_char': chunk.end_char } for chunk in chunks] vector_store = Chroma.from_texts( texts=texts, metadatas=metadatas, embedding=self.embeddings ) return vector_store def generate_summary_with_sources( self, vector_store: Chroma, query: str = "Summarize the main points of this document" ) -> List[Dict]: """Generate summary with source citations, returning structured JSON data""" # Retrieve relevant chunks with metadata relevant_docs = vector_store.similarity_search_with_score(query, k=5) # Prepare context and track sources contexts = [] sources = [] for doc, score in relevant_docs: chunk_id = doc.metadata['chunk_id'] context = doc.page_content contexts.append(context) sources.append({ 'content': context, 'page': doc.metadata['page'], 'chunk_id': chunk_id, 'relevance_score': score }) prompt = PromptTemplate( template=self.system_prompt, input_variables=["context"] ) llm = "" if (self.model == default_model): llm = ChatOpenAI( temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key ) else: llm = HuggingFaceEndpoint( repo_id=self.model, task="text-generation", max_new_tokens=1100, do_sample=False, huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) response = llm.predict(prompt.format(context="\n\n".join(contexts))) # Split the response into paragraphs summaries = [p.strip() for p in response.split('\n\n') if p.strip()] # Create structured output structured_output = [] for idx, summary in enumerate(summaries): # Associate each summary with the most relevant source structured_output.append({ "content": summary, "source": { "page": sources[min(idx, len(sources)-1)]['page'], "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...", "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score'] } }) return structured_output def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: """Get extended context around a specific chunk""" metadata = self.chunk_metadata.get(chunk_id) if not metadata: return None return { 'page': metadata['page'], 'start_char': metadata['start_char'], 'end_char': metadata['end_char'] } def get_llm_summary_answer_by_cursor(serializer, listaPDFs): # By Luan allPdfsChunks = [] # Initialize summarizer summarizer = DocumentSummarizer( openai_api_key=os.environ.get("OPENAI_API_KEY"), embedding=serializer["hf_embedding"], chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]}, system_prompt=serializer["system_prompt"], model=serializer["model"] ) # Load and process document for pdf in listaPDFs: pdf_path = pdf chunks = summarizer.load_and_split_document(pdf_path) allPdfsChunks = allPdfsChunks + chunks vector_store = summarizer.create_vector_store(allPdfsChunks) # Generate structured summary structured_summaries = summarizer.generate_summary_with_sources(vector_store) # Print or return the structured data # print(structured_summaries) json_data = json.dumps(structured_summaries) print("\n\n") print(json_data) return structured_summaries # If you need to send to frontend, you can just return structured_summaries # It will be in the format: # [ # { # "content": "Summary point 1...", # "source": { # "page": 1, # "text": "Source text...", # "relevance_score": 0.95 # } # }, # ... # ] if __name__ == "__main__": get_llm_summary_answer_by_cursor()