Spaces:

luanpoppe
/

vella-backend

Sleeping

File size: 7,221 Bytes

1fd7b67

import os
from typing import List, Dict, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.prompts import PromptTemplate
from dataclasses import dataclass
import uuid
import json
from langchain_huggingface import HuggingFaceEndpoint
from setup.environment import default_model

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]="VELLA"

@dataclass
class DocumentChunk:
  content: str
  page_number: int
  chunk_id: str
  start_char: int
  end_char: int

class DocumentSummarizer:
  def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
      self.model = model
      self.system_prompt = system_prompt
      self.openai_api_key = openai_api_key
      self.embeddings = HuggingFaceEmbeddings(
          model_name=embedding
      )
      self.text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=chunk_config["size"],
          chunk_overlap=chunk_config["overlap"]
      )
      self.chunk_metadata = {}  # Store chunk metadata for tracing
  
  def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
      """Load PDF and split into chunks with metadata"""
      loader = PyPDFLoader(pdf_path)
      pages = loader.load()
      chunks = []
      char_count = 0
      
      for page in pages:
          text = page.page_content
          # Split the page content
          page_chunks = self.text_splitter.split_text(text)
          
          for chunk in page_chunks:
              chunk_id = str(uuid.uuid4())
              start_char = text.find(chunk)
              end_char = start_char + len(chunk)
              
              doc_chunk = DocumentChunk(
                  content=chunk,
                  page_number=page.metadata.get('page') + 1,  # 1-based page numbering
                  chunk_id=chunk_id,
                  start_char=char_count + start_char,
                  end_char=char_count + end_char
              )
              chunks.append(doc_chunk)
              
              # Store metadata for later retrieval
              self.chunk_metadata[chunk_id] = {
                  'page': doc_chunk.page_number,
                  'start_char': doc_chunk.start_char,
                  'end_char': doc_chunk.end_char
              }
          
          char_count += len(text)
      
      return chunks

  def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
      """Create vector store with metadata"""
      texts = [chunk.content for chunk in chunks]
      metadatas = [{
          'chunk_id': chunk.chunk_id,
          'page': chunk.page_number,
          'start_char': chunk.start_char,
          'end_char': chunk.end_char
      } for chunk in chunks]
      
      vector_store = Chroma.from_texts(
          texts=texts,
          metadatas=metadatas,
          embedding=self.embeddings
      )
      return vector_store

  def generate_summary_with_sources(
      self, 
      vector_store: Chroma, 
      query: str = "Summarize the main points of this document"
  ) -> List[Dict]:
      """Generate summary with source citations, returning structured JSON data"""
      # Retrieve relevant chunks with metadata
      relevant_docs = vector_store.similarity_search_with_score(query, k=5)
      
      # Prepare context and track sources
      contexts = []
      sources = []
      
      for doc, score in relevant_docs:
          chunk_id = doc.metadata['chunk_id']
          context = doc.page_content
          contexts.append(context)
          
          sources.append({
              'content': context,
              'page': doc.metadata['page'],
              'chunk_id': chunk_id,
              'relevance_score': score
          })
      
      prompt = PromptTemplate(
          template=self.system_prompt,
          input_variables=["context"]
      )
      llm = ""

      if (self.model == default_model):
        llm = ChatOpenAI(
            temperature=0,
            model_name="gpt-4o-mini",
            api_key=self.openai_api_key
        )
      else:
        llm = HuggingFaceEndpoint(
          repo_id=self.model,
          task="text-generation",
          max_new_tokens=1100,
          do_sample=False,
          huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
        )

      
      response = llm.predict(prompt.format(context="\n\n".join(contexts)))
      
      # Split the response into paragraphs
      summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
      
      # Create structured output
      structured_output = []
      for idx, summary in enumerate(summaries):
          # Associate each summary with the most relevant source
          structured_output.append({
              "content": summary,
              "source": {
                  "page": sources[min(idx, len(sources)-1)]['page'],
                  "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
                  "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
              }
          })
      
      return structured_output

  def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
      """Get extended context around a specific chunk"""
      metadata = self.chunk_metadata.get(chunk_id)
      if not metadata:
          return None
      
      return {
          'page': metadata['page'],
          'start_char': metadata['start_char'],
          'end_char': metadata['end_char']
      }

def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
  # By Luan
  allPdfsChunks = []

  # Initialize summarizer
  summarizer = DocumentSummarizer(
    openai_api_key=os.environ.get("OPENAI_API_KEY"),
    embedding=serializer["hf_embedding"],
    chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
    system_prompt=serializer["system_prompt"],
    model=serializer["model"]
  )
  
  # Load and process document
  for pdf in listaPDFs:
    pdf_path = pdf
    chunks = summarizer.load_and_split_document(pdf_path)
    allPdfsChunks = allPdfsChunks + chunks

  vector_store = summarizer.create_vector_store(allPdfsChunks)
  
  # Generate structured summary
  structured_summaries = summarizer.generate_summary_with_sources(vector_store)
  
  # Print or return the structured data
  # print(structured_summaries)
  json_data = json.dumps(structured_summaries)
  print("\n\n")
  print(json_data)
  return structured_summaries
  # If you need to send to frontend, you can just return structured_summaries
  # It will be in the format:
  # [
  #     {
  #         "content": "Summary point 1...",
  #         "source": {
  #             "page": 1,
  #             "text": "Source text...",
  #             "relevance_score": 0.95
  #         }
  #     },
  #     ...
  # ]

if __name__ == "__main__":
    get_llm_summary_answer_by_cursor()