File size: 7,221 Bytes
1fd7b67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import os
from typing import List, Dict, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.prompts import PromptTemplate
from dataclasses import dataclass
import uuid
import json
from langchain_huggingface import HuggingFaceEndpoint
from setup.environment import default_model

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]="VELLA"

@dataclass
class DocumentChunk:
  content: str
  page_number: int
  chunk_id: str
  start_char: int
  end_char: int

class DocumentSummarizer:
  def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
      self.model = model
      self.system_prompt = system_prompt
      self.openai_api_key = openai_api_key
      self.embeddings = HuggingFaceEmbeddings(
          model_name=embedding
      )
      self.text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=chunk_config["size"],
          chunk_overlap=chunk_config["overlap"]
      )
      self.chunk_metadata = {}  # Store chunk metadata for tracing
  
  def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
      """Load PDF and split into chunks with metadata"""
      loader = PyPDFLoader(pdf_path)
      pages = loader.load()
      chunks = []
      char_count = 0
      
      for page in pages:
          text = page.page_content
          # Split the page content
          page_chunks = self.text_splitter.split_text(text)
          
          for chunk in page_chunks:
              chunk_id = str(uuid.uuid4())
              start_char = text.find(chunk)
              end_char = start_char + len(chunk)
              
              doc_chunk = DocumentChunk(
                  content=chunk,
                  page_number=page.metadata.get('page') + 1,  # 1-based page numbering
                  chunk_id=chunk_id,
                  start_char=char_count + start_char,
                  end_char=char_count + end_char
              )
              chunks.append(doc_chunk)
              
              # Store metadata for later retrieval
              self.chunk_metadata[chunk_id] = {
                  'page': doc_chunk.page_number,
                  'start_char': doc_chunk.start_char,
                  'end_char': doc_chunk.end_char
              }
          
          char_count += len(text)
      
      return chunks

  def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
      """Create vector store with metadata"""
      texts = [chunk.content for chunk in chunks]
      metadatas = [{
          'chunk_id': chunk.chunk_id,
          'page': chunk.page_number,
          'start_char': chunk.start_char,
          'end_char': chunk.end_char
      } for chunk in chunks]
      
      vector_store = Chroma.from_texts(
          texts=texts,
          metadatas=metadatas,
          embedding=self.embeddings
      )
      return vector_store

  def generate_summary_with_sources(
      self, 
      vector_store: Chroma, 
      query: str = "Summarize the main points of this document"
  ) -> List[Dict]:
      """Generate summary with source citations, returning structured JSON data"""
      # Retrieve relevant chunks with metadata
      relevant_docs = vector_store.similarity_search_with_score(query, k=5)
      
      # Prepare context and track sources
      contexts = []
      sources = []
      
      for doc, score in relevant_docs:
          chunk_id = doc.metadata['chunk_id']
          context = doc.page_content
          contexts.append(context)
          
          sources.append({
              'content': context,
              'page': doc.metadata['page'],
              'chunk_id': chunk_id,
              'relevance_score': score
          })
      
      prompt = PromptTemplate(
          template=self.system_prompt,
          input_variables=["context"]
      )
      llm = ""

      if (self.model == default_model):
        llm = ChatOpenAI(
            temperature=0,
            model_name="gpt-4o-mini",
            api_key=self.openai_api_key
        )
      else:
        llm = HuggingFaceEndpoint(
          repo_id=self.model,
          task="text-generation",
          max_new_tokens=1100,
          do_sample=False,
          huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
        )

      
      response = llm.predict(prompt.format(context="\n\n".join(contexts)))
      
      # Split the response into paragraphs
      summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
      
      # Create structured output
      structured_output = []
      for idx, summary in enumerate(summaries):
          # Associate each summary with the most relevant source
          structured_output.append({
              "content": summary,
              "source": {
                  "page": sources[min(idx, len(sources)-1)]['page'],
                  "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
                  "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
              }
          })
      
      return structured_output

  def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
      """Get extended context around a specific chunk"""
      metadata = self.chunk_metadata.get(chunk_id)
      if not metadata:
          return None
      
      return {
          'page': metadata['page'],
          'start_char': metadata['start_char'],
          'end_char': metadata['end_char']
      }

def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
  # By Luan
  allPdfsChunks = []

  # Initialize summarizer
  summarizer = DocumentSummarizer(
    openai_api_key=os.environ.get("OPENAI_API_KEY"),
    embedding=serializer["hf_embedding"],
    chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
    system_prompt=serializer["system_prompt"],
    model=serializer["model"]
  )
  
  # Load and process document
  for pdf in listaPDFs:
    pdf_path = pdf
    chunks = summarizer.load_and_split_document(pdf_path)
    allPdfsChunks = allPdfsChunks + chunks

  vector_store = summarizer.create_vector_store(allPdfsChunks)
  
  # Generate structured summary
  structured_summaries = summarizer.generate_summary_with_sources(vector_store)
  
  # Print or return the structured data
  # print(structured_summaries)
  json_data = json.dumps(structured_summaries)
  print("\n\n")
  print(json_data)
  return structured_summaries
  # If you need to send to frontend, you can just return structured_summaries
  # It will be in the format:
  # [
  #     {
  #         "content": "Summary point 1...",
  #         "source": {
  #             "page": 1,
  #             "text": "Source text...",
  #             "relevance_score": 0.95
  #         }
  #     },
  #     ...
  # ]

if __name__ == "__main__":
    get_llm_summary_answer_by_cursor()