Spaces:
Starting
Starting
import os | |
from typing import List, Dict, Tuple | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.chat_models import ChatOpenAI | |
from langchain.chains import create_extraction_chain | |
from langchain.prompts import PromptTemplate | |
from dataclasses import dataclass | |
import uuid | |
import json | |
from langchain_huggingface import HuggingFaceEndpoint | |
from setup.environment import default_model | |
os.environ["LANGCHAIN_TRACING_V2"]="true" | |
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" | |
os.environ.get("LANGCHAIN_API_KEY") | |
os.environ["LANGCHAIN_PROJECT"]="VELLA" | |
class DocumentChunk: | |
content: str | |
page_number: int | |
chunk_id: str | |
start_char: int | |
end_char: int | |
class DocumentSummarizer: | |
def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt): | |
self.model = model | |
self.system_prompt = system_prompt | |
self.openai_api_key = openai_api_key | |
self.embeddings = HuggingFaceEmbeddings( | |
model_name=embedding | |
) | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_config["size"], | |
chunk_overlap=chunk_config["overlap"] | |
) | |
self.chunk_metadata = {} # Store chunk metadata for tracing | |
def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]: | |
"""Load PDF and split into chunks with metadata""" | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
chunks = [] | |
char_count = 0 | |
for page in pages: | |
text = page.page_content | |
# Split the page content | |
page_chunks = self.text_splitter.split_text(text) | |
for chunk in page_chunks: | |
chunk_id = str(uuid.uuid4()) | |
start_char = text.find(chunk) | |
end_char = start_char + len(chunk) | |
doc_chunk = DocumentChunk( | |
content=chunk, | |
page_number=page.metadata.get('page') + 1, # 1-based page numbering | |
chunk_id=chunk_id, | |
start_char=char_count + start_char, | |
end_char=char_count + end_char | |
) | |
chunks.append(doc_chunk) | |
# Store metadata for later retrieval | |
self.chunk_metadata[chunk_id] = { | |
'page': doc_chunk.page_number, | |
'start_char': doc_chunk.start_char, | |
'end_char': doc_chunk.end_char | |
} | |
char_count += len(text) | |
return chunks | |
def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma: | |
"""Create vector store with metadata""" | |
texts = [chunk.content for chunk in chunks] | |
metadatas = [{ | |
'chunk_id': chunk.chunk_id, | |
'page': chunk.page_number, | |
'start_char': chunk.start_char, | |
'end_char': chunk.end_char | |
} for chunk in chunks] | |
vector_store = Chroma.from_texts( | |
texts=texts, | |
metadatas=metadatas, | |
embedding=self.embeddings | |
) | |
return vector_store | |
def generate_summary_with_sources( | |
self, | |
vector_store: Chroma, | |
query: str = "Summarize the main points of this document" | |
) -> List[Dict]: | |
"""Generate summary with source citations, returning structured JSON data""" | |
# Retrieve relevant chunks with metadata | |
relevant_docs = vector_store.similarity_search_with_score(query, k=5) | |
# Prepare context and track sources | |
contexts = [] | |
sources = [] | |
for doc, score in relevant_docs: | |
chunk_id = doc.metadata['chunk_id'] | |
context = doc.page_content | |
contexts.append(context) | |
sources.append({ | |
'content': context, | |
'page': doc.metadata['page'], | |
'chunk_id': chunk_id, | |
'relevance_score': score | |
}) | |
prompt = PromptTemplate( | |
template=self.system_prompt, | |
input_variables=["context"] | |
) | |
llm = "" | |
if (self.model == default_model): | |
llm = ChatOpenAI( | |
temperature=0, | |
model_name="gpt-4o-mini", | |
api_key=self.openai_api_key | |
) | |
else: | |
llm = HuggingFaceEndpoint( | |
repo_id=self.model, | |
task="text-generation", | |
max_new_tokens=1100, | |
do_sample=False, | |
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
) | |
response = llm.predict(prompt.format(context="\n\n".join(contexts))) | |
# Split the response into paragraphs | |
summaries = [p.strip() for p in response.split('\n\n') if p.strip()] | |
# Create structured output | |
structured_output = [] | |
for idx, summary in enumerate(summaries): | |
# Associate each summary with the most relevant source | |
structured_output.append({ | |
"content": summary, | |
"source": { | |
"page": sources[min(idx, len(sources)-1)]['page'], | |
"text": sources[min(idx, len(sources)-1)]['content'][:200] + "...", | |
"relevance_score": sources[min(idx, len(sources)-1)]['relevance_score'] | |
} | |
}) | |
return structured_output | |
def get_source_context(self, chunk_id: str, window: int = 100) -> Dict: | |
"""Get extended context around a specific chunk""" | |
metadata = self.chunk_metadata.get(chunk_id) | |
if not metadata: | |
return None | |
return { | |
'page': metadata['page'], | |
'start_char': metadata['start_char'], | |
'end_char': metadata['end_char'] | |
} | |
def get_llm_summary_answer_by_cursor(serializer, listaPDFs): | |
# By Luan | |
allPdfsChunks = [] | |
# Initialize summarizer | |
summarizer = DocumentSummarizer( | |
openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
embedding=serializer["hf_embedding"], | |
chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]}, | |
system_prompt=serializer["system_prompt"], | |
model=serializer["model"] | |
) | |
# Load and process document | |
for pdf in listaPDFs: | |
pdf_path = pdf | |
chunks = summarizer.load_and_split_document(pdf_path) | |
allPdfsChunks = allPdfsChunks + chunks | |
vector_store = summarizer.create_vector_store(allPdfsChunks) | |
# Generate structured summary | |
structured_summaries = summarizer.generate_summary_with_sources(vector_store) | |
# Print or return the structured data | |
# print(structured_summaries) | |
json_data = json.dumps(structured_summaries) | |
print("\n\n") | |
print(json_data) | |
return structured_summaries | |
# If you need to send to frontend, you can just return structured_summaries | |
# It will be in the format: | |
# [ | |
# { | |
# "content": "Summary point 1...", | |
# "source": { | |
# "page": 1, | |
# "text": "Source text...", | |
# "relevance_score": 0.95 | |
# } | |
# }, | |
# ... | |
# ] | |
if __name__ == "__main__": | |
get_llm_summary_answer_by_cursor() | |