c2pdf / app.py
nileshhanotia's picture
Create app.py
9735353 verified
import os
import PyPDF2
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import gradio as gr
import requests
# Load your Anthropic API key from environment variable or set it here
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Make sure to set your key
# Alternatively, set it directly in the code for testing purposes:
# ANTHROPIC_API_KEY = "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"
# Step 1: Extract text from PDFs
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Step 2: Generate embeddings
def create_embeddings(text):
model = SentenceTransformer('all-MiniLM-L6-v2')
return model.encode(text.split('\n'), convert_to_tensor=True)
# Step 3: Create FAISS index
def create_faiss_index(embeddings):
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))
return index
# Step 4: Query with Anthropic
def query_anthropic(prompt):
headers = {
"Authorization": f"Bearer {ANTHROPIC_API_KEY}",
"Content-Type": "application/json"
}
data = {
"prompt": prompt,
"max_tokens": 150,
"stop": None
}
response = requests.post('https://api.anthropic.com/v1/complete', json=data, headers=headers)
return response.json().get('completion', 'No response from model')
# Step 5: Complete workflow
def process_pdf_and_query(pdf_file, user_query):
# Step 1: Extract text
pdf_text = extract_text_from_pdf(pdf_file)
# Step 2: Generate embeddings
embeddings = create_embeddings(pdf_text)
# Step 3: Create FAISS index
faiss_index = create_faiss_index(embeddings)
# Step 4: Query with Anthropic
index_query_embedding = create_embeddings(user_query)
D, I = faiss_index.search(np.array([index_query_embedding]), k=1) # Searching for the closest match
closest_text = pdf_text.split('\n')[I[0][0]] # Get the closest text based on the index
response = query_anthropic(f"Answer the question based on this context: {closest_text}. Question: {user_query}")
return response
# Gradio interface
def run_gradio():
iface = gr.Interface(
fn=process_pdf_and_query,
inputs=[gr.File(label="Upload PDF File"), gr.Textbox(label="Ask a question")],
outputs="text",
title="PDF Query with Anthropic",
description="Upload a PDF file and ask questions related to its content."
)
iface.launch()
if __name__ == "__main__":
run_gradio()