Spaces:
Sleeping
Sleeping
nileshhanotia
commited on
Commit
•
9735353
1
Parent(s):
73afd28
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import PyPDF2
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import gradio as gr
|
7 |
+
import requests
|
8 |
+
|
9 |
+
# Load your Anthropic API key from environment variable or set it here
|
10 |
+
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Make sure to set your key
|
11 |
+
# Alternatively, set it directly in the code for testing purposes:
|
12 |
+
# ANTHROPIC_API_KEY = "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"
|
13 |
+
|
14 |
+
# Step 1: Extract text from PDFs
|
15 |
+
def extract_text_from_pdf(pdf_file):
|
16 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
17 |
+
text = ""
|
18 |
+
for page in reader.pages:
|
19 |
+
text += page.extract_text()
|
20 |
+
return text
|
21 |
+
|
22 |
+
# Step 2: Generate embeddings
|
23 |
+
def create_embeddings(text):
|
24 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
25 |
+
return model.encode(text.split('\n'), convert_to_tensor=True)
|
26 |
+
|
27 |
+
# Step 3: Create FAISS index
|
28 |
+
def create_faiss_index(embeddings):
|
29 |
+
dim = embeddings.shape[1]
|
30 |
+
index = faiss.IndexFlatL2(dim)
|
31 |
+
index.add(np.array(embeddings))
|
32 |
+
return index
|
33 |
+
|
34 |
+
# Step 4: Query with Anthropic
|
35 |
+
def query_anthropic(prompt):
|
36 |
+
headers = {
|
37 |
+
"Authorization": f"Bearer {ANTHROPIC_API_KEY}",
|
38 |
+
"Content-Type": "application/json"
|
39 |
+
}
|
40 |
+
data = {
|
41 |
+
"prompt": prompt,
|
42 |
+
"max_tokens": 150,
|
43 |
+
"stop": None
|
44 |
+
}
|
45 |
+
response = requests.post('https://api.anthropic.com/v1/complete', json=data, headers=headers)
|
46 |
+
return response.json().get('completion', 'No response from model')
|
47 |
+
|
48 |
+
# Step 5: Complete workflow
|
49 |
+
def process_pdf_and_query(pdf_file, user_query):
|
50 |
+
# Step 1: Extract text
|
51 |
+
pdf_text = extract_text_from_pdf(pdf_file)
|
52 |
+
|
53 |
+
# Step 2: Generate embeddings
|
54 |
+
embeddings = create_embeddings(pdf_text)
|
55 |
+
|
56 |
+
# Step 3: Create FAISS index
|
57 |
+
faiss_index = create_faiss_index(embeddings)
|
58 |
+
|
59 |
+
# Step 4: Query with Anthropic
|
60 |
+
index_query_embedding = create_embeddings(user_query)
|
61 |
+
D, I = faiss_index.search(np.array([index_query_embedding]), k=1) # Searching for the closest match
|
62 |
+
closest_text = pdf_text.split('\n')[I[0][0]] # Get the closest text based on the index
|
63 |
+
response = query_anthropic(f"Answer the question based on this context: {closest_text}. Question: {user_query}")
|
64 |
+
|
65 |
+
return response
|
66 |
+
|
67 |
+
# Gradio interface
|
68 |
+
def run_gradio():
|
69 |
+
iface = gr.Interface(
|
70 |
+
fn=process_pdf_and_query,
|
71 |
+
inputs=[gr.File(label="Upload PDF File"), gr.Textbox(label="Ask a question")],
|
72 |
+
outputs="text",
|
73 |
+
title="PDF Query with Anthropic",
|
74 |
+
description="Upload a PDF file and ask questions related to its content."
|
75 |
+
)
|
76 |
+
iface.launch()
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
run_gradio()
|