Merge pull request #39 from DL4DS/text_extraction
Browse filesSCRUM-57 LLamaParser patch, DataLoader restructure and more
- code/.chainlit/config.toml +19 -16
- code/main.py +2 -0
- code/modules/config/config.yml +1 -1
- code/modules/config/constants.py +6 -3
- code/modules/dataloader/data_loader.py +13 -110
- code/modules/dataloader/helpers.py +22 -2
- code/modules/dataloader/pdf_readers/base.py +14 -0
- code/modules/dataloader/pdf_readers/llama.py +92 -0
code/.chainlit/config.toml
CHANGED
@@ -23,7 +23,7 @@ allow_origins = ["*"]
|
|
23 |
unsafe_allow_html = false
|
24 |
|
25 |
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
26 |
-
latex =
|
27 |
|
28 |
# Automatically tag threads with the current chat profile (if a chat profile is used)
|
29 |
auto_tag_thread = true
|
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
|
|
85 |
# custom_build = "./public/build"
|
86 |
|
87 |
[UI.theme]
|
88 |
-
default = "
|
89 |
#layout = "wide"
|
90 |
#font_family = "Inter, sans-serif"
|
91 |
# Override default MUI light theme. (Check theme.ts)
|
92 |
[UI.theme.light]
|
93 |
-
background = "#FAFAFA"
|
94 |
-
paper = "#FFFFFF"
|
95 |
|
96 |
[UI.theme.light.primary]
|
97 |
-
main = "#
|
98 |
-
dark = "#
|
99 |
-
light = "#
|
100 |
[UI.theme.light.text]
|
101 |
-
primary = "#212121"
|
102 |
-
secondary = "#616161"
|
|
|
103 |
# Override default MUI dark theme. (Check theme.ts)
|
104 |
[UI.theme.dark]
|
105 |
-
background = "#
|
106 |
-
paper = "#
|
107 |
|
108 |
[UI.theme.dark.primary]
|
109 |
-
main = "#
|
110 |
-
dark = "#
|
111 |
-
light = "#
|
112 |
-
|
|
|
|
|
113 |
|
114 |
[meta]
|
115 |
-
generated_by = "1.1.
|
|
|
23 |
unsafe_allow_html = false
|
24 |
|
25 |
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
26 |
+
latex = true
|
27 |
|
28 |
# Automatically tag threads with the current chat profile (if a chat profile is used)
|
29 |
auto_tag_thread = true
|
|
|
85 |
# custom_build = "./public/build"
|
86 |
|
87 |
[UI.theme]
|
88 |
+
default = "dark"
|
89 |
#layout = "wide"
|
90 |
#font_family = "Inter, sans-serif"
|
91 |
# Override default MUI light theme. (Check theme.ts)
|
92 |
[UI.theme.light]
|
93 |
+
#background = "#FAFAFA"
|
94 |
+
#paper = "#FFFFFF"
|
95 |
|
96 |
[UI.theme.light.primary]
|
97 |
+
#main = "#F80061"
|
98 |
+
#dark = "#980039"
|
99 |
+
#light = "#FFE7EB"
|
100 |
[UI.theme.light.text]
|
101 |
+
#primary = "#212121"
|
102 |
+
#secondary = "#616161"
|
103 |
+
|
104 |
# Override default MUI dark theme. (Check theme.ts)
|
105 |
[UI.theme.dark]
|
106 |
+
#background = "#FAFAFA"
|
107 |
+
#paper = "#FFFFFF"
|
108 |
|
109 |
[UI.theme.dark.primary]
|
110 |
+
#main = "#F80061"
|
111 |
+
#dark = "#980039"
|
112 |
+
#light = "#FFE7EB"
|
113 |
+
[UI.theme.dark.text]
|
114 |
+
#primary = "#EEEEEE"
|
115 |
+
#secondary = "#BDBDBD"
|
116 |
|
117 |
[meta]
|
118 |
+
generated_by = "1.1.304"
|
code/main.py
CHANGED
@@ -173,4 +173,6 @@ async def main(message):
|
|
173 |
answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
|
174 |
processor._process(message.content, answer, sources_dict)
|
175 |
|
|
|
|
|
176 |
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
|
|
173 |
answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
|
174 |
processor._process(message.content, answer, sources_dict)
|
175 |
|
176 |
+
answer_with_sources = answer_with_sources.replace("$$", "$")
|
177 |
+
|
178 |
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
code/modules/config/config.yml
CHANGED
@@ -34,7 +34,7 @@ llm_params:
|
|
34 |
local_llm_params:
|
35 |
model: 'tiny-llama'
|
36 |
temperature: 0.7
|
37 |
-
pdf_reader: 'llama' # str [llama, pymupdf]
|
38 |
|
39 |
chat_logging:
|
40 |
log_chat: False # bool
|
|
|
34 |
local_llm_params:
|
35 |
model: 'tiny-llama'
|
36 |
temperature: 0.7
|
37 |
+
pdf_reader: 'llama' # str [llama, pymupdf, gpt]
|
38 |
|
39 |
chat_logging:
|
40 |
log_chat: False # bool
|
code/modules/config/constants.py
CHANGED
@@ -15,7 +15,9 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
|
|
15 |
# Prompt Templates
|
16 |
|
17 |
openai_prompt_template = """Use the following pieces of information to answer the user's question.
|
18 |
-
You are an intelligent chatbot designed to help students with questions regarding the course.
|
|
|
|
|
19 |
If you don't know the answer, just say that you don't know.
|
20 |
|
21 |
Context: {context}
|
@@ -26,8 +28,9 @@ Helpful answer:
|
|
26 |
"""
|
27 |
|
28 |
openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
|
29 |
-
You are an intelligent chatbot designed to help students with questions regarding the course.
|
30 |
-
|
|
|
31 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
32 |
|
33 |
Use the history to answer the question if you can.
|
|
|
15 |
# Prompt Templates
|
16 |
|
17 |
openai_prompt_template = """Use the following pieces of information to answer the user's question.
|
18 |
+
You are an intelligent chatbot designed to help students with questions regarding the course.
|
19 |
+
Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
|
20 |
+
Be sure to explain the parameters and variables in the equations.
|
21 |
If you don't know the answer, just say that you don't know.
|
22 |
|
23 |
Context: {context}
|
|
|
28 |
"""
|
29 |
|
30 |
openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
|
31 |
+
You are an intelligent chatbot designed to help students with questions regarding the course.
|
32 |
+
Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
|
33 |
+
Be sure to explain the parameters and variables in the equations.
|
34 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
35 |
|
36 |
Use the history to answer the question if you can.
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -25,101 +25,19 @@ import html2text
|
|
25 |
import bs4
|
26 |
import tempfile
|
27 |
import PyPDF2
|
|
|
|
|
28 |
|
29 |
try:
|
30 |
-
from modules.dataloader.helpers import get_metadata
|
31 |
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
32 |
-
|
33 |
-
|
34 |
except:
|
35 |
-
from dataloader.helpers import get_metadata
|
36 |
from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
37 |
|
38 |
logger = logging.getLogger(__name__)
|
39 |
BASE_DIR = os.getcwd()
|
40 |
|
41 |
-
class PDFReader:
|
42 |
-
def __init__(self):
|
43 |
-
pass
|
44 |
-
|
45 |
-
def get_loader(self, pdf_path):
|
46 |
-
loader = PyMuPDFLoader(pdf_path)
|
47 |
-
return loader
|
48 |
-
|
49 |
-
def get_documents(self, loader):
|
50 |
-
return loader.load()
|
51 |
-
|
52 |
-
class LlamaParser:
|
53 |
-
def __init__(self):
|
54 |
-
self.GPT_API_KEY = OPENAI_API_KEY
|
55 |
-
self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
|
56 |
-
self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
|
57 |
-
self.headers = {
|
58 |
-
'Accept': 'application/json',
|
59 |
-
'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
|
60 |
-
}
|
61 |
-
self.parser = LlamaParse(
|
62 |
-
api_key=LLAMA_CLOUD_API_KEY,
|
63 |
-
result_type="markdown",
|
64 |
-
verbose=True,
|
65 |
-
language="en",
|
66 |
-
gpt4o_mode=False,
|
67 |
-
# gpt4o_api_key=OPENAI_API_KEY,
|
68 |
-
parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
|
69 |
-
)
|
70 |
-
|
71 |
-
def parse(self, pdf_path):
|
72 |
-
pdf_name = os.path.basename(pdf_path)
|
73 |
-
|
74 |
-
documents = self.parser.load_data(pdf_path)
|
75 |
-
documents = [document.to_langchain_format() for document in documents]
|
76 |
-
|
77 |
-
os.remove(pdf_path) # cleanup, just in case
|
78 |
-
return documents
|
79 |
-
|
80 |
-
def make_request(self, pdf_url):
|
81 |
-
payload = {
|
82 |
-
"gpt4o_mode": "false",
|
83 |
-
"parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
|
84 |
-
}
|
85 |
-
|
86 |
-
files = [
|
87 |
-
('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
|
88 |
-
]
|
89 |
-
|
90 |
-
response = requests.request(
|
91 |
-
"POST", self.parse_url, headers=self.headers, data=payload, files=files)
|
92 |
-
|
93 |
-
return response.json()['id'], response.json()['status']
|
94 |
-
|
95 |
-
async def get_result(self, job_id):
|
96 |
-
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
|
97 |
-
|
98 |
-
response = requests.request("GET", url, headers=self.headers, data={})
|
99 |
-
|
100 |
-
return response.json()['markdown']
|
101 |
-
|
102 |
-
async def _parse(self, pdf_path):
|
103 |
-
job_id, status = self.make_request(pdf_path)
|
104 |
-
|
105 |
-
while status != "SUCCESS":
|
106 |
-
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
|
107 |
-
response = requests.request("GET", url, headers=self.headers, data={})
|
108 |
-
status = response.json()["status"]
|
109 |
-
|
110 |
-
result = await self.get_result(job_id)
|
111 |
-
|
112 |
-
documents = [
|
113 |
-
Document(
|
114 |
-
page_content=result,
|
115 |
-
metadata={"source": pdf_path}
|
116 |
-
)
|
117 |
-
]
|
118 |
-
|
119 |
-
return documents
|
120 |
-
|
121 |
-
async def _parse(self, pdf_path):
|
122 |
-
return await self._parse(pdf_path)
|
123 |
|
124 |
class HTMLReader:
|
125 |
def __init__(self):
|
@@ -186,18 +104,6 @@ class FileReader:
|
|
186 |
text += page.extract_text()
|
187 |
return text
|
188 |
|
189 |
-
@staticmethod
|
190 |
-
def download_pdf_from_url(pdf_url):
|
191 |
-
response = requests.get(pdf_url)
|
192 |
-
if response.status_code == 200:
|
193 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
194 |
-
temp_file.write(response.content)
|
195 |
-
temp_file_path = temp_file.name
|
196 |
-
return temp_file_path
|
197 |
-
else:
|
198 |
-
self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
|
199 |
-
return None
|
200 |
-
|
201 |
def read_pdf(self, temp_file_path: str):
|
202 |
if self.kind == "llama":
|
203 |
documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
|
@@ -383,22 +289,17 @@ class ChunkProcessor:
|
|
383 |
)
|
384 |
self.document_chunks_full.extend(document_chunks)
|
385 |
|
|
|
386 |
self.document_data[file_path] = file_data
|
387 |
self.document_metadata[file_path] = file_metadata
|
388 |
|
389 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
390 |
file_name = os.path.basename(file_path)
|
391 |
-
storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
|
392 |
-
local_path = os.path.join(storage_dir, file_name)
|
393 |
-
|
394 |
-
if not os.path.exists(local_path):
|
395 |
-
local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
|
396 |
|
397 |
if file_name in self.document_data:
|
398 |
return
|
399 |
|
400 |
-
file_type = file_name.split(".")[-1]
|
401 |
-
self.logger.info(f"Reading file {file_index + 1}: {local_path}")
|
402 |
|
403 |
read_methods = {
|
404 |
"pdf": file_reader.read_pdf,
|
@@ -412,9 +313,10 @@ class ChunkProcessor:
|
|
412 |
return
|
413 |
|
414 |
try:
|
415 |
-
documents = read_methods[file_type](
|
|
|
416 |
self.process_documents(
|
417 |
-
documents,
|
418 |
)
|
419 |
except Exception as e:
|
420 |
self.logger.error(f"Error processing file {file_name}: {str(e)}")
|
@@ -500,10 +402,11 @@ if __name__ == "__main__":
|
|
500 |
data_loader = DataLoader(config, logger=logger)
|
501 |
document_chunks, document_names, documents, document_metadata = (
|
502 |
data_loader.get_chunks(
|
503 |
-
|
504 |
-
[
|
505 |
)
|
506 |
)
|
507 |
|
508 |
-
print(document_names)
|
509 |
print(len(document_chunks))
|
|
|
|
25 |
import bs4
|
26 |
import tempfile
|
27 |
import PyPDF2
|
28 |
+
from modules.dataloader.pdf_readers.base import PDFReader
|
29 |
+
from modules.dataloader.pdf_readers.llama import LlamaParser
|
30 |
|
31 |
try:
|
32 |
+
from modules.dataloader.helpers import get_metadata, download_pdf_from_url
|
33 |
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
|
|
|
|
34 |
except:
|
35 |
+
from dataloader.helpers import get_metadata, download_pdf_from_url
|
36 |
from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
37 |
|
38 |
logger = logging.getLogger(__name__)
|
39 |
BASE_DIR = os.getcwd()
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
class HTMLReader:
|
43 |
def __init__(self):
|
|
|
104 |
text += page.extract_text()
|
105 |
return text
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def read_pdf(self, temp_file_path: str):
|
108 |
if self.kind == "llama":
|
109 |
documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
|
|
|
289 |
)
|
290 |
self.document_chunks_full.extend(document_chunks)
|
291 |
|
292 |
+
print(f"Processed {file_path}. File_data: {file_data}")
|
293 |
self.document_data[file_path] = file_data
|
294 |
self.document_metadata[file_path] = file_metadata
|
295 |
|
296 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
297 |
file_name = os.path.basename(file_path)
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
if file_name in self.document_data:
|
300 |
return
|
301 |
|
302 |
+
file_type = file_name.split(".")[-1]
|
|
|
303 |
|
304 |
read_methods = {
|
305 |
"pdf": file_reader.read_pdf,
|
|
|
313 |
return
|
314 |
|
315 |
try:
|
316 |
+
documents = read_methods[file_type](file_path)
|
317 |
+
|
318 |
self.process_documents(
|
319 |
+
documents, file_path, file_type, "file", addl_metadata
|
320 |
)
|
321 |
except Exception as e:
|
322 |
self.logger.error(f"Error processing file {file_name}: {str(e)}")
|
|
|
402 |
data_loader = DataLoader(config, logger=logger)
|
403 |
document_chunks, document_names, documents, document_metadata = (
|
404 |
data_loader.get_chunks(
|
405 |
+
["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
|
406 |
+
[],
|
407 |
)
|
408 |
)
|
409 |
|
410 |
+
print(document_names[:5])
|
411 |
print(len(document_chunks))
|
412 |
+
|
code/modules/dataloader/helpers.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
-
from
|
4 |
-
|
5 |
|
6 |
def get_urls_from_file(file_path: str):
|
7 |
"""
|
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
|
|
106 |
continue
|
107 |
|
108 |
return lecture_metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
from urllib.parse import urlparse
|
4 |
+
import tempfile
|
5 |
|
6 |
def get_urls_from_file(file_path: str):
|
7 |
"""
|
|
|
106 |
continue
|
107 |
|
108 |
return lecture_metadata
|
109 |
+
|
110 |
+
|
111 |
+
def download_pdf_from_url(pdf_url):
|
112 |
+
"""
|
113 |
+
Function to temporarily download a PDF file from a URL and return the local file path.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
pdf_url (str): The URL of the PDF file to download.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
str: The local file path of the downloaded PDF file.
|
120 |
+
"""
|
121 |
+
response = requests.get(pdf_url)
|
122 |
+
if response.status_code == 200:
|
123 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
124 |
+
temp_file.write(response.content)
|
125 |
+
temp_file_path = temp_file.name
|
126 |
+
return temp_file_path
|
127 |
+
else:
|
128 |
+
return None
|
code/modules/dataloader/pdf_readers/base.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
2 |
+
|
3 |
+
|
4 |
+
class PDFReader:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def get_loader(self, pdf_path):
|
9 |
+
loader = PyMuPDFLoader(pdf_path)
|
10 |
+
return loader
|
11 |
+
|
12 |
+
def parse(self, pdf_path):
|
13 |
+
loader = self.get_loader(pdf_path)
|
14 |
+
return loader.load()
|
code/modules/dataloader/pdf_readers/llama.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from llama_parse import LlamaParse
|
4 |
+
from langchain.schema import Document
|
5 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
6 |
+
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
class LlamaParser:
|
11 |
+
def __init__(self):
|
12 |
+
self.GPT_API_KEY = OPENAI_API_KEY
|
13 |
+
self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
|
14 |
+
self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
|
15 |
+
self.headers = {
|
16 |
+
'Accept': 'application/json',
|
17 |
+
'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
|
18 |
+
}
|
19 |
+
self.parser = LlamaParse(
|
20 |
+
api_key=LLAMA_CLOUD_API_KEY,
|
21 |
+
result_type="markdown",
|
22 |
+
verbose=True,
|
23 |
+
language="en",
|
24 |
+
gpt4o_mode=False,
|
25 |
+
# gpt4o_api_key=OPENAI_API_KEY,
|
26 |
+
parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
|
27 |
+
)
|
28 |
+
|
29 |
+
def parse(self, pdf_path):
|
30 |
+
if not os.path.exists(pdf_path):
|
31 |
+
pdf_path = download_pdf_from_url(pdf_path)
|
32 |
+
|
33 |
+
documents = self.parser.load_data(pdf_path)
|
34 |
+
document = [document.to_langchain_format() for document in documents][0]
|
35 |
+
|
36 |
+
content = document.page_content
|
37 |
+
pages = content.split("\n---\n")
|
38 |
+
pages = [page.strip() for page in pages]
|
39 |
+
|
40 |
+
documents = [
|
41 |
+
Document(
|
42 |
+
page_content=page,
|
43 |
+
metadata={"source": pdf_path, "page": i}
|
44 |
+
) for i, page in enumerate(pages)
|
45 |
+
]
|
46 |
+
|
47 |
+
return documents
|
48 |
+
|
49 |
+
def make_request(self, pdf_url):
|
50 |
+
payload = {
|
51 |
+
"gpt4o_mode": "false",
|
52 |
+
"parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
|
53 |
+
}
|
54 |
+
|
55 |
+
files = [
|
56 |
+
('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
|
57 |
+
]
|
58 |
+
|
59 |
+
response = requests.request(
|
60 |
+
"POST", self.parse_url, headers=self.headers, data=payload, files=files)
|
61 |
+
|
62 |
+
return response.json()['id'], response.json()['status']
|
63 |
+
|
64 |
+
async def get_result(self, job_id):
|
65 |
+
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
|
66 |
+
|
67 |
+
response = requests.request("GET", url, headers=self.headers, data={})
|
68 |
+
|
69 |
+
return response.json()['markdown']
|
70 |
+
|
71 |
+
async def _parse(self, pdf_path):
|
72 |
+
job_id, status = self.make_request(pdf_path)
|
73 |
+
|
74 |
+
while status != "SUCCESS":
|
75 |
+
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
|
76 |
+
response = requests.request("GET", url, headers=self.headers, data={})
|
77 |
+
status = response.json()["status"]
|
78 |
+
|
79 |
+
result = await self.get_result(job_id)
|
80 |
+
|
81 |
+
documents = [
|
82 |
+
Document(
|
83 |
+
page_content=result,
|
84 |
+
metadata={"source": pdf_path}
|
85 |
+
)
|
86 |
+
]
|
87 |
+
|
88 |
+
return documents
|
89 |
+
|
90 |
+
async def _parse(self, pdf_path):
|
91 |
+
return await self._parse(pdf_path)
|
92 |
+
|