File size: 3,137 Bytes
0339679
 
 
 
229ace9
0339679
 
 
4fc2bf8
0339679
 
 
 
 
 
 
 
 
 
 
 
 
1e2550f
0339679
1e2550f
0339679
c26167a
0339679
 
 
 
 
229ace9
0339679
1e2550f
0339679
 
 
1e2550f
0339679
 
 
 
1e2550f
 
 
 
 
 
 
0339679
 
 
 
 
1e2550f
0339679
 
 
1e2550f
 
 
4fc2bf8
1e2550f
0339679
 
 
1e2550f
 
 
 
 
 
0339679
 
 
 
351c4c7
0339679
 
1e2550f
 
0339679
 
 
229ace9
 
 
1e2550f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import base64
import os
import requests

from io import BytesIO
from openai import OpenAI
from pdf2image import convert_from_path
from langchain.schema import Document
from modules.config.constants import TIMEOUT


class GPTParser:
    """
    This class uses OpenAI's GPT-4o mini model to parse PDFs and extract text, images and equations.
    It is the most advanced parser in the system and is able to handle complex formats and layouts
    """

    def __init__(self):
        self.client = OpenAI()
        self.api_key = os.getenv("OPENAI_API_KEY")
        self.prompt = """
         The provided documents are images of PDFs of lecture slides of deep learning material.
         They contain LaTeX equations, images, and text.
         The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
         The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
         For images, give a description and if you can, a source. Separate each page with '---'.
         Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
         """

    def parse(self, pdf_path):
        images = convert_from_path(pdf_path)

        encoded_images = [self.encode_image(image) for image in images]

        chunks = [encoded_images[i : i + 5] for i in range(0, len(encoded_images), 5)]

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}",
        }

        output = ""
        for chunk_num, chunk in enumerate(chunks):
            content = [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
                for image in chunk
            ]

            content.insert(0, {"type": "text", "text": self.prompt})

            payload = {
                "model": "gpt-4o-mini",
                "messages": [{"role": "user", "content": content}],
            }

            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=TIMEOUT,
            )

            resp = response.json()

            chunk_output = (
                resp["choices"][0]["message"]["content"]
                .replace("```", "")
                .replace("markdown", "")
                .replace("````", "")
            )

            output += chunk_output + "\n---\n"

        output = output.split("\n---\n")
        output = [doc for doc in output if doc.strip() != ""]

        documents = [
            Document(page_content=page, metadata={"source": pdf_path, "page": i})
            for i, page in enumerate(output)
        ]
        return documents

    def encode_image(self, image):
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")