Farid Karimli commited on
Commit
229ace9
·
1 Parent(s): 39c29a9

HTML Reader fix and more changes

Browse files
code/modules/dataloader/data_loader.py CHANGED
@@ -27,6 +27,7 @@ import tempfile
27
  import PyPDF2
28
  from modules.dataloader.pdf_readers.base import PDFReader
29
  from modules.dataloader.pdf_readers.llama import LlamaParser
 
30
 
31
  try:
32
  from modules.dataloader.helpers import get_metadata, download_pdf_from_url
@@ -89,9 +90,12 @@ class FileReader:
89
  self.kind = kind
90
  if kind == "llama":
91
  self.pdf_reader = LlamaParser()
 
 
92
  else:
93
  self.pdf_reader = PDFReader()
94
  self.web_reader = HTMLReader()
 
95
 
96
 
97
  def extract_text_from_pdf(self, pdf_path):
@@ -130,8 +134,7 @@ class FileReader:
130
  return loader.load()
131
 
132
  def read_html(self, url: str):
133
- loader = WebBaseLoader(url)
134
- return loader.load()
135
 
136
  def read_tex_from_url(self, tex_url):
137
  response = requests.get(tex_url)
 
27
  import PyPDF2
28
  from modules.dataloader.pdf_readers.base import PDFReader
29
  from modules.dataloader.pdf_readers.llama import LlamaParser
30
+ from modules.dataloader.pdf_readers.gpt import GPTParser
31
 
32
  try:
33
  from modules.dataloader.helpers import get_metadata, download_pdf_from_url
 
90
  self.kind = kind
91
  if kind == "llama":
92
  self.pdf_reader = LlamaParser()
93
+ elif kind == "gpt":
94
+ self.pdf_reader = GPTParser()
95
  else:
96
  self.pdf_reader = PDFReader()
97
  self.web_reader = HTMLReader()
98
+ self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
99
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
 
134
  return loader.load()
135
 
136
  def read_html(self, url: str):
137
+ return [Document(page_content=self.web_reader.read_html(url))]
 
138
 
139
  def read_tex_from_url(self, tex_url):
140
  response = requests.get(tex_url)
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -2,6 +2,7 @@ import base64
2
  import os
3
  import requests
4
 
 
5
  from openai import OpenAI
6
  from pdf2image import convert_from_path
7
  from langchain.schema import Document
@@ -27,11 +28,8 @@ class GPTParser:
27
 
28
  def parse(self, pdf_path):
29
  images = convert_from_path(pdf_path)
30
- for i, image in enumerate(images):
31
- image.save(f'output/images/page{i}.jpg', 'JPEG')
32
 
33
- encoded_images = [self.encode_image(
34
- f'output/images/page{im}.jpg') for im in range(len(images))]
35
 
36
  chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
37
 
@@ -42,8 +40,6 @@ class GPTParser:
42
 
43
  output = ""
44
  for chunk_num, chunk in enumerate(chunks):
45
- print(f"Processing chunk {chunk_num + 1}/{len(chunks)})")
46
-
47
  content = [{"type": "image_url", "image_url": {
48
  "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
49
 
@@ -63,9 +59,8 @@ class GPTParser:
63
  "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
64
 
65
  resp = response.json()
66
- print("Response", resp)
67
 
68
- chunk_output = resp['choices'][0]['message']['content']
69
 
70
  output += chunk_output + "\n---\n"
71
 
@@ -79,6 +74,7 @@ class GPTParser:
79
  ]
80
  return documents
81
 
82
- def encode_image(self, image_path):
83
- with open(image_path, "rb") as image_file:
84
- return base64.b64encode(image_file.read()).decode('utf-8')
 
 
2
  import os
3
  import requests
4
 
5
+ from io import BytesIO
6
  from openai import OpenAI
7
  from pdf2image import convert_from_path
8
  from langchain.schema import Document
 
28
 
29
  def parse(self, pdf_path):
30
  images = convert_from_path(pdf_path)
 
 
31
 
32
+ encoded_images = [self.encode_image(image) for image in images]
 
33
 
34
  chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
35
 
 
40
 
41
  output = ""
42
  for chunk_num, chunk in enumerate(chunks):
 
 
43
  content = [{"type": "image_url", "image_url": {
44
  "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
45
 
 
59
  "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
60
 
61
  resp = response.json()
 
62
 
63
+ chunk_output = resp['choices'][0]['message']['content'].replace("```", "").replace("markdown", "").replace("````", "")
64
 
65
  output += chunk_output + "\n---\n"
66
 
 
74
  ]
75
  return documents
76
 
77
+ def encode_image(self, image):
78
+ buffered = BytesIO()
79
+ image.save(buffered, format="JPEG")
80
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')