Rauhan commited on
Commit
e6a7560
·
1 Parent(s): adad220

UPDATE: ThreadPoolExecutor

Browse files
Files changed (2) hide show
  1. functions.py +9 -5
  2. requirements.txt +1 -0
functions.py CHANGED
@@ -23,6 +23,7 @@ from langchain_groq import ChatGroq
23
  from pdf2image import convert_from_bytes
24
  import numpy as np
25
  import easyocr
 
26
  from bs4 import BeautifulSoup
27
  from urllib.parse import urlparse, urljoin
28
  from supabase import create_client
@@ -291,12 +292,15 @@ def getLinks(url: str, timeout = 30):
291
  return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
292
 
293
 
 
 
 
 
294
  def getTextFromImagePDF(pdfBytes):
295
- global reader
296
- allImages = convert_from_bytes(pdfBytes)
297
- allImages = [np.array(image) for image in allImages]
298
- text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
299
- return text
300
 
301
  def getTranscript(urls: str):
302
  urls = urls.split(",")
 
23
  from pdf2image import convert_from_bytes
24
  import numpy as np
25
  import easyocr
26
+ from concurrent.futures import ThreadPoolExecutor
27
  from bs4 import BeautifulSoup
28
  from urllib.parse import urlparse, urljoin
29
  from supabase import create_client
 
292
  return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
293
 
294
 
295
+ def getText(image):
296
+ global reader
297
+ return "\n".join([text[1] for text in reader.readtext(np.array(image.resize((500, 500))), paragraph=True)])
298
+
299
  def getTextFromImagePDF(pdfBytes):
300
+ allImages = convert_from_bytes(pdfBytes)
301
+ with ThreadPoolExecutor() as p:
302
+ texts = list(p.map(getText, allImages))
303
+ return "\n\n\n".join(texts)
 
304
 
305
  def getTranscript(urls: str):
306
  urls = urls.split(",")
requirements.txt CHANGED
@@ -80,6 +80,7 @@ langchain-groq
80
  lxml
81
  PyPDF2
82
  python-dotenv
 
83
  pandas
84
  sentence-transformers
85
  supabase
 
80
  lxml
81
  PyPDF2
82
  python-dotenv
83
+ pillow
84
  pandas
85
  sentence-transformers
86
  supabase