Spaces:
Sleeping
Sleeping
UPDATE: ThreadPoolExecutor
Browse files- functions.py +9 -5
- requirements.txt +1 -0
functions.py
CHANGED
@@ -23,6 +23,7 @@ from langchain_groq import ChatGroq
|
|
23 |
from pdf2image import convert_from_bytes
|
24 |
import numpy as np
|
25 |
import easyocr
|
|
|
26 |
from bs4 import BeautifulSoup
|
27 |
from urllib.parse import urlparse, urljoin
|
28 |
from supabase import create_client
|
@@ -291,12 +292,15 @@ def getLinks(url: str, timeout = 30):
|
|
291 |
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
292 |
|
293 |
|
|
|
|
|
|
|
|
|
294 |
def getTextFromImagePDF(pdfBytes):
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
return text
|
300 |
|
301 |
def getTranscript(urls: str):
|
302 |
urls = urls.split(",")
|
|
|
23 |
from pdf2image import convert_from_bytes
|
24 |
import numpy as np
|
25 |
import easyocr
|
26 |
+
from concurrent.futures import ThreadPoolExecutor
|
27 |
from bs4 import BeautifulSoup
|
28 |
from urllib.parse import urlparse, urljoin
|
29 |
from supabase import create_client
|
|
|
292 |
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
293 |
|
294 |
|
295 |
+
def getText(image):
|
296 |
+
global reader
|
297 |
+
return "\n".join([text[1] for text in reader.readtext(np.array(image.resize((500, 500))), paragraph=True)])
|
298 |
+
|
299 |
def getTextFromImagePDF(pdfBytes):
|
300 |
+
allImages = convert_from_bytes(pdfBytes)
|
301 |
+
with ThreadPoolExecutor() as p:
|
302 |
+
texts = list(p.map(getText, allImages))
|
303 |
+
return "\n\n\n".join(texts)
|
|
|
304 |
|
305 |
def getTranscript(urls: str):
|
306 |
urls = urls.split(",")
|
requirements.txt
CHANGED
@@ -80,6 +80,7 @@ langchain-groq
|
|
80 |
lxml
|
81 |
PyPDF2
|
82 |
python-dotenv
|
|
|
83 |
pandas
|
84 |
sentence-transformers
|
85 |
supabase
|
|
|
80 |
lxml
|
81 |
PyPDF2
|
82 |
python-dotenv
|
83 |
+
pillow
|
84 |
pandas
|
85 |
sentence-transformers
|
86 |
supabase
|