XThomasBU
commited on
Commit
·
4fc2bf8
1
Parent(s):
1ef2150
added timeout
Browse files
code/modules/config/constants.py
CHANGED
@@ -3,6 +3,8 @@ import os
|
|
3 |
|
4 |
load_dotenv()
|
5 |
|
|
|
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
3 |
|
4 |
load_dotenv()
|
5 |
|
6 |
+
TIMEOUT = 60
|
7 |
+
|
8 |
# API Keys - Loaded from the .env file
|
9 |
|
10 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -22,6 +22,7 @@ from modules.dataloader.pdf_readers.base import PDFReader
|
|
22 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
23 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
24 |
from modules.dataloader.helpers import get_metadata
|
|
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
BASE_DIR = os.getcwd()
|
@@ -32,7 +33,7 @@ class HTMLReader:
|
|
32 |
pass
|
33 |
|
34 |
def read_url(self, url):
|
35 |
-
response = requests.get(url)
|
36 |
if response.status_code == 200:
|
37 |
return response.text
|
38 |
else:
|
@@ -52,7 +53,7 @@ class HTMLReader:
|
|
52 |
absolute_url = urljoin(base_url, href)
|
53 |
link["href"] = absolute_url
|
54 |
|
55 |
-
resp = requests.head(absolute_url)
|
56 |
if resp.status_code != 200:
|
57 |
logger.warning(
|
58 |
f"Link {absolute_url} is broken. Status code: {resp.status_code}"
|
@@ -127,7 +128,7 @@ class FileReader:
|
|
127 |
return [Document(page_content=self.web_reader.read_html(url))]
|
128 |
|
129 |
def read_tex_from_url(self, tex_url):
|
130 |
-
response = requests.get(tex_url)
|
131 |
if response.status_code == 200:
|
132 |
return [Document(page_content=response.text)]
|
133 |
else:
|
|
|
22 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
23 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
24 |
from modules.dataloader.helpers import get_metadata
|
25 |
+
from modules.config.constants import TIMEOUT
|
26 |
|
27 |
logger = logging.getLogger(__name__)
|
28 |
BASE_DIR = os.getcwd()
|
|
|
33 |
pass
|
34 |
|
35 |
def read_url(self, url):
|
36 |
+
response = requests.get(url, timeout=TIMEOUT)
|
37 |
if response.status_code == 200:
|
38 |
return response.text
|
39 |
else:
|
|
|
53 |
absolute_url = urljoin(base_url, href)
|
54 |
link["href"] = absolute_url
|
55 |
|
56 |
+
resp = requests.head(absolute_url, timeout=TIMEOUT)
|
57 |
if resp.status_code != 200:
|
58 |
logger.warning(
|
59 |
f"Link {absolute_url} is broken. Status code: {resp.status_code}"
|
|
|
128 |
return [Document(page_content=self.web_reader.read_html(url))]
|
129 |
|
130 |
def read_tex_from_url(self, tex_url):
|
131 |
+
response = requests.get(tex_url, timeout=TIMEOUT)
|
132 |
if response.status_code == 200:
|
133 |
return [Document(page_content=response.text)]
|
134 |
else:
|
code/modules/dataloader/helpers.py
CHANGED
@@ -2,6 +2,7 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from urllib.parse import urlparse
|
4 |
import tempfile
|
|
|
5 |
|
6 |
|
7 |
def get_urls_from_file(file_path: str):
|
@@ -27,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
|
|
27 |
lecture_metadata = {}
|
28 |
|
29 |
# Get the main lectures page content
|
30 |
-
r_lectures = requests.get(lectures_url)
|
31 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
32 |
|
33 |
# Get the main schedule page content
|
34 |
-
r_schedule = requests.get(schedule_url)
|
35 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
36 |
|
37 |
# Find all lecture blocks
|
@@ -119,7 +120,7 @@ def download_pdf_from_url(pdf_url):
|
|
119 |
Returns:
|
120 |
str: The local file path of the downloaded PDF file.
|
121 |
"""
|
122 |
-
response = requests.get(pdf_url)
|
123 |
if response.status_code == 200:
|
124 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
125 |
temp_file.write(response.content)
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from urllib.parse import urlparse
|
4 |
import tempfile
|
5 |
+
from modules.config.constants import TIMEOUT
|
6 |
|
7 |
|
8 |
def get_urls_from_file(file_path: str):
|
|
|
28 |
lecture_metadata = {}
|
29 |
|
30 |
# Get the main lectures page content
|
31 |
+
r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
|
32 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
33 |
|
34 |
# Get the main schedule page content
|
35 |
+
r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
|
36 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
37 |
|
38 |
# Find all lecture blocks
|
|
|
120 |
Returns:
|
121 |
str: The local file path of the downloaded PDF file.
|
122 |
"""
|
123 |
+
response = requests.get(pdf_url, timeout=TIMEOUT)
|
124 |
if response.status_code == 200:
|
125 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
126 |
temp_file.write(response.content)
|
code/modules/dataloader/pdf_readers/gpt.py
CHANGED
@@ -6,6 +6,7 @@ from io import BytesIO
|
|
6 |
from openai import OpenAI
|
7 |
from pdf2image import convert_from_path
|
8 |
from langchain.schema import Document
|
|
|
9 |
|
10 |
|
11 |
class GPTParser:
|
@@ -59,6 +60,7 @@ class GPTParser:
|
|
59 |
"https://api.openai.com/v1/chat/completions",
|
60 |
headers=headers,
|
61 |
json=payload,
|
|
|
62 |
)
|
63 |
|
64 |
resp = response.json()
|
|
|
6 |
from openai import OpenAI
|
7 |
from pdf2image import convert_from_path
|
8 |
from langchain.schema import Document
|
9 |
+
from modules.config.constants import TIMEOUT
|
10 |
|
11 |
|
12 |
class GPTParser:
|
|
|
60 |
"https://api.openai.com/v1/chat/completions",
|
61 |
headers=headers,
|
62 |
json=payload,
|
63 |
+
timeout=TIMEOUT,
|
64 |
)
|
65 |
|
66 |
resp = response.json()
|
code/modules/dataloader/pdf_readers/llama.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import requests
|
3 |
from llama_parse import LlamaParse
|
4 |
from langchain.schema import Document
|
5 |
-
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
6 |
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
|
8 |
|
@@ -52,7 +52,11 @@ class LlamaParser:
|
|
52 |
files = [
|
53 |
(
|
54 |
"file",
|
55 |
-
(
|
|
|
|
|
|
|
|
|
56 |
)
|
57 |
]
|
58 |
|
|
|
2 |
import requests
|
3 |
from llama_parse import LlamaParse
|
4 |
from langchain.schema import Document
|
5 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
|
6 |
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
|
8 |
|
|
|
52 |
files = [
|
53 |
(
|
54 |
"file",
|
55 |
+
(
|
56 |
+
"file",
|
57 |
+
requests.get(pdf_url, timeout=TIMEOUT).content,
|
58 |
+
"application/octet-stream",
|
59 |
+
),
|
60 |
)
|
61 |
]
|
62 |
|
code/modules/dataloader/webpage_crawler.py
CHANGED
@@ -4,6 +4,7 @@ import asyncio
|
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
from urllib.parse import urljoin, urldefrag
|
|
|
7 |
|
8 |
|
9 |
class WebpageCrawler:
|
@@ -19,7 +20,7 @@ class WebpageCrawler:
|
|
19 |
|
20 |
def url_exists(self, url: str) -> bool:
|
21 |
try:
|
22 |
-
response = requests.head(url)
|
23 |
return response.status_code == 200
|
24 |
except requests.ConnectionError:
|
25 |
return False
|
@@ -89,7 +90,7 @@ class WebpageCrawler:
|
|
89 |
|
90 |
def is_webpage(self, url: str) -> bool:
|
91 |
try:
|
92 |
-
response = requests.head(url, allow_redirects=True)
|
93 |
content_type = response.headers.get("Content-Type", "").lower()
|
94 |
return "text/html" in content_type
|
95 |
except requests.RequestException:
|
|
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
from urllib.parse import urljoin, urldefrag
|
7 |
+
from modules.config.constants import TIMEOUT
|
8 |
|
9 |
|
10 |
class WebpageCrawler:
|
|
|
20 |
|
21 |
def url_exists(self, url: str) -> bool:
|
22 |
try:
|
23 |
+
response = requests.head(url, timeout=TIMEOUT)
|
24 |
return response.status_code == 200
|
25 |
except requests.ConnectionError:
|
26 |
return False
|
|
|
90 |
|
91 |
def is_webpage(self, url: str) -> bool:
|
92 |
try:
|
93 |
+
response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
|
94 |
content_type = response.headers.get("Content-Type", "").lower()
|
95 |
return "text/html" in content_type
|
96 |
except requests.RequestException:
|