Spaces:
Running
Running
added stuff
Browse files- app.py +3 -1
- requirements.txt +2 -1
- scraper.py +35 -0
app.py
CHANGED
@@ -6,6 +6,7 @@ from fastapi import FastAPI, File, UploadFile
|
|
6 |
from fastapi.responses import JSONResponse
|
7 |
import docx
|
8 |
import fitz
|
|
|
9 |
import asyncio
|
10 |
from google import genai
|
11 |
from pydantic import BaseModel
|
@@ -57,7 +58,7 @@ def get_course(query):
|
|
57 |
content_structure["Course_Title"]=title
|
58 |
content_structure["Course_Link"]=link
|
59 |
content_structure["Course_Snippet"]= snippet
|
60 |
-
|
61 |
content.append(content_structure)
|
62 |
|
63 |
|
@@ -82,6 +83,7 @@ def get_course_func(query):
|
|
82 |
content_structure["Course_Link"]=link
|
83 |
content_structure["Course_Snippet"]= snippet
|
84 |
|
|
|
85 |
content.append(content_structure)
|
86 |
|
87 |
|
|
|
6 |
from fastapi.responses import JSONResponse
|
7 |
import docx
|
8 |
import fitz
|
9 |
+
from scraper import scrapeCourse
|
10 |
import asyncio
|
11 |
from google import genai
|
12 |
from pydantic import BaseModel
|
|
|
58 |
content_structure["Course_Title"]=title
|
59 |
content_structure["Course_Link"]=link
|
60 |
content_structure["Course_Snippet"]= snippet
|
61 |
+
content_structure["Scraped_Course_Details"]= scrapeCourse(url=link)
|
62 |
content.append(content_structure)
|
63 |
|
64 |
|
|
|
83 |
content_structure["Course_Link"]=link
|
84 |
content_structure["Course_Snippet"]= snippet
|
85 |
|
86 |
+
|
87 |
content.append(content_structure)
|
88 |
|
89 |
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ pinecone
|
|
6 |
sentence-transformers
|
7 |
einops
|
8 |
google-genai
|
9 |
-
python-docx
|
|
|
|
6 |
sentence-transformers
|
7 |
einops
|
8 |
google-genai
|
9 |
+
python-docx
|
10 |
+
beautifulsoup4
|
scraper.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
def scrapeCourse(url):
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
webcontent=[]
|
7 |
+
|
8 |
+
# URL of the page you want to scrape
|
9 |
+
|
10 |
+
|
11 |
+
# Send a GET request to fetch the raw HTML content
|
12 |
+
response = requests.get(url)
|
13 |
+
|
14 |
+
# Check if the request was successful
|
15 |
+
if response.status_code == 200:
|
16 |
+
# Parse the HTML content using BeautifulSoup
|
17 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
18 |
+
|
19 |
+
# Find the content based on the given CSS selector
|
20 |
+
selector = "#main-content-anchor > div.paid-course-landing-page__body > div > div.ud-text-sm.component-margin.styles--description--AfVWV > div > div > div > div:nth-child(1) > ul"
|
21 |
+
content = soup.select(selector)
|
22 |
+
|
23 |
+
# Check if any elements are found
|
24 |
+
if content:
|
25 |
+
# Extract and print the text content from the first matched element
|
26 |
+
for item in content[0].find_all('li'): # Assuming the list items <li> are the ones you're interested in
|
27 |
+
# print(item.get_text(strip=True))
|
28 |
+
|
29 |
+
webcontent.append(item.get_text(strip=True))
|
30 |
+
return webcontent
|
31 |
+
else:
|
32 |
+
print("No content found for the selector.")
|
33 |
+
else:
|
34 |
+
print(f"Failed to retrieve the page. Status code: {response.status_code}")
|
35 |
+
|