Nattyboi commited on
Commit
98b1ce4
·
1 Parent(s): 070c1bf

added stuff

Browse files
Files changed (3) hide show
  1. app.py +3 -1
  2. requirements.txt +2 -1
  3. scraper.py +35 -0
app.py CHANGED
@@ -6,6 +6,7 @@ from fastapi import FastAPI, File, UploadFile
6
  from fastapi.responses import JSONResponse
7
  import docx
8
  import fitz
 
9
  import asyncio
10
  from google import genai
11
  from pydantic import BaseModel
@@ -57,7 +58,7 @@ def get_course(query):
57
  content_structure["Course_Title"]=title
58
  content_structure["Course_Link"]=link
59
  content_structure["Course_Snippet"]= snippet
60
-
61
  content.append(content_structure)
62
 
63
 
@@ -82,6 +83,7 @@ def get_course_func(query):
82
  content_structure["Course_Link"]=link
83
  content_structure["Course_Snippet"]= snippet
84
 
 
85
  content.append(content_structure)
86
 
87
 
 
6
  from fastapi.responses import JSONResponse
7
  import docx
8
  import fitz
9
+ from scraper import scrapeCourse
10
  import asyncio
11
  from google import genai
12
  from pydantic import BaseModel
 
58
  content_structure["Course_Title"]=title
59
  content_structure["Course_Link"]=link
60
  content_structure["Course_Snippet"]= snippet
61
+ content_structure["Scraped_Course_Details"]= scrapeCourse(url=link)
62
  content.append(content_structure)
63
 
64
 
 
83
  content_structure["Course_Link"]=link
84
  content_structure["Course_Snippet"]= snippet
85
 
86
+
87
  content.append(content_structure)
88
 
89
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ pinecone
6
  sentence-transformers
7
  einops
8
  google-genai
9
- python-docx
 
 
6
  sentence-transformers
7
  einops
8
  google-genai
9
+ python-docx
10
+ beautifulsoup4
scraper.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ def scrapeCourse(url):
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ webcontent=[]
7
+
8
+ # URL of the page you want to scrape
9
+
10
+
11
+ # Send a GET request to fetch the raw HTML content
12
+ response = requests.get(url)
13
+
14
+ # Check if the request was successful
15
+ if response.status_code == 200:
16
+ # Parse the HTML content using BeautifulSoup
17
+ soup = BeautifulSoup(response.text, 'html.parser')
18
+
19
+ # Find the content based on the given CSS selector
20
+ selector = "#main-content-anchor > div.paid-course-landing-page__body > div > div.ud-text-sm.component-margin.styles--description--AfVWV > div > div > div > div:nth-child(1) > ul"
21
+ content = soup.select(selector)
22
+
23
+ # Check if any elements are found
24
+ if content:
25
+ # Extract and print the text content from the first matched element
26
+ for item in content[0].find_all('li'): # Assuming the list items <li> are the ones you're interested in
27
+ # print(item.get_text(strip=True))
28
+
29
+ webcontent.append(item.get_text(strip=True))
30
+ return webcontent
31
+ else:
32
+ print("No content found for the selector.")
33
+ else:
34
+ print(f"Failed to retrieve the page. Status code: {response.status_code}")
35
+