Spaces:

Nattyboi
/

resume-api

Running

Nattyboi commited on 4 days ago

Commit

98b1ce4

1 Parent(s): 070c1bf

added stuff

Files changed (3) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import JSONResponse
 import docx
 import fitz
 import asyncio
 from google import genai
 from pydantic import BaseModel
@@ -57,7 +58,7 @@ def get_course(query):
             content_structure["Course_Title"]=title
             content_structure["Course_Link"]=link
             content_structure["Course_Snippet"]= snippet
             content.append(content_structure)
@@ -82,6 +83,7 @@ def get_course_func(query):
             content_structure["Course_Link"]=link
             content_structure["Course_Snippet"]= snippet
             content.append(content_structure)

 from fastapi.responses import JSONResponse
 import docx
 import fitz
+from scraper import scrapeCourse
 import asyncio
 from google import genai
 from pydantic import BaseModel
             content_structure["Course_Title"]=title
             content_structure["Course_Link"]=link
             content_structure["Course_Snippet"]= snippet
+            content_structure["Scraped_Course_Details"]= scrapeCourse(url=link)
             content.append(content_structure)
             content_structure["Course_Link"]=link
             content_structure["Course_Snippet"]= snippet
             content.append(content_structure)

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ pinecone
 sentence-transformers
 einops
 google-genai
-python-docx

 sentence-transformers
 einops
 google-genai
+python-docx
+beautifulsoup4

scraper.py ADDED Viewed

+def scrapeCourse(url):
+    import requests
+    from bs4 import BeautifulSoup
+    webcontent=[]
+    # URL of the page you want to scrape
+    # Send a GET request to fetch the raw HTML content
+    response = requests.get(url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Parse the HTML content using BeautifulSoup
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find the content based on the given CSS selector
+        selector = "#main-content-anchor > div.paid-course-landing-page__body > div > div.ud-text-sm.component-margin.styles--description--AfVWV > div > div > div > div:nth-child(1) > ul"
+        content = soup.select(selector)
+        # Check if any elements are found
+        if content:
+            # Extract and print the text content from the first matched element
+            for item in content[0].find_all('li'):  # Assuming the list items <li> are the ones you're interested in
+                # print(item.get_text(strip=True))
+                webcontent.append(item.get_text(strip=True))
+            return webcontent
+        else:
+            print("No content found for the selector.")
+    else:
+        print(f"Failed to retrieve the page. Status code: {response.status_code}")