import re import requests from bs4 import BeautifulSoup import pandas as pd import gradio as gr from groq import Groq import os from dotenv import load_dotenv # Step 1: Scrape the free courses from Analytics Vidhya url = "https://courses.analyticsvidhya.com/pages/all-free-courses" response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') courses = [] # Extracting course title, image, and course link for course_card in soup.find_all('header', class_='course-card__img-container'): img_tag = course_card.find('img', class_='course-card__img') if img_tag: title = img_tag.get('alt') image_url = img_tag.get('src') link_tag = course_card.find_previous('a') if link_tag: course_link = link_tag.get('href') if not course_link.startswith('http'): course_link = 'https://courses.analyticsvidhya.com' + course_link courses.append({ 'title': title, 'image_url': image_url, 'course_link': course_link }) # Step 2: Create DataFrame df = pd.DataFrame(courses) load_dotenv() client = Groq(api_key=os.getenv("GROQ_API_KEY")) def search_courses(query): try: print(f"Searching for: {query}") print(f"Number of courses in database: {len(df)}") # Prepare the prompt for Groq prompt = f"""Given the following query: "{query}" Please analyze the query and rank the following courses based on their relevance to the query. Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course. Only return courses with a relevance score of 0.5 or higher. Return the results in the following format: Title: [Course Title] Relevance: [Score] Courses: {df['title'].to_string(index=False)} """ print("Sending request to Groq...") # Get response from Groq response = client.chat.completions.create( model="llama-3.2-1b-preview", messages=[ {"role": "system", "content": "You are an AI assistant specialized in course recommendations."}, {"role": "user", "content": prompt} ], temperature=0.2, max_tokens=1000 ) print("Received response from Groq") # Parse Groq's response results = [] print("Groq response content:") print(response.choices[0].message.content) # Use regex to extract course titles and relevance scores matches = re.findall(r'\*\*(.+?)\*\*\s*\(Relevance Score: (0\.\d+)\)', response.choices[0].message.content) for title, score in matches: title = title.strip() score = float(score) if score >= 0.5: matching_courses = df[df['title'].str.contains(title[:30], case=False, na=False)] if not matching_courses.empty: course = matching_courses.iloc[0] results.append({ 'title': course['title'], # Use the full title from the database 'image_url': course['image_url'], 'course_link': course['course_link'], 'score': score }) print(f"Added course: {course['title']}") else: print(f"Warning: Course not found in database: {title}") print(f"Number of results found: {len(results)}") return sorted(results, key=lambda x: x['score'], reverse=True)[:10] # Return top 10 results except Exception as e: print(f"An error occurred in search_courses: {str(e)}") return [] def gradio_search(query): result_list = search_courses(query) if result_list: html_output = '
' for item in result_list: course_title = item['title'] course_image = item['image_url'] course_link = item['course_link'] relevance_score = round(item['score'] * 100, 2) html_output += f'''
{course_title}

{course_title}

Relevance: {relevance_score}%

View Course
''' html_output += '
' return html_output else: return '

No results found. Please try a different query.

' custom_css = """ body { font-family: Arial, Helvetica, sans-serif; background-color: #f0f2f5; } .container { max-width: 600px; margin: 0 auto; padding: 20px; } .results-container { display: flex; flex-direction: column; } .course-card { background-color: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); margin-bottom: 20px; overflow: hidden; width: 100%; transition: transform 0.2s; } .course-card:hover { transform: translateY(-5px); } .course-image { width: 100%; height: 200px; object-fit: cover; } .course-info { padding: 15px; } .course-info h3 { margin-top: 0; font-size: 18px; color: #333; } .course-info p { color: #666; font-size: 14px; margin-bottom: 10px; } .course-link { display: inline-block; background-color: #007bff; color: white; padding: 8px 12px; text-decoration: none; border-radius: 4px; font-size: 14px; transition: background-color 0.2s; } .course-link:hover { background-color: #0056b3; } .no-results { text-align: center; color: #666; font-style: italic; } """ # Gradio interface iface = gr.Interface( fn=gradio_search, inputs=gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning, data science, python"), outputs=gr.HTML(label="Search Results"), title="Analytics Vidhya Smart Search Tool🔍🌐", description="Find the most relevant courses from Analytics Vidhya Website based on your query.", theme="huggingface", css=custom_css, examples=[ ["Tableau Course"], ["Machine Learning/Deep Learning with Python"], ["Business Analytics"] ], ) if __name__ == "__main__": iface.launch(debug=True)