Spaces:

pratham0011
/

Smart_Search_Tool

Sleeping

File size: 6,798 Bytes

import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gradio as gr
from groq import Groq
import os
from dotenv import load_dotenv

# Step 1: Scrape the free courses from Analytics Vidhya
url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

courses = []

# Extracting course title, image, and course link
for course_card in soup.find_all('header', class_='course-card__img-container'):
    img_tag = course_card.find('img', class_='course-card__img')
    
    if img_tag:
        title = img_tag.get('alt')
        image_url = img_tag.get('src')
        
        link_tag = course_card.find_previous('a')
        if link_tag:
            course_link = link_tag.get('href')
            if not course_link.startswith('http'):
                course_link = 'https://courses.analyticsvidhya.com' + course_link

            courses.append({
                'title': title,
                'image_url': image_url,
                'course_link': course_link
            })

# Step 2: Create DataFrame
df = pd.DataFrame(courses)


load_dotenv()
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

def search_courses(query):
    try:
        print(f"Searching for: {query}")
        print(f"Number of courses in database: {len(df)}")

        # Prepare the prompt for Groq
        prompt = f"""Given the following query: "{query}"

        Please analyze the query and rank the following courses based on their relevance to the query. 

        Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course.

        Only return courses with a relevance score of 0.5 or higher.

        Return the results in the following format:

        Title: [Course Title]

        Relevance: [Score]

        

        Courses:

        {df['title'].to_string(index=False)}

        """

        print("Sending request to Groq...")
        # Get response from Groq
        response = client.chat.completions.create(
            model="llama-3.2-1b-preview",
            messages=[
                {"role": "system", "content": "You are an AI assistant specialized in course recommendations."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=1000
        )
        print("Received response from Groq")

        # Parse Groq's response
        results = []
        print("Groq response content:")
        print(response.choices[0].message.content)
        
        # Use regex to extract course titles and relevance scores
        matches = re.findall(r'\*\*(.+?)\*\*\s*\(Relevance Score: (0\.\d+)\)', response.choices[0].message.content)
        
        for title, score in matches:
            title = title.strip()
            score = float(score)
            if score >= 0.5:
                matching_courses = df[df['title'].str.contains(title[:30], case=False, na=False)]
                if not matching_courses.empty:
                    course = matching_courses.iloc[0]
                    results.append({
                        'title': course['title'],  # Use the full title from the database
                        'image_url': course['image_url'],
                        'course_link': course['course_link'],
                        'score': score
                    })
                    print(f"Added course: {course['title']}")
                else:
                    print(f"Warning: Course not found in database: {title}")

        print(f"Number of results found: {len(results)}")
        return sorted(results, key=lambda x: x['score'], reverse=True)[:10]  # Return top 10 results

    except Exception as e:
        print(f"An error occurred in search_courses: {str(e)}")
        return []
    
def gradio_search(query):
    result_list = search_courses(query)
    
    if result_list:
        html_output = '<div class="results-container">'
        for item in result_list:
            course_title = item['title']
            course_image = item['image_url']
            course_link = item['course_link']
            relevance_score = round(item['score'] * 100, 2)
            
            html_output += f'''

            <div class="course-card">

                <img src="{course_image}" alt="{course_title}" class="course-image"/>

                <div class="course-info">

                    <h3>{course_title}</h3>

                    <p>Relevance: {relevance_score}%</p>

                    <a href="{course_link}" target="_blank" class="course-link">View Course</a>

                </div>

            </div>'''
        html_output += '</div>'
        return html_output
    else:
        return '<p class="no-results">No results found. Please try a different query.</p>'

custom_css = """

body {

    font-family: Arial, Helvetica, sans-serif;

    background-color: #f0f2f5;

}

.container {

    max-width: 600px;

    margin: 0 auto;

    padding: 20px;

}

.results-container {

    display: flex;

    flex-direction: column;

}

.course-card {

    background-color: white;

    border-radius: 8px;

    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);

    margin-bottom: 20px;

    overflow: hidden;

    width: 100%;

    transition: transform 0.2s;

}

.course-card:hover {

    transform: translateY(-5px);

}

.course-image {

    width: 100%;

    height: 200px;

    object-fit: cover;

}

.course-info {

    padding: 15px;

}

.course-info h3 {

    margin-top: 0;

    font-size: 18px;

    color: #333;

}

.course-info p {

    color: #666;

    font-size: 14px;

    margin-bottom: 10px;

}

.course-link {

    display: inline-block;

    background-color: #007bff;

    color: white;

    padding: 8px 12px;

    text-decoration: none;

    border-radius: 4px;

    font-size: 14px;

    transition: background-color 0.2s;

}

.course-link:hover {

    background-color: #0056b3;

}

.no-results {

    text-align: center;

    color: #666;

    font-style: italic;

}

"""
# Gradio interface
iface = gr.Interface(
    fn=gradio_search,
    inputs=gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning, data science, python"),
    outputs=gr.HTML(label="Search Results"),
    title="Analytics Vidhya Smart Search Tool🔍🌐",
    description="Find the most relevant courses from Analytics Vidhya Website based on your query.",
    theme="huggingface",
    css=custom_css,
    examples=[
        ["Tableau Course"],
        ["Machine Learning/Deep Learning with Python"],
        ["Business Analytics"]
    ],
)

if __name__ == "__main__":
    iface.launch(debug=True)