import os from langchain_groq import ChatGroq from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.output_parsers import ResponseSchema, StructuredOutputParser from urllib.parse import urljoin, urlparse import requests from io import BytesIO from langchain_chroma import Chroma import requests from bs4 import BeautifulSoup from langchain_core.prompts import ChatPromptTemplate import gradio as gr from PyPDF2 import PdfReader from langchain_huggingface import HuggingFaceEmbeddings from openai import OpenAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from typing import Iterator import time getmod= os.environ.get('V1') embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1") def scrape_websites(base_urls): try: visited_links = set() # To avoid revisiting the same link content_by_url = {} # Store content from each URL for base_url in base_urls: if not base_url.strip(): continue # Skip empty or invalid URLs print(f"Scraping base URL: {base_url}") html_content = fetch_page_content(base_url) if html_content: cleaned_content = clean_body_content(html_content) content_by_url[base_url] = cleaned_content visited_links.add(base_url) # Extract and process all internal links soup = BeautifulSoup(html_content, "html.parser") links = extract_internal_links(base_url, soup) for link in links: if link not in visited_links: print(f"Scraping link: {link}") page_content = fetch_page_content(link) if page_content: cleaned_content = clean_body_content(page_content) content_by_url[link] = cleaned_content visited_links.add(link) # If the link is a PDF file, extract its content if link.lower().endswith('.pdf'): print(f"Extracting PDF content from: {link}") pdf_content = extract_pdf_text(link) if pdf_content: content_by_url[link] = pdf_content return content_by_url except Exception as e: print(f"Error during scraping: {e}") return {} def fetch_page_content(url): try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def extract_internal_links(base_url, soup): links = set() for anchor in soup.find_all("a", href=True): href = anchor["href"] full_url = urljoin(base_url, href) if is_internal_link(base_url, full_url): links.add(full_url) return links def is_internal_link(base_url, link_url): base_netloc = urlparse(base_url).netloc link_netloc = urlparse(link_url).netloc return base_netloc == link_netloc def extract_pdf_text(pdf_url): try: response = requests.get(pdf_url) response.raise_for_status() with BytesIO(response.content) as file: reader = PdfReader(file) pdf_text = "" for page in reader.pages: pdf_text += page.extract_text() return pdf_text if pdf_text else None except requests.exceptions.RequestException as e: print(f"Error fetching PDF {pdf_url}: {e}") return None except Exception as e: print(f"Error reading PDF {pdf_url}: {e}") return None def clean_body_content(html_content): soup = BeautifulSoup(html_content, "html.parser") for script_or_style in soup(["script", "style"]): script_or_style.extract() cleaned_content = soup.get_text(separator="\n") cleaned_content = "\n".join( line.strip() for line in cleaned_content.splitlines() if line.strip() ) return cleaned_content if __name__ == "__main__": website = ["https://aims.ac.rw/", "https://nexteinstein.org/" ] all_content = scrape_websites(website) temp_list = [] for url, content in all_content.items(): temp_list.append((url, content)) processed_texts = [] for element in temp_list: if isinstance(element, tuple): url, content = element processed_texts.append(f"url: {url}, content: {content}") elif isinstance(element, str): processed_texts.append(element) else: processed_texts.append(str(element)) def chunk_string(s, chunk_size=1000): return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)] chunked_texts = [] for text in processed_texts: chunked_texts.extend(chunk_string(text)) vectorstore = Chroma( collection_name="AIMS_Chat_DATABASE", embedding_function=embed_model, persist_directory="./", ) vectorstore.get().keys() vectorstore.add_texts(chunked_texts) template = (""" You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses based on the provided context: {context}. Follow these guidelines: 1. **Contextual Interaction** - Extract precise details from provided context: {context} - Respond directly to user's question: {question} 2. **Communication Guidelines** - Maintain warm, conversational tone - Use occasional emojis for engagement - Provide clear, concise information 3. **Response Strategies** - Greet users naturally (e.g., "Hello! 😊 How can I help?") - Deliver only relevant information - Avoid generating content beyond context - Handle missing information transparently 4. **No Extra Content** - If no information matches user's request: * Respond politely: "I don't have that information at the moment. 😊" * Offer alternative assistance options - Strictly avoid generating unsupported content - Prevent information padding or speculation 5. **Specialized Handling** - Prioritize direct data extraction - Provide most relevant URL if link requested - Personalize responses using available interaction history 6. **Real-Time Awareness** - Acknowledge current context when appropriate - Stay focused on user's immediate needs **Context:** {context} **User's Question:** {question} **Response Approach:** Precise, helpful, context-driven """) rag_prompt = PromptTemplate.from_template(template) retriever = vectorstore.as_retriever() from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough class OpenRouterLLM: def __init__(self, api_key: str): try: self.client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key ) self.headers = { "HTTP-Referer": "http://localhost:3000", "X-Title": "Local Development" } except Exception as e: print(f"Initialization error: {e}") raise def stream(self, prompt: str) -> Iterator[str]: try: completion = self.client.chat.completions.create( extra_headers=self.headers, model="google/gemini-2.0-flash-thinking-exp:free", messages=[{"role": "user", "content": prompt}], stream=True ) for chunk in completion: if chunk.choices[0].delta.content is not None: yield chunk.choices[0].delta.content except Exception as e: yield f"Streaming error: {str(e)}" def create_rag_chain(retriever, template): rag_prompt = PromptTemplate.from_template(template) def stream_generator(input_dict): try: context = retriever.invoke(input_dict["question"]) context_str = "\n".join([doc.page_content for doc in context]) prompt = rag_prompt.format( context=context_str, question=input_dict["question"] ) llm = OpenRouterLLM(getmod) return llm.stream(prompt) except Exception as e: def error_stream(): yield f"RAG chain error: {str(e)}" return error_stream() return stream_generator # Create RAG chain with error handling rag_chain = create_rag_chain(retriever, template) def rag_memory_stream(message, history): partial_text = "" for new_text in rag_chain.stream(message): # Replace with actual streaming logic partial_text += new_text yield partial_text # Title with emojis title = "🧮 African Institute for Mathematical Sciences (AIMS Chatbot) 🔬" # Short description for the examples section examples = [ "What are the admission requirements for AIMS?", "Tell me about ongoing research at AIMS.", "Where can I find study materials for mathematics?" ] # Custom CSS for styling the interface custom_css = """ body { font-family: "Arial", serif; } .gradio-container { font-family: "Times New Roman", serif; } .gr-button { background-color: #007bff; /* Blue button */ color: white; border: none; border-radius: 5px; font-size: 16px; padding: 10px 20px; cursor: pointer; } .gr-textbox:focus, .gr-button:focus { outline: none; /* Remove outline focus for a cleaner look */ } /* Custom CSS for the examples section */ .gr-examples { font-size: 30px; /* Increase font size of examples */ background-color: #f9f9f9; /* Light background color */ border-radius: 30px; /* Rounded corners */ } .gr-examples .example { background-color: white; /* White background for each example */ cursor: pointer; /* Change cursor to pointer on hover */ transition: background-color 0.3s ease; /* Smooth hover effect */ } .gr-examples .example:hover { background-color: #f1f1f1; /* Light gray background on hover */ } """ # Create the Chat Interface demo = gr.ChatInterface( fn=rag_memory_stream, title=title, examples=examples, # Display the short description and example questions fill_height=True, theme="soft", css=custom_css, # Apply the custom CSS ) # Launch the app if __name__ == "__main__": demo.launch(share=True)