Spaces:
Paused
Paused
import os | |
import json | |
import asyncio | |
from typing import List, Dict, Any, Optional | |
from langchain.prompts import ChatPromptTemplate | |
from pathlib import Path | |
from dotenv import load_dotenv | |
import time | |
from langchain_community.tools import BraveSearch | |
from src.utils.api_key_manager import with_api_manager | |
from src.helpers.helper import remove_markdown | |
class SearchEngine: | |
def __init__( | |
self, | |
brave_api_key: Optional[str] = None, | |
): | |
if brave_api_key is None: | |
if os.getenv("BRAVE_API_KEY") is None: | |
raise ValueError("BRAVE_API_KEY is not set") | |
else: | |
self.brave_api_key = os.getenv("BRAVE_API_KEY") | |
else: | |
self.brave_api_key = brave_api_key | |
async def generate_optimized_query(self, user_query: str, context: str = None, *, llm) -> str: | |
if context: | |
template = \ | |
"""Objective: | |
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query and relevant past context. | |
The generated SEO query should enhance visibility, relevance, and ranking on search engines. | |
Information: | |
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. | |
Instructions: | |
1. Understand the Inputs: | |
- User Query: This is the current question or statement provided by the user. | |
- Past Context: This includes any relevant previous interactions, preferences, or information that can inform the understanding of the user's intent. | |
2. Analyze the User Intent: | |
- Determine what the user is seeking to find or achieve with their query. | |
- Identify keywords and phrases that are central to the user's intent. | |
3. Incorporate SEO Best Practices: | |
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. | |
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. | |
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. | |
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). | |
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. | |
4. Generate the SEO-Optimized Query: | |
- Combine the insights from the user query and past context. | |
- Formulate a search query that maximizes SEO potential while staying true to the user's intent. | |
5. Review and Refine: | |
- Ensure the generated query is free from grammatical errors. | |
- Verify that the query does not include unnecessary or irrelevant keywords. | |
- Confirm that the query is tailored to improve search engine rankings for the intended content. | |
6. Format [IMPORTANT]: | |
- If the user query is a question, the SEO-optimized query should also be a question. | |
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement. | |
- Unless search results would be more accurate if the optimized query was a question. | |
Example 1: | |
- User Query: | |
'Best vegan restaurants in New York' | |
- Past Context: | |
'User has previously shown interest in healthy eating and sustainability.' | |
- SEO-Optimized Search Query: | |
'Top Vegan Restaurants in New York City for Healthy Dining' | |
Example 2: | |
- User Query: | |
'Give me a list of the best sci-fi movies' | |
- Past Context: | |
'User has a preference for classic science fiction films. Previous searches include "Blade Runner" and "2001: A Space Odyssey."' | |
- SEO-Optimized Search Query: | |
'What are the top classic science fiction movies to watch that are similar to Blade Runner and 2001: A Space Odyssey?' | |
Input: | |
- User Query: | |
{user_query} | |
- Past Context: | |
{context} | |
Output: | |
(The generated SEO-friendly query based on the inputs in plain text format without any markdown)""" | |
else: | |
template = \ | |
"""Objective: | |
Create a search engine optimized (SEO) query that accurately reflects the user's intent by utilizing their current query. | |
The generated SEO query should enhance visibility, relevance, and ranking on search engines. | |
Information: | |
The search engine being used is semantic in nature and requires a query that aligns with the user's intent while incorporating SEO best practices. | |
Instructions: | |
1. Understand the Input: | |
- User Query: This is the current question or statement provided by the user. | |
2. Analyze the User Intent: | |
- Determine what the user is seeking to find or achieve with their query. | |
- Identify keywords and phrases that are central to the user's intent. | |
3. Incorporate SEO Best Practices: | |
- Keyword Optimization: Use relevant keywords that users are likely to search for. Include both primary and secondary keywords. | |
- Long-Tail Keywords: Incorporate longer, more specific keyword phrases that reflect the user's intent more precisely. | |
- Clarity and Relevance: Ensure the query is clear, concise, and directly related to the user's needs. | |
- Search Intent Alignment: Align the query with the type of content the user is likely seeking (informational, navigational, transactional, or commercial investigation). | |
- Optimal Length: Keep the query within 5-12 words to maintain effectiveness and avoid keyword stuffing. | |
4. Generate the SEO-Optimized Query: | |
- Utilize the insights from the user query. | |
- Formulate a search query that maximizes SEO potential while staying true to the user's intent. | |
5. Review and Refine: | |
- Ensure the generated query is free from grammatical errors. | |
- Verify that the query does not include unnecessary or irrelevant keywords. | |
- Confirm that the query is tailored to improve search engine rankings for the intended content. | |
6. Format [IMPORTANT]: | |
- If the user query is a question, the SEO-optimized query should also be a question. | |
- If the user query is a statement, the SEO-optimized query should be a clear and concise statement. | |
- Unless search results would be more accurate if the optimized query was a question. | |
Example 1: | |
- User Query: | |
'Best vegan restaurants in New York' | |
- SEO-Optimized Search Query: | |
'Top Vegan Restaurants in New York City for Healthy Dining' | |
Example 2: | |
- User Query: | |
'Give me a list of the best sci-fi movies' | |
- SEO-Optimized Search Query: | |
'What are the top science fiction movies to watch?' | |
Input: | |
- User Query: | |
{user_query} | |
Output: | |
(The generated SEO-friendly query based on the input in plain text format without any markdown)""" | |
prompt_template = ChatPromptTemplate.from_template(template) | |
prompt = prompt_template.format(context=context, user_query=user_query) | |
optimized_query = await llm.ainvoke(prompt) | |
return optimized_query.content.strip() | |
async def search( | |
self, | |
query: str, | |
num_results: int = 10, | |
gl: str = 'us', | |
hl: str = 'en', | |
safe: str = 'off', | |
exclude_filetypes: Optional[List[str]] = None | |
) -> List[Dict[str, Any]]: | |
# Construct exclusion string for filetypes (maintaining compatibility) | |
exclusion = '' | |
if exclude_filetypes: | |
exclusion = ' ' + ' '.join([f"NOT filetype:{ft}" for ft in exclude_filetypes]) | |
modified_query = f"{query}{exclusion}" | |
print(f"Performing search with query: '{modified_query}', num_results: {num_results}, gl: {gl}, hl: {hl}, safe: {safe}") | |
try: | |
all_results = [] | |
remaining_results = num_results | |
offset = 0 | |
while remaining_results > 0 and offset <= 9: # Max offset is 9 | |
# Calculate count for this page (max 20 per request) | |
count = min(remaining_results, 20) | |
# Initialize Brave Search within the method | |
brave_search = BraveSearch.from_api_key( | |
api_key=self.brave_api_key, | |
search_kwargs={ | |
"count": count, | |
"offset": offset, | |
"country": gl, | |
"search_lang": hl, | |
"safesearch": safe | |
} | |
) | |
try: | |
results_str = await asyncio.to_thread(brave_search.run, modified_query) | |
page_results = eval(results_str) # Convert string representation of list to actual list | |
if not page_results: # No more results available | |
break | |
except Exception as e: | |
if "429" in str(e): # Rate limit error | |
print("Brave API rate limit hit, waiting 1 second...") | |
await asyncio.sleep(1) | |
continue | |
else: | |
raise e | |
all_results.extend(page_results) | |
remaining_results -= len(page_results) | |
offset += 1 | |
# Add a delay to avoid hitting the rate limit | |
await asyncio.sleep(1) | |
print(f"Total results fetched: {len(all_results)}") | |
return all_results[:num_results] # Ensure we don't return more than requested | |
except Exception as e: | |
raise e | |
async def filter_urls( | |
self, | |
query: str, | |
category: str, | |
search_results: List[Dict[str, Any]], | |
num_results: int = 3, | |
*, | |
llm | |
) -> List[Dict[str, str]]: | |
link_info = {} | |
for result in search_results: | |
link = result.get("link") | |
title = result.get("title") | |
snippet = result.get("snippet") | |
if link and title and snippet: | |
link_info[link] = {"title": title, "snippet": snippet} | |
template = \ | |
"""[IMPORTANT] | |
This is a very important task. | |
Please take a deep breath, read the instructions VERY carefully, and think step-by-step before responding. | |
[PROMPT] | |
You are an expert at determining the relevance of search results to a given query. | |
Your task is to re-rank the given search results based on their relevance to the original query. | |
Use a hybrid of semantic and keyword matching to determine relevance | |
Consider factors such as: | |
1. How well the title and snippet match the query intent | |
3. The credibility and authority of the source | |
4. The recency of the information (if applicable) | |
Rules: | |
1. Rerank the URLs based on their relevance to the query according to the criteria listed above, from best match to worst match. | |
2. Once reranked, select the top best matched results according to the category of the query as defined below: | |
- Simple External Lookup: Select upto 3 top best matched results | |
- Complex Moderate Decomposition: Select upto 4 top best matched results | |
- Complex Advanced Decomposition: Select upto 5 top best matched results | |
- Extensive Research Dynamic Structuring: Select upto 6 top best matched results | |
3. [IMPORTANT] Select the MINIMUM number of results (based on the categories above) that are required to answer the query. | |
4. The response should only contain a JSON array of objects, each containing 'link', 'title' and 'snippet' keys after reranking and filtering. | |
Note: Do not include ANY markdown in your response. | |
[INPUT] | |
Query Category: | |
{category} | |
Query: | |
{query} | |
Dictionary Containing Link, Titles and Snippets: | |
{link_info} | |
Ranked URLs (JSON array of objects):""" | |
prompt = ChatPromptTemplate.from_template(template) | |
response = await llm.ainvoke(prompt.format_messages(category=category, query=query, link_info=link_info)) | |
cleaned_response = remove_markdown(response.content.strip()) | |
try: | |
ranked_links = json.loads(cleaned_response) | |
print(f"Number of search results after reranking and filtering: {len(ranked_links)}") | |
return ranked_links | |
except json.JSONDecodeError: | |
print("Error decoding JSON response from LLM") | |
return [{"link": link, "title": info["title"], "snippet": info["snippet"]} for link, info in list(link_info.items())[:num_results]] | |
if __name__ == "__main__": | |
# Get the project root directory | |
project_root = Path(__file__).resolve().parents[2] | |
# Load environment variables | |
load_dotenv() | |
required_env_vars = ["BRAVE_API_KEY"] | |
missing_vars = [var for var in required_env_vars if os.getenv(var) is None] | |
if missing_vars: | |
print(f"Environment variables are not set: {missing_vars}") | |
exit() | |
else: | |
print("All environment variables are set!") | |
search_engine = SearchEngine() | |
queries = [ | |
"Compare the benefits and drawbacks of AI in healthcare", | |
"What is the impact of AI on healthcare?", | |
"How is AI used in healthcare?", | |
"What are the ethical considerations of AI in healthcare?", | |
"What are the economic and social impacts of artificial intelligence on the job market?", | |
"How can cold fusion be achieved without violating the laws of thermodynamics? And how can AGI help with that?", | |
"What are the major obstacles to achieving carbon neutrality in heavy industries like steel and cement? What are the potential solutions?" | |
] | |
async def main(queries: List[str]): | |
for query in queries: | |
optimized_query = await search_engine.generate_optimized_query(query) | |
print(f"\nOriginal Query: {query}") | |
print(f"Optimized Query: {optimized_query}\n") | |
start = time.perf_counter() | |
search_results = await search_engine.search(optimized_query, num_results=2, exclude_filetypes=["pdf"]) | |
end = time.perf_counter() | |
print(f"Time taken to fetch search results: {end - start:.2f} seconds") | |
# filtered_search = search_engine.filter_urls( | |
# optimized_query, | |
# category="Simple External Lookup", | |
# search_results=search_results, | |
# num_results=2 | |
# ) | |
print("Search Results:") | |
urls = [] | |
for result in search_results: | |
print(f"- {result['title']}: {result['link']}: {result['snippet']}") | |
urls.append(result['link']) | |
print("-"*20) | |
asyncio.run(main(queries)) |