from sentence_transformers import CrossEncoder import math import numpy as np from search_client import SearchClient import os from dotenv import load_dotenv load_dotenv() GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID") GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY") BING_SEARCH_API_KEY = os.getenv("BING_SEARCH_API_KEY") reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") googleSearchClient = SearchClient( "google", api_key=GOOGLE_SEARCH_API_KEY, engine_id=GOOGLE_SEARCH_ENGINE_ID ) bingSearchClient = SearchClient("bing", api_key=BING_SEARCH_API_KEY, engine_id=None) def rerank(query, top_k, search_results, chunk_size): """Chunks and reranks the documents using a specified reranker, Args: query (str): the query for reranking top_k (int): the number of top reranked results to return search_results (list[dict]): a list of dictionaries containing "link" and "text" keys for each search result chunk_size (int): the size of each chunk for reranking Returns: list[dict]: a list of dictionaries containing the top reranked results with "link" and "text" keys """ chunks = [] for result in search_results: text = result["text"] words = text.split() num_chunks = math.ceil(len(words) / chunk_size) for i in range(num_chunks): start = i * chunk_size end = (i + 1) * chunk_size chunk = " ".join(words[start:end]) chunks.append((result["link"], chunk)) # Create sentence combinations with the query sentence_combinations = [[query, chunk[1]] for chunk in chunks] # Compute similarity scores for these combinations similarity_scores = reranker.predict(sentence_combinations) # Sort scores indexes in decreasing order sim_scores_argsort = reversed(np.argsort(similarity_scores)) # Rearrange search_results based on the reranked scores reranked_results = [] for idx in sim_scores_argsort: link = chunks[idx][0] chunk = chunks[idx][1] reranked_results.append({"link": link, "text": chunk}) # Return the top K ranks return reranked_results[:top_k] def gen_augmented_prompt_via_websearch( prompt, vendor, n_crawl, top_k, pre_context="", post_context="", pre_prompt="", post_prompt="", pass_prev=False, prev_output="", chunk_size=512, ): """ Generates an augmented prompt and a list of links by performing a web search and reranking the results. Args: prompt (str, required): The prompt for the web search. vendor (str): The search engine to use, either 'Google' or 'Bing'. n_crawl (int): The number of search results to retrieve. top_k (int): The number of top reranked results to return. pre_context (str): The pre-context to be included in the augmented prompt. post_context (str): The post-context to be included in the augmented prompt. pre_prompt (str, optional): The pre-prompt to be included in the augmented prompt. Defaults to "". post_prompt (str, optional): The post-prompt to be included in the augmented prompt. Defaults to "". pass_prev (bool, optional): Whether to include the previous output in the augmented prompt. Defaults to False. prev_output (str, optional): The previous output to be included in the augmented prompt. Defaults to "". chunk_size (int, optional): The size of each chunk for reranking. Defaults to 512. Returns: tuple: A tuple containing the augmented prompt and a list of links. """ search_results = [] if vendor == "Google": search_results = googleSearchClient.search(prompt, n_crawl) elif vendor == "Bing": search_results = bingSearchClient.search(prompt, n_crawl) reranked_results = rerank(prompt, top_k, search_results, chunk_size) links = [] context = "" for res in reranked_results: context += res["text"] + "\n\n" link = res["link"] links.append(link) # remove duplicate links links = list(set(links)) # print(reranked_results) prev_output = prev_output if pass_prev else "" augmented_prompt = f""" {pre_context} {context} {post_context} {pre_prompt} {prompt} {post_prompt} {prev_output} """ return augmented_prompt, links inital_prompt_engineering_dict = { "SYSTEM_INSTRUCTION": """ You are a knowledgeable author on medical conditions, with a deep expertise in Huntington's disease. You provide extensive, clear information on complex medical topics, treatments, new research and developments. You avoid giving personal medical advice or diagnoses but offers general advice and underscores the importance of consulting healthcare professionals. Your goal is to inform, engage and enlighten users that enquire about Huntington's disease, offering factual data and real-life perspectives with anempathetic tone. You use every search available including web search together with articles and information from * Journal of Huntington's disease, * Movement Disorders, * Neurology, * Journal of Neurology, * Neurosurgery & Psychiatry, * HDBuzz, * PubMed, * Huntington's disease Society of America (HDSA), * Huntington Study Group (HSG), * Nature Reviews Neurology * ScienceDirect The information you provide should be understandable to laypersons, well-organized, and include credible sources, citations, and an empathetic tone. It should educate on the scientific aspects and personal challenges of living with Huntington's Disease.""", "SYSTEM_RESPONSE": """Hello! I'm an assistant trained to provide detailed and accurate information on medical conditions, including Huntington's Disease. I'm here to help answer your questions and provide resources to help you better understand this disease and its impact on individuals and their families. If you have any questions about HD or related topics, feel free to ask!""", "PRE_CONTEXT": """NOW YOU ARE SEARCHING THE WEB, AND HERE ARE THE CHUNKS RETRIEVED FROM THE WEB.""", "POST_CONTEXT": """ """, # EMPTY "PRE_PROMPT": """NOW ACCORDING TO THE CONTEXT RETRIEVED FROM THE GENERATE THE CONTENT FOR THE FOLLOWING SUBJECT""", "POST_PROMPT": """ Do not repeat yourself """, }