ebook-gen / utils.py
pragneshbarik's picture
added doc string to function rerank and gen_augmented_prompt_via_websearch
d466514
raw
history blame
6.52 kB
from sentence_transformers import CrossEncoder
import math
import numpy as np
from search_client import SearchClient
import os
from dotenv import load_dotenv
load_dotenv()
GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
BING_SEARCH_API_KEY = os.getenv("BING_SEARCH_API_KEY")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
googleSearchClient = SearchClient(
"google", api_key=GOOGLE_SEARCH_API_KEY, engine_id=GOOGLE_SEARCH_ENGINE_ID
)
bingSearchClient = SearchClient("bing", api_key=BING_SEARCH_API_KEY, engine_id=None)
def rerank(query, top_k, search_results, chunk_size):
"""Chunks and reranks the documents using a specified reranker,
Args:
query (str): the query for reranking
top_k (int): the number of top reranked results to return
search_results (list[dict]): a list of dictionaries containing "link" and "text" keys for each search result
chunk_size (int): the size of each chunk for reranking
Returns:
list[dict]: a list of dictionaries containing the top reranked results with "link" and "text" keys
"""
chunks = []
for result in search_results:
text = result["text"]
words = text.split()
num_chunks = math.ceil(len(words) / chunk_size)
for i in range(num_chunks):
start = i * chunk_size
end = (i + 1) * chunk_size
chunk = " ".join(words[start:end])
chunks.append((result["link"], chunk))
# Create sentence combinations with the query
sentence_combinations = [[query, chunk[1]] for chunk in chunks]
# Compute similarity scores for these combinations
similarity_scores = reranker.predict(sentence_combinations)
# Sort scores indexes in decreasing order
sim_scores_argsort = reversed(np.argsort(similarity_scores))
# Rearrange search_results based on the reranked scores
reranked_results = []
for idx in sim_scores_argsort:
link = chunks[idx][0]
chunk = chunks[idx][1]
reranked_results.append({"link": link, "text": chunk})
# Return the top K ranks
return reranked_results[:top_k]
def gen_augmented_prompt_via_websearch(
prompt,
vendor,
n_crawl,
top_k,
pre_context="",
post_context="",
pre_prompt="",
post_prompt="",
pass_prev=False,
prev_output="",
chunk_size=512,
):
"""
Generates an augmented prompt and a list of links by performing a web search and reranking the results.
Args:
prompt (str, required): The prompt for the web search.
vendor (str): The search engine to use, either 'Google' or 'Bing'.
n_crawl (int): The number of search results to retrieve.
top_k (int): The number of top reranked results to return.
pre_context (str): The pre-context to be included in the augmented prompt.
post_context (str): The post-context to be included in the augmented prompt.
pre_prompt (str, optional): The pre-prompt to be included in the augmented prompt. Defaults to "".
post_prompt (str, optional): The post-prompt to be included in the augmented prompt. Defaults to "".
pass_prev (bool, optional): Whether to include the previous output in the augmented prompt. Defaults to False.
prev_output (str, optional): The previous output to be included in the augmented prompt. Defaults to "".
chunk_size (int, optional): The size of each chunk for reranking. Defaults to 512.
Returns:
tuple: A tuple containing the augmented prompt and a list of links.
"""
search_results = []
if vendor == "Google":
search_results = googleSearchClient.search(prompt, n_crawl)
elif vendor == "Bing":
search_results = bingSearchClient.search(prompt, n_crawl)
reranked_results = rerank(prompt, top_k, search_results, chunk_size)
links = []
context = ""
for res in reranked_results:
context += res["text"] + "\n\n"
link = res["link"]
links.append(link)
# remove duplicate links
links = list(set(links))
# print(reranked_results)
prev_output = prev_output if pass_prev else ""
augmented_prompt = f"""
{pre_context}
{context}
{post_context}
{pre_prompt}
{prompt}
{post_prompt}
{prev_output}
"""
return augmented_prompt, links
inital_prompt_engineering_dict = {
"SYSTEM_INSTRUCTION": """ You are a knowledgeable author on medical conditions, with a deep expertise in Huntington's disease.
You provide extensive, clear information on complex medical topics, treatments, new research and developments.
You avoid giving personal medical advice or diagnoses but offers general advice and underscores the importance of consulting healthcare professionals.
Your goal is to inform, engage and enlighten users that enquire about Huntington's disease, offering factual data and real-life perspectives with anempathetic tone.
You use every search available including web search together with articles and information from
* Journal of Huntington's disease,
* Movement Disorders,
* Neurology,
* Journal of Neurology,
* Neurosurgery & Psychiatry,
* HDBuzz,
* PubMed,
* Huntington's disease Society of America (HDSA),
* Huntington Study Group (HSG),
* Nature Reviews Neurology
* ScienceDirect
The information you provide should be understandable to laypersons, well-organized, and include credible sources, citations, and an empathetic tone.
It should educate on the scientific aspects and personal challenges of living with Huntington's Disease.""",
"SYSTEM_RESPONSE": """Hello! I'm an assistant trained to provide detailed and accurate information on medical conditions, including Huntington's Disease.
I'm here to help answer your questions and provide resources to help you better understand this disease and its impact on individuals and their families.
If you have any questions about HD or related topics, feel free to ask!""",
"PRE_CONTEXT": """NOW YOU ARE SEARCHING THE WEB, AND HERE ARE THE CHUNKS RETRIEVED FROM THE WEB.""",
"POST_CONTEXT": """ """, # EMPTY
"PRE_PROMPT": """NOW ACCORDING TO THE CONTEXT RETRIEVED FROM THE GENERATE THE CONTENT FOR THE FOLLOWING SUBJECT""",
"POST_PROMPT": """
Do not repeat yourself
""",
}