Spaces:

nand-tmp
/

GoogleSearchWithLLM

Sleeping

App Files Files Community

GoogleSearchWithLLM / model.py

8bitnand

returning the query itsef, IDK why

c575b59 5 months ago

raw

history blame

No virus

2.67 kB

	from search import SemanticSearch, GoogleSearch, Document
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import BitsAndBytesConfig
	from transformers.utils import is_flash_attn_2_available
	import yaml
	import torch


	def load_configs(config_file: str) -> dict:
	with open(config_file, "r") as f:
	configs = yaml.safe_load(f)

	return configs


	class RAGModel:
	def __init__(self, configs) -> None:
	self.configs = configs
	self.device = configs["model"]["device"]
	model_url = configs["model"]["genration_model"]
	# quantization_config = BitsAndBytesConfig(
	# load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
	# )

	self.model = AutoModelForCausalLM.from_pretrained(
	model_url,
	torch_dtype=torch.float16,
	# quantization_config=quantization_config,
	low_cpu_mem_usage=False,
	attn_implementation="sdpa",
	).to(self.device)
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_url,
	)

	def create_prompt(self, query, topk_items: list[str]):

	context = "_ " + "\n-".join(c for c in topk_items)

	base_prompt = f"""Give time for yourself to read the context and then answer the query.
	Do not return thinking process, just return the answer.
	If you do not find the answer, or if the query is offesnsive or in any other way harmfull just return "I'm not aware of it"
	Now use the following context items to answer the user query.
	context: {context}.
	user query : {query}
	"""

	dialog_template = [{"role": "user", "content": base_prompt}]

	prompt = self.tokenizer.apply_chat_template(
	conversation=dialog_template, tokenize=False, add_feneration_prompt=True
	)
	return prompt

	def answer_query(self, query: str, topk_items: list[str]):

	prompt = self.create_prompt(query, topk_items)
	input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	output = self.model.generate(**input_ids, max_new_tokens=512)
	text = self.tokenizer.decode(output[0])

	return text


	if __name__ == "__main__":

	configs = load_configs(config_file="rag.configs.yml")
	query = "what is computer vision"
	g = GoogleSearch(query)
	data = g.all_page_data
	d = Document(data, 512)
	doc_chunks = d.doc()
	s = SemanticSearch(doc_chunks, "all-mpnet-base-v2", "mps")
	topk, u = s.semantic_search(query=query, k=32)
	r = RAGModel(configs)
	output = r.answer_query(query=query, topk_items=topk)
	print(output)