import os import random import requests import torch from bs4 import BeautifulSoup from datasets import Dataset from langchain.docstore.document import Document from langchain.llms import HuggingFacePipeline from langchain.text_splitter import CharacterTextSplitter from peft import PeftConfig, PeftModel from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline) # os.environ["CUDA_VISIBLE_DEVICES"] = "0" generation_config = GenerationConfig(temperature=.8, top_p=0.75, top_k=40) device = 'cuda' shared = { 'answer_context': None, 'embeddings_dataset': None, 'full_text': None, } text_splitter = CharacterTextSplitter() def get_nearest_examples(question: str, k: int): print(['get_nearest_examples', 'start']) question_embedding = get_embeddings([question]).cpu().detach().numpy() embeddings_dataset = shared['embeddings_dataset'] scores, samples = embeddings_dataset.get_nearest_examples( "embeddings", question_embedding, k) print(['get_nearest_examples', 'scores and samples']) print(scores) print(samples['id']) print(['get_nearest_examples', 'end']) return samples def get_embeddings(text): print(['get_embeddings', 'start']) encoded_input = emb_tokenizer(text, padding=True, truncation=True, return_tensors="pt") encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()} model_output = emb_model(**encoded_input) model_output = model_output.last_hidden_state[:, 0] print(['get_embeddings', 'end']) return model_output def build_faiss_index(text): print(['build_faiss_index', 'start']) text_list = split_text(text) emb_list = [] for i, item in enumerate(text_list): emb_list.append({ "embeddings": get_embeddings(item).cpu().detach().numpy()[0], 'id': i }) dataset = Dataset.from_list(emb_list) dataset.add_faiss_index(column="embeddings") shared['embeddings_dataset'] = dataset print(['build_faiss_index', 'end']) def extract_text(url: str): print(['extract_text', 'start']) if url is None or url.strip() == '': return '' response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p'))) shared['full_text'] = text print(['extract_text', 'end']) return text def split_text(text: str): lines = text.split('\n') lines = [line.strip() for line in lines if line.strip()] return lines def remove_prompt(text: str) -> str: output_prompt = 'Output: ' idx = text.index(output_prompt) res = text[idx + len(output_prompt):].strip() res = res.replace('Input: ', '') return res def summarize_text(text: str) -> str: print(['summarize_text', 'start']) print(['summarize_text', 'splitting text']) texts = text_splitter.split_text(text) docs = [Document(page_content=t) for t in texts] prompts = [f'Instruction: Elabora un resume del siguiente texto.\nInput: {d.page_content}\nOutput: ' for d in docs] print(['summarize_text', 'generating']) cleaned_summaries = [remove_prompt( s['generated_text']) for s in pipe(prompts)] summaries = '\n\n'.join(cleaned_summaries) print(['summarize_text', 'end']) return summaries def summarize_text_v1(text: str): print(['summarize_text', 'start']) input_text = f'Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: ' batch = tokenizer(input_text, return_tensors='pt') batch = batch.to(device) print(['summarize_text', 'generating']) with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=512, generation_config=generation_config ) output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) output = output.replace(input_text, '') print(['summarize_text', 'end']) return output def generate_question(text: str): print(['generate_question', 'start']) # Get a random section of the whole text to generate a question fragments = split_text(text) rnd_text = random.choice(fragments) shared['answer_context'] = rnd_text input_text = f'Instruction: Dado el siguiente texto quiero que generes una pregunta cuya respuesta se encuentre en él.\nInput: {rnd_text}\nOutput: ' batch = tokenizer(input_text, return_tensors='pt') print(['generate_question', 'generating']) with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=256, generation_config=generation_config) output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) output = output.replace(input_text, '') print(['generate_question', 'end']) return output def get_answer_context(): return shared['answer_context'] def answer_question(question: str): print(['answer_question', 'start']) full_text = shared['full_text'] if not shared['embeddings_dataset']: build_faiss_index(full_text) top_k_samples = get_nearest_examples(question, k=3) index_text = {} for i, t in enumerate(split_text(full_text)): index_text[i] = t context = '\n'.join([index_text[id] for id in top_k_samples['id']]) input_text = f"""Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta. El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """ batch = tokenizer(input_text, return_tensors='pt') print(['answer_question', 'generating']) with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=256, generation_config=generation_config) output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) output = output.replace(input_text, '') print(['answer_question', 'end']) return output def load_model(peft_model_id): print(['load_model', 'start']) config = PeftConfig.from_pretrained(peft_model_id) print(['load_model', 'loading model']) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') print(['load_model', 'loading tokenizer']) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) model = PeftModel.from_pretrained(model, peft_model_id) model.config.use_cache = True print(['load_model', 'end']) return model, tokenizer def load_embeddings_model(model_ckpt: str): print(['load_embeddings_model', 'start']) print(['load_embeddings_model', 'loading tokenizer']) tokenizer = AutoTokenizer.from_pretrained(model_ckpt) print(['load_embeddings_model', 'loading model']) model = AutoModel.from_pretrained(model_ckpt) model = model.to(device) print(['load_embeddings_model', 'end']) return model, tokenizer model, tokenizer = load_model( "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2") pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100) llm = HuggingFacePipeline(pipeline=pipe) # Sentence Transformers models # - paraphrase-multilingual-MiniLM-L12-v2 # - multi-qa-mpnet-base-dot-v1 emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")