import os import requests # from IPython.display import Markdown, display, update_display # from openai import OpenAI # from google.colab import drive from huggingface_hub import login # from google.colab import userdata from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig import torch from dotenv import load_dotenv load_dotenv() LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct" hf_token = os.environ["HF_TOKEN"] login(hf_token, add_to_git_credential=True) system_message = "You are an assistant that produces datasets based on description provided." user_input = "Film critics of 1900s" user_prompt = f"Below is the description for which you need to generate dataset.\n{user_input}" messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_prompt} ] quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4" ) tokenizer = None model = None streamer = None def run_llama(python): if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(LLAMA) tokenizer.pad_token = tokenizer.eos_token inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") # streamer = TextStreamer(tokenizer) if model is None: model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config) outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer) response = tokenizer.decode(outputs[0]) return response # Gradio Interface iface = gr.Interface( fn=run_llama, inputs=gr.Textbox(label="Enter dataset description"), outputs=gr.Markdown(label="Generated Dataset"), title="Dataset Generator", description="Describe the dataset you want to generate." ) iface.launch(share=True, debug=True)