from huggingface_hub import InferenceClient
import os
import gradio as gr
import random
import time

# HF Inference Endpoints parameter
endpoint_url = "https://qrh4fv8e7x3fw9w3.us-east-1.aws.endpoints.huggingface.cloud" #\
'''
-X POST \
-d '{"inputs":"My name is Teven and I am"}' \
-H "Authorization: Bearer <hf_token>" \
-H "Content-Type: application/json""
'''

hf_token = os.getenv("TOKEN_HF")

# Streaming Client
client = InferenceClient(endpoint_url, token=hf_token)


########################################################################
#Chat KI nutzen, um Text zu generieren...
def predict(text,
            chatbotGr,
            history,
            top_p,
            temperature,
            max_length_tokens,
            max_context_length_tokens,):
    if text=="":
        yield chatbotGr,history,"Empty context."
        return 
    try:
        client
    except:
        yield [[text,"No Model Found"]],[],"No Endpoint Found"
        return

    # generation parameter
    gen_kwargs = dict(
        max_new_tokens=max_length_tokens,
        top_k=30,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=1.02,
        stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
    )
    # prompt
    prompt = generate_prompt_with_history(text,history,tokenizer,max_length=max_context_length_tokens)

                   
#######################################################################
#Darstellung mit Gradio

with open("custom.css", "r", encoding="utf-8") as f:
    customCSS = f.read()

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        # generation parameter
        gen_kwargs = dict(
            max_new_tokens=512,
            top_k=30,
            top_p=0.9,
            temperature=0.2,
            repetition_penalty=1.02,
            stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
        )
        prompt = generate_prompt_with_history(text,history,max_length=max_context_length_tokens)
        stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)

        history[-1][1] = ""
        for character in stream:
            history[-1][1] += character
            time.sleep(0.05)
            yield history

        '''
        # yield each generated token
        for r in stream:
            # skip special tokens
            if r.token.special:
                continue
            # stop if we encounter a stop sequence
            if r.token.text in gen_kwargs["stop_sequences"]:
                break
            # yield the generated token
            #print(r.token.text, end = "")
            yield r.token.text 
        '''


    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)
    
demo.queue()
demo.launch()