import os import time import gradio as gr from huggingface_hub import InferenceClient bloom_repo = "bigscience/bloom" bloom_template = """Text translation. {source} text: {query} {target} translated text: """ bloom_model_kwargs=dict( max_new_tokens=1000, temperature=0.3, # truncate=1512, seed=42, stop_sequences=["","<|endoftext|>","<|end|>"], top_p=0.95, repetition_penalty=1.1, ) client = InferenceClient(model=bloom_repo, token=os.environ.get("HUGGINGFACEHUB_API_TOKEN", None)) def split_text_into_chunks(text, chunk_size=1000): lines = text.split('\n') chunks = [] chunk = "" for line in lines: # If adding the current line doesn't exceed the chunk size, add the line to the chunk if len(chunk) + len(line) <= chunk_size: chunk += line + "" else: # If adding the line exceeds chunk size, store the current chunk and start a new one chunks.append(chunk) chunk = line + "" # Don't forget the last chunk chunks.append(chunk) return chunks def translation(source, target, text): output = "" result = "" chunks = split_text_into_chunks(text) for chunk in chunks: try: input_prompt = bloom_template.replace("{source}", source) input_prompt = input_prompt.replace("{target}", target) input_prompt = input_prompt.replace("{query}", chunk) stream = client.text_generation(input_prompt, stream=True, details=True, return_full_text=False, **bloom_model_kwargs) for response in stream: output += response.token.text for stop_str in bloom_model_kwargs['stop_sequences']: if output.endswith(stop_str): output = output[:-len(stop_str)] yield output.replace("","\n").replace("","\n") #yield output.replace("","\n") result += output except Exception as e: print(f"ERROR: LLM show {e}") time.sleep(1) #yield result.replace("","\n").strip() if result == "": result = text return result.replace("","\n").replace("","\n").strip() gr.Interface(translation, inputs=["text","text","text"], outputs="text").queue(concurrency_count=100).launch()