aws_test

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Dec 27, 2024

Commit

2e0bd60

verified ·

1 Parent(s): fbbc32a

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -12

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ class GenerateRequest(BaseModel):
     input_text: str = ""
     task_type: str
     temperature: float = 1.0
-    max_new_tokens: int = 200
     stream: bool = True
     top_p: float = 1.0
     top_k: int = 50
@@ -130,7 +130,7 @@ async def generate(request: GenerateRequest):
         input_text = request.input_text
         task_type = request.task_type
         temperature = request.temperature
-        max_new_tokens = request.max_new_tokens
         stream = request.stream
         top_p = request.top_p
         top_k = request.top_k
@@ -147,7 +147,7 @@ async def generate(request: GenerateRequest):
         if "text-to-text" == task_type:
             generation_config = GenerationConfig(
                 temperature=temperature,
-                max_new_tokens=max_new_tokens,
                 top_p=top_p,
                 top_k=top_k,
                 repetition_penalty=repetition_penalty,
@@ -158,7 +158,7 @@ async def generate(request: GenerateRequest):
             return StreamingResponse(
                 stream_text(model, tokenizer, input_text,
                              generation_config, stop_sequences,
-                             device),
                 media_type="text/plain"
             )
         else:
@@ -172,7 +172,7 @@ async def generate(request: GenerateRequest):
 async def stream_text(model, tokenizer, input_text,
                         generation_config, stop_sequences,
-                        device, max_length=2048):
     encoded_input = tokenizer(
         input_text, return_tensors="pt",
         truncation=True, max_length=max_length
@@ -186,6 +186,7 @@ async def stream_text(model, tokenizer, input_text,
     generation_config.max_new_tokens = min(
         remaining_tokens, generation_config.max_new_tokens
     )
     def find_stop(output_text, stop_sequences):
         for seq in stop_sequences:
@@ -222,23 +223,28 @@ async def stream_text(model, tokenizer, input_text,
         if stop_index != -1:
             final_output = output_text[:stop_index]
-            for char in final_output:
-                yield json.dumps({"text": char, "is_end": False}) + "\n"
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
         else:
-            for char in new_text:
-                yield json.dumps({"text": char, "is_end": False}) + "\n"
         if len(output_text) >= generation_config.max_new_tokens:
-            for char in output_text:
-                yield json.dumps({"text": char, "is_end": False}) + "\n"
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break

     input_text: str = ""
     task_type: str
     temperature: float = 1.0
+    max_new_tokens: int = 200 # this will be limited to 10
     stream: bool = True
     top_p: float = 1.0
     top_k: int = 50
         input_text = request.input_text
         task_type = request.task_type
         temperature = request.temperature
+        max_new_tokens = request.max_new_tokens #This value will be used to constraint the output
         stream = request.stream
         top_p = request.top_p
         top_k = request.top_k
         if "text-to-text" == task_type:
             generation_config = GenerationConfig(
                 temperature=temperature,
+                max_new_tokens=min(max_new_tokens,10),  # Constrain max_new_tokens to 10
                 top_p=top_p,
                 top_k=top_k,
                 repetition_penalty=repetition_penalty,
             return StreamingResponse(
                 stream_text(model, tokenizer, input_text,
                              generation_config, stop_sequences,
+                             device, max_length=10),
                 media_type="text/plain"
             )
         else:
 async def stream_text(model, tokenizer, input_text,
                         generation_config, stop_sequences,
+                        device, max_length):
     encoded_input = tokenizer(
         input_text, return_tensors="pt",
         truncation=True, max_length=max_length
     generation_config.max_new_tokens = min(
         remaining_tokens, generation_config.max_new_tokens
     )
     def find_stop(output_text, stop_sequences):
         for seq in stop_sequences:
         if stop_index != -1:
             final_output = output_text[:stop_index]
+            chunked_output = [final_output[i:i+10]
+                             for i in range(0, len(final_output), 10)]
+            for chunk in chunked_output:
+                yield json.dumps({"text": chunk, "is_end": False}) + "\n"
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break
         else:
+             chunked_output = [new_text[i:i+10]
+                             for i in range(0, len(new_text), 10)]
+             for chunk in chunked_output:
+                  yield json.dumps({"text": chunk, "is_end": False}) + "\n"
         if len(output_text) >= generation_config.max_new_tokens:
+            chunked_output = [output_text[i:i+10]
+                             for i in range(0, len(output_text), 10)]
+            for chunk in chunked_output:
+                yield json.dumps({"text": chunk, "is_end": False}) + "\n"
             yield json.dumps({"text": "", "is_end": True}) + "\n"
             break