NexaAIDev
/

Octopus-v2-gguf-awq

@@ -31,12 +31,37 @@ Run with [Ollama](https://github.com/ollama/ollama)
 ollama run NexaAIDev/octopus-v2-Q4_K_M
 ```
-# AWQ Quantization
-Python example:
 ```python
 from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer, GemmaForCausalLM
 import torch
 import time
 import numpy as np
@@ -51,28 +76,25 @@ def inference(input_text):
     start_time = time.time()
     generation_output = model.generate(
         tokens,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.95,
-        top_k=40,
         max_new_tokens=512
     )
     end_time = time.time()
-    res = tokenizer.decode(generation_output[0])
-    res = res.split(input_text)
     latency = end_time - start_time
-    output_tokens = tokenizer.encode(res)
-    num_output_tokens = len(output_tokens)
     throughput = num_output_tokens / latency
-    return {"output": res[-1], "latency": latency, "throughput": throughput}
-model_id = "path/to/Octopus-v2-AWQ"
 model = AutoAWQForCausalLM.from_quantized(model_id, fuse_layers=True,
                                           trust_remote_code=False, safetensors=True)
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=False)
 prompts = ["Below is the query from the users, please call the correct function and generate the parameters to call the function.\n\nQuery: Can you take a photo using the back camera and save it to the default location? \n\nResponse:"]

 ollama run NexaAIDev/octopus-v2-Q4_K_M
 ```
+Input example:
+```dash
+"Below is the query from the users, please call the correct function and generate the parameters to call the function.\n\nQuery: Take a selfie for me with front camera \n\nResponse:"
+```
+Output function example:
+```json
+def get_trending_news(category=None, region='US', language='en', max_results=5):
+    """
+    Fetches trending news articles based on category, region, and language.
+    Parameters:
+    - category (str, optional): News category to filter by, by default use None for all categories. Optional to provide.
+    - region (str, optional): ISO 3166-1 alpha-2 country code for region-specific news, by default, uses 'US'. Optional to provide.
+    - language (str, optional): ISO 639-1 language code for article language, by default uses 'en'. Optional to provide.
+    - max_results (int, optional): Maximum number of articles to return, by default, uses 5. Optional to provide.
+    Returns:
+    - list[str]: A list of strings, each representing an article. Each string contains the article's heading and URL.
+    """
+```
+## AWQ Quantization
+Input Python example:
 ```python
+from transformers import AutoTokenizer
 from awq import AutoAWQForCausalLM
 import torch
 import time
 import numpy as np
     start_time = time.time()
     generation_output = model.generate(
         tokens,
+        do_sample=False,
+        temperature=0,
         max_new_tokens=512
     )
     end_time = time.time()
+    generated_sequence = generation_output[:, input_length:].tolist()
+    res = tokenizer.decode(generated_sequence[0])
     latency = end_time - start_time
+    num_output_tokens = len(generated_sequence[0])
     throughput = num_output_tokens / latency
+    return {"output": res, "latency": latency, "throughput": throughput}
+model_id = "NexaAIDev/Octopus-v2-gguf-awq"
+tokenizer = AutoTokenizer.from_pretrained(model_id,
+trust_remote_code=False)
 model = AutoAWQForCausalLM.from_quantized(model_id, fuse_layers=True,
                                           trust_remote_code=False, safetensors=True)
 prompts = ["Below is the query from the users, please call the correct function and generate the parameters to call the function.\n\nQuery: Can you take a photo using the back camera and save it to the default location? \n\nResponse:"]