Spaces:
Running
Running
Commit
Β·
371c49c
1
Parent(s):
3795233
fix(quant): add torchao to quantization option
Browse files- hardware.yaml +1 -0
- src/kernels.py +8 -1
hardware.yaml
CHANGED
@@ -32,6 +32,7 @@
|
|
32 |
- awq
|
33 |
- bnb
|
34 |
- gptq
|
|
|
35 |
backends:
|
36 |
- pytorch
|
37 |
|
|
|
32 |
- awq
|
33 |
- bnb
|
34 |
- gptq
|
35 |
+
- torchao
|
36 |
backends:
|
37 |
- pytorch
|
38 |
|
src/kernels.py
CHANGED
@@ -38,6 +38,7 @@ def get_quant_df(llm_perf_df):
|
|
38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
|
|
41 |
# merge the three dataframes
|
42 |
exllamav1_df = pd.merge(
|
43 |
vanilla_df,
|
@@ -63,8 +64,14 @@ def get_quant_df(llm_perf_df):
|
|
63 |
on=["Model π€"],
|
64 |
suffixes=["", " Custom Kernel"],
|
65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# concat the two dataframes row-wise
|
67 |
-
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
68 |
# compute speedups
|
69 |
quant_df["Prefill Speedup (%)"] = (
|
70 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
|
|
38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
41 |
+
torchao_df = copy_df[(copy_df["Quantization ποΈ"] == "torchao.4bit")]
|
42 |
# merge the three dataframes
|
43 |
exllamav1_df = pd.merge(
|
44 |
vanilla_df,
|
|
|
64 |
on=["Model π€"],
|
65 |
suffixes=["", " Custom Kernel"],
|
66 |
)
|
67 |
+
torchao_df = pd.merge(
|
68 |
+
vanilla_df,
|
69 |
+
torchao_df,
|
70 |
+
on=["Model π€"],
|
71 |
+
suffixes=["", " Custom Kernel"],
|
72 |
+
)
|
73 |
# concat the two dataframes row-wise
|
74 |
+
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df, torchao_df])
|
75 |
# compute speedups
|
76 |
quant_df["Prefill Speedup (%)"] = (
|
77 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|