Microsoft Open Source

bapatra commited on May 2, 2024

Commit

47e359f

1 Parent(s): cd3817b

chore(root): Initial files upload.

Browse files

- Upload 3 files (25ca391cc729c0dadf60afa925f7d6c5f3cac5fe)

Co-authored-by: Barun <[email protected]>

Files changed (23) hide show

.gitattributes +35 -0
.gitignore +0 -160
CODE_OF_CONDUCT.md +9 -0
LICENSE +21 -0
README.md +276 -0
SECURITY.md +41 -0
SUPPORT.md +25 -0
cl100k_base.tiktoken +0 -0
config.json +47 -0
configuration_phi3_small.py +250 -0
generation_config.json +9 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +426 -0
modeling_phi3_small.py +1140 -0
positional_embedding.py +288 -0
special_tokens_map.json +5 -0
tokenization_phi3_small.py +315 -0
tokenizer_config.json +16 -0
triton_blocksparse_attention_layer.py +176 -0
triton_flash_blocksparse_attn.py +1943 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,160 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [[email protected]](mailto:[email protected]) with questions or concerns

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

README.md ADDED Viewed

	@@ -0,0 +1,276 @@

+---
+license: mit
+license_link: https://huggingface.co/microsoft/Phi-3-small-8k-instruct/resolve/main/LICENSE
+language:
+- multilingual
+pipeline_tag: text-generation
+tags:
+- nlp
+- code
+inference:
+  parameters:
+    temperature: 0.7
+widget:
+  - messages:
+      - role: user
+        content: Can you provide ways to eat combinations of bananas and dragonfruits?
+---
+## Model Summary
+The Phi-3-Small-8K-Instruct is a 7B parameters, lightweight, state-of-the-art open model trained with the Phi-3 datasets that includes both synthetic data and the filtered publicly available websites data with a focus on high-quality and reasoning dense properties.
+The model belongs to the Phi-3 family with the Small version in two variants [8K](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) and [128K](https://huggingface.co/microsoft/Phi-3-small-128k-instruct) which is the context length (in tokens) that it can support.
+The model has underwent a post-training process that incorporates both supervised fine-tuning and direct preference optimization for the instruction following and safety measures.
+When assessed against benchmarks testing common sense, language understanding, math, code, long context and logical reasoning, Phi-3-Small-8K-Instruct showcased a robust and state-of-the-art performance among models of the same-size and next-size-up.
+Resources and Technical Documentation:
++ [Phi-3 Microsoft Blog](https://aka.ms/Phi-3Build2024)
++ [Phi-3 Technical Report](https://aka.ms/phi3-tech-report)
++ [Phi-3 on Azure AI Studio](https://aka.ms/phi3-azure-ai)
+|         | Short Context | Long Context |
+| ------- | ------------- | ------------ |
+| Mini    | 4K [[HF]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) ; [[GGUF]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx)|
+| Small   | 8K [[HF]](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-small-8k-instruct-onnx-cuda) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-small-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-small-128k-instruct-onnx-cuda)|
+| Medium  | 4K [[HF]](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-cuda) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-cuda)|
+| Vision  |  | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
+## Intended Uses
+**Primary use cases**
+The model is intended for broad commercial and research use in English. The model provides uses for general purpose AI systems and applications which require:
+1) Memory/compute constrained environments
+2) Latency bound scenarios
+3) Strong reasoning (especially code, math and logic)
+Our model is designed to accelerate research on language and multimodal models, for use as a building block for generative AI powered features.
+**Use case considerations**
+Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of language models as they select use cases, and evaluate and mitigate for accuracy, safety, and fariness before using within a specific downstream use case, particularly for high risk scenarios. Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case.
+Nothing contained in this Model Card should be interpreted as or deemed a restriction or modification to the license the model is released under.
+## How to Use
+Phi-3-Small-8K-Instruct has been integrated in the development version (4.40.2) of `transformers`. Until the official version is released through `pip`, ensure that you are doing one of the following:
+* Install tiktoken (0.6.0) ans triton (2.3.0)
+* When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
+* Update your local `transformers` to the development version: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`. The previous command is an alternative to cloning and installing from the source.
+The current `transformers` version can be verified with: `pip list | grep transformers`.
+Phi-3-Small-8K-Instruct is also available in [Azure AI](https://ai.azure.com/explore/models?&selectedCollection=phi).
+### Tokenizer
+Phi-3-Small-8K-Instruct supports a vocabulary size of up to `100352` tokens.
+### Chat Format
+Given the nature of the training data, the Phi-3-Small-8K-Instruct model is best suited for prompts using the chat format as follows.
+You can provide the prompt as a question with a generic template as follow:
+```markdown
+<|endoftext|><|user|>\nQuestion <|end|>\n<|assistant|>
+```
+For example:
+```markdown
+<|endoftext|><|user|>
+How to explain Internet for a medieval knight?<|end|>
+<|assistant|>
+```
+where the model generates the text after `<|assistant|>` . In case of few-shots prompt, the prompt can be formatted as the following:
+```markdown
+<|endoftext|><|user|>
+I am going to Paris, what should I see?<|end|>
+<|assistant|>
+Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:\n\n1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.\n2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.\n3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.\n\nThese are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."<|end|>
+<|user|>
+What is so great about #1?<|end|>
+<|assistant|>
+```
+### Sample inference code
+This code snippets show how to get quickly started with running the model on a GPU:
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+torch.random.manual_seed(0)
+model_id = "microsoft/Phi-3-small-8k-instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype="auto",
+    trust_remote_code=True,
+)
+assert torch.cuda.is_available(), "This model needs a GPU to run ..."
+device = torch.cuda.current_device()
+model = model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+messages = [
+    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
+    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
+    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
+]
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device=device
+)
+generation_args = {
+    "max_new_tokens": 500,
+    "return_full_text": False,
+    "temperature": 0.0,
+    "do_sample": False,
+}
+output = pipe(messages, **generation_args)
+print(output[0]['generated_text'])
+```
+*Some applications/frameworks might not include a BOS token (`<|endoftext|>`) at the start of the conversation. Please ensure that it is included since it provides more reliable results.*
+## Responsible AI Considerations
+Like other language models, the Phi series models can potentially behave in ways that are unfair, unreliable, or offensive. Some of the limiting behaviors to be aware of include:
++ Quality of Service: the Phi models are trained primarily on English text. Languages other than English will experience worse performance. English language varieties with less representation in the training data might experience worse performance than standard American English.
++ Representation of Harms & Perpetuation of Stereotypes: These models can over- or under-represent groups of people, erase representation of some groups, or reinforce demeaning or negative stereotypes. Despite safety post-training, these limitations may still be present due to differing levels of representation of different groups or prevalence of examples of negative stereotypes in training data that reflect real-world patterns and societal biases.
++ Inappropriate or Offensive Content: these models may produce other types of inappropriate or offensive content, which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the use case.
++ Information Reliability: Language models can generate nonsensical content or fabricate content that might sound reasonable but is inaccurate or outdated.
++ Limited Scope for Code: Majority of Phi-3 training data is based in Python and use common packages such as "typing, math, random, collections, datetime, itertools". If the model generates Python scripts that utilize other packages or scripts in other languages, we strongly recommend users manually verify all API uses.
+Developers should apply responsible AI best practices and are responsible for ensuring that a specific use case complies with relevant laws and regulations (e.g. privacy, trade, etc.). Important areas for consideration include:
++ Allocation: Models may not be suitable for scenarios that could have consequential impact on legal status or the allocation of resources or life opportunities (ex: housing, employment, credit, etc.) without further assessments and additional debiasing techniques.
++ High-Risk Scenarios: Developers should assess suitability of using models in high-risk scenarios where unfair, unreliable or offensive outputs might be extremely costly or lead to harm. This includes providing advice in sensitive or expert domains where accuracy and reliability are critical (ex: legal or health advice). Additional safeguards should be implemented at the application level according to the deployment context.
++ Misinformation: Models may produce inaccurate information. Developers should follow transparency best practices and inform end-users they are interacting with an AI system. At the application level, developers can build feedback mechanisms and pipelines to ground responses in use-case specific, contextual information, a technique known as Retrieval Augmented Generation (RAG).
++ Generation of Harmful Content: Developers should assess outputs for their context and use available safety classifiers or custom solutions appropriate for their use case.
++ Misuse: Other forms of misuse such as fraud, spam, or malware production may be possible, and developers should ensure that their applications do not violate applicable laws and regulations.
+## Training
+### Model
+* Architecture: Phi-3 Small-8K-Instruct has 7B parameters and is a dense decoder-only Transformer model. The model is fine-tuned with Supervised fine-tuning (SFT) and Direct Preference Optimization (DPO) to ensure alignment with human preferences and safety guidlines.
+* Inputs: Text. It is best suited for prompts using chat format.
+* Context length: 8K tokens
+* GPUs: 1024 H100-80G
+* Training time: 18 days
+* Training data: 4.8T tokens
+* Outputs: Generated text in response to the input
+* Dates: Our models were trained between February and April 2024
+* Status: This is a static model trained on an offline dataset with cutoff date October 2023. Future versions of the tuned models may be released as we improve models.
+* Release dates	The model weight is released on May 21, 2024.
+### Datasets
+Our training data includes a wide variety of sources, totaling 4.8 trillion tokens (including 10% multilingual), and is a combination of
+1) Publicly available documents filtered rigorously for quality, selected high-quality educational data, and code;
+2) Newly created synthetic, “textbook-like” data for the purpose of teaching math, coding, common sense reasoning, general knowledge of the world (science, daily activities, theory of mind, etc.);
+3) High quality chat format supervised data covering various topics to reflect human preferences on different aspects such as instruct-following, truthfulness, honesty and helpfulness.
+We are focusing on the quality of data that could potentially improve the reasoning ability for the model, and we filter the publicly available documents to contain the correct level of knowledge. As an example, the result of a game in premier league in a particular day might be good training data for frontier models, but we need to remove such information to leave more model capacity for reasoning for the small size models. More details about data can be found in the [Phi-3 Technical Report](https://aka.ms/phi3-tech-report).
+## Benchmarks
+We report the results for Phi-3-Small-8K-Instruct on standard open-source benchmarks measuring the model's reasoning ability (both common sense reasoning and logical reasoning). We compare to Mixtral-8x7b, Gemini-Pro, Gemma 7B, Llama-3-8B-Instruct, GPT-3.5-Turbo-1106, and GPT-4-Turbo-1106.
+All the reported numbers are produced with the exact same pipeline to ensure that the numbers are comparable. These numbers might differ from other published numbers due to slightly different choices in the evaluation.
+As is now standard, we use few-shot prompts to evaluate the models, at temperature 0.
+The prompts and number of shots are part of a Microsoft internal tool to evaluate language models, and in particular we did no optimization to the pipeline for Phi-3.
+More specifically, we do not change prompts, pick different few-shot examples, change prompt format, or do any other form of optimization for the model.
+The number of k–shot examples is listed per-benchmark.
+|Benchmark|Phi-3-Small-8K-Instruct<br>7b|Gemma<br>7B|Mixtral<br>8x7B|Llama-3-Instruct<br>8b|GPT-3.5-Turbo<br>version 1106|Gemini<br>Pro|GPT-4-Turbo<br>version 1106 (Chat)|
+|---------|-----------------------|--------|-------------|-------------------|-----------------|----------|------------------------|
+|AGI Eval<br>5-shot|45.1|42.1|45.2|42.0|48.4|49.0|59.6|
+|MMLU<br>5-shot|75.7|63.6|70.5|66.5|71.4|66.7|84.0|
+|BigBench Hard<br>3-shot|79.1|59.6|69.7|51.5|68.3|75.6|87.7|
+|ANLI<br>7-shot|58.1|48.7|55.2|57.3|58.1|64.2|71.7|
+|HellaSwag<br>5-shot|77.0|49.8|70.4|71.1|78.8|76.2|88.3|
+|ARC Challenge<br>10-shot|90.7|78.3|87.3|82.8|87.4|88.3|95.6|
+|ARC Easy<br>10-shot|97.0|91.4|95.6|93.4|96.3|96.1|98.8|
+|BoolQ<br>2-shot|84.8|66.0|76.6|80.9|79.1|86.4|91.3|
+|CommonsenseQA<br>10-shot|80.0|76.2|78.1|79.0|79.6|81.8|86.7|
+|MedQA<br>2-shot|65.4|49.6|62.2|60.5|63.4|58.2|83.7|
+|OpenBookQA<br>10-shot|88.0|78.6|85.8|82.6|86.0|86.4|93.4|
+|PIQA<br>5-shot|86.9|78.1|86.0|75.7|86.6|86.2|90.1|
+|Social IQA<br>5-shot|79.2|65.5|75.9|73.9|68.3|75.4|81.7|
+|TruthfulQA (MC2)<br>10-shot|70.2|52.1|60.1|63.2|67.7|72.6|85.2|
+|WinoGrande<br>5-shot|81.5|55.6|62.0|65.0|68.8|72.2|86.7|
+|TriviaQA<br>5-shot|58.1|72.3|82.2|67.7|85.8|80.2|73.3|
+|GSM8K Chain of Thought<br>8-shot|89.6|59.8|64.7|77.4|78.1|80.4|94.2|
+|HumanEval<br>0-shot|61.0|34.1|37.8|60.4|62.2|64.4|79.9|
+|MBPP<br>3-shot|71.7|51.5|60.2|67.7|77.8|73.2|86.7|
+|Average|75.7|61.8|69.8|69.4|74.3|75.4|85.2|
+We take a closer look at different categories across 80 public benchmark datasets at the table below:
+|Benchmark|Phi-3-Small-8K-Instruct<br>7b|Gemma<br>7B|Mixtral<br>8x7B|Llama-3-Instruct<br>8b|GPT-3.5-Turbo<br>version 1106|Gemini<br>Pro|GPT-4-Turbo<br>version 1106 (Chat)|
+|--------|------------------------|--------|-------------|-------------------|-------------------|----------|------------------------|
+|Popular aggregated benchmark|71.1|59.4|66.2|59.9|67.0|67.5|80.5|
+|Reasoning|82.4|69.1|77.0|75.7|78.3|80.4|89.3|
+|Language understanding|70.6|58.4|64.9|65.4|70.4|75.3|81.6|
+|Code generation|60.7|45.6|52.7|56.4|70.4|66.7|76.1|
+|Math|51.6|35.8|40.3|41.1|52.8|50.9|67.1|
+|Factual knowledge|38.6|46.7|58.6|43.1|63.4|54.6|45.9|
+|Multilingual|62.5|63.2|63.4|65.0|69.1|76.5|82.0|
+|Robustness|72.9|38.4|51.0|64.5|69.3|69.7|84.6|
+## Software
+* [PyTorch](https://github.com/pytorch/pytorch)
+* [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+* [Transformers](https://github.com/huggingface/transformers)
+* [Flash-Attention](https://github.com/HazyResearch/flash-attention)
+* [Tiktoken](https://github.com/openai/tiktoken)
+* [Triton](https://github.com/openai/triton)
+## Hardware
+Note that by default, the Phi-3-Small model uses flash attention, which requires certain types of GPU hardware to run. We have tested on the following GPU types:
+* NVIDIA A100
+* NVIDIA A6000
+* NVIDIA H100
+If you want to run the model on:
++ Optimized inference on GPU, CPU, and Mobile: use the **ONNX** models [8K](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-cuda)
+## Cross Platform Support
+ONNX runtime ecosystem now supports Phi3 small models  across platforms and hardware.
+Optimized phi-3 models are also published here in ONNX format, to run with ONNX Runtime on CPU and GPU across devices, including server platforms, Windows, Linux and Mac desktops, and mobile CPUs, with the precision best suited to each of these targets. DirectML GPU acceleration is supported for Windows desktops GPUs (AMD, Intel, and NVIDIA).
+Along with DML, ONNX Runtime provides cross platform support for Phi3 Small  across a range of devices CPU, GPU, and mobile.
+Here are some of the optimized configurations we have added:
+1. ONNX models for int4 DML: Quantized to int4 via AWQ
+2. ONNX model for fp16 CUDA
+3. ONNX model for int4 CUDA: Quantized to int4 via RTN
+4. ONNX model for int4 CPU and Mobile: Quantized to int4 via RTN
+## License
+The model is licensed under the [MIT license](https://huggingface.co/microsoft/Phi-3-small-8k/resolve/main/LICENSE).
+## Trademarks
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies.

SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

SUPPORT.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# TODO: The maintainer of this repo has not yet edited this file
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+## Microsoft Support Policy
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.

cl100k_base.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "_name_or_path": "Phi-3-small-8k-instruct",
+  "architectures": [
+    "Phi3SmallForCausalLM"
+  ],
+  "attention_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3_small.Phi3SmallConfig",
+    "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM",
+    "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer"
+  },
+  "blocksparse_block_size": 64,
+  "blocksparse_homo_head_pattern": false,
+  "blocksparse_num_local_blocks": 16,
+  "blocksparse_triton_kernel_block_size": 64,
+  "blocksparse_vert_stride": 8,
+  "bos_token_id": 100257,
+  "dense_attention_every_n_layers": 2,
+  "embedding_dropout_prob": 0.1,
+  "eos_token_id": 100257,
+  "ff_dim_multiplier": null,
+  "ff_intermediate_size": 14336,
+  "ffn_dropout_prob": 0.1,
+  "gegelu_limit": 20.0,
+  "gegelu_pad_to_256": true,
+  "hidden_act": "gegelu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 8192,
+  "model_type": "phi3small",
+  "mup_attn_multiplier": 1.0,
+  "mup_embedding_multiplier": 10.0,
+  "mup_use_scaling": true,
+  "mup_width_multiplier": 8.0,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_sequence_to_multiple_of_64": true,
+  "reorder_and_upcast_attn": false,
+  "rope_embedding_base": 1000000,
+  "rope_position_scale": 1.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.1",
+  "use_cache": true,
+  "vocab_size": 100352
+}

configuration_phi3_small.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from functools import cached_property
+""" Phi3Small model configuration """
+logger = logging.get_logger(__name__)
+def next_mult(x, y):
+    return (x + y - 1) // y * y
+class Phi3SmallConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Phi3SmallConfig, Phi3SmallModel
+    >>> # Initializing a Phi3Small configuration
+    >>> configuration = Phi3SmallConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Phi3SmallModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "phi3small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # General information about the model
+        vocab_size: int =100352,
+        max_position_embeddings: int = 8192,
+        # RoPE Related Parameters
+        rope_embedding_base: float = 10**6,
+        rope_position_scale: float = 1.0,
+        rope_scaling: Optional[Dict[str, Union[float, List[float], int]]] = None,
+        # General Model Parameters
+        hidden_size: int = 4096,
+        num_hidden_layers: int = 32,
+        # KV Shared Attention Configurations
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        # GEGELU Related Parameters
+        hidden_act: str = "gegelu",
+        gegelu_limit: float = 20.0,
+        gegelu_pad_to_256: bool = True,
+        ff_dim_multiplier: Optional[int] = None,
+        ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
+        blocksparse_homo_head_pattern: bool = False,
+        blocksparse_block_size: int = 64,
+        blocksparse_num_local_blocks: int = 16,
+        blocksparse_vert_stride: int = 8,
+        blocksparse_triton_kernel_block_size: int = 64,
+        # Frequency of block-sparsity
+        dense_attention_every_n_layers: Optional[int] = 2,
+        # Reegularization parameters
+        embedding_dropout_prob: float =0.1,
+        attention_dropout_prob: float = 0.0,
+        ffn_dropout_prob: float = 0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        # MuP parameters
+        mup_use_scaling: bool = True,
+        mup_width_multiplier: bool = 8.0,
+        mup_embedding_multiplier: bool = 10.0,
+        mup_attn_multiplier: bool =1.0,
+        use_cache=True,
+        # The model does not have a bos token id
+        # However, in order for some of the downstream libraries to not break
+        # we set this to be the same as the eos_token_id
+        bos_token_id: int = 100257,
+        eos_token_id: int = 100257,
+        reorder_and_upcast_attn=False,
+        # Configuration to pad sequence length to a multiple of 64
+        pad_sequence_to_multiple_of_64: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_embedding_base = rope_embedding_base
+        self.rope_position_scale = rope_position_scale
+        self.rope_scaling = rope_scaling
+        self.hidden_size = hidden_size
+        # QK Shared Attention
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # Block Sparse Attention Pattern
+        self.blocksparse_homo_head_pattern = blocksparse_homo_head_pattern
+        self.blocksparse_block_size = blocksparse_block_size
+        self.blocksparse_num_local_blocks = blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = blocksparse_vert_stride
+        self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
+        # Frequency of block sparsity
+        self.dense_attention_every_n_layers = dense_attention_every_n_layers
+        # Activation function
+        self.hidden_act = hidden_act
+        self.gegelu_limit = gegelu_limit
+        self.gegelu_pad_to_256 = gegelu_pad_to_256
+        self.ff_dim_multiplier = ff_dim_multiplier
+        self.ff_intermediate_size = ff_intermediate_size
+        if self.ff_dim_multiplier is None and self.ff_intermediate_size is None:
+            raise ValueError(f"Cannot have both {self.ff_dim_multiplier} and {self.ff_intermediate_size} as None")
+        if self.ff_dim_multiplier is not None and self.ff_intermediate_size is not None:
+            raise ValueError(f"Cannot specify both {self.ff_dim_multiplier} and {self.ff_intermediate_size}.")
+        # General regularization
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.ffn_dropout_prob = ffn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # MuP parameters
+        self.mup_use_scaling = mup_use_scaling
+        self.mup_width_multiplier = mup_width_multiplier
+        self.mup_embedding_multiplier = mup_embedding_multiplier
+        self.mup_attn_multiplier = mup_attn_multiplier
+        self.use_cache = use_cache
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.pad_sequence_to_multiple_of_64 = pad_sequence_to_multiple_of_64
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # Importing here to avoid circular imports
+        from .tokenization_phi3_small import Phi3SmallTokenizer
+        tokenizer = Phi3SmallTokenizer()
+        return tokenizer.dummy_token_indices
+    @property
+    def intermediate_size(self) -> int:
+        if self.ff_intermediate_size is not None:
+            return self.ff_intermediate_size
+        intermediate_size = (self.ff_dim_multiplier) * (self.hidden_size // 3) * 2
+        if self.gegelu_pad_to_256:
+            intermediate_size = next_mult(intermediate_size, 256)
+        return intermediate_size

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100257,
+  "eos_token_id": [
+    100257,
+    100266
+  ],
+  "transformers_version": "4.38.1"
+}

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8435e8fd0cc2a302f057814bb7e2650f16a4812a9b34339e3769e213276797
+size 4832943104

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0be58e1371e8630fff0f8655d6be99a2dfc6ccfb4e00bc4fa85e831b8042eac6
+size 4799608224

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77aa243e7aa0a19eb37eb8dabc6f30de9a779c606cb476e5ac432d742fe7e917
+size 4799608240

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb5772a577868e7e794bed074c19b0d5284a5f9a0a89b537c33623873940f3a
+size 352437304

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,426 @@

+{
+  "metadata": {
+    "total_size": 14784548864
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.final_layernorm.bias": "model-00004-of-00004.safetensors",
+    "model.final_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.dense.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.dense.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.query_key_value.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.query_key_value.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.dense.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.dense.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.query_key_value.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.query_key_value.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.dense.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.dense.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.query_key_value.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.query_key_value.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00001-of-00004.safetensors"
+  }
+}

modeling_phi3_small.py ADDED Viewed

	@@ -0,0 +1,1140 @@

+import math
+from typing import Any, Dict, Optional, List, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.modeling_outputs import SequenceClassifierOutputWithPast, CausalLMOutputWithPast, BaseModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.cache_utils import Cache, DynamicCache
+from .triton_flash_blocksparse_attn import BlockSparseParams
+from .triton_blocksparse_attention_layer import BlockSparseAttentionLayer
+from .positional_embedding import RotaryEmbedding
+from .configuration_phi3_small import Phi3SmallConfig
+# Flash Attention Related Imports
+is_flash_attention_available = False
+try:
+    import flash_attn
+    if int(flash_attn.__version__.split('.')[0]) < 2:
+        from flash_attn.flash_attn_interface import (
+            flash_attn_func,
+            flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
+            )
+        # rename `max_seqlen`
+        def flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens, max_seqlen, dropout_p=0.0, **kwargs):
+            return flash_attn_func(qkv, cu_seqlens, dropout_p=dropout_p, max_s=max_seqlen, **kwargs)
+    else:
+        from flash_attn.flash_attn_interface import (
+            flash_attn_varlen_kvpacked_func,
+        )
+        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+    is_flash_attention_available = True
+except ImportError:
+    pass
+logger = logging.get_logger(__name__)
+LegacyCache = Tuple[Tuple[torch.FloatTensor]]
+# Taken from https://github.com/allenai/allennlp/blob/main/allennlp/nn/util.py
+def info_value_of_dtype(dtype: torch.dtype):
+    """
+    Returns the `finfo` or `iinfo` object of a given PyTorch data type. Does not allow torch.bool.
+    """
+    if dtype == torch.bool:
+        raise TypeError("Does not support torch.bool")
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype)
+    else:
+        return torch.iinfo(dtype)
+# Taken from https://github.com/allenai/allennlp/blob/main/allennlp/nn/util.py
+def min_value_of_dtype(dtype: torch.dtype):
+    """
+    Returns the minimum value of a given PyTorch data type. Does not allow torch.bool.
+    """
+    return info_value_of_dtype(dtype).min
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+@torch.jit.script
+def gegelu(input, limit: Optional[float] = None):
+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
+    if limit is not None:
+        a_gelu = torch.where(
+            torch.isinf(a_gelu), a_gelu, a_gelu.clamp(min=None, max=limit)
+        )
+        a_linear = torch.where(
+            torch.isinf(a_linear), a_linear, a_linear.clamp(min=-limit, max=limit)
+        )
+    out_gelu = quick_gelu(a_gelu)
+    return out_gelu * (a_linear + 1)
+def collapse_first_n_dims(x: torch.Tensor, n: int) -> torch.Tensor:
+    """
+    Collapse the first `n` dimensions of a tensor into a single dimension.
+    Args:
+        x (torch.Tensor): The input tensor.
+        n (int): The number of dimensions to collapse.
+    Returns:
+        torch.Tensor: The output tensor.
+    """
+    return x.view(-1, *x.shape[n:])
+def pad_tensor_to_next_mult_of(
+    tensor: torch.Tensor,
+    dim: int,
+    n: int,
+) -> Tuple[torch.Tensor, int]:
+    """
+    Pads a tensor along a specified dimension to the next multiple of a given number.
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        dim (int): The dimension along which to pad the tensor.
+        n (int): The number to pad the tensor to the next multiple of.
+    Returns:
+        Tuple[torch.Tensor, int]: A tuple containing the padded tensor and the amount of padding added.
+    """
+    residual = tensor.size(dim) % n
+    if residual == 0:
+        return tensor, 0
+    padding = n - residual
+    padding_tensor = torch.zeros((*tensor.size()[:dim], padding, *tensor.size()[dim + 1:]), device=tensor.device, dtype=tensor.dtype)
+    return torch.cat([tensor, padding_tensor], dim=dim), padding
+def strip_padding_from_tensor(
+    tensor: torch.Tensor,
+    dim: int,
+    residual: int,
+) -> torch.Tensor:
+    """
+    Removes padding from a tensor along a specified dimension.
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        dim (int): The dimension along which to remove padding.
+        residual (int): The amount of padding to remove.
+    Returns:
+        torch.Tensor: The tensor with padding removed along the specified dimension.
+    """
+    return torch.narrow(tensor, dim, 0, tensor.size(dim) - residual)
+class Phi3SmallMLP(nn.Module):
+    def __init__(self, config: Phi3SmallConfig):
+        super().__init__()
+        self.config = config
+        assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
+        self.hidden_size = config.hidden_size
+        self.gegelu_limit = config.gegelu_limit
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size)
+        self.dropout = nn.Dropout(config.ffn_dropout_prob)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(
+            self.down_proj(
+                gegelu(self.up_proj(x), limit=self.gegelu_limit)
+            )
+        )
+class Phi3SmallSelfAttention(nn.Module):
+    def __init__(self, config: Phi3SmallConfig, layer_idx: Optional[int] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        # Number of Query Heads
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        # Number of Key Value Heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_embedding_base = config.rope_embedding_base
+        self.rope_position_scale = config.rope_position_scale
+        self.is_causal = True
+        self.attention_dropout_rate = config.attention_dropout_prob
+        norm_factor = None
+        if config.mup_use_scaling:
+            norm_factor = self.head_dim / config.mup_attn_multiplier
+        else:
+            norm_factor = math.sqrt(self.head_dim)
+        self.softmax_scale = 1.0 / norm_factor
+        self.query_key_value = nn.Linear(self.hidden_size, (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.blocksparse_params = None
+        # layer_idx is 0 indexed because that's what the KV Cache expects.
+        if self.config.dense_attention_every_n_layers and ((self.layer_idx + 1) % self.config.dense_attention_every_n_layers == 0):
+            logger.info(
+                f"Layer {layer_idx + 1} is using dense attention since it is divisible by "
+                f"{self.config.dense_attention_every_n_layers}"
+            )
+            assert is_flash_attention_available, "Flash Attention is not available, but is needed for dense attention"
+        else:
+            # BlockSparse related Parameters
+            self.blocksparse_params = BlockSparseParams.from_config(config)
+        if self.blocksparse:
+            active_head_range = None
+            """
+                ... note(bapatra)::
+                    In case of tensor parallelism and while using the heterogeneous head patterns,
+                    the active head range needs to be modified based on the tensor parallel rank
+                    and the tensor parallel world size.
+                    This is because in the case of heterogeneous head patterns, the kernel needs to know
+                    which head is on which device, so that it can pick the corresponding blocksparse head
+                    pattern correctly.
+                    Example:
+                    ```python
+                        if not self.blocksparse_params.homo_head_pattern:
+                            tp_rank = torch.distributed.get_rank() % tp_world_size
+                            num_heads_per_partition = num_heads // tp_world_size
+                            active_head_range = (tp_rank * num_heads_per_partition, (tp_rank + 1) * num_heads_per_partition)
+                    ```
+            """
+            self._blocksparse_layer = BlockSparseAttentionLayer(
+                n_heads=self.num_heads,
+                max_seq_len=self.max_position_embeddings,
+                sparse_block_size=self.blocksparse_params.block_size,
+                local_blocks=self.blocksparse_params.num_local_blocks,
+                vert_stride=self.blocksparse_params.vert_stride,
+                kernel_block_size=self.blocksparse_params.kernel_block_size,
+                homo_head=self.blocksparse_params.homo_head_pattern,
+                active_head_range=active_head_range,
+            )
+        self.rotary_emb = RotaryEmbedding.from_config(config)
+    @property
+    def blocksparse(self):
+        return self.blocksparse_params is not None
+    def _split_heads(self, mixed_x_layer: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        bs, sq, _ = mixed_x_layer.size()
+        r"""
+        The main idea is that we group tensors as
+        [bs, sq, (q00, q01, ... q0m, k0, v0), (q10, q11, ... q1m, k1, v1), ... (qn0, qn1, ... qnm, kn, vn)]
+        That ways, when the MP column sharding happens, this tensor will be sharded keeping all the
+        queries and keys intact. In order to get the correct qkv, we first break into groups, and then
+        index into the groups.
+        """
+        intermediate_shape = (bs, sq, -1, (self.num_q_per_kv + 2), self.head_dim)
+        mixed_x_layer = mixed_x_layer.view(*intermediate_shape)
+        q = mixed_x_layer[:, :, :, :-2]
+        k = mixed_x_layer[:, :, :, [-2]]
+        v = mixed_x_layer[:, :, :, [-1]]
+        q, k, v = [
+            rearrange(
+                x,
+                "bs sq group nh hn -> bs sq (group nh) hn"
+            ) for x in (q, k, v)
+        ]
+        return q, k, v
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._unpad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def _apply_blocksparse_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor],
+        return_attention_probs: bool = False,
+    ) -> torch.Tensor:
+        """
+        Applies blocksparse attention to the input tensors.
+        Args:
+            q (torch.Tensor): The query tensor of shape (bs, nqp, seq_len, hn).
+            k (torch.Tensor): The key tensor of shape (bs, nkp, seq_len, hn).
+            v (torch.Tensor): The value tensor of shape (bs, nkp, seq_len, hn).
+            attention_mask (Optional[torch.LongTensor]): The attention mask tensor of shape (bs, seq_len).
+            return_attention_probs (bool, optional): Whether to return attention probabilities. Defaults to False.
+        Returns:
+            torch.Tensor: The context layer tensor of shape (bs, nqp, seq_len, hn).
+        """
+        assert not return_attention_probs, "return_attention_probs is not supported for blocksparse attention"
+        q, k, v = q.contiguous(), k.contiguous(), v.contiguous()
+        # shape: (bs, nqp, seq_len, hn)
+        if torch.is_grad_enabled():
+            # Training or non-batched inference
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale
+            )
+        elif attention_mask is None:
+            if q.size(0) != 1:
+                logger.warning_once(
+                    "You are attempting to do batched inference without passing the attention mask.\n"
+                    "This is okay if you are running loglikelihood requests. However, if you want to do generation, "
+                    "this probably won't work as expected. Please pass the attention mask to the forward function."
+                )
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale
+            )
+        else:
+            """
+                Shapes of tensors are as follows:
+                    q: (bs, nqp, seq_len, hdim)
+                    k: (bs, nkp, seq_len, hdim)
+                    v: (bs, nkp, seq_len, hdim)
+                We first need to transpose the shapes to fit what the
+                kernel needs, and the reinvert it back at the end of the operations
+            """
+            assert attention_mask.ndim == 2, "The kernel, like flash-attention-2, only supports 2d attention masks ..."
+            left_paddings = attention_mask.shape[1] - attention_mask.sum(dim=-1)
+            # shape: (bs, seq_len, nqp, hdim)
+            q = q.transpose(1, 2).contiguous()
+            # shape: (bs, seq_len, nkp, hdim)
+            k = k.transpose(1, 2).contiguous()
+            # shape: (bs, seq_len, nkp, hdim)
+            v = v.transpose(1, 2).contiguous()
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale, left_paddings=left_paddings.to(torch.int32)
+            )
+            # shape: (bs, nqp, seq_len, hdim)
+            context_layer = context_layer.transpose(1, 2).contiguous()
+        return context_layer
+    def _apply_dense_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: torch.Tensor,
+        return_attention_probs: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Apply dense attention
+        Args:
+            q (torch.Tensor):
+                The query tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            k (torch.Tensor):
+                The key tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            v (torch.Tensor):
+                The value tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            return_attention_probs (bool, optional):
+                Return the attention probabilities. Defaults to False.
+        Returns:
+            Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+                Return the output of the attention aggregation. If `return_attention_probs` is True, then
+                also return the attention probabilities
+        .. note::
+            Right now, am assuming the expansion for the query key values is already done
+            outside. But ideally, since Flash attention handles the GQA correctly, we can
+            avoid doing that.
+        """
+        attention_dropout_prob = self.attention_dropout_rate if self.training else 0.0
+        # Get into the correct shape for the Flash Attention API
+        # shape: (bs, seq_len, nqp, hn)
+        q = q.transpose(1, 2).contiguous()
+        query_length = q.size(1)
+        # shape: (bs, seq_len, npq, hn)
+        k = k.transpose(1, 2).contiguous()
+        # shape: (bs, seq_len, npq, hn)
+        v = v.transpose(1, 2).contiguous()
+        if attention_mask is not None:
+            causal = q.size(2) == k.size(2)
+            batch_size = q.shape[0]
+            flat_q, flat_k, flat_v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                q, k, v, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_q, max_seqlen_k = max_seq_lens
+            flat_kv = torch.cat((flat_k.unsqueeze(1), flat_v.unsqueeze(1)), dim=1)
+            attn_output_unpad = flash_attn_varlen_kvpacked_func(
+                q=flat_q,
+                kv=flat_kv,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                dropout_p=attention_dropout_prob,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                return_attn_probs=return_attention_probs
+            )
+            attention_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length
+            )
+        else:
+            kv = torch.cat((k.unsqueeze(2), v.unsqueeze(2)), dim=2)
+            cu_seqlens_q = torch.arange(
+                0, (q.size(0) + 1), device=q.device, dtype=torch.int32
+            ) * q.size(1)
+            cu_seqlens_kv = torch.arange(
+                0, (kv.size(0) + 1), device=kv.device, dtype=torch.int32
+            ) * kv.size(1)
+            max_seqlen_q = q.size(1)
+            max_seqlen_k = kv.size(1)
+            attention_output = flash_attn_varlen_kvpacked_func(
+                q=collapse_first_n_dims(q, 2),
+                kv=collapse_first_n_dims(kv, 2),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                dropout_p=attention_dropout_prob,
+                softmax_scale=self.softmax_scale,
+                causal=q.size(1) == kv.size(1),
+                return_attn_probs=return_attention_probs
+            )
+        if return_attention_probs:
+            (context_layer, attn_probs) = attention_output
+            context_layer = context_layer.view(q.size(0), q.size(1), -1, q.size(3)).transpose(1, 2).contiguous()
+            return (context_layer, attn_probs)
+        context_layer = attention_output
+        context_layer = context_layer.view(q.size(0), q.size(1), -1, q.size(3)).transpose(1, 2).contiguous()
+        return context_layer
+    def expand_kv_to_q_size(self, kv: torch.Tensor, num_q_per_kv: int) -> torch.Tensor:
+        """
+        Expand the key-value tensor to match the size of the query tensor.
+        Args:
+            kv (torch.Tensor): The key-value tensor of shape (bsz, nkp, 2, seq_len, hdim).
+            num_q_per_kv (int): The number of queries per key-value.
+        Returns:
+            torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
+            Where nqp = num_q_per_kv * nkp
+        .. note(bapatra)::
+            Right now, I am using a repeat_interleave to expand the kv to the size of q.
+            This incurs a memory penalty, since the tensors are actually copied.
+            TODO: If this does yield benefits, then potentially we can use the re-written
+            flash attention kernel that can handle GQA.
+        """
+        repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
+        total = repeats.sum()
+        expanded_kv = torch.repeat_interleave(
+            kv,
+            repeats=repeats,
+            dim=1,
+            output_size=total
+        )
+        return expanded_kv
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+        The forward function of the Self Attention Layer.
+        Args:
+            hidden_states (torch.Tensor):
+                The input tensor of shape (bs, q_len, h).
+            attention_mask (Optional[torch.Tensor], optional):
+                The attention mask tensor of shape (bs, seq_len). This is the 2D attention mask tensor as is standard in the flash-attention
+                kernel.
+                Defaults to None.
+            position_ids (Optional[torch.LongTensor], optional):
+                The position ids tensor of shape (bs, q_len). Defaults to None. Unused by the function.
+            past_key_value (Optional[Cache], optional):
+                The previous kv cache values. Defaults to None.
+            output_attentions (bool, optional):
+                Whether to return the attention scores. Defaults to False.
+                    .. note::
+                        For the blocksparse attention kernel, we do not support returning the attention scores.
+            use_cache (bool, optional):
+                Whether to use the cache for storing the kv. Defaults to False.
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+                The output tensor of shape (bs, q_len, h),
+                the attention scores tensor of shape (bs, nqp, q_len, seq_len) if `output_attentions` is True,
+                and the updated cache values if `use_cache` is True.
+        Notations:
+        ------------
+            bs: batch size
+            sq_len: sequence length of the entire sequence
+            q_len: sequence length of the query
+            cache_sq: sequence length in the cache
+                If there is no cache then cache_sq = 0
+                and sq_len = q_len
+                otherwise sq_len = q_len + cache_sq
+            h: hidden size
+            nq: number of query heads
+            nkv: number of key heads
+            hn: hidden size per head
+                hn = h // nq
+            nqp: number of query heads (per MP partition)
+                nqp = nq // (num mp partitions)
+            nkvp: number of key-value heads (per MP partition)
+                nkvp = nk // (num mp partitions)
+        """
+        # shape: (bs, q_len, h)
+        bsz, q_len, _ = hidden_states.size()
+        # shape: (bs, q_len, (nqp + 2 * nkvp) * hn)
+        mixed_x_layer = self.query_key_value(hidden_states)
+        # shape: (bs, q_len, nqp, hn), shape: (bs, q_len, nkvp, hn), shape: (bs, q_len, nkvp, hn)
+        q, k, v = self._split_heads(mixed_x_layer)
+        # shape: (bs, qnp, q_len, hn)
+        query_states = q.permute(0, 2, 1, 3).contiguous()
+        # shape: (bs, nkvp, q_len, hn)
+        key_states = k.permute(0, 2, 1, 3).contiguous()
+        # shape: (bs, nkvp, q_len, hn)
+        value_states = v.permute(0, 2, 1, 3).contiguous()
+        kv_seq_len = key_states.shape[-2]
+        if past_key_values is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            if self.rotary_emb is not None:
+                seqlen_offset = past_key_values.get_usable_length(kv_seq_len, layer_idx=self.layer_idx)
+                # shape: (bs, nqp, q_len, hn), shape: (bs, nkvp, q_len, hn)
+                query_states, key_states = self.rotary_emb(
+                    query_states, key_states, seq_dimension=2, seqlen_offset=seqlen_offset
+                )
+                key_states, value_states = past_key_values.update(key_states=key_states, value_states=value_states, layer_idx=self.layer_idx)
+        else:
+            # In this case seq_len = q_len and cache_sq = 0
+            if self.rotary_emb is not None:
+                # shape: (bs, nqp, seq_len, hn), shape: (bs, nkvp, seq_len, hn)
+                query_states, key_states = self.rotary_emb(query_states, key_states, seq_dimension=2)
+        # shape: (bs, nkvp, 2, seq_len, hn)
+        kv_states = torch.cat((key_states.unsqueeze(2), value_states.unsqueeze(2)), dim=2)
+        # shape: (bs, nqp, 2, seq_len, hn)
+        expanded_kv_states = self.expand_kv_to_q_size(kv_states, num_q_per_kv=self.num_q_per_kv)
+        # shape: (bs, nqp, seq_len, hn), shape: (bs, nqp, seq_len, hn)
+        expanded_key_states, expanded_value_states = expanded_kv_states[:, :, 0], expanded_kv_states[:, :, 1]
+        if self.blocksparse:
+            attn_function_output = self._apply_blocksparse_attention(
+                q=query_states,
+                k=expanded_key_states,
+                v=expanded_value_states,
+                attention_mask=attention_mask,
+                return_attention_probs=output_attentions
+            )
+        else:
+            attn_function_output = self._apply_dense_attention(
+                q=query_states,
+                k=expanded_key_states,
+                v=expanded_value_states,
+                attention_mask=attention_mask,
+                return_attention_probs=output_attentions
+            )
+        attn_weights = None
+        if output_attentions:
+            attn_output, attn_weights = attn_function_output
+        else:
+            # shape: (bs, nqp, seq_len, hn)
+            attn_output = attn_function_output
+        # shape: (bs, seq_len, nqp, hn)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # shape: (bs, seq_len, h)
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.dense(attn_output)
+        return attn_output, attn_weights, past_key_values
+class Phi3SmallDecoderLayer(nn.Module):
+    def __init__(self, config: Phi3SmallConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3SmallSelfAttention(config, layer_idx)
+        self.mlp = Phi3SmallMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Cache]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_values,)
+        return outputs
+class Phi3SmallPreTrainedModel(PreTrainedModel):
+    config_class = Phi3SmallConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3SmallDecoderLayer"]
+    skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    def _init_weights(self, module: nn.Module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        # The output projection on the decoder attention layer as well as the down_proj in the MLP are scaled
+        # differently (dubbed `output_layer_init_method` in the Megatron code). This is replicated here
+        for name, p in module.named_parameters():
+            if any(x in name for x in ("c_proj.weight", "down_proj.weight", "o_proj.weight")):
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers)))
+class Phi3SmallModel(Phi3SmallPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # Embedding Dropout
+        self.embedding_dropout = nn.Dropout(config.embedding_dropout_prob)
+        # MuP Embedding scaling
+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        self.layers = nn.ModuleList([Phi3SmallDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @property
+    def pad_sequence_to_multiple_of_64(self):
+        # We only need to do this for the backward pass. So only required
+        # when we are in the context of generating gradients
+        return self.config.pad_sequence_to_multiple_of_64 and torch.is_grad_enabled()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, LegacyCache]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embedding_dropout(inputs_embeds)
+        if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
+            inputs_embeds = inputs_embeds * self.mup_embedding_multiplier
+        residual = 0
+        if self.pad_sequence_to_multiple_of_64:
+            # note(bapatra): Since we don't particularly use the position_ids and the attention mask
+            # we don't need to pad them
+            inputs_embeds, residual = pad_tensor_to_next_mult_of(tensor=inputs_embeds, dim=1, n=64)
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                # Following the Mistral schema for layer return values
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.final_layernorm(hidden_states)
+        if residual > 0:
+            hidden_states = strip_padding_from_tensor(tensor=hidden_states, dim=1, residual=residual)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Phi3SmallForCausalLM(Phi3SmallPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3SmallModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False)
+        self.mup_width_multiplier = config.mup_width_multiplier
+        # Create the mask for the dummy tokens in the vocabulary
+        dummy_token_indices = config.dummy_token_indices
+        dummy_tokens_mask = torch.zeros(self.vocab_size).bool()
+        dummy_tokens_mask[dummy_token_indices] = True
+        # shape: (vocab_size,)
+        self.register_buffer("dummy_tokens_mask", dummy_tokens_mask, persistent=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        if self.mup_width_multiplier:
+            logits = logits / self.mup_width_multiplier
+        logits = logits.masked_fill(self.dummy_tokens_mask, min_value_of_dtype(logits.dtype))
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+# Copied from transformers.models.mistral.modeling_mistral.MistralForSequenceClassification with Mistral -> Phi3Small
+class Phi3SmallForSequenceClassification(Phi3SmallPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3SmallModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

positional_embedding.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Orginally Taken verbatim from xformers library
+https://github.com/facebookresearch/xformers/blob/bcb707576c6a80eaf850aa80e8643d3497ec2bc4/xformers/components/positional_embedding/rotary.py
+The difference is that xformers seems to assume the inputs to be
+(bs, head, seq_len, dim) while we assume (bs, seq_len, head, dim)
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# CREDITS: This implementation is inspired by GPT-NeoX https://github.com/EleutherAI/gpt-neox
+# NOTE: Almost the same right now, moving parts to Triton is the next step
+import math
+from typing import List, Optional, Tuple, Dict, Union
+import torch
+import dataclasses
+from transformers.utils import logging
+from transformers import PretrainedConfig
+is_dacite_available = False
+try:
+    import dacite
+    is_dacite_available = True
+except ImportError:
+    pass
+logger = logging.get_logger(__name__)
+@dataclasses.dataclass
+class LongRopeConfig(object):
+    short_factor: List[float]
+    long_factor: List[float]
+    original_max_position_embeddings: int
+    type: str = "longrope"
+    short_mscale: float = -1
+    long_mscale: float = -1
+    def __post_init__(self):
+        assert self.type in ("longrope", "su"), f"Invalid type {self.type} for LongRopeConfig. Expected longrope / su"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Union[float, List[float], int]]) -> "LongRopeConfig":
+        if is_dacite_available:
+            # Preferred since we can also type check the input
+            return dacite.from_dict(data_class=cls, data=config_dict)
+        kwargs = {}
+        for field in dataclasses.fields(cls):
+            if field.name in config_dict:
+                if field.init:
+                    kwargs[field.name] = config_dict[field.name]
+                else:
+                    raise ValueError(f"Field {field.name} is not initiable")
+            else:
+                if field.default is dataclasses.MISSING:
+                    raise ValueError(f"Field {field.name} is required")
+        extra_keys = set(config_dict.keys()) - set(kwargs.keys())
+        if len(extra_keys) > 0:
+            for key in extra_keys:
+                logger.error(f"Unrecognized key {key} in config_dict")
+            raise ValueError(f"Unrecognized keys in config_dict")
+        return cls(**kwargs)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)
+@torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin, seq_dimension: int):
+    # NOTE: This could probably be moved to Triton
+    if seq_dimension == 0:
+        cos = cos[: x.shape[0], None, None, :]
+        sin = sin[: x.shape[0], None, None, :]
+    elif seq_dimension == 1:
+        # Handle a possible sequence length mismatch in between q and k
+        cos = cos[None, : x.shape[1], None, :]
+        sin = sin[None, : x.shape[1], None, :]
+    elif seq_dimension == 2:
+        cos = cos[None, None, : x.shape[2], :]
+        sin = sin[None, None, : x.shape[2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    Adapted from the xformers library
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    # Arguments
+    :param dim_mode: head dimention
+    :param max_seq_len:
+    :param default_seq_dimension: which dim is the sequence length
+    :param dtype: cos/sin dtype
+    :param use_fused_kernel: if to use customized fused kernel.
+        Note: if used, q, k will be modified inplace. Ok for both forward & backward.
+    """
+    def __init__(
+        self,
+        dim_model: int,
+        *,
+        max_seq_len: Optional[int] = None,
+        dtype: Optional[torch.dtype] = None,
+        base=10000,
+        position_scale=1,
+        device: Optional[torch.device] = None,
+        longrope_config: Optional[LongRopeConfig] = None,
+    ):
+        super().__init__()
+        self.base = base
+        self.dim_model = dim_model
+        self.max_seq_len = max_seq_len
+        self.longrope_config = longrope_config
+        if self.is_longrope:
+            # Keep the maximum range vector, and slice from it as needed
+            self.register_buffer(
+                "range_vector",
+                torch.arange(max_seq_len, device=device, dtype=torch.float32),
+                persistent=False
+            )
+            self.register_buffer(
+                "short_factors",
+                torch.tensor(self.longrope_config.short_factor, dtype=torch.float32),
+                persistent=False
+            )
+            self.register_buffer(
+                "long_factors",
+                torch.tensor(self.longrope_config.long_factor, dtype=torch.float32),
+                persistent=False
+            )
+        else:
+            # Generate and save the inverse frequency buffer (non trainable)
+            inv_freq = 1.0 / (base ** (torch.arange(0, dim_model, 2).float().to(device) / self.dim_model))
+            self.register_buffer("inv_freq", inv_freq)
+        self.position_scale = position_scale
+        if not self.is_longrope:
+            dtype = dtype or torch.get_default_dtype()
+            self._set_cos_sin_cache(
+                seq_len=max_seq_len,
+                device=self.inv_freq.device,
+                dtype=dtype,
+            )
+    @property
+    def is_longrope(self):
+        return self.longrope_config is not None
+    @property
+    def original_max_seq_len(self):
+        if self.longrope_config is not None:
+            return self.longrope_config.original_max_position_embeddings
+        logger.warning_once(
+            (
+                "``original_max_seq_len'' is being accessed, but longrope_config has not been set. "
+                "Please only do this if you are sure about the context."
+            )
+        )
+        return self.max_seq_len
+    def get_range_vector(self, seq_len: int, device: torch.device):
+        if self.is_longrope:
+            assert seq_len < self.range_vector.shape[0], f"Found seq_len {seq_len} greater than max_seq_len {self.range_vector.shape[0]}"
+            if self.range_vector.device != device:
+                self.range_vector = self.range_vector.to(device)
+            return self.range_vector[:seq_len]
+        return torch.arange(seq_len, device=device, dtype=torch.float32)
+    def _calc_mscale(self, scale: torch.Tensor) -> torch.Tensor:
+        if scale <= 1.0:
+            return 1.0
+        return math.sqrt(1 + math.log(scale) / math.log(self.original_max_seq_len))
+    def _set_cos_sin_cache(
+        self,
+        seq_len: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        dtype = dtype or torch.get_default_dtype()
+        self.max_seq_len_cached = seq_len
+        t = (torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32) * self.position_scale).type_as(self.inv_freq)
+        device_type = device.type if device is not None else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            # shape: (seq_len, dim_model // 2)
+            freqs = torch.outer(t, self.inv_freq)
+            # shape: (seq_len, dim_model)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        self.register_buffer("cos_cached", cos.to(dtype), persistent=False)
+        self.register_buffer("sin_cached", sin.to(dtype), persistent=False)
+    def forward(
+        self, q: torch.Tensor,
+        k: torch.Tensor,
+        seq_dimension: int = 1,
+        seqlen_offset: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """q, k does not include `seqlen_offset`
+        q: Either (bs, seq_len, num_heads, head_dim) or (seq_len, bs, num_heads, head_dim)
+        k: Either (bs, seq_len, num_heads, head_dim) or (seq_len, bs, num_heads, head_dim)
+        """
+        if seq_dimension < 0:
+            seq_dimension = k.ndim + seq_dimension
+        assert seq_dimension in (0, 1, 2)
+        seq_len = k.shape[seq_dimension] + seqlen_offset
+        if self.is_longrope:
+            if seq_len > self.original_max_seq_len:
+                t = self.get_range_vector(seq_len, device=q.device)
+                rescale_factors = self.long_factors.to(q.device)
+                long_mscale = self.longrope_config.long_mscale
+                mscale = long_mscale if long_mscale > 0 else self._calc_mscale(self.max_seq_len / self.original_max_seq_len)
+            else:
+                t = self.get_range_vector(self.original_max_seq_len, device=q.device)
+                rescale_factors = self.short_factors.to(q.device)
+                short_mscale = self.longrope_config.short_mscale
+                mscale = short_mscale if short_mscale > 0 else 1.0
+            assert rescale_factors.shape == (self.dim_model // 2, ), (
+                f"misaligned shape for LongRoPE rescale factors:\n"
+                f"\tExpected {(self.dim_model // 2, )}, got {rescale_factors.shape}."
+            )
+            inv_freq = 1.0 / (rescale_factors * (self.base ** (torch.arange(0, self.dim_model, 2).float().to(q.device) / self.dim_model)))
+            device_type = q.device.type if q.device is not None else "cpu"
+            device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+            with torch.autocast(device_type=device_type, enabled=False):
+                freqs = torch.outer(t, inv_freq)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                cos = emb.cos() * mscale
+                sin = emb.sin() * mscale
+            cos_cached = cos.to(q.dtype)
+            sin_cached = sin.to(q.dtype)
+        else:
+            if seq_len > self.max_seq_len_cached:
+                self._set_cos_sin_cache(
+                    seq_len=seq_len,
+                    device=k.device,
+                    dtype=k.dtype,
+                )
+            cos_cached = self.cos_cached
+            sin_cached = self.sin_cached
+        return (
+            apply_rotary_pos_emb(
+                q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ),
+            apply_rotary_pos_emb(
+                k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ),
+        )
+    @classmethod
+    def from_config(cls, config: PretrainedConfig) -> "RotaryEmbedding":
+        kwargs = dict(
+            dim_model=config.hidden_size // config.num_attention_heads,
+            max_seq_len=config.max_position_embeddings,
+            base=config.rope_embedding_base,
+            position_scale=config.rope_position_scale,
+        )
+        if config.rope_scaling is not None:
+            kwargs["longrope_config"] = LongRopeConfig.from_dict(config.rope_scaling)
+        return cls(**kwargs)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
+}

tokenization_phi3_small.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
+import os
+from typing import Collection, List, Optional, Dict, Set, Tuple, Union
+from functools import cached_property
+import base64
+from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+import tiktoken
+"""
+    This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
+    with a few additional special tokens to support the ChatML format.
+    TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
+    Maybe in the future, that would be useful? Can add that support later.
+"""
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+# On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
+# this in turn causes some indices to be empty. We account for these empty indices by adding
+# dummy tokens to the tokenizer.
+EFFECTIVE_PADDED_VOCAB_SIZE = 100352
+ACTUAL_VOCAB_SIZE = 100276
+DUMMY_TOKENS = {
+    f"<|dummy_id_{11 + offset}|>": 100276 + offset
+    for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
+}
+SPECIAL_TOKENS = {
+    # tiktoken.get_encoding("cl100k_base")._special_tokens
+    '<|endoftext|>': 100257,
+    '<|fim_prefix|>': 100258,
+    '<|fim_middle|>': 100259,
+    '<|fim_suffix|>': 100260,
+    # Special tokens for post-training
+    "<|system|>": 100261,
+    "<|user|>": 100262,
+    "<|assistant|>": 100263,
+    # Dummy unused tokens
+    "<|dummy_id_0|>": 100264,
+    "<|dummy_id_1|>": 100265,
+    # Special tokens for post-training continued
+    "<|end|>": 100266,
+    # Some dummy tokens, so that tokenization is contiguous and does not cause issues
+    # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
+    # actually map to anything. So we use a dummy token here.
+    "<|dummy_id_2|>": 100256,
+    # Likewise, tokens from 100267 to 100275 are also unused
+    "<|dummy_id_3|>": 100267,
+    "<|dummy_id_4|>": 100268,
+    "<|dummy_id_5|>": 100269,
+    "<|dummy_id_6|>": 100270,
+    "<|dummy_id_7|>": 100271,
+    "<|dummy_id_8|>": 100272,
+    "<|dummy_id_9|>": 100273,
+    "<|dummy_id_10|>": 100274,
+    "<|dummy_id_11|>": 100275,
+    # The final end of prompt token
+    # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
+    '<|endofprompt|>': 100276,
+    # Dummy tokens to account for padding of the tokenizer
+    # We pad to ensure tensor cores are used for vocab multiplication
+    **DUMMY_TOKENS
+}
+class Phi3SmallTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "cl100k_base.tiktoken"
+    }
+    model_input_names: List[str] = ["input_ids", "attention_mask"]
+    padding_side = "left"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        errors: str = "replace",
+        **kwargs
+    ) -> None:
+        # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
+        # if the token is present in `self.special_tokens``. Hence instantiating it here.
+        # The way Qwen gets around this is by checking against SPECIAL_TOKENS
+        # But I think it's better to check against the objects own `special_tokens`
+        # in case we eventually want to allow the tokenizer to have special tokens.
+        self.special_tokens = SPECIAL_TOKENS
+        super().__init__(**kwargs)
+        self.errors = errors
+        base = tiktoken.get_encoding("cl100k_base")
+        if vocab_file is None:
+            self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
+        else:
+            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
+        self.pat_str = base._pat_str
+        enc = tiktoken.Encoding(
+            name="phi3small",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+        self.decoder: Dict[int, bytes] = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.eod_id = self.tokenizer.eot_token
+        self._eos_token = self._convert_id_to_token(self.eod_id)
+        # Setting the bos_token to be the same as the eos_token
+        # Note that this is **not** the correct thing to do, and is done
+        # just so that some of the downstream libraries do not break.
+        self._bos_token = self._eos_token
+        # Assign the special tokens to class variables
+        self.system_id = self.special_tokens["<|system|>"]
+        self.user_id = self.special_tokens["<|user|>"]
+        self.assistant_id = self.special_tokens["<|assistant|>"]
+        self.end_id = self.special_tokens["<|end|>"]
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # There are some additional special tokens in the cl100k_base tokenizer
+        # that we do not use. Hence, we also consider them to be dummy tokens.
+        additional_tokens = [
+            "<|fim_prefix|>",
+            "<|fim_middle|>",
+            "<|fim_suffix|>",
+            "<|endofprompt|>"
+        ]
+        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
+        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
+        return sorted(dummy_token_indices)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__ = state
+        enc = tiktoken.Encoding(
+            name="cl100k_im",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self):
+        return self.tokenizer.n_vocab
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs,
+    ):
+        cls_kwargs = kwargs
+        # First try to load from the tokenization config if it exists
+        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if tokenization_config:
+            cls_kwargs.update(
+                dict(
+                    model_max_length=tokenization_config["model_max_length"],
+                    chat_template=tokenization_config.get("chat_template", None)
+                )
+            )
+        else:
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+            cls_kwargs["model_max_length"] = config.max_position_embeddings
+        return cls(**cls_kwargs)
+    def get_vocab(self) -> Dict[Union[str, bytes], int]:
+        return {**self.mergeable_ranks, **self.special_tokens}
+    def convert_tokens_to_ids(
+        self,
+        tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> Union[int, List[int]]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        ids: List[int] = []
+        for token in tokens:
+            ids.append(self.convert_tokens_to_ids(token))
+        return ids
+    def _add_tokens(
+            self,
+            new_tokens: Union[List[str], List[AddedToken]],
+            special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Only special tokens can be added to this tokenizer")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in self.special_tokens:
+                raise ValueError(
+                    "For now, we do not support unknown special tokens\n"
+                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
+                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
+                    "And finally, we can re-construct the enc object back\n"
+                )
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
+        with open(file_path, "w") as f:
+            for token, rank in self.mergeable_ranks.items():
+                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
+                f.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs
+    ) -> List[Union[bytes, str]]:
+        tokens: List[Union[bytes, str]] = []
+        for token_id in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[token_id])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    @property
+    def eos_token_id(self) -> int:
+        return self.eod_id
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_phi3_small.Phi3SmallTokenizer",
+      "tokenization_phi3_small.Phi3SmallTokenizer"
+    ]
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "Phi3SmallTokenizer"
+}

triton_blocksparse_attention_layer.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import math
+from typing import Optional, Tuple, TypeVar
+import torch.nn as nn
+import torch
+import triton
+from functools import lru_cache
+from .triton_flash_blocksparse_attn import get_local_strided_sparse_attention_op, _get_sparse_attn_mask, blocksparse_flash_attn_padded_fwd, blocksparse_flash_attn_varlen_fwd
+Layout = Tuple[torch.LongTensor, torch.LongTensor]
+def create_sparse_attn_mask(
+    n_heads: int,
+    max_seq_len: int,
+    max_seq_len_k: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    BLOCK: int,
+    local_blocks: int,
+    vert_stride: int,
+    homo_head: bool,
+    return_dense: bool
+) -> Tuple[Layout, torch.Tensor, Optional[torch.Tensor]]:
+    layout, block_sparse_pattern, _ = _get_sparse_attn_mask(
+        n_heads=n_heads,
+        q_len=max_seq_len,
+        N_CTX=max_seq_len_k,
+        dtype=dtype,
+        device=device,
+        BLOCK=BLOCK,
+        local_blocks=local_blocks,
+        vert_stride=vert_stride,
+        homo_head=homo_head,
+        return_dense=return_dense
+    )
+    return layout, block_sparse_pattern
+class BlockSparseAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        n_heads: int,
+        max_seq_len: int,
+        sparse_block_size: int,
+        local_blocks: int,
+        vert_stride: int,
+        kernel_block_size: Optional[int] = None,
+        homo_head: bool = False,
+        active_head_range: Optional[Tuple[int]] = None
+    ) -> None:
+        super().__init__()
+        self.n_heads = n_heads
+        self.max_seq_len = max_seq_len
+        self.sparse_block_size = sparse_block_size
+        self.kernel_block_size = kernel_block_size or sparse_block_size
+        self.local_blocks = local_blocks
+        self.vert_stride = vert_stride
+        self.homo_head = homo_head
+        self.active_head_range = active_head_range
+        # Internal Parameters used by the layer
+        self._sparse_block_mask = None
+        self._sparse_layout = None
+        self._dtype = None
+        self._device = None
+        # TODO(bapatra): Ideally, I'd want to keep all the code for
+        # forward to be handled here, and not branch for training and inference.
+        # However, that refactor would need a lot of testing. For now, using the
+        # training op as is, and will refactor again later.
+    def prune_blocksparse_layout_to_heads(self, h_start: int, h_end: int) -> None:
+        self._sparse_block_mask = self._sparse_block_mask[h_start: h_end]
+        self._sparse_layout[0] = self._sparse_layout[0][h_start: h_end]
+        self._sparse_layout[1] = self._sparse_layout[1][h_start: h_end]
+    def _initialize_internals(
+        self,
+        dtype: torch.dtype,
+        device: torch.device
+    ) -> None:
+        self._dtype, self._device = dtype, device
+        self._sparse_layout, self._sparse_block_mask = create_sparse_attn_mask(
+            n_heads=self.n_heads,
+            max_seq_len=self.max_seq_len,
+            max_seq_len_k=self.max_seq_len,
+            dtype=dtype,
+            device=device,
+            BLOCK=self.sparse_block_size,
+            local_blocks=self.local_blocks,
+            vert_stride=self.vert_stride,
+            homo_head=self.homo_head,
+            return_dense=False,
+        )
+        if (not self.homo_head) and (self.active_head_range is not None):
+            assert len(self.active_head_range) == 2, "\"active_head_range\" should be a tuple of start/end index of the heads."
+            h_start, h_end = self.active_head_range
+            self.prune_blocksparse_layout_to_heads(h_start=h_start, h_end=h_end)
+        assert self.sparse_block_size % self.kernel_block_size == 0,  f"The sparse block size must be a multiple of {self.kernel_block_size}. Found {self.sparse_block_size}."
+        assert self.kernel_block_size >=16 and math.log2(self.kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {self.kernel_block_size} is given"
+        if self.sparse_block_size // self.kernel_block_size > 1:
+            _mul = self.sparse_block_size // self.kernel_block_size
+            # need to consider if block_m and block_n are different
+            self._sparse_block_mask = torch.kron(self._sparse_block_mask, self._sparse_block_mask.new_ones(_mul, _mul))
+            num_sparse_blocks = self._sparse_block_mask.size(-1)
+            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
+            self._sparse_block_mask *= block_causal_mask.type_as(self._sparse_block_mask)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        sm_scale: float,
+        *,
+        # Arguments Related to Block Attention Inference
+        left_paddings: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.LongTensor] = None,
+        # Arguements Related to Variable Length Inference
+        cu_seqlens_k: Optional[torch.LongTensor] = None,
+        cu_seqlens_q: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if left_paddings is None and seqlens is None and cu_seqlens_k is None and cu_seqlens_q is None:
+            blocksparse_op = get_local_strided_sparse_attention_op(
+                n_heads=self.n_heads,
+                max_seq_len=self.max_seq_len,
+                sparse_block_size=self.sparse_block_size,
+                kernel_block_size=self.kernel_block_size,
+                local_blocks=self.local_blocks,
+                vert_stride=self.vert_stride,
+                homo_head=self.homo_head,
+                device=q.device,
+                inference=not self.training
+            )
+            return blocksparse_op(q, k, v, sm_scale)
+        assert not torch.is_grad_enabled(), "Variable Length Inference / Batched inference is not supported during training. Please run it in a torch.no_grad() context"
+        # First set internals if they have not been set
+        if self._sparse_block_mask is None or (self._dtype != q.dtype) or (self._device != q.device):
+            self._initialize_internals(dtype=q.dtype, device=q.device)
+        if k.dim() == 3:
+            assert cu_seqlens_k is not None
+            return blocksparse_flash_attn_varlen_fwd(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_k=cu_seqlens_k,
+                cu_seqlens_q=cu_seqlens_q,
+                sm_scale=sm_scale,
+                sparse_layout=self._sparse_layout,
+                block_size=self.kernel_block_size,
+                max_seqlen=self.max_seq_len,
+            )
+        if k.dim() == 4:
+            assert not (left_paddings is None and seqlens is None), "Either left_paddings or seqlens must be provided for batched inference."
+            return blocksparse_flash_attn_padded_fwd(
+                q=q,
+                k=k,
+                v=v,
+                sm_scale=sm_scale,
+                sparse_layout=self._sparse_layout,
+                left_paddings=left_paddings,
+                seqlens=seqlens,
+                block_size=self.kernel_block_size,
+                max_seqlen=self.max_seq_len,
+            )
+        raise ValueError('q/k/v must be either 3 dim for variable-length input or 4 dim for fixed-length.')

triton_flash_blocksparse_attn.py ADDED Viewed

	@@ -0,0 +1,1943 @@

+"""
+    Author: Eric Lin (xihlin)
+"""
+"""
+    ... note(bapatra)::
+        This is written as one big file, instead of splitting into logical components because I was running into issues with transformers auto module
+        imports when splitting into different files. I've tried keeping the logical partitions demarkated with comment blocks, but it is not ideal.
+        In the future, would be really good to revisit this and refactor into a more readable file structure.
+"""
+from typing import TypeVar
+from functools import lru_cache
+import math
+import pytest
+import torch
+import numpy as np
+import triton
+import triton.language as tl
+import os
+import dataclasses
+Phi3SmallConfig = TypeVar('Phi3SmallConfig')
+# triton 2.0.0: fail at backward on A100, for the examples, if h_dim=128.
+# Done
+#  1. strided of qkv
+#  2. seq len not power of 2
+#  3. bf16 with Triton May, 2023
+# TODO:
+#  1. wip: support non-contiguous backward, also help reduce memory allocation in training (q, k, v split)
+#  2. block sparse with different BLOCK_M, BLOCK_N?
+#  3. for Lq not divided by BLOCK_M, BLOCK_N, only apply mask to K/V on last batch, still need to apply mask on Q.
+#     Attempt, fail to compile
+#  4. For 2nd iter of inference,  BLOCK_M=1, how to make things work?  K/V maynot divided by BLOCK_N.
+#  5. The inner loop can also be paralled via bigger num_stage(better) or on different thread-block (via m/L and atomic update, but this no-comm/sync between blocks)
+###########################################################
+################### Kernel Parameters #####################
+###########################################################
+@dataclasses.dataclass
+class BlockSparseParams(object):
+    block_size: int
+    kernel_block_size: int
+    num_local_blocks: int
+    vert_stride: int
+    homo_head_pattern: bool = False
+    @classmethod
+    def from_config(cls, config: Phi3SmallConfig) -> "BlockSparseParams":
+        return cls(
+            block_size=config.blocksparse_block_size,
+            kernel_block_size=config.blocksparse_triton_kernel_block_size,
+            num_local_blocks=config.blocksparse_num_local_blocks,
+            vert_stride=config.blocksparse_vert_stride,
+            homo_head_pattern=config.blocksparse_homo_head_pattern,
+        )
+###########################################################
+###########################################################
+###########################################################
+################### Utility Functions #####################
+###########################################################
+# helper functions for 3D sparse pattern
+# these function are not optimized and very inefficient. Avoid calling them too frequent.
+# currently, it is only called within `get_local_strided_sparse_attention_op`, which is cached.
+def dense_to_crow_col(x):
+    ''' Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
+    param:
+    TODO:
+        1. improve efficiency, is it faster if done in CPU, or customize a cuda kernel for it?
+    NOTE: col_indices padded -1
+    '''
+    pad = -1
+    dim = x.dim()
+    assert x.dim() in (2, 3)
+    if x.dim() == 2:
+        x = x[None]
+    x = [xi.to_sparse_csr() for xi in x]
+    crows = torch.vstack([xi.crow_indices() for xi in x])
+    cols = [xi.col_indices() for xi in x]
+    max_cols = max(len(xi) for xi in cols)
+    cols = [torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])]) for xi in cols]
+    cols = torch.vstack(cols)
+    if dim == 2:
+        crows = crows[0]
+        cols = cols[0]
+    return crows, cols
+def crow_col_to_dense(crows, cols, dtype=torch.float16):
+    dim = crows.dim()
+    if dim == 1:
+        crows = crows[None]
+        cols = cols[None]
+    device = crows.device
+    crows, cols = crows.cpu(), cols.cpu()  # faster in cpu
+    shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1)
+    x = torch.zeros(shape, dtype=dtype)
+    for i in range(shape[0]):
+        for j in range(shape[1]):
+            x[i, j, cols[i, crows[i, j]:crows[i, j+1]]] = 1
+    if dim == 1:
+        x = x[0]
+    return x.to(device)
+def dense_to_ccol_row(x):
+    '''Similar, but to CSC format
+    '''
+    x = x.transpose(-2, -1)
+    return dense_to_crow_col(x)
+def ccol_row_to_dense(ccol, rows, dtype=torch.float16):
+    return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous()
+def _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, return_dense=False):
+    '''
+    :return: a tuple of 3:
+        - tuple of crow_indices, col_indices representation of CSR format.
+        - block dense mask
+        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
+    '''
+    with torch.no_grad():
+        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
+        q_pos = torch.arange(N_BLOCK)[:, None]
+        k_pos = torch.arange(N_BLOCK)[None]
+        mask_vert_strided = (torch.arange(N_BLOCK) + 1) % vert_stride == 0
+        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
+        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
+        block_mask_dense_output = block_mask_dense[-N_BLOCK_Q:].contiguous().to_sparse_csr()
+    if return_dense:
+        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
+        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
+        mask_dense = mask_dense[-q_len:, :N_CTX] * causal_mask
+        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, mask_dense
+    else:
+        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, None
+def _get_sparse_attn_mask(n_heads, q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, homo_head=True, return_dense=False):
+    '''
+    :return: a tuple of 3:
+        - tuple of crow_indices, col_indices representation of CSR format.
+        - block dense mask
+        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
+    '''
+    if homo_head:
+        with torch.no_grad():
+            (crow, col), block_mask_dense, mask_dense = _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK, local_blocks, vert_stride, return_dense)
+            crow = crow[None].expand(n_heads, crow.shape[0])
+            col = col[None].expand(n_heads, col.shape[0])
+            if return_dense:
+                mask_dense = mask_dense[None].expand(n_heads, *mask_dense.shape)
+            return (crow, col), block_mask_dense, mask_dense
+    with torch.no_grad():
+        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
+        q_pos = torch.arange(N_BLOCK)[None, :, None]
+        k_pos = torch.arange(N_BLOCK)[None, None]
+        head_sliding_step = max(1, int(vert_stride / n_heads))  # if vert_stride <= n_heads, rotating the heads
+        mask_vert_strided = [(torch.arange(N_BLOCK) + h * head_sliding_step + 1) % vert_stride == 0 for h in range(n_heads)]
+        mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1)
+        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
+        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
+        block_mask_dense_output = block_mask_dense[:, -N_BLOCK_Q:]
+    if return_dense:
+        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
+        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
+        mask_dense = mask_dense[..., -q_len:, :N_CTX] * causal_mask[None]
+        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, mask_dense
+    else:
+        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, None
+def get_sparse_attn_mask(q, N_CTX, *args, **kwargs):
+    return _get_sparse_attn_mask(q.size(1), q.size(2), N_CTX, q.dtype, q.device, *args, **kwargs)
+###########################################################
+###########################################################
+###########################################################
+###################### Training Kernels ###################
+###########################################################
+# TODO: only apply loading/saving mask on the last iteration for EVEN_N_BLOCK, useful for 1st iteration of inference.
+#    Experiment failed inside loop.
+#    Another idea: only on saving? load even out of boundary(will it causes illegal access error)?
+@triton.jit
+def _fwd_kernel(
+    Q, K, V, sm_scale,
+    layout_crow_ptr,
+    layout_col_ptr,
+    layout_crow_stride_h, layout_crow_stride_m,
+    layout_col_stride_h, layout_col_stride_m,
+    TMP, L, M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug. TMP, L, M are assumed to have contiguous layouts
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qd,
+    stride_kz, stride_kh, stride_kn, stride_kd,
+    stride_vz, stride_vh, stride_vn, stride_vd,
+    stride_oz, stride_oh, stride_om, stride_od,
+    Z, H, N_CTX,
+    PAST_LEN,
+    Q_ROUNDED_LEN,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+    EVEN_N_BLOCK: tl.constexpr,
+    INFERENCE: tl.constexpr,
+    NUM_DBLOCKS: tl.constexpr,
+):
+    Q_LEN = N_CTX - PAST_LEN
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_h = off_hz % H
+    off_z = off_hz // H
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
+    # off_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
+    off_k = offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kd
+    off_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hz * Q_ROUNDED_LEN + offs_m
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    if NUM_DBLOCKS >= 2:
+        acc2 = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    if EVEN_M_BLOCK:
+        q = tl.load(q_ptrs)
+        if NUM_DBLOCKS >= 2:
+            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
+    else:
+        q = tl.load(q_ptrs, mask=offs_m[:, None] < Q_LEN)
+        if NUM_DBLOCKS >= 2:
+            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m[:, None] < Q_LEN)
+    layout_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + start_m * layout_crow_stride_m
+    start_l = tl.load(layout_ptr).to(tl.int32)
+    end_l = tl.load(layout_ptr + layout_crow_stride_m).to(tl.int32)
+    # loop over k, v and update accumulator
+    for col_idx_idx in range(start_l, end_l):
+        col_idx = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + col_idx_idx * layout_col_stride_m).to(tl.int32)
+        start_n = col_idx * BLOCK_N
+        # -- compute qk ----
+        if EVEN_N_BLOCK:
+            k = tl.load(k_ptrs + start_n * stride_kn)
+        else:
+            k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_n[None, :] + start_n < N_CTX)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_N_BLOCK:
+                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd, mask=offs_n[None, :] + start_n < N_CTX)
+            qk += tl.dot(q2, k)
+        qk *= sm_scale
+        qk += tl.where(offs_m[:, None] + PAST_LEN >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        # tl.store(t_ptrs, acc_scale)
+        # acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
+        acc = acc * acc_scale[:, None]
+        if NUM_DBLOCKS >= 2:
+            acc2 = acc2 * acc_scale[:, None]
+        p = p.to(Q.dtype.element_ty)
+        # update acc
+        if EVEN_N_BLOCK:
+            v = tl.load(v_ptrs + start_n * stride_vn)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_n[:, None] + start_n < N_CTX)
+        acc += tl.dot(p, v)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_N_BLOCK:
+                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] + start_n < N_CTX)
+            acc2 += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # rematerialize offsets to save registers
+    # start_m = tl.program_id(0)
+    # offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    if not INFERENCE:
+        l_ptrs = L + off_hz * N_CTX + offs_m
+        m_ptrs = M + off_hz * N_CTX + offs_m
+        if EVEN_M_BLOCK:
+            tl.store(l_ptrs, l_i)
+            tl.store(m_ptrs, m_i)
+        else:
+            tl.store(l_ptrs, l_i,  mask=offs_m < Q_LEN)
+            tl.store(m_ptrs, m_i,  mask=offs_m < Q_LEN)
+    # initialize pointers to output
+    # offs_n = tl.arange(0, BLOCK_DMODEL)
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc,  mask=offs_m[:, None] < Q_LEN)
+    if NUM_DBLOCKS >= 2:
+        tl.store(out_ptrs + BLOCK_DMODEL * stride_od, acc2,  mask=offs_m[:, None] < Q_LEN)
+## backward
+@triton.heuristics(
+    {
+        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
+    }
+)
+@triton.jit
+def _bwd_preprocess(
+    Out, DO, L, # assume contiguous for Out, DO, L, NewDO, Delta layout.
+    NewDO, Delta,
+    N_CTX,
+    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_d = tl.arange(0, D_HEAD)
+    # load
+    if EVEN_M_BLOCK:
+        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
+        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
+    else:
+        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
+        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
+    denom = tl.load(L + off_m).to(tl.float32)
+    # compute
+    do = do / denom[:, None]
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    if EVEN_M_BLOCK:
+        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do)
+    else:
+        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do,  mask=off_m[:, None] < N_CTX)
+    tl.store(Delta + off_m, delta)
+# Does not suuport unequal seqlen(q) and seqlen(k)
+@triton.heuristics(
+    {
+        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
+        'EVEN_N_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_N'] == 0,
+    }
+)
+@triton.jit
+def _bwd_kernel(
+    Q, K, V, sm_scale,
+    layout_ccol_ptr,
+    layout_row_ptr,
+    layout_ccol_stride_h, layout_ccol_stride_m,
+    layout_row_stride_h, layout_row_stride_m,
+    Out, DO,  # assume contigous: Out, Do, DQ, DK, DV, L, M, D, seq(q) == seq(k), with stride_oz, stride_oh, stride_om, stride_od,
+    DQ, DK, DV,
+    L, M,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qd,
+    stride_kz, stride_kh, stride_kn, stride_kd,
+    stride_vz, stride_vh, stride_vn, stride_vd,
+    stride_oz, stride_oh, stride_om, stride_od,
+    # stride_dz, stride_dh, stride_dm, stride_dd,
+    Z, H, N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+    EVEN_N_BLOCK: tl.constexpr,
+    NUM_DBLOCKS: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_oz + off_h * stride_oh
+    DK += off_z * stride_oz + off_h * stride_oh
+    DV += off_z * stride_oz + off_h * stride_oh
+    # Look like this loop can be parallelled
+    # for start_n in range(0, num_block):
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    # initialize pointers to value-like data
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd)
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd)
+    # pointer to row-wise quantities in value-like data
+    D_ptrs = D + off_hz * N_CTX
+    m_ptrs = M + off_hz * N_CTX
+    # initialize dv amd dk
+    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    # k and v stay in SRAM throughout
+    if EVEN_N_BLOCK:
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+    else:
+        k = tl.load(k_ptrs, mask=offs_n[:, None] < N_CTX)
+        v = tl.load(v_ptrs, mask=offs_n[:, None] < N_CTX)
+    if NUM_DBLOCKS >= 2:
+        dv2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+        dk2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+        if EVEN_N_BLOCK:
+            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd)
+            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd)
+        else:
+            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd, mask=offs_n[:, None] < N_CTX)
+            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] < N_CTX)
+    # loop over rows
+    layout_ptr = layout_ccol_ptr + off_h * layout_ccol_stride_h + start_n * layout_ccol_stride_m
+    start_l = tl.load(layout_ptr).to(tl.int32)
+    end_l = tl.load(layout_ptr + layout_ccol_stride_m).to(tl.int32)
+    for row_idx_idx in range(start_l, end_l):
+        row_idx = tl.load(layout_row_ptr + off_h * layout_row_stride_h + row_idx_idx * layout_row_stride_m).to(tl.int32)
+        start_m = row_idx * BLOCK_M
+        # offs_qm = start_m + tl.arange(0, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        q_ptrs =   Q + (offs_m_curr[:, None] * stride_qm + offs_d[None, :] * stride_qd)
+        do_ptrs = DO + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)
+        dq_ptrs = DQ + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)
+        # load q, k, v, do on-chip
+        if EVEN_M_BLOCK:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX)
+        # re-compute p = softmax(qk, dim=-1).T
+        # NOTE: `do` is pre-divided by `l`; no normalization here
+        qk = tl.dot(q, tl.trans(k))
+        if NUM_DBLOCKS >= 2:
+            if EVEN_M_BLOCK:
+                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
+            else:
+                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m_curr[:, None] < N_CTX)
+            qk += tl.dot(q2, tl.trans(k2))
+        qk += tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), 0, float('-inf'))
+        if EVEN_M_BLOCK:
+            m = tl.load(m_ptrs + offs_m_curr)
+        else:
+            m = tl.load(m_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
+        p = tl.exp(qk * sm_scale - m[:, None])
+        # compute dv
+        if EVEN_M_BLOCK:
+            do = tl.load(do_ptrs)
+        else:
+            do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < N_CTX)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_M_BLOCK:
+                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od)
+            else:
+                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od, mask=offs_m_curr[:, None] < N_CTX)
+        dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
+        if NUM_DBLOCKS >= 2:
+            dv2 += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do2)
+        # compute dp = dot(v, do)
+        if EVEN_M_BLOCK:
+            Di = tl.load(D_ptrs + offs_m_curr)
+        else:
+            Di = tl.load(D_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
+        dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+        dp += tl.dot(do, tl.trans(v))
+        if NUM_DBLOCKS >= 2:
+            dp += tl.dot(do2, tl.trans(v2))
+        # compute ds = p * (dp - delta[:, None])
+        ds = p * dp * sm_scale
+        # compute dk = dot(ds.T, q)
+        dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
+        if NUM_DBLOCKS >= 2:
+            dk2 += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q2)
+        # # compute dq
+        dq = tl.dot(ds.to(Q.dtype.element_ty), k)
+        if EVEN_M_BLOCK:
+            tl.atomic_add(dq_ptrs, dq)
+        else:
+            tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < N_CTX)
+        if NUM_DBLOCKS >= 2:
+            dq2 = tl.dot(ds.to(Q.dtype.element_ty), k2)
+            dq_ptrs2 = dq_ptrs + BLOCK_DMODEL * stride_od
+            if EVEN_M_BLOCK:
+                tl.atomic_add(dq_ptrs2, dq2)
+            else:
+                tl.atomic_add(dq_ptrs2, dq2, mask=offs_m_curr[:, None] < N_CTX)
+    # write-back
+    dv_ptrs = DV + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
+    dk_ptrs = DK + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
+    if EVEN_N_BLOCK:
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+    else:
+        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < N_CTX)
+        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < N_CTX)
+    if NUM_DBLOCKS >= 2:
+        dv_ptrs2 = dv_ptrs + BLOCK_DMODEL * stride_od
+        dk_ptrs2 = dk_ptrs + BLOCK_DMODEL * stride_od
+        if EVEN_N_BLOCK:
+            tl.store(dv_ptrs2, dv2)
+            tl.store(dk_ptrs2, dk2)
+        else:
+            tl.store(dv_ptrs2, dv2, mask=offs_n[:, None] < N_CTX)
+            tl.store(dk_ptrs2, dk2, mask=offs_n[:, None] < N_CTX)
+def _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N, num_warps=None, num_stages=1, inference=None, out=None):
+    '''
+    :param q, k, v: [batch, n_heads, seq_len, model_dim]. len of q is allowed to be different than k/v.
+    :param layout_crow_indices, layout_col_indices: same as CSR.crow_indices, and CSR.col_indices used to preresent a sparse tensor.
+        Each element represent a block, i.e, all elements in a block to be attentdd, or not attended at all..
+    '''
+    assert q.shape[-1] == k.shape[-1] == v.shape[-1]
+    assert k.shape[2] == v.shape[2]
+    o = out if out is not None else torch.empty_like(q).contiguous()
+    grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1])
+    q_rounded_len = grid[0] * BLOCK_M
+    tmp = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+    if inference is None:
+        inference = (not q.requires_grad) and (not k.requires_grad)  and (not v.requires_grad)
+    if inference:
+        L, m = tmp, tmp  # no need to use create new tensor
+    else:
+        L = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+        m = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+    if layout_col_indices.dim() == 1:
+        layout_crow_indices = layout_crow_indices[None].expand(q.shape[1] , -1)
+        layout_col_indices = layout_col_indices[None].expand(q.shape[1] , -1)
+    assert q.shape[-1] in [64, 128]
+    BLOCK_DMODEL = 64
+    if num_warps is None:
+        MIN_D = min(BLOCK_M, BLOCK_N, BLOCK_DMODEL)
+        num_warps = max(1, 2 ** int(math.log2(MIN_D / 16)))
+        # print(f'> {BLOCK_M=}, {BLOCK_N=}, {BLOCK_DMODEL=}, {num_warps=}, {num_stages=}')
+    else:
+        assert math.log2(num_warps) % 1 == 0, f'''"num_warps" should be power of 2, but got {num_warps}.'''
+    ## For debugging:
+    # print(f'>> {q.shape=}, {k.shape=}, {BLOCK_M=}, {BLOCK_N=}, {num_warps=}, {BLOCK_DMODEL=}, {q.stride()=}, {k.stride()=}')
+    # print(f'>> {layout_crow_indices=}\n{layout_col_indices=}\n {layout_crow_indices.stride()=}, {layout_crow_indices.stride()=}')
+    # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
+    #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
+    _fwd_kernel[grid](
+        q, k, v, sm_scale,
+        layout_crow_indices,
+        layout_col_indices,
+        layout_crow_indices.stride(0), layout_crow_indices.stride(1),
+        layout_col_indices.stride(0), layout_col_indices.stride(1),
+        tmp, L, m,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], k.shape[2],
+        k.shape[2] - q.shape[2],
+        q_rounded_len,
+        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
+        EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
+        INFERENCE=inference,
+        NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    if inference:
+        L, m = None, None
+    ctx.save_for_backward(q, k, v, o, L, m, layout_crow_indices,  layout_col_indices)
+    ctx.BLOCK_M = BLOCK_M
+    ctx.BLOCK_N = BLOCK_N
+    ctx.BLOCK_DMODEL = BLOCK_DMODEL
+    # ctx.BLOCK = BLOCK
+    ctx.grid = grid
+    ctx.sm_scale = sm_scale
+    ctx.num_warps = num_warps
+    ctx.num_stages = num_stages
+    return o
+def _backward(ctx, do, layout_ccol_indices, layout_row_indices, dq=None, dk=None, dv=None):
+    # q, k, v, o, l, m = ctx.saved_tensors
+    q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors
+    ## this following too slow to do online, so get it from inputs, which is cached.
+    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
+    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))
+    if not do.is_contiguous():
+        do = do.contiguous()
+        ## for debugging
+        # print(f'----> do is not contiguous: {do.stride()=}')
+        # raise ValueError(f'>>>> output grad is not contiguous: {do.stride()=}')
+    if not o.is_contiguous():
+        # TODO: currently only work with contiguous q/k/v.
+        raise ValueError(f'--> output is not contiguous: {o.stride()=}. This is maybe caused by q/k/v not being contiguous.')
+    if layout_ccol_indices.dim() == 1:
+        layout_ccol_indices = layout_ccol_indices[None].expand(q.shape[1], -1)
+        layout_row_indices = layout_row_indices[None].expand(q.shape[1], -1)
+    # do = do.contiguous()
+    dq = dq if dq is not None else torch.zeros_like(q, dtype=torch.float32)
+    dk = dk if dk is not None else torch.empty_like(k)
+    dv =dv if dv is not None else  torch.empty_like(v)
+    do_scaled = torch.empty_like(do)
+    delta = torch.empty_like(l)
+    assert o.stride() == dq.stride() == dk.stride() == dv.stride() == do_scaled.stride()
+    _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
+        o, do, l,
+        do_scaled, delta,
+        k.shape[2],
+        BLOCK_M=ctx.BLOCK_M, D_HEAD=q.shape[-1],
+    )
+    grid = (triton.cdiv(q.shape[2], ctx.BLOCK_N), ctx.grid[1])
+    _bwd_kernel[grid](
+        q, k, v, ctx.sm_scale,
+        layout_ccol_indices,
+        layout_row_indices,
+        layout_ccol_indices.stride(0), layout_ccol_indices.stride(1),
+        layout_row_indices.stride(0), layout_row_indices.stride(1),
+        o, do_scaled,
+        dq, dk, dv,
+        l, m,
+        delta,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        ctx.grid[0],
+        BLOCK_M=ctx.BLOCK_M,
+        BLOCK_N=ctx.BLOCK_N,
+        BLOCK_DMODEL=ctx.BLOCK_DMODEL,
+        NUM_DBLOCKS=q.shape[-1] // ctx.BLOCK_DMODEL,
+        num_warps=ctx.num_warps,
+        num_stages=1,
+    )
+    return dq, dk, dv, None, None, None
+class _sparse_attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+        BLOCK = 128
+        # shape constraints
+        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK, BLOCK)
+    @staticmethod
+    def backward(ctx, do):
+        # q, k, v, o, l, m = ctx.saved_tensors
+        q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors
+        # TODO: the following is very inefficient.
+        # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
+        layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))
+        return _backward(ctx, do, layout_ccol_indices, layout_row_indices)
+# suppressed
+class _sparse_attention_inference(_sparse_attention):
+    # TODO: does not work now, as BLOCK_M cannot be <1, as shape for tl.dot cannot be smaller than 16.
+    @staticmethod
+    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+        BLOCK = 128
+        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, 1, BLOCK)
+def sparse_attention_factory(BLOCK_M=128, BLOCK_N=128, **kwargs):
+    class _sparse_attention_config(_sparse_attention):
+        @staticmethod
+        def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+            # shape constraints
+            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
+                            **kwargs
+                        )
+    return _sparse_attention_config.apply
+@lru_cache(maxsize=8)
+def get_local_strided_sparse_attention_op(
+        n_heads: int,
+        max_seq_len:int,
+        sparse_block_size: int=128,
+        local_blocks: int=4,
+        vert_stride: int=4,
+        homo_head: bool=False,
+        dtype=torch.bfloat16,
+        device='cuda',
+        active_head_range=None,
+        verbose=True,
+        **kwargs):
+    '''
+    :param n_heads: total number of attention heads (regardless of tensor/model parallel)
+    :param max_seq_len: max sequence length. Need to be bigger or equal to the length of sequences.
+    :param sparse_block_size: sparse block size. Default to 128
+    :param local_blocks: number of nearest block to attend to. Default to 4, i.e., attention to previous 4xblock_size tokens.
+    :param vert_stride: Default to 4. Meaning
+    :param homo_head: if all head shared the same pattern.
+    :param active_head_range: tuple of start & end of the heads, e..g, (8, 16). Default to use all heads.
+                              Mainly for tensor/model parallelization where heads are splitted to different GPUs.
+    '''
+    if verbose:
+        print((f'> new block_sparse_attn op constructed with config: '
+            f'{n_heads=}, {max_seq_len=}, {sparse_block_size=}, {local_blocks=}, '
+            f'{vert_stride=}, {homo_head=}, {active_head_range=}, {kwargs=}'))
+    # assert math.log2(max_seq_len) % 2 == 0, f"max_seq_len should be power of 2 to be more efficient"
+    _, block_sparse_pattern, _ = _get_sparse_attn_mask(n_heads, max_seq_len, max_seq_len, dtype, device,
+                                                       BLOCK=sparse_block_size, local_blocks=local_blocks,
+                                                       vert_stride=vert_stride, homo_head=homo_head,
+                                                       return_dense=False)
+    if (not homo_head) and (active_head_range is not None):
+        assert isinstance(active_head_range, tuple)
+        assert len(active_head_range) == 2, '"active_head_range" should be a tuple of start/end index of the heads.'
+        h_start, h_end = active_head_range
+        block_sparse_pattern = block_sparse_pattern[h_start:h_end]
+    # print(block_sparse_pattern)
+    return get_sparse_attn_op(block_sparse_pattern, sparse_block_size, **kwargs)
+def get_sparse_attn_op(
+        sparse_pattern: torch.tensor,
+        sparse_block_size: int=128,
+        kernel_block_size=128,
+        qkv_format='q,k,v',
+          **kwargs):
+    '''
+    Ccreate a block-sparse op with fixed layout. This is to avoid the need to of create CSR layout and convert it to CSC layout everytime,
+        which is very inefficient (use python loops on CPU.  PyTorch 1.13 supports CSR->CSC, may help.)
+    :param sparse_pattern: sparse pattern of the blocks. Should be `num_blocks(q) x num_blocks(k)` or `n_heads x num_blocks x num_blocks`.
+        This tensor should have lower-triangular matrices on the last 2 dimensions for causal attention
+    :param sparse_block_size: sparse block size. Default to 128
+    :param kernel_block_size: the tile/block size to launch a triton instance. Default to None, i.e., same as `sparse_block_size`
+    :param qkv_format: Choices=['q,k,v', 'q, kv', 'qkv'], i.e., separated q,k,v, or kv packed, or qkv packed. Currently, only 'q,k,v' is supported.
+    :param kwargs: keyward arguments passed to `_forward`
+    '''
+    # assert qkv_format in ('q,k,v', 'q, kv', 'qkv')  # to save from running `concat` at forward/backward
+    assert qkv_format == 'q,k,v'
+    if kernel_block_size is None:
+        kernel_block_size = sparse_block_size
+    else:
+        assert sparse_block_size % kernel_block_size == 0, f"The sparse block size must be a multiple of {kernel_block_size}."
+        assert kernel_block_size >=16 and math.log2(kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {kernel_block_size} is given"
+        # print(f'>> {sparse_pattern.shape=}')
+        # print(f'{sparse_pattern=}')
+        if sparse_block_size // kernel_block_size > 1:
+            _mul = sparse_block_size // kernel_block_size
+            # need to consider if block_m and block_n are different
+            sparse_pattern = torch.kron(sparse_pattern, sparse_pattern.new_ones(_mul, _mul))
+            num_sparse_blocks = sparse_pattern.size(-1)
+            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
+            sparse_pattern *= block_causal_mask.type_as(sparse_pattern)
+            # print(f'>> after: {sparse_pattern.shape=}')
+            # print(f'{sparse_pattern=}')
+    BLOCK_N = kernel_block_size
+    NUM_BLOCK =  sparse_pattern.size(-1)
+    MAX_SEQ_LEN = kernel_block_size * NUM_BLOCK
+    grand_layout_crow_indices, grand_layout_col_indices = dense_to_crow_col(sparse_pattern)
+    # sparse csc layout for backward
+    grand_layout_ccol_indices, grand_layout_row_indices = dense_to_ccol_row(sparse_pattern)
+    # cache GPU backward layout. limit the size to avoid OOM as time goes.
+    # For inference, one only needs to cache one block as sequence length always increases
+    # Therefore, this cache needs to be reconstructed per every `block_size`-steps.
+    # For training/finetune, set to 8 to increase cache hit.
+    # Given an input, the block_len will be the same for all layers, so cache is very helpful.
+    max_cache_size = 1 if kwargs.get('inference', False) else 8
+    @lru_cache(maxsize=max_cache_size)
+    def get_backward_layout_by_block_len(block_len):
+        assert block_len <= NUM_BLOCK
+        if block_len == NUM_BLOCK:
+            return (grand_layout_ccol_indices, grand_layout_row_indices)
+        return dense_to_ccol_row(sparse_pattern[..., :block_len, :block_len])
+    # for debugging
+    # if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+    #     print(f'> {sparse_pattern.cpu().tolist()=}')
+    #     print('----')
+    #     print(f'> {grand_layout_crow_indices.cpu().tolist()=}\n{grand_layout_col_indices.cpu().tolist()=}')
+     # q, k, v separated
+    class _q_k_v_sparse_attention(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, q, k, v, sm_scale):
+            # assert q.shape[2] == 1 or q.shape[2] == k.shape[2]
+            # shape constraints
+            MIN_BLOCK_SIZE = 16
+            assert BLOCK_N >= MIN_BLOCK_SIZE
+            BLOCK_M = 16 if q.shape[2] <= 16 else BLOCK_N  # BLOCK_M has to be power of 2
+            # this following code only works for causal attention
+            K_BLOCKS = triton.cdiv(k.shape[2],  kernel_block_size)
+            # Q_START_BLOCKS = K_BLOCKS - 1 if q.shape[2] == 1 else 0
+            Q_START_BLOCKS = K_BLOCKS - triton.cdiv(q.shape[2], BLOCK_N)
+            # print(Q_START_BLOCKS, K_BLOCKS)
+            layout_crow_indices = grand_layout_crow_indices[..., Q_START_BLOCKS:K_BLOCKS+1]
+            layout_col_indices = grand_layout_col_indices
+            # print(BLOCK_M, BLOCK_N, Q_START_BLOCKS, K_BLOCKS+1, layout_crow_indices, layout_col_indices)
+            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
+                            **kwargs
+                        )
+        @staticmethod
+        def backward(ctx, do):
+            q, k = ctx.saved_tensors[:2]
+            assert q.shape[2] == k.shape[2], '> currently backward can only be done if q, k have same length. Contact @EricLin if you need it.'
+            # assume q, k have same length
+            block_len = triton.cdiv(do.shape[2], kernel_block_size)
+            backward_layout = get_backward_layout_by_block_len(block_len)
+            return _backward(ctx, do, *backward_layout)[:4]
+    def _q_k_v_sparse_attention_fn(*args):
+        return _q_k_v_sparse_attention.apply(*args)
+    _q_k_v_sparse_attention_fn.sparse_pattern = sparse_pattern
+    _q_k_v_sparse_attention_fn.grand_layout_crow_indices = grand_layout_crow_indices
+    _q_k_v_sparse_attention_fn.grand_layout_col_indices = grand_layout_col_indices
+    _q_k_v_sparse_attention_fn.grand_layout_ccol_indices = grand_layout_ccol_indices
+    _q_k_v_sparse_attention_fn.grand_layout_row_indices = grand_layout_row_indices
+    return _q_k_v_sparse_attention_fn
+###########################################################
+###########################################################
+###########################################################
+################ Inference Kernels ########################
+###########################################################
+def blocksparse_flash_attn_padded_fwd(
+    q, k, v, # (batch, tokens, n_heads, head_size)
+    sm_scale,
+    sparse_layout,
+    *,
+    left_paddings = None,
+    seqlens = None,
+    block_size = 64,
+    max_seqlen = None
+):
+    '''
+    q, k, v: (batch, tokens, n_heads/n_kv_heads, head_size)
+    left_paddings: (batch, ), number of left paddings for each sample.
+    seqlens: can be used to specify right padding. No need to specify if left_paddings is used.
+    '''
+    batches, q_len, n_heads, head_size = q.shape
+    _, k_len, n_kv_heads, _ = k.shape
+    assert q.dim() == k.dim() == v.dim() == 4
+    assert q.size(2) % k.size(2) == 0
+    assert q.size(0) == k.size(0) and q.size(3) == k.size(3)
+    assert k.shape == v.shape # TODO: allow diff head_size for k, v
+    assert q_len == 1 or q_len == k_len, \
+    f'q length can only 1 for decoding for same as k length for prefilling.'
+    q_k_ratio = q.size(2) // k.size(2)
+    if max_seqlen:
+        assert k.size(1) <= max_seqlen, f'k has seqlen {k.size(1)} while max sequence length is set to {max_seqlen}.'
+    # paddings always has zero output, a little slower than using empty
+    out = q.new_zeros(q.shape)
+    layout_crow_indices, layout_col_indices = sparse_layout
+    block_d = triton.next_power_of_2(head_size)
+    if left_paddings is not None:
+        assert left_paddings.shape == (batches,)
+        k_batch_starts = left_paddings.to(q.device, dtype=torch.int32).contiguous()
+    else:
+        k_batch_starts = torch.zeros((batches,), dtype=torch.int32, device=q.device)
+    if seqlens is not None:
+        k_batch_ends = k_batch_starts + seqlens.type_as(k_batch_starts)
+        assert k_batch_ends.max() <= k_len, f'seqlens (+left_paddings if any) exceeds seqlen.'
+    else:
+        k_batch_ends = torch.zeros_like(k_batch_starts) + k_len
+    if q_len == 1:
+        q_batch_starts = torch.zeros_like(k_batch_starts)
+        q_batch_ends = q_batch_starts + 1
+    else:
+        q_batch_starts = k_batch_starts
+        q_batch_ends = k_batch_ends
+    # switch to use cpu to avoid too many kernel lauch when iterate over
+    q_lens = (q_batch_ends - q_batch_starts).cpu()
+    n_blocks = (q_lens + block_size - 1) // block_size
+    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
+                                dtype=q_batch_starts.dtype,
+                                device=q_batch_starts.device)
+    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
+                               dtype=q_batch_starts.dtype,
+                               device=q_batch_starts.device)
+    grid = (len(q_start_sids), n_heads)
+    _fwd_kernel_batch_inference[grid](
+    q, k, v, out,
+    sm_scale,
+    q_batch_starts,
+    q_batch_ends,
+    k_batch_starts,
+    k_batch_ends,
+    q_batch_ids,
+    q_start_sids,
+    *q.stride(),
+    *k.stride(),
+    *v.stride(),
+    *out.stride(),
+    layout_crow_indices,
+    layout_col_indices,
+    *layout_crow_indices.stride(),
+    *layout_col_indices.stride(),
+    q_k_ratio,
+    HAS_BATCH_DIM = True,
+    D_HEAD = head_size,
+    BLOCK_M = block_size,
+    BLOCK_N = block_size,
+    BLOCK_D = block_d,
+    BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
+    EVEN_D = block_d == head_size,
+    num_warps = 1 if q_len == 1 else 4,
+    num_stages = 3
+    )
+    return out
+def blocksparse_flash_attn_varlen_fwd(
+    q, k, v, # (#tokens, n_heads, head_size)
+    cu_seqlens_k,
+    cu_seqlens_q,
+    sm_scale,
+    sparse_layout,
+    *,
+    block_size=64,
+    max_seqlen = None
+):
+    # split q to blocks
+    _, n_heads, head_size = q.shape
+    batch_size = cu_seqlens_k.size(0) - 1
+    # print(f'> {q.shape=}, {k.shape=}')
+    assert q.dim() == k.dim() == v.dim() == 3
+    assert q.size(1) % k.size(1) == 0
+    assert q.size(2) == k.size(2)
+    assert k.shape == v.shape # TODO: allow diff head_size for k, v
+    assert cu_seqlens_k.dim() == 1
+    q_k_ratio = q.size(1) // k.size(1)
+    if cu_seqlens_q is None:
+        if q.size(0) == batch_size: # decoding only
+            cu_seqlens_q = torch.arange(0, batch_size + 1,
+                                        dtype=cu_seqlens_k.dtype,
+                                        device=cu_seqlens_k.device)
+        elif q.size(0) == k.size(0):
+            cu_seqlens_q = cu_seqlens_k
+        else:
+            raise ValueError('cu_seqlens_q must be specified if it is mix of prefilling and decoding.')
+    else:
+        assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0)
+    # switch to use cpu to avoid too many kernel lauch when iterate over
+    q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu()
+    k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu()
+    assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), \
+        'length of q should either be 1 (decoding) or same as k (prefilling).'
+    if max_seqlen:
+        assert k_lens.max() <= max_seqlen
+    n_blocks = (q_lens + block_size - 1) // block_size
+    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
+                                dtype=cu_seqlens_q.dtype,
+                                device=cu_seqlens_q.device)
+    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
+                               dtype=cu_seqlens_q.dtype,
+                               device=cu_seqlens_q.device)
+    out = q.new_empty(q.shape)
+    cu_seqlens_q = cu_seqlens_q.contiguous()
+    cu_seqlens_k = cu_seqlens_k.contiguous()
+    layout_crow_indices, layout_col_indices = sparse_layout
+    block_d = triton.next_power_of_2(head_size)
+    decoding_only =  (q_lens == 1).all()
+    grid = (len(q_start_sids), n_heads)
+    _fwd_kernel_batch_inference[grid](
+    q, k, v, out,
+    sm_scale,
+    cu_seqlens_q[:-1],
+    cu_seqlens_q[1:],
+    cu_seqlens_k[:-1],
+    cu_seqlens_k[1:],
+    q_batch_ids,
+    q_start_sids,
+    0, *q.stride(),
+    0, *k.stride(),
+    0, *v.stride(),
+    0, *out.stride(),
+    layout_crow_indices,
+    layout_col_indices,
+    *layout_crow_indices.stride(),
+    *layout_col_indices.stride(),
+    q_k_ratio,
+    HAS_BATCH_DIM = False,
+    D_HEAD = head_size,
+    BLOCK_M = block_size,
+    BLOCK_N = block_size,
+    BLOCK_D = block_d,
+    BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
+    EVEN_D = block_d == head_size,
+    num_warps = 1 if decoding_only else 4,
+    num_stages = 3
+    )
+    return out
+@triton.jit
+def _fwd_kernel_inner(
+    acc, l_i, m_i,
+    q, Q,
+    k_block_col_idx,
+    layout_col_ptr,
+    layout_col_stride_h, layout_col_stride_m,
+    k_ptrs,
+    v_ptrs,
+    off_h, offs_m, offs_n, offs_d,
+    stride_kt, stride_vt,
+    sm_scale,
+    k_seqlen,
+    past_len,
+    LAST_K_BLOCK: tl.constexpr,
+    BLOCK_M_LOADING: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    EVEN_D: tl.constexpr,
+    M_LT_N: tl.constexpr
+):
+    k_block_id = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + k_block_col_idx * layout_col_stride_m).to(tl.int32)
+    start_n = k_block_id * BLOCK_N
+    # -- compute qk ----
+    if LAST_K_BLOCK:
+        if EVEN_D:
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=offs_n[None, :] + start_n < k_seqlen)
+        else:
+            # mask = mask & (offs_d[:, ])
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=(offs_n[None, :] + start_n < k_seqlen) & (offs_d[:, None] < D_HEAD))
+    else:
+        if EVEN_D:
+            k = tl.load(k_ptrs + start_n * stride_kt)
+        else:
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=offs_d[:, None] < D_HEAD)
+    qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
+    qk += tl.dot(q, k)
+    qk *= sm_scale
+    # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
+    if LAST_K_BLOCK | M_LT_N:
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+    # -- compute m_ij, p, l_ij
+    m_ij = tl.max(qk, 1)
+    p = tl.exp(qk - m_ij[:, None])
+    l_ij = tl.sum(p, 1)
+    # -- update m_i and l_i
+    m_i_new = tl.maximum(m_i, m_ij)
+    alpha = tl.exp(m_i - m_i_new)
+    beta = tl.exp(m_ij - m_i_new)
+    l_i_new = alpha * l_i + beta * l_ij
+    # -- update output accumulator --
+    # scale p
+    p_scale = beta / l_i_new
+    p = p * p_scale[:, None]
+    # scale acc
+    acc_scale = l_i / l_i_new * alpha
+    acc = acc * acc_scale[:, None]
+    p = p.to(Q.dtype.element_ty)
+    # update acc
+    if LAST_K_BLOCK:
+        if EVEN_D:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=offs_n[:, None] + start_n < k_seqlen)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=(offs_n[:, None] + start_n < k_seqlen) & (offs_d[None, :] < D_HEAD))
+    else:
+        if EVEN_D:
+            v = tl.load(v_ptrs + start_n * stride_vt)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=offs_d[None, :] < D_HEAD)
+    acc += tl.dot(p, v)
+    # update m_i and l_i
+    l_i = l_i_new
+    m_i = m_i_new
+    return acc, l_i, m_i
+@triton.heuristics(
+    {
+        'M_LT_N': lambda kwargs: kwargs['BLOCK_M'] < kwargs['BLOCK_N'],
+    }
+)
+@triton.jit
+def _fwd_kernel_batch_inference(
+    Q, K, V, Out,
+    sm_scale,
+    q_batch_starts,
+    q_batch_ends,
+    k_batch_starts,
+    k_batch_ends,
+    q_batch_ids,
+    q_start_sids,
+    stride_qb, stride_qt, stride_qh, stride_qd,
+    stride_kb, stride_kt, stride_kh, stride_kd,
+    stride_vb, stride_vt, stride_vh, stride_vd,
+    stride_ob, stride_ot, stride_oh, stride_od,
+    layout_crow_ptr,
+    layout_col_ptr,
+    layout_crow_stride_h, layout_crow_stride_m,
+    layout_col_stride_h, layout_col_stride_m,
+    q_k_ratio,
+    HAS_BATCH_DIM: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_M_LOADING: tl.constexpr,
+    EVEN_D: tl.constexpr,
+    M_LT_N: tl.constexpr
+):
+    '''
+    NOTATION:
+    pid: position id
+    sid: storage id
+    sbid: storage block id
+    pbid: position block id
+    offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col)
+    q and blocks in KV needs to be contiguous
+    Arguments:
+    kv_seq_lens: for compute past_len
+    kv_storage_offsets: similar to block_tables in vllm, except it is dynamic.
+        TODO: fix this
+    TODO:
+    Optimize grouped-attn
+    CUDA graph support issue
+        1. grid is dynamic: vllm set up multiple cuda graph in decoding phase, with diff max token size (16, 32, ...)
+            since we mix prompt and decoing phase here, it can be more complex.
+            need to set up diff cuda-graph for diff (off_zm, off_z)
+            # indeed, q_batch_ids can be padded to maximum number of grid[0], i.e., assume all decoding
+            therefore, cu_seqlens_q, kv_seq_lens
+    '''
+    off_zm = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_h_for_kv = off_h // q_k_ratio
+    off_z = tl.load(q_batch_ids + off_zm).to(tl.int32)   # [0, 0, 0, 1]
+    q_start_sid = tl.load(q_start_sids + off_zm)
+    start_m = q_start_sid // BLOCK_M
+    if HAS_BATCH_DIM:
+        Q += off_z * stride_qb
+        K += off_z * stride_kb
+        V += off_z * stride_vb
+        Out += off_z * stride_ob
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32)
+    q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start
+    k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32)
+    k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start
+    past_len = k_seqlen - q_seqlen
+    Q += q_cu_start * stride_qt + off_h * stride_qh
+    K += k_cu_start * stride_kt + off_h_for_kv * stride_kh
+    V += k_cu_start * stride_vt + off_h_for_kv * stride_vh
+    Out += q_cu_start * stride_ot + off_h * stride_oh
+    q_pbid = (past_len + q_start_sid) // BLOCK_M
+    if EVEN_D:
+        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
+                    mask=offs_m[:, None] < q_seqlen)
+    else:
+        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
+                    mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
+                    other=0)
+    sparse_crow_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + q_pbid * layout_crow_stride_m
+    # TODO: load at once, supported in new Triton
+    k_block_start = tl.load(sparse_crow_ptr).to(tl.int32)
+    k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32)
+    m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32)
+    k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
+    v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
+    for k_block_col_idx in range(k_block_start, k_block_end - 1):
+        acc, l_i, m_i = _fwd_kernel_inner(
+            acc, l_i, m_i,
+            q, Q,
+            k_block_col_idx,
+            layout_col_ptr,
+            layout_col_stride_h, layout_col_stride_m,
+            k_ptrs,
+            v_ptrs,
+            off_h, offs_m, offs_n, offs_d,
+            stride_kt, stride_vt,
+            sm_scale,
+            k_seqlen,
+            past_len,
+            False,
+            BLOCK_M_LOADING,
+            BLOCK_N,
+            D_HEAD,
+            EVEN_D,
+            M_LT_N
+            )
+    acc, l_i, m_i = _fwd_kernel_inner(
+        acc, l_i, m_i,
+        q, Q,
+        k_block_end - 1,
+        layout_col_ptr,
+        layout_col_stride_h, layout_col_stride_m,
+        k_ptrs,
+        v_ptrs,
+        off_h, offs_m, offs_n, offs_d,
+        stride_kt, stride_vt,
+        sm_scale,
+        k_seqlen,
+        past_len,
+        True,
+        BLOCK_M_LOADING,
+        BLOCK_N,
+        D_HEAD,
+        EVEN_D,
+        M_LT_N
+        )
+    # write output
+    if EVEN_D:
+        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
+                mask=offs_m[:, None] < q_seqlen)
+    else:
+        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
+                mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD))
+###########################################################
+###########################################################
+###########################################################
+################## Testing Utilities ######################
+###########################################################
+def torch_attention(q, k, v, attn_mask=None, sm_scale=None, block_attn_mask=None, block_size=128, do=None):
+    '''
+    q, k, v: shape=(batch, n_heads, seq, dim)
+    '''
+    # for verification
+    if sm_scale is None:
+        sm_scale = math.sqrt(float(q.size(-1)))
+    if block_attn_mask is not None:
+        assert attn_mask is None
+        outs = []
+        for s in range(0, q.size(2), block_size):
+            e = min(s + block_size, q.size(2))
+            q_block = q[:, :, s:e]
+            attn = torch.einsum('bhmd,bhnd->bhmn', q_block, k[:, :, :e]).float() * sm_scale
+            mask = block_attn_mask[..., s // block_size, : (s // block_size + 1)]
+            mask = torch.kron(mask, torch.ones(block_size, block_size, device=mask.device))
+            mask[..., :, s:].masked_fill_(torch.arange(0, block_size)[:, None] <= torch.arange(0, block_size)[None, :], 0)
+            attn = attn.masked_fill((1 - mask).bool(), float('-inf'))
+            attn = attn.softmax(-1)
+            out = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v[:, :, :e])
+            outs.append(out)
+        torch_output = torch.cat(outs, dim=2)
+    else:
+        attn = torch.einsum('bhmd,bhnd->bhmn', q, k).float() * sm_scale
+        # import ipdb; ipdb.set_trace()
+        if attn_mask is not None:
+            attn = attn.masked_fill((1 - attn_mask).bool(), float('-inf'))
+        # print(f'> torch attn: {attn.exp().sum(-1)=}')
+        attn = attn.softmax(-1)
+        if do is not None:
+            dv = torch.einsum('bhqk,bhqd->bhkd', attn.type_as(do), do)
+            print(f'> torch_attn computed dv: {dv=}')
+        torch_output = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v)
+    return torch_output
+###########################################################
+###########################################################
+###########################################################
+#################### Unit Tests ###########################
+###########################################################
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(2, 8, 2048, 128), (1, 4, 4096, 64)])
+def test_op(Z, H, N_CTX, D_HEAD, Q_LEN=None, dtype=torch.bfloat16, homo_head=True, kernel_block_size=None, sparse_block_size=128, backward=True,
+            sparse_attention_fn=None, local_blocks=4, vert_stride=4, sm_scale=None, max_length=None):
+    Q_LEN = Q_LEN or N_CTX
+    torch.manual_seed(20)
+    q = torch.empty((Z, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    if sm_scale is None:
+        sm_scale = 1. / math.sqrt(D_HEAD)
+    # for debugging
+    # print(f'>> {q.shape=}, {k.shape=}, {v.shape=}, {homo_head=}, {kernel_block_size=}, {sparse_block_size=}, {local_blocks=}, {vert_stride=}')
+    sm_scale = 0.0078125
+    if backward:
+        q.requires_grad_(), k.requires_grad_(), v.requires_grad_()
+    # qkv = torch.empty((Z, N_CTX, 3*H*D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5)
+    # q = qkv[..., :H*D_HEAD]
+    # k = qkv[..., H*D_HEAD:2*H*D_HEAD]
+    # v = qkv[..., 2*H*D_HEAD:]
+    # q = q.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # k = k.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # v = v.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # if Q_LEN and Q_LEN < N_CTX:
+    #     q = q[:, :, -Q_LEN:] # .contiguous()
+    # q = q.requires_grad_()
+    # k = k.requires_grad_()
+    # v = v.requires_grad_()
+    dout = torch.randn_like(q).contiguous()
+    # dout = torch.eye(N_CTX)[:, :D_HEAD][None, None].expand_as(q).type_as(q).contiguous()
+    # print(dout)
+    mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=sparse_block_size,
+                            local_blocks=local_blocks, vert_stride=vert_stride, homo_head=homo_head, return_dense=True)
+    if sparse_attention_fn is None:
+        sparse_attention_fn = get_local_strided_sparse_attention_op(H, N_CTX,
+                                                                    sparse_block_size=sparse_block_size,
+                                                                    local_blocks=local_blocks,
+                                                                    vert_stride=vert_stride,
+                                                                    homo_head=homo_head,
+                                                                    device=q.device,
+                                                                    dtype=q.dtype,
+                                                                    kernel_block_size=kernel_block_size)
+    # reference implementation
+    ref_out = torch_attention(q, k, v, mask_dense, sm_scale)
+    # lengths = torch.full((Z,), fill_value=N_CTX, device='cuda')
+    # cu_seqlens = torch.zeros((Z + 1,), device='cuda', dtype=torch.int32)
+    # cu_seqlens[1:] = lengths.cumsum(0)
+    # # qkv = torch.randn((Z * N_CTX, 3, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+    # qkv_list = list(map(lambda x: x.permute(0, 2, 1, 3).contiguous().view(Z * N_CTX, 1, H, D_HEAD), [q, k, v]))
+    # qkv = torch.cat(qkv_list, dim=1)
+    # ref_out0 = flash_attn_func(qkv, cu_seqlens, dropout_p=0, max_s=N_CTX, softmax_scale=sm_scale, causal=True)
+    # ref_out = ref_out0.view(Z, N_CTX, H, D_HEAD).permute(0, 2, 1, 3).contiguous()
+    if backward:
+        ref_out.backward(dout)
+        ref_dv, v.grad = v.grad.clone(), None
+        ref_dk, k.grad = k.grad.clone(), None
+        ref_dq, q.grad = q.grad.clone(), None
+    tri_out = sparse_attention_fn(q, k, v, sm_scale)
+    decimal = 1 if dtype == torch.bfloat16 else 2
+    assert torch.allclose(ref_out.cpu(), tri_out.cpu(), atol=1e-2, rtol=0), f'>> {ref_out[0, 0, :, 0].tolist()=}\n\n{tri_out[0, 0, :, 0].tolist()=}'
+    if backward:
+        tri_out.backward(dout)
+        tri_dv, v.grad = v.grad.clone(), None
+        tri_dk, k.grad = k.grad.clone(), None
+        tri_dq, q.grad = q.grad.clone(), None
+    if backward:
+        assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=1e-2)
+        assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=0)
+        assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=0)
+    print(f'> test passed: {Z=}, {H=}, {N_CTX=}, {D_HEAD=}, {Q_LEN=}, {dtype=}, {homo_head=}, {sparse_block_size=}')
+###########################################################
+if __name__ == '__main__':
+    GPU_TYPE = os.popen('nvidia-smi --query-gpu=name --format=csv | tail -n 1').read().strip()
+    # print(GPU_TYPE)
+    support_backward = True # 'A100' in GPU_TYPE. Wasn't supportted in consumer A1000.
+    ###############
+    # benchmarking
+    HAS_DENSE_TRITON_FLASH = False
+    # try:
+    #     from triton.ops.flash_attention import attention as triton_attention
+    #     HAS_DENSE_TRITON_FLASH = True
+    # except:
+    #     HAS_DENSE_TRITON_FLASH = False
+    #     print('> cannot import Trition flash attn')
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_unpadded_func
+        HAS_FLASH = True
+    except BaseException:
+        HAS_FLASH = False
+        print('> cannot import flash_attn')
+    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64
+    BATCH, N_HEADS, N_CTX, D_HEAD = 4, 32, 4096, 128  # 6.7B model, with 4k len
+    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 16, 4096, 128  # 204m model
+    BLOCK_SIZE = 64
+    LOCAl_BLOCKS = 8 # 4
+    VERT_STRIDE = 1 # 16 # 8
+    HOMO_HEAD = False
+    sparse_type = 'home' if HOMO_HEAD else 'hetero'
+    dtype = torch.bfloat16
+    modes = ['fwd', 'bwd'] if support_backward else ['fwd']
+    configs = [triton.testing.Benchmark(
+        x_names=['SEQ_LEN'],
+        x_vals=[2**i for i in range(8, 16)],
+        line_arg='provider',
+        line_vals=(['triton'] if HAS_DENSE_TRITON_FLASH else []) + (['flash'] if HAS_FLASH else []) + ['triton_sparse'],
+        line_names=(['Triton-Dense'] if HAS_DENSE_TRITON_FLASH else [])  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse'],
+        styles=[('red', '-'), ('blue', '-'), ('green', '-')],
+        ylabel='ms',
+        plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}-{dtype}-{mode}',
+        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': dtype, 'mode': mode}
+    ) for mode in modes]
+    @triton.testing.perf_report(configs)
+    def bench_flash_attention(BATCH, H, SEQ_LEN, D_HEAD, mode, provider, dtype=torch.bfloat16, device='cuda', sparse_attention_fn=None):
+        assert mode in ['fwd', 'bwd']
+        warmup = 25
+        rep = 100
+        N_CTX = SEQ_LEN
+        if provider == 'triton':
+            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            sm_scale = 1.3
+            fn = lambda: triton_attention(q, k, v, sm_scale)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_sparse':
+            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            sm_scale = 1.3
+            # q_pos = torch.arange(N_CTX // BLOCK, device='cuda')[:, None]
+            # k_pos = torch.arange(N_CTX // BLOCK, device='cuda')[None]
+            # local_blocks = 4 # num_block per attn, block_size is tied to BLOCK
+            # vert_stride =N_CTX + 1 # 4
+            # mask_vert_strided = torch.arange(N_CTX // BLOCK, device='cuda') % vert_stride == vert_stride - 1
+            # mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).type_as(q)
+            # mask = mask_dense.to_sparse_csr()
+            # mask_csr, _ = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK, local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=HOMO_HEAD)
+            if sparse_attention_fn is None:
+                # sparse_attention_fn = sparse_attention
+                sparse_attention_fn = get_local_strided_sparse_attention_op(H, SEQ_LEN,
+                                                                            local_blocks=LOCAl_BLOCKS,
+                                                                            vert_stride=VERT_STRIDE,
+                                                                            homo_head=HOMO_HEAD,
+                                                                            sparse_block_size=BLOCK_SIZE,
+                                                                            kernel_block_size=BLOCK_SIZE,
+                                                                            device=q.device)
+            # sparse_attention_fn = sparse_attention_factory(128, 128, num_warps=8)
+            # fn = lambda: sparse_attention_fn(q, k, v, mask_csr[0], mask_csr[1], sm_scale)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'flash':
+            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
+            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
+            cu_seqlens[1:] = lengths.cumsum(0)
+            qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True)
+            fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        # if provider == 'torch':
+        #     q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     sm_scale = 1.3
+        #     causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(q)
+        #     fn = lambda:  torch_attention(q, k, v, causal_mask, sm_scale)
+        #     ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep)
+        #     return ms
+    BATCH, N_HEADS, N_CTX, D_HEAD, Q_LEN = 4, 32, 4096, 128, 1  # 6.7B model, with 4k len
+    BLOCK_SIZE = 64
+    LOCAl_BLOCKS = 8 # 4
+    VERT_STRIDE = 16 # 8
+    HOMO_HEAD = False
+    sparse_type = 'home' if HOMO_HEAD else 'hetero'
+    dtype = torch.bfloat16
+    MAX_N_CTX = 8192
+    configs = [triton.testing.Benchmark(
+        x_names=['PAST_LEN'],
+        x_vals=[2**i - 1 for i in range(8, 14)],
+        line_arg='provider',
+        line_vals=['torch'] + (['flash'] if HAS_FLASH else []) + ['triton_sparse', 'triton_dense'],
+        line_names=['Torch']  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse', 'Triton-Dense'],
+        styles=[('red', '-'), ('blue', '-'), ('green', '-'), ('cyan', '-')],
+        ylabel='ms',
+        plot_name=f'fused-attention-inference-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}',
+        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'Q_LEN': Q_LEN, 'dtype': torch.float16, 'mode': mode}
+    ) for mode in ['fwd']]
+    @triton.testing.perf_report(configs)
+    def bench_flash_attention_inference(BATCH, H, PAST_LEN, D_HEAD, Q_LEN, mode, provider, dtype=torch.bfloat16, device='cuda'):
+        assert mode in ['fwd']
+        warmup = 25
+        rep = 100
+        N_CTX = PAST_LEN + Q_LEN
+        if provider == 'torch':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK_SIZE,
+                                    local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=VERT_STRIDE, return_dense=True)
+            fn = lambda: torch_attention(q, k, v, mask_dense, sm_scale=sm_scale, block_size=2048)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_sparse':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
+                                                                        local_blocks=LOCAl_BLOCKS,
+                                                                        vert_stride=VERT_STRIDE,
+                                                                        homo_head=HOMO_HEAD,
+                                                                        sparse_block_size=BLOCK_SIZE,
+                                                                        kernel_block_size=BLOCK_SIZE,
+                                                                        device=q.device,
+                                                                        inference=True)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_dense':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
+                                                                        local_blocks=1,
+                                                                        vert_stride=1,
+                                                                        homo_head=True,
+                                                                        sparse_block_size=BLOCK_SIZE,
+                                                                        kernel_block_size=BLOCK_SIZE,
+                                                                        device=q.device,
+                                                                        inference=True)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'flash':
+            assert Q_LEN == 1
+            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
+            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
+            cu_seqlens[1:] = lengths.cumsum(0)
+            cu_seqlens_q = torch.arange(BATCH + 1, device=device, dtype=torch.int32)
+            # (total_q, nheads, headdim),
+            q = torch.randn((BATCH, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            fn = lambda: flash_attn_unpadded_func(q, k, v, cu_seqlens_q, cu_seqlens, 1, N_CTX, dropout_p=0, softmax_scale=1.3, causal=False)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+    test_op(1, 4, 512, 128, dtype=torch.float16, homo_head=False, backward=support_backward)
+    # bench_flash_attention.run(save_path='.', print_data=True)
+    bench_flash_attention_inference.run(save_path='.', print_data=True)
+    exit()
+    # head_dim=64
+    test_op(1, 2, 1024, 64, kernel_block_size=64, sparse_block_size=64,
+            dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # uneven length, bf16
+    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, sparse_block_size=128,
+            kernel_block_size=64, local_blocks=8, vert_stride=8)
+    test_op(3, 2, 2047, 128, homo_head=False, backward=False)
+    # diff kernel/sparse block size
+    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, kernel_block_size=64)
+    # inference
+    # test_op(1, 4, 512 + 256, 128, Q_LEN=1,  dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # dense flash attn
+    test_op(1, 2, 1024, 128, kernel_block_size=128, sparse_block_size=128, dtype=torch.bfloat16, homo_head=False,
+            backward=support_backward, local_blocks=1, vert_stride=1)
+    # fp16
+    test_op(1, 4, 512 + 256, 128, dtype=torch.float16, homo_head=False, backward=support_backward)
+    # longer sequence
+    test_op(2, 4, 8192, 64, homo_head=False, backward=support_backward)
+    test_op(2, 4, 8192, 128, dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # homo head
+    test_op(3, 2, 2048, 64, homo_head=True, dtype=torch.bfloat16, backward=False)
+    test_op(3, 2, 2048, 64, homo_head=True, backward=support_backward)
+    # sparse_attention_fn = sparse_attention_factory(16, 128, num_warps=1, INFERENCE=True)
+    # test_op(8, 1, 2047, 128, 1, backward=False, sparse_attention_fn=None)
+    # test_op_inference(3, 2, 2048, 128, 2048)
+    # test_op_inference(3, 2, 2047, 64, 2047)
+    # test_op_inference(3, 2, 256, 64, 128)
+    # test_op_inference(3, 2, 2048, 64, 1)
+    bench_flash_attention.run(save_path='.', print_data=True)
+    # bench_flash_attention_inference.run(save_path='.', print_data=True)
+# ========================
+# Some Benchmark Results #
+# ========================
+# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-fwd
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.057184     0.069646       0.052567
+# 1    512.0      0.131688     0.187658       0.110212
+# 2   1024.0      0.391844     0.524990       0.247875
+# 3   2048.0      1.305190     1.456685       0.596506
+# 4   4096.0      4.623019     4.968653       1.600277
+# 5   8192.0     17.513062    18.332262       4.802458
+# 6  16384.0     68.453377    70.337540      16.052908
+# 7  32768.0    270.655487   276.020233      57.938946
+# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-bwd (num_warp=8):
+# SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.190120     0.150313       0.181451
+# 1    512.0      0.406348     0.391767       0.391177
+# 2   1024.0      1.029704     1.182967       0.885741
+# 3   2048.0      2.985456     3.843399       2.040469
+# 4   4096.0      9.808897    13.073701       5.069609
+# 5   8192.0     34.995201    47.863808      13.948782
+# 6  16384.0    132.740097   182.579193      42.816513
+# 7  32768.0    542.223389   714.820618     147.053574
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
+# PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050949     0.032357       0.107513
+# 1     512.0     0.073624     0.050651       0.199086
+# 2    1024.0     0.107472     0.080379       0.245445
+# 3    2048.0     0.178423     0.129448       0.338259
+# 4    4096.0     0.327647     0.223106       0.517048
+# 5    8192.0     0.588423     0.411263       0.884606
+# 6   16384.0     1.098898     0.798941       1.611809
+# 7   32768.0     2.094537     1.594726       3.044160
+# 6.7B
+# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.069208     0.082156       0.065097
+# 1    512.0      0.138271     0.201393       0.144467
+# 2   1024.0      0.391521     0.624614       0.322382
+# 3   2048.0      1.268443     2.406325       0.784367
+# 4   4096.0      4.455703     9.139097       2.100856
+# 5   8192.0     16.764315    35.289600       6.328320
+# 6  16384.0     65.221634   138.401794      21.069057
+# 7  32768.0    257.251343   548.085754      76.111870
+# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.297118     0.266469       0.255255
+# 1    512.0      0.672826     0.613685       0.552954
+# 2   1024.0      1.718434     1.705066       1.251953
+# 3   2048.0      4.936755     5.403875       2.927895
+# 4   4096.0     15.911594    18.959362       7.436288
+# 5   8192.0     55.357441    70.808578      21.140224
+# 6  16384.0    208.188416   273.617920      68.018173
+# 7  32768.0    806.037476  1081.453613     218.720261
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
+#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050151     0.032337       0.107593
+# 1     512.0     0.073409     0.051737       0.200200
+# 2    1024.0     0.107533     0.082099       0.247067
+# 3    2048.0     0.177259     0.128891       0.338510
+# 4    4096.0     0.325866     0.223621       0.524842
+# 5    8192.0     0.586926     0.408913       0.885490
+# 6   16384.0     1.100834     0.793277       1.612271
+# 7   32768.0     2.098851     1.595831       3.064544
+# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.066673     0.082037       0.065085
+# 1    512.0      0.137379     0.201880       0.143473
+# 2   1024.0      0.390675     0.624234       0.312046
+# 3   2048.0      1.267739     2.406950       0.696045
+# 4   4096.0      4.445138     9.136333       1.665788
+# 5   8192.0     16.768614    35.265533       4.380486
+# 6  16384.0     65.235970   138.393600      12.997633
+# 7  32768.0    257.317902   550.442993      42.821121
+# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.296461     0.266581       0.254022
+# 1    512.0      0.671427     0.613643       0.551283
+# 2   1024.0      1.719918     1.704295       1.229982
+# 3   2048.0      4.945305     5.403364       2.721906
+# 4   4096.0     15.934293    18.960999       6.259371
+# 5   8192.0     55.406593    70.832130      15.676929
+# 6  16384.0    208.750595   275.004425      44.837891
+# 7  32768.0    808.057861  1080.647705     141.856766
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert8-hetero:
+#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050739     0.032886       0.107837
+# 1     512.0     0.073507     0.051996       0.200293
+# 2    1024.0     0.106394     0.080679       0.240610
+# 3    2048.0     0.177659     0.127660       0.287625
+# 4    4096.0     0.326326     0.226971       0.377500
+# 5    8192.0     0.586339     0.407367       0.559266
+# 6   16384.0     1.102279     0.786221       0.920976
+# 7   32768.0     2.097370     1.545090       1.644288
+################
+##### fp16 #####
+################
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.032518     0.035472       0.029939
+# 1    512.0      0.054266     0.087841       0.054320
+# 2   1024.0      0.133447     0.263090       0.102045
+# 3   2048.0      0.384615     1.023293       0.201763
+# 4   4096.0      1.300890     4.023936       0.449555
+# 5   8192.0      4.774144    15.816704       1.150854
+# 6  16384.0     18.220032    62.771198       3.356001
+# 7  32768.0     71.405571   250.273788      10.976142
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.083342     0.069742       0.079496
+# 1    512.0      0.159894     0.170995       0.151705
+# 2   1024.0      0.386071     0.522407       0.331443
+# 3   2048.0      1.067715     1.737333       0.715248
+# 4   4096.0      3.382731     6.219520       1.597457
+# 5   8192.0     11.857793    23.560448       3.879035
+# 6  16384.0     44.422142    91.251709      10.626843
+# 7  32768.0    175.011841   359.473145      32.340992
+################
+##### bf16 #####
+################
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.037636     0.035902       0.031512
+# 1    512.0      0.058591     0.087229       0.058125
+# 2   1024.0      0.143337     0.263919       0.108443
+# 3   2048.0      0.414458     1.025985       0.214114
+# 4   4096.0      1.390841     4.020010       0.480550
+# 5   8192.0      5.067938    15.808171       1.230874
+# 6  16384.0     19.442280    62.765057       3.597274
+# 7  32768.0     75.501572   250.443771      11.768959
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.084404     0.070663       0.082613
+# 1    512.0      0.161510     0.172882       0.157661
+# 2   1024.0      0.388954     0.526047       0.339855
+# 3   2048.0      1.075814     1.736057       0.732420
+# 4   4096.0      3.401622     6.221376       1.636039
+# 5   8192.0     11.915136    23.483391       3.968725
+# 6  16384.0     44.660225    91.302910      10.857130
+# 7  32768.0    175.038467   359.048187      32.778240