Spaces:

ShivamMore
/

aipod

Running

App Files Files Community

ShivamMore commited on 22 days ago

Commit

2e6f087

1 Parent(s): 204a314

commit name

Browse files

Files changed (16) hide show

.gitignore +9 -0
LICENSE +201 -0
inference.ipynb +239 -0
inference_client.py +161 -0
inference_client_webrtc.py +255 -0
inference_server.py +172 -0
ioblocks.py +333 -0
model.py +443 -0
requirements.txt +15 -0
requirements_webrtc.txt +2 -0
tokenizer.py +581 -0
transformer.py +381 -0
utils/__init__.py +3 -0
utils/blocks.py +92 -0
utils/dist.py +98 -0
utils/interp.py +84 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+.venv
+*.wav
+*.mp3
+*.m4a
+!prompts/*.wav
+!prompts/*.mp3
+!prompts/*.m4a
+__pycache__
+*ckpt

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 Standard Intelligence PBC
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

inference.ipynb ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch as T\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import torchaudio\n",
+    "from utils import load_ckpt, print_colored\n",
+    "from tokenizer import make_tokenizer\n",
+    "from model import get_hertz_dev_config\n",
+    "import matplotlib.pyplot as plt\n",
+    "from IPython.display import Audio, display\n",
+    "\n",
+    "\n",
+    "# If you get an error like \"undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12\",\n",
+    "# you need to install PyTorch with the correct CUDA version. Run:\n",
+    "# `pip3 uninstall torch torchaudio && pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu121`\n",
+    "\n",
+    "device = 'cuda' if T.cuda.is_available() else 'cpu'\n",
+    "T.cuda.set_device(0)\n",
+    "print_colored(f\"Using device: {device}\", \"grey\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This code will automatically download them if it can't find them.\n",
+    "audio_tokenizer = make_tokenizer(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We have different checkpoints for the single-speaker and two-speaker models\n",
+    "# Set to True to load and run inference with the two-speaker model\n",
+    "TWO_SPEAKER = False\n",
+    "USE_PURE_AUDIO_ABLATION = False # We trained a base model with no text initialization at all. Toggle this to enable it.\n",
+    "assert not (USE_PURE_AUDIO_ABLATION and TWO_SPEAKER) # We only have a single-speaker version of this model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_config = get_hertz_dev_config(is_split=TWO_SPEAKER, use_pure_audio_ablation=USE_PURE_AUDIO_ABLATION)\n",
+    "\n",
+    "generator = model_config()\n",
+    "generator = generator.eval().to(T.bfloat16).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_preprocess_audio(audio_path):\n",
+    "    print_colored(\"Loading and preprocessing audio...\", \"blue\", bold=True)\n",
+    "    # Load audio file\n",
+    "    audio_tensor, sr = torchaudio.load(audio_path)\n",
+    "    print_colored(f\"Loaded audio shape: {audio_tensor.shape}\", \"grey\")\n",
+    "    \n",
+    "    if TWO_SPEAKER:\n",
+    "        if audio_tensor.shape[0] == 1:\n",
+    "            print_colored(\"Converting mono to stereo...\", \"grey\")\n",
+    "            audio_tensor = audio_tensor.repeat(2, 1)\n",
+    "            print_colored(f\"Stereo audio shape: {audio_tensor.shape}\", \"grey\")\n",
+    "    else:\n",
+    "        if audio_tensor.shape[0] == 2:\n",
+    "            print_colored(\"Converting stereo to mono...\", \"grey\")\n",
+    "            audio_tensor = audio_tensor.mean(dim=0).unsqueeze(0)\n",
+    "            print_colored(f\"Mono audio shape: {audio_tensor.shape}\", \"grey\")\n",
+    "        \n",
+    "    # Resample to 16kHz if needed\n",
+    "    if sr != 16000:\n",
+    "        print_colored(f\"Resampling from {sr}Hz to 16000Hz...\", \"grey\")\n",
+    "        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)\n",
+    "        audio_tensor = resampler(audio_tensor)\n",
+    "        \n",
+    "    # Clip to 5 minutes if needed\n",
+    "    max_samples = 16000 * 60 * 5\n",
+    "    if audio_tensor.shape[1] > max_samples:\n",
+    "        print_colored(\"Clipping audio to 5 minutes...\", \"grey\")\n",
+    "        audio_tensor = audio_tensor[:, :max_samples]\n",
+    "\n",
+    "    \n",
+    "    print_colored(\"Audio preprocessing complete!\", \"green\")\n",
+    "    return audio_tensor.unsqueeze(0)\n",
+    "\n",
+    "def display_audio(audio_tensor):\n",
+    "    audio_tensor = audio_tensor.cpu().squeeze()\n",
+    "    if audio_tensor.ndim == 1:\n",
+    "        audio_tensor = audio_tensor.unsqueeze(0)\n",
+    "    audio_tensor = audio_tensor.float()\n",
+    "\n",
+    "    # Make a waveform plot\n",
+    "    plt.figure(figsize=(4, 1))\n",
+    "    plt.plot(audio_tensor.numpy()[0], linewidth=0.5)\n",
+    "    plt.axis('off')\n",
+    "    plt.show()\n",
+    "\n",
+    "    # Make an audio player\n",
+    "    display(Audio(audio_tensor.numpy(), rate=16000))\n",
+    "    print_colored(f\"Audio ready for playback ↑\", \"green\", bold=True)\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "# Our model is very prompt-sensitive, so we recommend experimenting with a diverse set of prompts.\n",
+    "prompt_audio = load_and_preprocess_audio('./prompts/toaskanymore.wav')\n",
+    "display_audio(prompt_audio)\n",
+    "prompt_len_seconds = 3\n",
+    "prompt_len = prompt_len_seconds * 8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_colored(\"Encoding prompt...\", \"blue\")\n",
+    "with T.autocast(device_type='cuda', dtype=T.bfloat16):\n",
+    "    if TWO_SPEAKER:\n",
+    "        encoded_prompt_audio_ch1 = audio_tokenizer.latent_from_data(prompt_audio[:, 0:1].to(device))\n",
+    "        encoded_prompt_audio_ch2 = audio_tokenizer.latent_from_data(prompt_audio[:, 1:2].to(device))\n",
+    "        encoded_prompt_audio = T.cat([encoded_prompt_audio_ch1, encoded_prompt_audio_ch2], dim=-1)\n",
+    "    else:\n",
+    "        encoded_prompt_audio = audio_tokenizer.latent_from_data(prompt_audio.to(device))\n",
+    "print_colored(f\"Encoded prompt shape: {encoded_prompt_audio.shape}\", \"grey\")\n",
+    "print_colored(\"Prompt encoded successfully!\", \"green\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_completion(encoded_prompt_audio, prompt_len, gen_len=None):\n",
+    "    prompt_len_seconds = prompt_len / 8\n",
+    "    print_colored(f\"Prompt length: {prompt_len_seconds:.2f}s\", \"grey\")\n",
+    "    print_colored(\"Completing audio...\", \"blue\")\n",
+    "    encoded_prompt_audio = encoded_prompt_audio[:, :prompt_len]\n",
+    "    with T.autocast(device_type='cuda', dtype=T.bfloat16):\n",
+    "        completed_audio_batch = generator.completion(\n",
+    "            encoded_prompt_audio, \n",
+    "            temps=(.8, (0.5, 0.1)), # (token_temp, (categorical_temp, gaussian_temp))\n",
+    "            use_cache=True,\n",
+    "            gen_len=gen_len)\n",
+    "\n",
+    "        completed_audio = completed_audio_batch\n",
+    "        print_colored(f\"Decoding completion...\", \"blue\")\n",
+    "        if TWO_SPEAKER:\n",
+    "            decoded_completion_ch1 = audio_tokenizer.data_from_latent(completed_audio[:, :, :32].bfloat16())\n",
+    "            decoded_completion_ch2 = audio_tokenizer.data_from_latent(completed_audio[:, :, 32:].bfloat16())\n",
+    "            decoded_completion = T.cat([decoded_completion_ch1, decoded_completion_ch2], dim=0)\n",
+    "        else:\n",
+    "            decoded_completion = audio_tokenizer.data_from_latent(completed_audio.bfloat16())\n",
+    "        print_colored(f\"Decoded completion shape: {decoded_completion.shape}\", \"grey\")\n",
+    "\n",
+    "    print_colored(\"Preparing audio for playback...\", \"blue\")\n",
+    "\n",
+    "    audio_tensor = decoded_completion.cpu().squeeze()\n",
+    "    if audio_tensor.ndim == 1:\n",
+    "        audio_tensor = audio_tensor.unsqueeze(0)\n",
+    "    audio_tensor = audio_tensor.float()\n",
+    "\n",
+    "    if audio_tensor.abs().max() > 1:\n",
+    "        audio_tensor = audio_tensor / audio_tensor.abs().max()\n",
+    "\n",
+    "    return audio_tensor[:, max(prompt_len*2000 - 16000, 0):]\n",
+    "\n",
+    "num_completions = 10\n",
+    "print_colored(f\"Generating {num_completions} completions...\", \"blue\")\n",
+    "for _ in range(num_completions):\n",
+    "    completion = get_completion(encoded_prompt_audio, prompt_len, gen_len=20*8) # 20 seconds of generation\n",
+    "    display_audio(completion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

inference_client.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# server.py remains the same as before
+# Updated client.py
+import asyncio
+import websockets
+import sounddevice as sd
+import numpy as np
+import base64
+import queue
+import argparse
+import requests
+import time
+class AudioClient:
+    def __init__(self, server_url="ws://localhost:8000", token_temp=None, categorical_temp=None, gaussian_temp=None):
+        # Convert ws:// to http:// for the base URL
+        self.base_url = server_url.replace("ws://", "http://")
+        self.server_url = f"{server_url}/audio"
+        # Set temperatures if provided
+        if any(t is not None for t in [token_temp, categorical_temp, gaussian_temp]):
+            self.set_temperature_and_echo(token_temp, categorical_temp, gaussian_temp)
+        # Initialize queues
+        self.audio_queue = queue.Queue()
+        self.output_queue = queue.Queue()
+    def set_temperature_and_echo(self, token_temp=None, categorical_temp=None, gaussian_temp=None, echo_testing = False):
+        """Send temperature settings to server"""
+        params = {}
+        if token_temp is not None:
+            params['token_temp'] = token_temp
+        if categorical_temp is not None:
+            params['categorical_temp'] = categorical_temp
+        if gaussian_temp is not None:
+            params['gaussian_temp'] = gaussian_temp
+        response = requests.post(f"{self.base_url}/set_temperature", params=params)
+        print(response.json()['message'])
+    def audio_callback(self, indata, frames, time, status):
+        """This is called for each audio block"""
+        if status:
+            print(status)
+        # if np.isclose(indata, 0).all():
+        #     raise Exception('Audio input is not working - received all zeros')
+        # Convert float32 to int16 for efficient transmission
+        indata_int16 = (indata.copy() * 32767).astype(np.int16)
+        # indata_int16 = np.zeros_like(indata_int16)
+        self.audio_queue.put(indata_int16)
+    def output_stream_callback(self, outdata, frames, time, status):
+        """Callback for output stream to get audio data"""
+        if status:
+            print(status)
+        try:
+            data = self.output_queue.get_nowait()
+            data = data.astype(np.float32) / 32767.0
+            if len(data) < len(outdata):
+                outdata[:len(data)] = data
+                outdata[len(data):] = 0
+            else:
+                outdata[:] = data[:len(outdata)]
+        except queue.Empty:
+            outdata.fill(0)
+    async def process_audio(self):
+        async with websockets.connect(self.server_url) as ws:
+            while self.running:
+                if not self.audio_queue.empty():
+                    # Get recorded audio
+                    audio_data = self.audio_queue.get()
+                    print(f'Data from microphone:{audio_data.shape, audio_data.dtype, audio_data.min(), audio_data.max()}')
+                    # Convert to base64
+                    audio_b64 = base64.b64encode(audio_data.tobytes()).decode('utf-8')
+                    # Send to server
+                    time_sent = time.time()
+                    await ws.send(f"data:audio/raw;base64,{audio_b64}")
+                    # Receive processed audio
+                    response = await ws.recv()
+                    response = response.split(",")[1]
+                    time_received = time.time()
+                    print(f"Data sent: {audio_b64[:10]}. Data received: {response[:10]}. Received in {(time_received - time_sent) * 1000:.2f} ms")
+                    processed_audio = np.frombuffer(
+                        base64.b64decode(response),
+                        dtype=np.int16
+                    ).reshape(-1, CHANNELS)
+                    print(f'Data from model:{processed_audio.shape, processed_audio.dtype, processed_audio.min(), processed_audio.max()}')
+                    self.output_queue.put(processed_audio)
+    def start(self):
+        self.running = True
+        # Print audio device information
+        devices = sd.query_devices()
+        default_input = sd.query_devices(kind='input')
+        default_output = sd.query_devices(kind='output')
+        print("\nAudio Device Configuration:")
+        print("-" * 50)
+        print(f"Default Input Device:\n{default_input}\n")
+        print(f"Default Output Device:\n{default_output}\n")
+        print("\nAll Available Devices:")
+        print("-" * 50)
+        for i, device in enumerate(devices):
+            print(f"Device {i}:")
+            print(f"Name: {device['name']}")
+            print(f"Channels (in/out): {device['max_input_channels']}/{device['max_output_channels']}")
+            print(f"Sample Rates: {device['default_samplerate']}")
+            print()
+        input_device = input("Enter the index of the input device or press enter for default: ")
+        output_device = input("Enter the index of the output device or press enter for default: ")
+        if input_device == "":
+            input_device = default_input['index']
+        if output_device == "":
+            output_device = default_output['index']
+        with sd.InputStream(callback=self.audio_callback,
+                          channels=CHANNELS,
+                          samplerate=SAMPLE_RATE,
+                          device=int(input_device),
+                          blocksize=2000), \
+             sd.OutputStream(callback=self.output_stream_callback,
+                           channels=CHANNELS,
+                           samplerate=SAMPLE_RATE,
+                           blocksize=2000,
+                           device=int(output_device)):
+            asyncio.run(self.process_audio())
+    def stop(self):
+        self.running = False
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Audio Client with Temperature Control')
+    parser.add_argument('--token_temp', '-t1', type=float, help='Token (LM) temperature parameter')
+    parser.add_argument('--categorical_temp', '-t2', type=float, help='Categorical (VAE) temperature parameter')
+    parser.add_argument('--gaussian_temp', '-t3', type=float, help='Gaussian (VAE) temperature parameter')
+    parser.add_argument('--server', '-s', default="ws://localhost:8000",
+                        help='Server URL (default: ws://localhost:8000)')
+    args = parser.parse_args()
+    # Audio settings
+    SAMPLE_RATE = 16000
+    CHANNELS = 1
+    client = AudioClient(
+        server_url=args.server,
+        token_temp=args.token_temp,
+        categorical_temp=args.categorical_temp,
+        gaussian_temp=args.gaussian_temp
+    )
+    try:
+        client.start()
+    except KeyboardInterrupt:
+        client.stop()

inference_client_webrtc.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# server.py remains the same as before
+# Updated client.py
+import asyncio
+import websockets
+import numpy as np
+import base64
+import argparse
+import requests
+import time
+import torch
+import torchaudio
+import av
+import streamlit as st
+from typing import List
+from streamlit_webrtc import WebRtcMode, webrtc_streamer
+class AudioClient:
+    def __init__(self, server_url="ws://localhost:8000", token_temp=None, categorical_temp=None, gaussian_temp=None):
+        # Convert ws:// to http:// for the base URL
+        self.base_url = server_url.replace("ws://", "http://")
+        self.server_url = f"{server_url}/audio"
+        self.sound_check = False
+        # Set temperatures if provided
+        if any(t is not None for t in [token_temp, categorical_temp, gaussian_temp]):
+            response_message = self.set_temperature_and_echo(token_temp, categorical_temp, gaussian_temp)
+            print(response_message)
+        self.downsampler = torchaudio.transforms.Resample(STREAMING_SAMPLE_RATE, SAMPLE_RATE)
+        self.upsampler = torchaudio.transforms.Resample(SAMPLE_RATE, STREAMING_SAMPLE_RATE)
+        self.ws = None
+        self.in_buffer = None
+        self.out_buffer = None
+    def set_temperature_and_echo(self, token_temp=None, categorical_temp=None, gaussian_temp=None, echo_testing = False):
+        """Send temperature settings to server"""
+        params = {}
+        if token_temp is not None:
+            params['token_temp'] = token_temp
+        if categorical_temp is not None:
+            params['categorical_temp'] = categorical_temp
+        if gaussian_temp is not None:
+            params['gaussian_temp'] = gaussian_temp
+        response = requests.post(f"{self.base_url}/set_temperature", params=params)
+        response_message = response.json()['message']
+        return response_message
+    def _resample(self, audio_data: np.ndarray, resampler: torchaudio.transforms.Resample) -> np.ndarray:
+        audio_data = audio_data.astype(np.float32) / 32767.0
+        audio_data = resampler(torch.tensor(audio_data)).numpy()
+        audio_data = (audio_data * 32767.0).astype(np.int16)
+        return audio_data
+    def upsample(self, audio_data: np.ndarray) -> np.ndarray:
+        return self._resample(audio_data, self.upsampler)
+    def downsample(self, audio_data: np.ndarray) -> np.ndarray:
+        return self._resample(audio_data, self.downsampler)
+    def from_s16_format(self, audio_data: np.ndarray, channels: int) -> np.ndarray:
+        if channels == 2:
+            audio_data = audio_data.reshape(-1, 2).T
+        else:
+            audio_data = audio_data.reshape(-1)
+        return audio_data
+    def to_s16_format(self, audio_data: np.ndarray):
+        if len(audio_data.shape) == 2 and audio_data.shape[0] == 2:
+            audio_data = audio_data.T.reshape(1, -1)
+        elif len(audio_data.shape) == 1:
+            audio_data = audio_data.reshape(1, -1)
+        return audio_data
+    def to_channels(self, audio_data: np.ndarray, channels: int) -> np.ndarray:
+        current_channels = audio_data.shape[0] if len(audio_data.shape) == 2 else 1
+        if current_channels == channels:
+            return audio_data
+        elif current_channels == 1 and channels == 2:
+            audio_data = np.tile(audio_data, 2).reshape(2, -1)
+        elif current_channels == 2 and channels == 1:
+            audio_data = audio_data.astype(np.float32) / 32767.0
+            audio_data = audio_data.mean(axis=0)
+            audio_data = (audio_data * 32767.0).astype(np.int16)
+        return audio_data
+    async def process_audio(self, audio_data: np.ndarray) -> np.ndarray:
+        if self.ws is None:
+            self.ws = await websockets.connect(self.server_url)
+        audio_data = audio_data.reshape(-1, CHANNELS)
+        print(f'Data from microphone:{audio_data.shape, audio_data.dtype, audio_data.min(), audio_data.max()}')
+        # Convert to base64
+        audio_b64 = base64.b64encode(audio_data.tobytes()).decode('utf-8')
+        # Send to server
+        time_sent = time.time()
+        await self.ws.send(f"data:audio/raw;base64,{audio_b64}")
+        # Receive processed audio
+        response = await self.ws.recv()
+        response = response.split(",")[1]
+        time_received = time.time()
+        print(f"Data sent: {audio_b64[:10]}. Data received: {response[:10]}. Received in {(time_received - time_sent) * 1000:.2f} ms")
+        processed_audio = np.frombuffer(
+            base64.b64decode(response),
+            dtype=np.int16
+        ).reshape(-1, CHANNELS)
+        print(f'Data from model:{processed_audio.shape, processed_audio.dtype, processed_audio.min(), processed_audio.max()}')
+        if CHANNELS == 1:
+            processed_audio = processed_audio.reshape(-1)
+        return processed_audio
+    async def queued_audio_frames_callback(self, frames: List[av.AudioFrame]) -> List[av.AudioFrame]:
+        out_frames = []
+        for frame in frames:
+            # Read in audio
+            audio_data = frame.to_ndarray()
+            # Convert input audio from s16 format, convert to `CHANNELS` number of channels, and downsample
+            audio_data = self.from_s16_format(audio_data, len(frame.layout.channels))
+            audio_data = self.to_channels(audio_data, CHANNELS)
+            audio_data = self.downsample(audio_data)
+            # Add audio to input buffer
+            if self.in_buffer is None:
+                self.in_buffer = audio_data
+            else:
+                self.in_buffer = np.concatenate((self.in_buffer, audio_data), axis=-1)
+            # Take BLOCK_SIZE samples from input buffer if available for processing
+            if self.in_buffer.shape[0] >= BLOCK_SIZE:
+                audio_data = self.in_buffer[:BLOCK_SIZE]
+                self.in_buffer = self.in_buffer[BLOCK_SIZE:]
+            else:
+                audio_data = None
+            # Process audio if available and add resulting audio to output buffer
+            if audio_data is not None:
+                if not self.sound_check:
+                    audio_data = await self.process_audio(audio_data)
+                if self.out_buffer is None:
+                    self.out_buffer = audio_data
+                else:
+                    self.out_buffer = np.concatenate((self.out_buffer, audio_data), axis=-1)
+            # Take `out_samples` samples from output buffer if available for output
+            out_samples = int(frame.samples * SAMPLE_RATE / STREAMING_SAMPLE_RATE)
+            if self.out_buffer is not None and self.out_buffer.shape[0] >= out_samples:
+                audio_data = self.out_buffer[:out_samples]
+                self.out_buffer = self.out_buffer[out_samples:]
+            else:
+                audio_data = None
+            # Output silence if no audio data available
+            if audio_data is None:
+                # output silence
+                audio_data = np.zeros(out_samples, dtype=np.int16)
+            # Upsample output audio, convert to original number of channels, and convert to s16 format
+            audio_data = self.upsample(audio_data)
+            audio_data = self.to_channels(audio_data, len(frame.layout.channels))
+            audio_data = self.to_s16_format(audio_data)
+            # return audio data as AudioFrame
+            new_frame = av.AudioFrame.from_ndarray(audio_data, format=frame.format.name, layout=frame.layout.name)
+            new_frame.sample_rate = frame.sample_rate
+            out_frames.append(new_frame)
+        return out_frames
+    def stop(self):
+        if self.ws is not None:
+            # TODO: this hangs. Figure out why.
+            #asyncio.get_event_loop().run_until_complete(self.ws.close())
+            print("Websocket closed")
+        self.ws = None
+        self.in_buffer = None
+        self.out_buffer = None
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Audio Client with Temperature Control')
+    parser.add_argument('--token_temp', '-t1', type=float, help='Token (LM) temperature parameter')
+    parser.add_argument('--categorical_temp', '-t2', type=float, help='Categorical (VAE) temperature parameter')
+    parser.add_argument('--gaussian_temp', '-t3', type=float, help='Gaussian (VAE) temperature parameter')
+    parser.add_argument('--server', '-s', default="ws://localhost:8000",
+                        help='Server URL (default: ws://localhost:8000)')
+    parser.add_argument("--use_ice_servers", action="store_true", help="Use public STUN servers")
+    args = parser.parse_args()
+    # Audio settings
+    STREAMING_SAMPLE_RATE = 48000
+    SAMPLE_RATE = 16000
+    BLOCK_SIZE = 2000
+    CHANNELS = 1
+    st.title("hertz-dev webrtc demo!")
+    st.markdown("""
+    Welcome to the audio processing interface! Here you can talk live with hertz.
+    - Process audio in real-time through your microphone
+    - Adjust various temperature parameters for inference
+    - Test your microphone with sound check mode
+    - Enable/disable echo cancellation and noise suppression
+    To begin, click the START button below and allow microphone access.
+    """)
+    audio_client = st.session_state.get("audio_client")
+    if audio_client is None:
+        audio_client = AudioClient(
+            server_url=args.server,
+            token_temp=args.token_temp,
+            categorical_temp=args.categorical_temp,
+            gaussian_temp=args.gaussian_temp
+        )
+        st.session_state.audio_client = audio_client
+    with st.sidebar:
+        st.markdown("## Inference Settings")
+        token_temp_default = args.token_temp if args.token_temp is not None else 0.8
+        token_temp = st.slider("Token Temperature", 0.05, 2.0, token_temp_default, step=0.05)
+        categorical_temp_default = args.categorical_temp if args.categorical_temp is not None else 0.4
+        categorical_temp = st.slider("Categorical Temperature", 0.01, 1.0, categorical_temp_default, step=0.01)
+        gaussian_temp_default = args.gaussian_temp if args.gaussian_temp is not None else 0.1
+        gaussian_temp = st.slider("Gaussian Temperature", 0.01, 1.0, gaussian_temp_default, step=0.01)
+        if st.button("Set Temperatures"):
+            response_message = audio_client.set_temperature_and_echo(token_temp, categorical_temp, gaussian_temp)
+            st.write(response_message)
+        st.markdown("## Microphone Settings")
+        audio_client.sound_check = st.toggle("Sound Check (Echo)", value=False)
+        echo_cancellation = st.toggle("Echo Cancellation*‡", value=False)
+        noise_suppression = st.toggle("Noise Suppression*", value=False)
+        st.markdown(r"\* *Restart stream to take effect*")
+        st.markdown("‡ *May cause audio to cut out*")
+    # Use a free STUN server from Google if --use_ice_servers is given
+    # (found in get_ice_servers() at https://github.com/whitphx/streamlit-webrtc/blob/main/sample_utils/turn.py)
+    rtc_configuration = {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} if args.use_ice_servers else None
+    audio_config = {"echoCancellation": echo_cancellation, "noiseSuppression": noise_suppression}
+    webrtc_streamer(
+        key="streamer",
+        mode=WebRtcMode.SENDRECV,
+        rtc_configuration=rtc_configuration,
+        media_stream_constraints={"audio": audio_config, "video": False},
+        queued_audio_frames_callback=audio_client.queued_audio_frames_callback,
+        on_audio_ended=audio_client.stop,
+        async_processing=True,
+    )

inference_server.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import time
+import numpy as np
+from fastapi import FastAPI, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+import base64
+import uvicorn
+import traceback
+import numpy as np
+import argparse
+import torch as T
+import torch.nn.functional as F
+import torchaudio
+import os
+from typing import Optional
+from utils import print_colored
+from model import get_hertz_dev_config
+argparse = argparse.ArgumentParser()
+argparse.add_argument('--prompt_path', type=str, default='./prompts/bob_mono.wav', help="""
+                      We highly recommend making your own prompt based on a conversation between you and another person.
+                      bob_mono.wav seems to work better for two-channel than bob_stereo.wav.
+                      """)
+args = argparse.parse_args()
+device = 'cuda' if T.cuda.is_available() else T.device('cpu')
+print_colored(f"Using device: {device}", "grey")
+model_config = get_hertz_dev_config(is_split=True)
+model = model_config()
+model = model.eval().bfloat16().to(device)
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Hyperparams or something.
+SAMPLE_RATE = 16000 # Don't change this
+TEMPS = (0.8, (0.4, 0.1)) # You can change this, but there's also an endpoint for it.
+REPLAY_SECONDS = 3 # What the user hears as context.
+class AudioProcessor:
+    def __init__(self, model, prompt_path):
+        self.model = model
+        self.prompt_path = prompt_path
+        self.initialize_state(prompt_path)
+    def initialize_state(self, prompt_path):
+        loaded_audio, sr = torchaudio.load(prompt_path)
+        self.replay_seconds = REPLAY_SECONDS
+        if sr != SAMPLE_RATE:
+            resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
+            loaded_audio = resampler(loaded_audio)
+        if loaded_audio.shape[0] == 1:
+            loaded_audio = loaded_audio.repeat(2, 1)
+        audio_length = loaded_audio.shape[-1]
+        num_chunks = audio_length // 2000
+        loaded_audio = loaded_audio[..., :num_chunks * 2000]
+        self.loaded_audio = loaded_audio.to(device)
+        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
+                self.model.init_cache(bsize=1, device=device, dtype=T.bfloat16, length=1024)
+                self.next_model_audio = self.model.next_audio_from_audio(self.loaded_audio.unsqueeze(0), temps=TEMPS)
+        self.prompt_buffer = None
+        self.prompt_position = 0
+        self.chunks_until_live = int(self.replay_seconds * 8)
+        self.initialize_prompt_buffer()
+        print_colored("AudioProcessor state initialized", "green")
+    def initialize_prompt_buffer(self):
+        self.recorded_audio = self.loaded_audio
+        prompt_audio = self.loaded_audio.reshape(1, 2, -1)
+        prompt_audio = prompt_audio[:, :, -(16000*self.replay_seconds):].cpu().numpy()
+        prompt_audio_mono = prompt_audio.mean(axis=1)
+        self.prompt_buffer = np.array_split(prompt_audio_mono[0], int(self.replay_seconds * 8))
+        print_colored(f"Initialized prompt buffer with {len(self.prompt_buffer)} chunks", "grey")
+    async def process_audio(self, audio_data):
+        if self.chunks_until_live > 0:
+            print_colored(f"Serving from prompt buffer, {self.chunks_until_live} chunks left", "grey")
+            chunk = self.prompt_buffer[int(self.replay_seconds * 8) - self.chunks_until_live]
+            self.chunks_until_live -= 1
+            if self.chunks_until_live == 0:
+                print_colored("Switching to live processing mode", "green")
+            time.sleep(0.05)
+            return chunk
+        audio_tensor = T.from_numpy(audio_data).to(device)
+        audio_tensor = audio_tensor.reshape(1, 1, -1)
+        audio_tensor = T.cat([audio_tensor, self.next_model_audio], dim=1)
+        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
+            curr_model_audio = self.model.next_audio_from_audio(
+                audio_tensor,
+                temps=TEMPS
+            )
+        print(f"Recorded audio shape {self.recorded_audio.shape}, audio tensor shape {audio_tensor.shape}")
+        self.recorded_audio = T.cat([self.recorded_audio.cpu(), audio_tensor.squeeze(0).cpu()], dim=-1)
+        self.next_model_audio = curr_model_audio
+        return curr_model_audio.float().cpu().numpy()
+    def cleanup(self):
+        print_colored("Cleaning up audio processor...", "blue")
+        os.makedirs('audio_recordings', exist_ok=True)
+        torchaudio.save(f'audio_recordings/{time.strftime("%d-%H-%M")}.wav', self.recorded_audio.cpu(), SAMPLE_RATE)
+        self.model.deinit_cache()
+        self.initialize_state(self.prompt_path)
+        print_colored("Audio processor cleanup complete", "green")
+@app.post("/set_temperature")
+async def set_temperature(token_temp: Optional[float] = None, categorical_temp: Optional[float] = None, gaussian_temp: Optional[float] = None):
+    try:
+        global TEMPS
+        TEMPS = (token_temp, (categorical_temp, gaussian_temp))
+        print_colored(f"Temperature updated to: {TEMPS}", "green")
+        return {"message": f"Temperature updated to: {TEMPS}", "status": "success"}
+    except Exception as e:
+        print_colored(f"Error setting temperature: {str(e)}", "red")
+        return {"message": f"Error setting temperature: {str(e)}", "status": "error"}
+@app.websocket("/audio")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        while True:
+            data = await websocket.receive_text()
+            audio_data = np.frombuffer(
+                base64.b64decode(data.split(",")[1]),
+                dtype=np.int16
+            )
+            audio_data = audio_data.astype(np.float32) / 32767.0
+            processed_audio = await audio_processor.process_audio(audio_data)
+            processed_audio = (processed_audio * 32767).astype(np.int16)
+            processed_data = base64.b64encode(processed_audio.tobytes()).decode('utf-8')
+            await websocket.send_text(f"data:audio/raw;base64,{processed_data}")
+    except Exception as e:
+            print_colored(f"WebSocket error: {e}", "red")
+            print_colored(f"Full traceback:\n{traceback.format_exc()}", "red")
+    finally:
+        audio_processor.cleanup()
+        await websocket.close()
+audio_processor = AudioProcessor(model=model, prompt_path=args.prompt_path)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+    print("Server started")

ioblocks.py ADDED Viewed

	@@ -0,0 +1,333 @@

+from __future__ import annotations
+from functools import partial
+from contextlib import nullcontext
+from typing import List, Tuple
+from math import ceil
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch import Tensor, int32
+from torch.amp import autocast
+from einops import rearrange, pack, unpack
+from utils import si_module, exists, default, maybe
+@si_module
+class GaussianMixtureIOLayer(nn.Module):
+    class Config:
+        latent_dim: int
+        dim: int
+        num_components: int
+    def __init__(self, c: Config):
+        super().__init__()
+        self.latent_dim = c.latent_dim
+        self.num_components = c.num_components
+        self.input_projection = nn.Linear(c.latent_dim, c.dim)
+        self.fc_loc = nn.Linear(c.dim, c.num_components * c.latent_dim)
+        self.fc_scale = nn.Linear(c.dim, c.num_components * c.latent_dim)
+        self.fc_weight = nn.Linear(c.dim, c.num_components)
+    def _square_plus(self, x):
+        return (x + T.sqrt(T.square(x) + 4)) / 2
+    def input(self, sampled_latents: T.Tensor) -> T.Tensor:
+        """Pre-sampled latents T.Tensor (B, L, Z) -> float tensor (B, L, D)"""
+        hidden = self.input_projection(sampled_latents)
+        return hidden
+    def output(self, h: T.Tensor) -> Tuple[T.Tensor, T.Tensor, T.Tensor]:
+        """float tensor (B, L, D) -> Tuple of locs, scales, and weights"""
+        batch_size, seq_len, _ = h.shape
+        locs = self.fc_loc(h).view(batch_size, seq_len, self.num_components, self.latent_dim)
+        scales = T.clamp(self._square_plus(self.fc_scale(h)), min=1e-6).view(batch_size, seq_len, self.num_components, self.latent_dim)
+        weights = self.fc_weight(h).view(batch_size, seq_len, self.num_components)
+        return (locs, scales, weights)
+    def loss(self, data, dataHat):
+        locs, scales, weights = dataHat
+        log_probs = -0.5 * T.sum(
+            (data.unsqueeze(-2) - locs).pow(2) / scales.pow(2) +
+            2 * T.log(scales) +
+            T.log(T.tensor(2 * T.pi)),
+            dim=-1
+        )
+        log_weights = F.log_softmax(weights, dim=-1)
+        return -T.logsumexp(log_weights + log_probs, dim=-1)
+    def temp_sample(self, orig_pdist, temp):
+        locs, scales, weights = orig_pdist
+        if temp is None:
+            component_samples = locs + scales * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        elif isinstance(temp, tuple):
+            assert len(temp) == 2
+            categorical_temp, gaussian_temp = temp
+            component_samples = locs + scales * gaussian_temp * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights / categorical_temp, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        else:
+            component_samples = locs + scales * temp * T.randn_like(scales)
+            mixture_samples = F.gumbel_softmax(weights / temp, hard=True)
+            sampled = (component_samples * mixture_samples.unsqueeze(-1)).sum(dim=-2)
+        return sampled
+class GPTOutput(nn.Module):
+    def __init__(self, dim, vocab_size):
+        super().__init__()
+        self.output = nn.Linear(dim, vocab_size, bias=False)
+    def forward(self, x):
+        return self.output(x)
+# helper functions
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def first(l):
+    return l[0]
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+def get_code_utilization(codes, codebook_size, get_global=False):
+    if get_global and dist.is_initialized():
+        world_size = dist.get_world_size()
+    else:
+        world_size = 1
+    if world_size > 1:
+        gathered_tokens = [T.zeros_like(codes) for _ in range(world_size)]
+        dist.all_gather(gathered_tokens, codes)
+        gathered_tokens = T.cat(gathered_tokens, dim=0)
+    else:
+        gathered_tokens = codes
+    unique_tokens = len(T.unique(gathered_tokens))
+    code_utilization = unique_tokens / min(gathered_tokens.numel(), codebook_size)
+    return code_utilization
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+# lucidrains fsq
+@si_module
+class FSQ(nn.Module):
+    @property
+    def needs_float32_params(self):
+        return True
+    class Config:
+        levels: List[int]
+        dim: int | None = None
+        num_codebooks: int = 1
+        keep_num_codebooks_dim: bool | None = None
+        scale: float | None = None
+        allowed_dtypes: Tuple[str, ...] = ('float32', 'float64')
+        channel_first: bool = False
+        projection_has_bias: bool = True
+        return_indices: bool = True
+        force_quantization_f32: bool = True
+        use_rms: bool = False
+    def __init__(self, c: Config):
+        super().__init__()
+        _levels = T.tensor(c.levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent = False)
+        _basis = T.cumprod(T.tensor([1] + c.levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent = False)
+        self.scale = c.scale
+        codebook_dim = len(c.levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * c.num_codebooks
+        self.num_codebooks = c.num_codebooks
+        self.allowed_dtypes = []
+        for dtype_str in c.allowed_dtypes:
+            if hasattr(T, dtype_str):
+                self.allowed_dtypes.append(getattr(T, dtype_str))
+            else:
+                raise ValueError(f"Invalid dtype string: {dtype_str}")
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(c.keep_num_codebooks_dim, c.num_codebooks > 1)
+        assert not (c.num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(c.dim, len(_levels) * c.num_codebooks)
+        self.channel_first = c.channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = c.projection_has_bias) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.return_indices = c.return_indices
+        if c.return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(T.arange(self.codebook_size))
+            self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)
+        self.allowed_dtypes = c.allowed_dtypes
+        self.force_quantization_f32 = c.force_quantization_f32
+        self.latent_loss = None
+    def latent_metric(self, codes, get_global=False):
+        return {'code_util_estimate': get_code_utilization(codes, self.codebook_size, get_global)}
+    def repr_from_latent(self, latent):
+        return self.indices_to_codes(latent)
+    def bound(self, z, eps: float = 1e-3):
+        """ Bound `z`, an array of shape (..., d). """
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = T.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """ Quantizes z, returns quantized zhat, same shape as z. """
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2 # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """ Converts a `code` to an index in the codebook. """
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
+        indices = rearrange(indices, '... -> ... 1')
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """ Inverse of `codes_to_indices`. """
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, '... c d -> ... (c d)')
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    # @autocast(device_type='cuda', enabled = False)
+    def forward(self, z, return_codes=False):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, 'b d ... -> b ... d')
+            z, ps = pack_one(z, 'b * d')
+        assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
+        z = self.project_in(z)
+        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, device_type='cuda', enabled = False) if force_f32 else nullcontext
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            codes = rearrange(codes, 'b n c d -> b n (c d)')
+            codes = codes.type(orig_dtype)
+        # project out
+        if return_codes:
+            return codes, indices
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, 'b * d')
+            out = rearrange(out, 'b ... d -> b d ...')
+            indices = maybe(unpack_one)(indices, ps, 'b * c')
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, '... 1 -> ...')
+        # return quantized output and indices
+        return out, indices

model.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from typing import Optional, Tuple
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+from ioblocks import GaussianMixtureIOLayer, FSQ
+from transformer import Stack, ShapeRotator, Block as PerfBlock, GPTOutput, CACHE_FILL_VALUE, FFNN, Norm
+from tokenizer import make_tokenizer
+from utils import si_module, exists, isnt, tqdm0, print0, default, print0_colored
+from utils import load_ckpt
+@si_module
+class LatentQuantizer(nn.Module):
+    class Config:
+        compressor_config: Optional[FSQ.Config] = None
+        dim: Optional[int] = None
+        ff_dim: Optional[int] = None
+        input_dim: int = None
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert exists(c.compressor_config), f'hmm {c}'
+        self.compressor = c.compressor_config()
+        self.ffnn = FFNN(c.dim, c.ff_dim)
+        self.input = nn.Linear(c.input_dim, c.dim) if exists(c.input_dim) else nn.Identity()
+        if exists(c.from_pretrained):
+            self.load_state_dict(checkpoint)
+    @T.no_grad()
+    def forward(self, x, return_latent=False, known_latent=None):
+        """
+        x: (B, S, D)
+        """
+        if exists(known_latent):
+            return self.compressor.indices_to_codes(known_latent)
+        x = self.input(x)
+        x = self.ffnn(x)
+        x, tokens = self.compressor(x)
+        if return_latent:
+            return x, tokens
+        return x
+@si_module
+class TransformerVAE(nn.Module):
+    class Config:
+        io_config: Optional[GaussianMixtureIOLayer.Config] = None
+        stack_config: Optional[Stack.Config] = None
+        quantizer_config: Optional[LatentQuantizer.Config] = None
+        plex_layer: int = None
+        plex_roll: int = 1
+        split: bool = True
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert (exists(c.io_config) and exists(c.stack_config) and exists(c.quantizer_config)), f'hmm {c}'
+        self.io = c.io_config()
+        self.stack = c.stack_config()
+        self.plex_layer = c.stack_config.layers//2
+        self.plex_roll = c.plex_roll
+        self.plex_dim = c.quantizer_config.dim
+        assert self.plex_dim is not None and c.stack_config.dim is not None, f'One of the following are None: self.plex_dim: {self.plex_dim}, c.stack_config.dim: {c.stack_config.dim}'
+        self.plex_projection = nn.Linear(self.plex_dim, c.stack_config.dim)
+        self.out_norm = Norm(c.stack_config.dim)
+        if c.split:
+            self.io2 = c.io_config()
+            self.plex_projection2 = nn.Linear(self.plex_dim, c.stack_config.dim)
+            self.io2.fc_loc = None
+            self.io2.fc_scale = None
+            self.io2.fc_weight = None
+        kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
+        head_dim = c.stack_config.dim // c.stack_config.n_head
+        self.cache_num_layers = c.stack_config.layers + ((c.stack_config.layers - self.plex_layer) if c.split else 0)
+        cache_shape = [self.cache_num_layers, c.stack_config.seq_len, 2, kv_heads, head_dim]
+        self.cache_shape = cache_shape
+        self.cache = [None] * self.cache_num_layers
+        if exists(c.from_pretrained):
+            result = self.load_state_dict(checkpoint, strict=False)
+            print0_colored(result, 'yellow')
+        self.quantizer = c.quantizer_config().eval()
+        self.quantizer.requires_grad = False
+    @T.no_grad()
+    def quantize(self, x):
+        if self.c.split:
+            x1, x2 = x.chunk(2, dim=-1)
+            with T.autocast(device_type='cuda', dtype=T.bfloat16):
+                quantized1 = self.quantizer(x1)
+                quantized2 = self.quantizer(x2)
+            return quantized1, quantized2
+        else:
+            with T.autocast(device_type='cuda', dtype=T.bfloat16):
+                return self.quantizer(x)
+    @T.no_grad()
+    def untokenize(self, token_data):
+        return self.quantizer(None, known_latent=token_data)
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        cache_shape = self.cache_shape.copy()
+        cache_shape[1] = length or cache_shape[1]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+    def deinit_cache(self):
+        self.cache = [None] * self.cache_num_layers
+    @T.no_grad()
+    def forward(self, data, next_tokens: Optional[Tuple[T.Tensor, T.Tensor]] = None, temps: Optional[Tuple[float, Tuple[float, float]]] = None):
+        if self.c.split:
+            x1, x2 = data.chunk(2, dim=-1)
+            x = self.io.input(x1) + self.io2.input(x2)
+        else:
+            x = self.io.input(data)
+        cache_idx = 0
+        for l, layer in enumerate(self.stack.layers):
+            if l == self.plex_layer:
+                if self.c.split:
+                    plex1, plex2 = self.quantize(data)
+                    plex1 = T.roll(plex1, -self.c.plex_roll, dims=1)
+                    plex2 = T.roll(plex2, -self.c.plex_roll, dims=1)
+                    if exists(next_tokens):
+                        plex1[:, -1:] = self.untokenize(next_tokens[0])
+                        plex2[:, -1:] = self.untokenize(next_tokens[1])
+                    x1 = x + self.plex_projection(plex1)
+                    x2 = x + self.plex_projection2(plex2)
+                else:
+                    plex = self.quantize(data)
+                    plex = T.roll(plex, -self.c.plex_roll, dims=1)
+                    if exists(next_tokens):
+                        plex[:, -1:] = self.untokenize(next_tokens)
+                    x = x + self.plex_projection(plex)
+            if l < self.plex_layer:
+                x = layer(x, kv=self.cache[l])
+            else:
+                if self.c.split:
+                    x1 = layer(x1, kv=self.cache[self.plex_layer + cache_idx])
+                    cache_idx += 1
+                    x2 = layer(x2, kv=self.cache[self.plex_layer + cache_idx])
+                    cache_idx += 1
+                else:
+                    x = layer(x, kv=self.cache[l])
+        with T.autocast(device_type='cuda', dtype=T.bfloat16):
+            if self.c.split:
+                x1, x2 = self.out_norm(x1), self.out_norm(x2)
+                out1, out2 = self.io.output(x1), self.io.output(x2)
+            else:
+                x = self.out_norm(x)
+                out = self.io.output(x)
+        if isnt(temps):
+            if self.c.split:
+                return out1, out2
+            else:
+                return out
+        else:
+            if self.c.split:
+                next_data1 = self.io.temp_sample(out1, temps)[:, -1:, :]
+                next_data2 = self.io2.temp_sample(out2, temps)[:, -1:, :]
+                next_data = T.cat([next_data1, next_data2], dim=-1)
+                return next_data
+            else:
+                next_data = self.io.temp_sample(out, temps)[:, -1:, :]
+                return next_data
+@si_module
+class HertzDevModel(nn.Module):
+    class Config:
+        dim: int
+        vocab_size: int
+        stack_config: Optional[Stack.Config] = None
+        latent_size: int = 32
+        split: bool = True
+        quantizer_config: Optional[LatentQuantizer.Config] = None
+        resynthesizer_config: Optional[TransformerVAE.Config] = None
+        from_pretrained: Optional[Tuple[str, str]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        if exists(c.from_pretrained):
+            checkpoint = load_ckpt(*c.from_pretrained)
+        else:
+            assert (exists(c.stack_config)), f'hmm {c}'
+        self.input = nn.Linear(c.latent_size, c.dim)
+        if self.c.split:
+            self.input2 = nn.Linear(c.latent_size, c.dim)
+        self.shape_rotator = ShapeRotator(c.stack_config.dim//c.stack_config.n_head, c.stack_config.seq_len, theta=c.stack_config.theta)
+        self.layers = nn.ModuleList([
+            PerfBlock(
+                dim=c.stack_config.dim,
+                layer_id=l,
+                n_head=c.stack_config.n_head,
+                kv_heads=c.stack_config.kv_heads,
+                ff_dim=c.stack_config.ff_dim,
+                eps=c.stack_config.eps,
+                shape_rotator=self.shape_rotator,
+            ) for l in range(c.stack_config.layers)
+        ])
+        self.output = GPTOutput(c.dim, c.vocab_size)
+        if self.c.split:
+            self.output2 = GPTOutput(c.dim, c.vocab_size)
+        self.cache = [None] * c.stack_config.layers
+        self.kv_heads = c.stack_config.kv_heads or c.stack_config.n_head
+        self.head_dim = c.stack_config.dim // c.stack_config.n_head
+        if exists(c.from_pretrained):
+            result = self.load_state_dict(checkpoint, strict=False)
+            print0_colored(result, 'yellow')
+        self.resynthesizer = c.resynthesizer_config().eval()
+        self.resynthesizer.requires_grad = False
+        self.audio_tokenizer = make_tokenizer(device='cpu')
+        self.audio_cache = None
+        self.audio_latent_cache = None
+        self.use_audio_cache = False
+    @T.no_grad()
+    def tokenize(self, audio_data):
+        orig_audio_shape = audio_data.shape
+        if exists(self.audio_cache):
+            audio_data = T.cat([self.audio_cache, audio_data], dim=-1)
+            self.audio_cache = audio_data[..., -(6*16_000):]
+        elif self.use_audio_cache:
+            self.audio_cache = audio_data[..., -(6*16_000):]
+        if audio_data.shape[1] == 2:
+            enc_ch1 = self.audio_tokenizer.latent_from_data(audio_data[:, 0:1])
+            enc_ch2 = self.audio_tokenizer.latent_from_data(audio_data[:, 1:2])
+            return T.cat([enc_ch1, enc_ch2], dim=-1)[:, -(orig_audio_shape[-1]//2000):]
+        else:
+            return self.audio_tokenizer.latent_from_data(audio_data)[:, -(orig_audio_shape[-1]//2000):]
+    @T.no_grad()
+    def untokenize(self, token_data):
+        if exists(self.audio_latent_cache):
+            token_data = T.cat([self.audio_latent_cache, token_data], dim=1)
+            self.audio_latent_cache = token_data[:, -(6*8):]
+        elif self.use_audio_cache:
+            self.audio_latent_cache = token_data[:, -(6*8):]
+        if token_data.shape[-1] == 2*self.c.latent_size:
+            dec_ch1 = self.audio_tokenizer.data_from_latent(token_data[:, :self.c.latent_size])
+            dec_ch2 = self.audio_tokenizer.data_from_latent(token_data[:, self.c.latent_size:])
+            return T.cat([dec_ch1, dec_ch2], dim=1)[..., -(token_data.shape[1]*2000):]
+        else:
+            return self.audio_tokenizer.data_from_latent(token_data)[..., -(token_data.shape[1]*2000):]
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        cache_shape = [self.c.stack_config.layers, length or self.c.stack_config.seq_len, 2, self.kv_heads, self.head_dim]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+        self.resynthesizer.init_cache(bsize, device, dtype, length)
+        self.use_audio_cache = True
+    def deinit_cache(self):
+        self.cache = [None] * len(self.layers)
+        self.resynthesizer.deinit_cache()
+        self.audio_cache = None
+        self.audio_latent_cache = None
+        self.use_audio_cache = False
+    @T.no_grad()
+    def forward(self, data):
+        if self.c.split:
+            x1, x2 = data.chunk(2, dim=-1)
+            x = self.input(x1) + self.input2(x2)
+        else:
+            x = self.input(data)
+        for l, layer in enumerate(self.layers):
+            x = layer(x, kv=self.cache[l])
+        if self.c.split:
+            return self.output(x), self.output2(x)
+        else:
+            return self.output(x)
+    @T.no_grad()
+    def next_audio_from_audio(self, audio_data: T.Tensor, temps=(0.8, (0.5, 0.1))):
+        latents_in = self.tokenize(audio_data)
+        next_latents = self.next_latent(latents_in, temps)
+        next_model_latent = next_latents[..., self.c.latent_size:]
+        audio_decoded = self.untokenize(next_model_latent)[..., -2000:]
+        return audio_decoded
+    @T.no_grad()
+    def next_latent(self, model_input: T.Tensor, temps=(0.8, (0.5, 0.1))):
+        if self.c.split:
+            logits1, logits2 = self.forward(model_input)
+            next_logits1 = logits1[:, -1]
+            next_logits2 = logits2[:, -1]
+            next_token1 = F.softmax(next_logits1 / temps[0], dim=-1).multinomial(1)
+            next_token2 = F.softmax(next_logits2 / temps[0], dim=-1).multinomial(1)
+            next_input = self.resynthesizer(model_input, next_tokens=(next_token1, next_token2), temps=temps[1])
+        else:
+            logits = self.forward(model_input)
+            next_logits = logits[:, -1]
+            next_token = F.softmax(next_logits / temps[0], dim=-1).multinomial(1)
+            next_input = self.resynthesizer(model_input, next_tokens=next_token, temps=temps[1])
+        return next_input
+    @T.no_grad()
+    def completion(self, data: T.Tensor, temps=(0.8, (0.5, 0.1)), gen_len=None, use_cache=True) -> T.Tensor:
+        """
+        only accepts latent-space data.
+        """
+        if use_cache:
+            self.init_cache(data.shape[0], data.device, T.bfloat16)
+        next_input = generated = data
+        target_len = min(data.shape[1] + default(gen_len, data.shape[1]), self.c.stack_config.seq_len)
+        for _ in tqdm0(range(data.shape[1], target_len)):
+            model_input = next_input if use_cache else generated
+            next_input = self.next_latent(model_input, temps)
+            generated = T.cat([generated, next_input], dim=1)
+        if use_cache:
+            self.deinit_cache()
+        return generated
+def get_hertz_dev_config(is_split=True, use_pure_audio_ablation=False):
+    if is_split:
+        checkpoints = [('inference_care_50000', 'e4ff4fe5c7e9f066410d2a5673b7a935'), ('inference_scion_54000', 'cb8bc484423922747b277ebc2933af5d')]
+    elif not use_pure_audio_ablation:
+        checkpoints = [('inference_whip_72000', '5e7cee7316900737d55fc5d44cc7a8f7'), ('inference_caraway_112000', 'fcb8368ef8ebf7712f3e31e6856da580')]
+    else:
+        checkpoints = [('inference_whip_72000', '5e7cee7316900737d55fc5d44cc7a8f7'), ('inference_syrup_110000', '353c48f553f1706824c11f3bb6a049e9')]
+    quantizer_config=LatentQuantizer.Config(
+        from_pretrained=('inference_volcano_3', 'd42bf674022c5f84b051d5d7794f6169'),
+        compressor_config=FSQ.Config(
+            levels=[8,8,8,8,8],
+            dim=2048,
+            num_codebooks=1,
+            keep_num_codebooks_dim=None,
+            scale=None,
+            allowed_dtypes=['float32', 'float64', 'bfloat16'],
+            channel_first=False,
+            projection_has_bias=True,
+            return_indices=True,
+            force_quantization_f32=True,
+            use_rms=False
+        ),
+        dim=2048,
+        ff_dim=8192,
+        input_dim=32
+    )
+    resynthesizer_config=TransformerVAE.Config(
+        io_config=GaussianMixtureIOLayer.Config(
+            latent_dim=32,
+            dim=4096,
+            num_components=8,
+        ),
+        stack_config=Stack.Config(
+            layers=8,
+            dim=4096,
+            seq_len=8192,
+            n_head=16,
+            ff_dim=11008,
+            kv_heads=16,
+            eps=1e-5,
+            theta=10_000
+        ),
+        quantizer_config=quantizer_config,
+        plex_layer=None,
+        plex_roll=1,
+        split=is_split,
+        from_pretrained=checkpoints[0],
+    )
+    return HertzDevModel.Config(
+        dim=4096,
+        vocab_size=32_768,
+        stack_config=Stack.Config(
+            layers=32,
+            dim=4096,
+            seq_len=2048,
+            n_head=32,
+            ff_dim=None,
+            kv_heads=None,
+            eps=1e-5,
+            theta=10_000,
+        ),
+        quantizer_config=quantizer_config,
+        resynthesizer_config=resynthesizer_config,
+        split=is_split,
+        from_pretrained=checkpoints[1],
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.5.1
+torchaudio==2.5.1
+einops==0.8.0
+tqdm==4.66.6
+ipython==8.18.1
+numpy==1.26.3
+soundfile==0.12.1
+websockets==13.1
+requests==2.32.3
+sounddevice==0.5.1
+matplotlib==3.9.2
+fastapi==0.115.4
+uvicorn==0.32.0
+huggingface-hub[hf_transfer]==0.26.2
+IProgress==0.4

requirements_webrtc.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit==1.33.0
2	+ streamlit-webrtc==0.47.9

tokenizer.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import math
+from dataclasses import dataclass
+from typing import Union, Tuple, Literal
+import torch as T
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+from utils import load_ckpt
+from utils.interp import print_colored
+from utils import si_module, get_activation
+# Adapted from https://github.com/facebookresearch/AudioDec
+def Conv1d1x1(in_channels, out_channels, bias=True):
+    return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias)
+class NonCausalConv1d(nn.Module):
+    """1D noncausal convolution w/ 2-sides padding."""
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=-1,
+            dilation=1,
+            groups=1,
+            bias=True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        if padding < 0:
+            padding = (kernel_size - 1) // 2 * dilation
+        self.dilation = dilation
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C, T).
+        """
+        x = self.conv(x)
+        return x
+class NonCausalConvTranspose1d(nn.Module):
+    """1D noncausal transpose convolution."""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=-1,
+        output_padding=-1,
+        groups=1,
+        bias=True,
+    ):
+        super().__init__()
+        if padding < 0:
+            padding = (stride+1) // 2
+        if output_padding < 0:
+            output_padding = 1 if stride % 2 else 0
+        self.deconv = nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C', T').
+        """
+        x = self.deconv(x)
+        return x
+class CausalConv1d(NonCausalConv1d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        dilation=1,
+        groups=1,
+        bias=True
+    ):
+        super(CausalConv1d, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.stride = stride
+        self.pad_length = (kernel_size - 1) * dilation
+    def forward(self, x):
+        pad = nn.ConstantPad1d((self.pad_length, 0), 0.0)
+        x = pad(x)
+        return self.conv(x)
+class CausalConvTranspose1d(NonCausalConvTranspose1d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        bias=True,
+        pad_buffer=None,
+    ):
+        super(CausalConvTranspose1d, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            output_padding=0,
+            bias=bias,
+        )
+        self.stride = stride
+        self.pad_length = (math.ceil(kernel_size/stride) - 1)
+        if pad_buffer is None:
+            pad_buffer = T.zeros(1, in_channels, self.pad_length)
+        self.register_buffer("pad_buffer", pad_buffer)
+    def forward(self, x):
+        pad = nn.ReplicationPad1d((self.pad_length, 0))
+        x = pad(x)
+        return self.deconv(x)[:, :, self.stride : -self.stride]
+    def inference(self, x):
+        x = T.cat((self.pad_buffer, x), -1)
+        self.pad_buffer = x[:, :, -self.pad_length:]
+        return self.deconv(x)[:, :, self.stride : -self.stride]
+    def reset_buffer(self):
+        self.pad_buffer.zero_()
+class NonCausalResUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=7,
+        dilation=1,
+        bias=False,
+    ):
+        super().__init__()
+        self.activation = nn.ELU()
+        self.conv1 = NonCausalConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
+    def forward(self, x):
+        y = self.conv1(self.activation(x))
+        y = self.conv2(self.activation(y))
+        return x + y
+class CausalResUnit(NonCausalResUnit):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=7,
+        dilation=1,
+        bias=False,
+    ):
+        super(CausalResUnit, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.conv1 = CausalConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=dilation,
+            bias=bias,
+        )
+    def inference(self, x):
+        y = self.conv1.inference(self.activation(x))
+        y = self.conv2(self.activation(y))
+        return x + y
+class ResNetBlock(nn.Module):
+    def __init__(self,
+        in_channels,
+        out_channels,
+        stride,
+        kernel_size=7,
+        dilations=(1, 3, 9),
+        bias=True,
+        mode='encoder',
+    ):
+        super().__init__()
+        assert mode in ('encoder', 'decoder'), f"Mode ({mode}) is not supported!"
+        self.mode = mode
+        self.stride = stride
+        ConvUnit = CausalConv1d if mode == 'encoder' else CausalConvTranspose1d
+        res_channels = in_channels if mode == 'encoder' else out_channels
+        res_units = [CausalResUnit(
+            res_channels,
+            res_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+        ) for dilation in dilations]
+        if in_channels == out_channels:
+            if mode == 'encoder':
+                self.pool = nn.AvgPool1d(kernel_size=stride, stride=stride)
+            if mode == 'decoder':
+                self.upsample = nn.Upsample(scale_factor=stride, mode='nearest')
+            conv_unit = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias=bias,
+            ) if in_channels != out_channels else nn.Identity()
+        else:
+            conv_unit = ConvUnit(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(2 * stride),
+                stride=stride,
+                bias=bias,
+            )
+        if mode == 'encoder':
+            if in_channels == out_channels:
+                self.res_block = nn.Sequential(*res_units, self.pool, conv_unit)
+            else:
+                self.res_block = nn.Sequential(*res_units, conv_unit)
+        elif mode == 'decoder':
+            if in_channels == out_channels:
+                self.res_block = nn.Sequential(self.upsample, conv_unit, *res_units)
+            else:
+                self.res_block = nn.Sequential(conv_unit, *res_units)
+    def forward(self, x):
+        out = x
+        for unit in self.res_block:
+            out = unit(out)
+        return out
+    def inference(self, x):
+        for unit in self.res_block:
+            x = unit.inference(x)
+        return x
+@si_module
+class ResNetStack(nn.Module):
+    """
+    ResNet encoder or decoder stack. Channel ratios
+    and strides take the default order of from
+    data/io-layer, to the middle of the model.
+    """
+    class Config:
+        input_channels: int = 1
+        output_channels: int = 1
+        encode_channels: int = 32
+        decode_channel_multiplier: int = 1
+        latent_dim: int = None
+        kernel_size: int = 7
+        bias: bool = True
+        channel_ratios: Tuple[int, ...] = (2, 4, 8, 16)
+        strides: Tuple[int, ...] = (3, 4, 5, 5)
+        mode: Literal['encoder', 'decoder'] = 'encoder'
+    def __init__(self, c: Config):
+        super().__init__()
+        assert c.mode in ('encoder', 'decoder'), f"Mode ({c.mode}) is not supported!"
+        self.mode = c.mode
+        assert len(c.channel_ratios) == len(c.strides)
+        channel_ratios = (1,) + c.channel_ratios
+        strides = c.strides
+        self.middle_channels = c.encode_channels * channel_ratios[-1]
+        if c.mode == 'decoder':
+            channel_ratios = tuple(reversed(channel_ratios))
+            strides = tuple(reversed(strides))
+        self.multiplier = c.decode_channel_multiplier if c.mode == 'decoder' else 1
+        res_blocks = [ResNetBlock(
+            c.encode_channels * channel_ratios[s_idx] * self.multiplier,
+            c.encode_channels * channel_ratios[s_idx+1] * self.multiplier,
+            stride,
+            kernel_size=c.kernel_size,
+            bias=c.bias,
+            mode=c.mode,
+        ) for s_idx, stride in enumerate(strides)]
+        data_conv = CausalConv1d(
+            in_channels=c.input_channels if c.mode == 'encoder' else c.encode_channels * self.multiplier,
+            out_channels=c.encode_channels if c.mode == 'encoder' else c.output_channels,
+            kernel_size=c.kernel_size,
+            stride=1,
+            bias=False,
+        )
+        if c.mode == 'encoder':
+            self.res_stack = nn.Sequential(data_conv, *res_blocks)
+        elif c.mode == 'decoder':
+            self.res_stack = nn.Sequential(*res_blocks, data_conv)
+        if c.latent_dim is not None:
+            self.latent_proj = Conv1d1x1(self.middle_channels, c.latent_dim, bias=c.bias) if c.mode == 'encoder' else Conv1d1x1(c.latent_dim, self.middle_channels, bias=c.bias)
+        if self.multiplier != 1:
+            self.multiplier_proj = Conv1d1x1(self.middle_channels, self.middle_channels * self.multiplier, bias=c.bias)
+    def forward(self, x, return_feats=False):
+        if self.c.latent_dim is not None and self.mode == 'decoder':
+            x = self.latent_proj(x)
+        if self.multiplier != 1:
+            x = self.multiplier_proj(x)
+        feats = []
+        for block in self.res_stack:
+            x = block(x)
+            if return_feats:
+                feats.append(x)
+        if self.c.latent_dim is not None and self.mode == 'encoder':
+            x = self.latent_proj(x)
+            if return_feats:
+                feats.append(x)
+        if return_feats:
+            return feats
+        return x
+    def inference(self, x):
+        for block in self.res_stack:
+            x = block.inference(x)
+        return x
+    def reset_buffer(self):
+        def _reset_buffer(m):
+            if isinstance(m, CausalConv1d) or isinstance(m, CausalConvTranspose1d):
+                m.reset_buffer()
+        self.apply(_reset_buffer)
+    def reset_parameters(self):
+        def _reset_parameters(m):
+            if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
+                m.weight.data.normal_(0.0, 0.01)
+        self.apply(_reset_parameters)
+    def apply_weight_norm(self):
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(
+                m, nn.ConvTranspose1d
+            ):
+                nn.utils.parametrizations.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def remove_weight_norm(self):
+        def _remove_weight_norm(m):
+            try:
+                print(m)
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+@si_module
+class GaussianZ(nn.Module):
+    class Config:
+        dim: int
+        latent_dim: int
+        bias: bool = False
+        use_weight_norm: bool = False
+    def __init__(self, c: Config):
+        super().__init__()
+        self.proj_in = nn.Linear(c.dim, c.latent_dim * 2, bias=c.bias)
+        self.proj_out = nn.Linear(c.latent_dim, c.dim, bias=c.bias)
+        if c.use_weight_norm:
+            self.proj_in = weight_norm(self.proj_in)
+            self.proj_out = weight_norm(self.proj_out)
+    def reparam(self, mu, logvar):
+        std = T.exp(logvar / 2)
+        eps = T.randn_like(std)
+        return mu + eps * std
+    def kl_divergence(self, mu, logvar):
+        return T.mean(-0.5 * T.sum(
+                1 + logvar - mu.pow(2) - logvar.exp(),
+                dim=(1, 2))
+            )
+    def repr_from_latent(self, latent: Union[dict, T.Tensor]):
+        if isinstance(latent, T.Tensor):
+            z = latent
+        else:
+            z = self.reparam(latent['mu'], latent['logvar'])
+        l = self.proj_out(z)
+        return l
+    def forward(self, x: T.Tensor) -> Tuple[T.Tensor, dict]:
+        mu, logvar = self.proj_in(x).chunk(2, dim=-1)
+        kl_div = self.kl_divergence(mu, logvar)
+        z = self.reparam(mu, logvar)
+        xhat = self.proj_out(z)
+        latent = {'mu': mu, 'logvar': logvar, 'z': z, 'kl_divergence': kl_div}
+        return xhat, latent
+@si_module
+class WaveCodec(nn.Module):
+    class Config:
+        resnet_config: ResNetStack.Config = None
+        sample_rate: int = 16_000
+        use_weight_norm: bool = False
+        compressor_config: dataclass = None
+        norm_stddev: float = 1.0
+    def __init__(self, c: Config):
+        super().__init__()
+        self.norm_stddev = c.norm_stddev
+        self.encoder = c.resnet_config(mode='encoder')
+        self.sample_rate = c.sample_rate
+        self.total_stride = 1
+        for stride in c.resnet_config.strides:
+            self.total_stride *= stride
+        self.tokens_per_second = self.sample_rate / self.total_stride
+        self.compressor = c.compressor_config(dim=self.encoder.middle_channels)
+        self.decoder = c.resnet_config(mode='decoder')
+        if c.use_weight_norm:
+            self.encoder.apply_weight_norm()
+            self.decoder.apply_weight_norm()
+            self.encoder.reset_parameters()
+            self.decoder.reset_parameters()
+    def encode(self, data):
+        return self.encoder(data/self.norm_stddev)
+    def decode(self, latent):
+        return self.decoder(latent.transpose(1, 2))*self.norm_stddev
+    @T.no_grad()
+    def latent_from_data(self, data, get_parameters=False):
+        x = self.encode(data)
+        l_in = x.transpose(1, 2)
+        l, latent = self.compressor(l_in)
+        return latent['z'] if not get_parameters else {
+            'mu': latent['mu'],
+            'logvar': latent['logvar'],
+            'z': latent['z'],
+        }
+    @T.no_grad()
+    def data_from_latent(self, latent):
+        l = self.compressor.repr_from_latent(latent)
+        x = self.decode(l)
+        return x
+    def process(self, x):
+        return self.latent_from_data(x)
+    def unprocess(self, latent):
+        return self.data_from_latent(latent)
+    def forward(self, audio_input):
+        x = self.encode(audio_input)
+        l_in = x.transpose(1, 2)
+        l, latent = self.compressor(l_in)
+        xhat = self.decode(l)
+        return xhat, latent
+def make_tokenizer(device='cuda'):
+    generator_config = WaveCodec.Config(
+        resnet_config=ResNetStack.Config(
+            input_channels=1,
+            output_channels=1,
+            encode_channels=16,
+            decode_channel_multiplier=4,
+            kernel_size=7,
+            bias=True,
+            channel_ratios=(4, 8, 16, 16, 16, 16),
+            strides=(2, 2, 4, 5, 5, 5),
+            mode=None,
+        ),
+        use_weight_norm=True,
+        compressor_config=GaussianZ.Config(
+            dim=None,
+            latent_dim=32,
+            bias=True,
+            use_weight_norm=True
+        ),
+        norm_stddev=0.05,
+    )
+    checkpoint = load_ckpt("inference_apatosaurus_95000", expected_hash="ba876edb97b988e9196e449dd176ca97")
+    tokenizer = generator_config()
+    load_result = tokenizer.load_state_dict(checkpoint, strict=False)
+    print_colored(f"Loaded tokenizer state dict: {load_result}", "grey")
+    tokenizer = tokenizer.eval()
+    # Only convert to bfloat16 if using CUDA
+    if device == 'cuda':
+        tokenizer = tokenizer.bfloat16()
+    tokenizer = tokenizer.to(device)
+    tokenizer.requires_grad_ = False
+    return tokenizer

transformer.py ADDED Viewed

	@@ -0,0 +1,381 @@

+from typing import Optional, Tuple, MutableMapping
+from typing import Union
+import math
+from contextlib import nullcontext
+import torch
+import torch as T
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.attention import SDPBackend
+from einops import rearrange
+from utils import si_module, default, exists, load_ckpt
+CACHE_FILL_VALUE = -1
+def get_cache_len(cache: Optional[Tensor]) -> int:
+    """
+    cache: (batch, seq_len, 2, kv_heads, head_dim)
+    """
+    if cache is None:
+        return 0
+    nonzeros = T.any(cache.flatten(2) != CACHE_FILL_VALUE, dim=-1)
+    length = nonzeros.sum(dim=-1).int()
+    assert T.all(length == length[0])
+    return length[0]
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin, offset: int = 0):
+    assert (
+        cos.shape[1] >= offset + x.shape[1]
+    ), f"Offset and/or input sequence is too large,\
+        \n offset: {offset}, seq_len: {x.shape[1]}, max: {cos.shape[1]}"
+    cos_out = cos[:, offset : offset + x.shape[1], :, :]
+    sin_out = sin[:, offset : offset + x.shape[1], :, :]
+    return (x * cos_out) + (rotate_half(x) * sin_out)
+# Adapted from https://github.com/foundation-model-stack/foundation-model-stack
+class ShapeRotator:
+    def __init__(
+        self,
+        dim: int,
+        end: int,
+        theta: float = 10_000,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ratio = theta
+        self.cached_freqs: MutableMapping[int, MutableMapping[int, torch.Tensor]] = {}
+        self.max_seq_len_cached: MutableMapping[int, int] = {}
+        self.ntk_scaling = False
+        self.max_seq_len = end
+    def compute_freqs_cis(self, device, max_seq_len=None):
+        alpha = 1
+        dev_idx = device.index
+        max_seq_len = default(max_seq_len, self.max_seq_len)
+        if dev_idx not in self.cached_freqs:
+            self.cached_freqs[dev_idx] = {}
+        if dev_idx not in self.max_seq_len_cached:
+            self.max_seq_len_cached[dev_idx] = 0
+        if self.max_seq_len_cached[dev_idx] > 0:
+            return 1
+        max_seq_len = max(max_seq_len, self.max_seq_len)
+        if (
+            1 in self.cached_freqs[dev_idx]
+            and max_seq_len <= self.max_seq_len_cached[dev_idx]
+        ):
+            return 1
+        ratio = self.ratio
+        dim = self.dim
+        freqs = 1.0 / (ratio ** (torch.arange(0, dim, 2, device=device).float() / dim))
+        t = torch.arange(max_seq_len, device=device, dtype=freqs.dtype)
+        freqs = torch.einsum("i,j->ij", t, freqs)
+        emb = torch.cat((freqs, freqs), dim=-1).to(device)
+        cos_to_cache = emb.cos()[None, :, None, :]
+        sin_to_cache = emb.sin()[None, :, None, :]
+        self.max_seq_len_cached[dev_idx] = max_seq_len
+        self.cached_freqs[dev_idx][alpha] = torch.stack(
+            [
+                cos_to_cache,
+                sin_to_cache,
+            ],
+            dim=-1,
+        )
+        return alpha
+    def rotate(
+        self,
+        q: Tensor,
+        k: Tensor,
+        offset: int = 0,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args
+        ----
+        q : torch.Tensor
+            Embedded query tensor, expected size is B x S x H x Eh
+        k : torch.Tensor
+            Embedded query tensor, expected size is B x S x H x Eh
+        """
+        assert len(q.size()) == 4
+        assert len(k.size()) == 4
+        seq_len = self.max_seq_len
+        alpha = self.compute_freqs_cis(q.device, seq_len)
+        freqs = self.cached_freqs[q.device.index][alpha]
+        freqs = freqs.float()  # 1 L D/2 2 2
+        q_out = apply_rotary_pos_emb(q, freqs[..., 0], freqs[..., 1], offset=offset).type_as(q)
+        k_out = apply_rotary_pos_emb(k, freqs[..., 0], freqs[..., 1], offset=offset).type_as(k)
+        return q_out.view_as(q), k_out.view_as(k)
+class Linear(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, bias=False)
+class Norm(nn.Module):
+    def __init__(self,
+            dim: int,
+            eps: float = 1e-5,) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(T.ones((dim,)))
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(input, (self.weight.shape[0],), weight=self.weight, bias=None, eps=self.eps)
+class FFNN(nn.Module):
+    def __init__(self,
+            dim: int,
+            expand_dim: int = None,):
+        super().__init__()
+        expand_dim = default(expand_dim, 256 * ((int(2 * 4 * dim / 3) + 256 - 1) // 256))
+        self.dim = dim
+        self.expand_dim = expand_dim
+        self.gateup_proj = Linear(dim, 2*expand_dim)
+        self.down_proj = Linear(expand_dim, dim)
+    def forward(self, x):
+        gate, up = self.gateup_proj(x).chunk(2, dim=-1)
+        return self.down_proj(up * F.silu(gate))
+class GQA(nn.Module):
+    def __init__(self,
+            dim: int,
+            n_head: int,
+            shape_rotator: ShapeRotator,
+            kv_heads: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,):
+        super().__init__()
+        self.n_heads = n_head
+        self.kv_heads = default(kv_heads, n_head)
+        self.head_dim = dim // n_head
+        self.causal = causal
+        self.proj_qkv = Linear(dim, self.head_dim*(n_head+2*self.kv_heads))
+        self.norm_q = Norm(self.head_dim*n_head, eps=eps)
+        self.norm_k = Norm(self.head_dim*self.kv_heads, eps=eps)
+        self.attn_out = Linear(dim, dim)
+        self.shape_rotator = shape_rotator
+    def _sdpa(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        k = k.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
+        v = v.repeat_interleave(self.n_heads // self.kv_heads, dim=2)
+        x = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            is_causal=False if (q.size(1) != k.size(1)) else self.causal,
+        )
+        x = x.transpose(1, 2).contiguous()
+        return x
+    def _attend(self, q: Tensor, k: Tensor, v: Tensor, kv_cache: Optional[Tensor] = None,):
+        cache_len = get_cache_len(kv_cache)
+        q, k = self.shape_rotator.rotate(q, k, offset=cache_len)
+        if exists(kv_cache):
+            k = T.cat([kv_cache[:, :cache_len, 0], k], dim=1)
+            v = T.cat([kv_cache[:, :cache_len, 1], v], dim=1)
+            kv_cache[:, :k.size(1), 0] = k
+            kv_cache[:, :v.size(1), 1] = v
+        x = self._sdpa(q, k, v)
+        return self.attn_out(rearrange(x, 'b s h d -> b s (h d)'))
+    def _project(self, x):
+        full_q, full_k, full_v = self.proj_qkv(x).chunk(3, dim=-1)
+        normed_full_q = self.norm_q(full_q).to(full_q.dtype)
+        normed_full_k = self.norm_k(full_k).to(full_k.dtype)
+        q = rearrange(normed_full_q, 'b s (h d) -> b s h d', h=self.n_heads)
+        k = rearrange(normed_full_k, 'b s (h d) -> b s h d', h=self.kv_heads)
+        v = rearrange(full_v, 'b s (h d) -> b s h d', h=self.kv_heads)
+        return q, k, v
+    def forward(self,
+            x: Tensor,
+            kv: Optional[Tensor] = None,):
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        q, k, v = self._project(x)
+        return self._attend(q, k, v, kv_cache=kv)
+class PreNormAttn(nn.Module):
+    def __init__(self,
+            dim: int,
+            n_head: int,
+            shape_rotator: ShapeRotator,
+            kv_heads: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,):
+        super().__init__()
+        self.attn_norm = Norm(dim, eps=eps)
+        self.attn = GQA(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
+    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        return x + self.attn(self.attn_norm(x), kv)
+class PreNormFFNN(nn.Module):
+    def __init__(self,
+            dim: int,
+            ff_dim: int,
+            eps: float = 1e-5,):
+        super().__init__()
+        self.ffnn_norm = Norm(dim, eps=eps)
+        self.ffnn = FFNN(dim, ff_dim)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.ffnn(self.ffnn_norm(x))
+class Block(nn.Module):
+    def __init__(self,
+            dim: int,
+            layer_id: int = 0,
+            n_head: int = 16,
+            kv_heads: Optional[int] = None,
+            ff_dim: Optional[int] = None,
+            eps: float = 1e-5,
+            causal: bool = True,
+            shape_rotator: ShapeRotator = None):
+        super().__init__()
+        self.attn = PreNormAttn(dim, n_head, shape_rotator, kv_heads, eps=eps, causal=causal)
+        self.ffnn = PreNormFFNN(dim, ff_dim, eps=eps)
+        self.dim = dim
+        self.layer_id = layer_id
+        self.head_dim = dim // n_head
+        self.expand_dim = self.ffnn.ffnn.expand_dim
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1.0 / math.sqrt(self.dim)
+        nn.init.trunc_normal_(self.ffnn.ffnn.gateup_proj.weight, std=std, a=-3 * std, b=3 * std)
+        nn.init.trunc_normal_(self.attn.attn.proj_qkv.weight, std=std, a=-3 * std, b=3 * std)
+        nn.init.trunc_normal_(self.attn.attn.attn_out.weight, std=std, a=-3 * std, b=3 * std)
+        xstd = 1.0 / math.sqrt(self.expand_dim)
+        nn.init.trunc_normal_(self.ffnn.ffnn.down_proj.weight, std=xstd, a=-3 * xstd, b=3 * xstd)
+    def forward(self, x: Tensor, kv: Optional[Tensor] = None) -> Tensor:
+        """
+        x: (B, S, D)
+        kv: (B, S, H, D)
+        """
+        h = self.attn(x, kv)
+        out = self.ffnn(h)
+        return out
+class GPTOutput(nn.Module):
+    def __init__(self, dim, vocab_size):
+        super().__init__()
+        self.dim = dim
+        self.norm = Norm(dim)
+        self.output = Linear(dim, vocab_size)
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1.0 / math.sqrt(self.dim**2)
+        nn.init.trunc_normal_(self.output.weight, std=std, a=-3 * std, b=3 * std)
+    def forward(self, x):
+        return self.output(self.norm(x))
+@si_module
+class Stack(nn.Module):
+    class Config:
+        layers: int
+        dim: int
+        seq_len: int
+        n_head: int = 32
+        ff_dim: int = None
+        kv_heads: int = None
+        eps: float = 1e-5
+        theta: Union[int, float] = 10_000
+        causal: bool = True
+        from_pretrained: Optional[Tuple[str, int]] = None
+    def __init__(self, c: Config):
+        super().__init__()
+        from_pretrained = c.from_pretrained
+        if exists(from_pretrained):
+            checkpoint = load_ckpt(c.from_pretrained)
+        self.shape_rotator = ShapeRotator(c.dim//c.n_head, c.seq_len, theta=c.theta)
+        self.layers = nn.ModuleList([
+            Block(
+                dim=c.dim,
+                layer_id=l,
+                n_head=c.n_head,
+                kv_heads=c.kv_heads,
+                ff_dim=c.ff_dim,
+                eps=c.eps,
+                causal=c.causal,
+                shape_rotator=self.shape_rotator,
+            ) for l in range(c.layers)
+        ])
+        kv_heads = c.kv_heads or c.n_head
+        head_dim = c.dim // c.n_head
+        cache_shape = [c.layers, c.seq_len, 2, kv_heads, head_dim]
+        self.cache_shape = cache_shape
+        self.cache = [None] * c.layers
+        if exists(from_pretrained):
+            self.load_state_dict(checkpoint)
+    def init_cache(self, bsize, device, dtype, length:int=None):
+        if self.cache_shape is None:
+            return
+        cache_shape = self.cache_shape.copy()
+        cache_shape[1] = length or cache_shape[1]
+        self.cache = T.full((bsize, *cache_shape), CACHE_FILL_VALUE, device=device, dtype=dtype).transpose(0, 1)
+    def deinit_cache(self):
+        self.cache = [None] * len(self.cache)
+    def forward(self, x: Tensor) -> Tensor:
+        for l, layer in enumerate(self.layers):
+            x = layer(x, kv=self.cache[l])
+        return x

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blocks import *
+from .dist import *
+from .interp import *

utils/blocks.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from dataclasses import dataclass
+from typing import TypeVar, Generic, Type, Optional
+from functools import wraps
+import time
+import random
+import torch as T
+import torch.nn as nn
+# @TODO: remove si_module from codebase
+# we use this in our research codebase to make modules from callable configs
+si_module_TpV = TypeVar('si_module_TpV')
+def si_module(cls: Type[si_module_TpV]) -> Type[si_module_TpV]:
+    if not hasattr(cls, 'Config') or not isinstance(cls.Config, type):
+        class Config:
+            pass
+        cls.Config = Config
+    cls.Config = dataclass(cls.Config)
+    class ConfigWrapper(cls.Config, Generic[si_module_TpV]):
+        def __call__(self, *args, **kwargs) -> si_module_TpV:
+            if len(kwargs) > 0:
+                config_dict = {field.name: getattr(self, field.name) for field in self.__dataclass_fields__.values()}
+                config_dict.update(kwargs)
+                new_config = type(self)(**config_dict)
+                return cls(new_config)
+            else:
+                return cls(self, *args)
+    ConfigWrapper.__module__ = cls.__module__
+    ConfigWrapper.__name__ = f"{cls.__name__}Config"
+    ConfigWrapper.__qualname__ = f"{cls.__qualname__}.Config"
+    cls.Config = ConfigWrapper
+    original_init = cls.__init__
+    def new_init(self, *args, **kwargs):
+        self.c = next((arg for arg in args if isinstance(arg, cls.Config)), None) or next((arg for arg in kwargs.values() if isinstance(arg, cls.Config)), None)
+        original_init(self, *args, **kwargs)
+        self.register_buffer('_device_tracker', T.Tensor(), persistent=False)
+    cls.__init__ = new_init
+    @property
+    def device(self):
+        return self._device_tracker.device
+    @property
+    def dtype(self):
+        return self._device_tracker.dtype
+    cls.device = device
+    cls.dtype = dtype
+    return cls
+def get_activation(nonlinear_activation, nonlinear_activation_params={}):
+    if hasattr(nn, nonlinear_activation):
+        return getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+    else:
+        raise NotImplementedError(f"Activation {nonlinear_activation} not found in torch.nn")
+def exists(v):
+    return v is not None
+def isnt(v):
+    return not exists(v)
+def truthyexists(v):
+    return exists(v) and v is not False
+def truthyattr(obj, attr):
+    return hasattr(obj, attr) and truthyexists(getattr(obj, attr))
+defaultT = TypeVar('defaultT')
+def default(*args: Optional[defaultT]) -> Optional[defaultT]:
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner

utils/dist.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import torch as T
+import re
+from tqdm import tqdm
+from datetime import timedelta
+import requests
+import hashlib
+from io import BytesIO
+from huggingface_hub import hf_hub_download
+def rank0():
+    rank = os.environ.get('RANK')
+    if rank is None or rank == '0':
+        return True
+    else:
+        return False
+def local0():
+    local_rank = os.environ.get('LOCAL_RANK')
+    if local_rank is None or local_rank == '0':
+        return True
+    else:
+        return False
+class tqdm0(tqdm):
+    def __init__(self, *args, **kwargs):
+        total = kwargs.get('total', None)
+        if total is None and len(args) > 0:
+            try:
+                total = len(args[0])
+            except TypeError:
+                pass
+        if total is not None:
+            kwargs['miniters'] = max(1, total // 20)
+        super().__init__(*args, **kwargs, disable=not rank0(), bar_format='{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]')
+def print0(*args, **kwargs):
+    if rank0():
+        print(*args, **kwargs)
+_PRINTED_IDS = set()
+def printonce(*args, id=None, **kwargs):
+    if id is None:
+        id = ' '.join(map(str, args))
+    if id not in _PRINTED_IDS:
+        print(*args, **kwargs)
+        _PRINTED_IDS.add(id)
+def print0once(*args, **kwargs):
+    if rank0():
+        printonce(*args, **kwargs)
+def init_dist():
+    if T.distributed.is_initialized():
+        print0('Distributed already initialized')
+        rank = T.distributed.get_rank()
+        local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        world_size = T.distributed.get_world_size()
+    else:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            device = f'cuda:{local_rank}'
+            T.cuda.set_device(device)
+            T.distributed.init_process_group(backend='nccl', timeout=timedelta(minutes=30), rank=rank, world_size=world_size, device_id=T.device(device))
+            print(f'Rank {rank} of {world_size}.')
+        except Exception as e:
+            print0once(f'Not initializing distributed env: {e}')
+            rank = 0
+            local_rank = 0
+            world_size = 1
+    return rank, local_rank, world_size
+def load_ckpt(load_from_location, expected_hash=None):
+    os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1' #Disable this to speed up debugging errors with downloading from the hub
+    if local0():
+        repo_id = "si-pbc/hertz-dev"
+        print0(f'Loading checkpoint from repo_id {repo_id} and filename {load_from_location}.pt. This may take a while...')
+        save_path = hf_hub_download(repo_id=repo_id, filename=f"{load_from_location}.pt")
+        print0(f'Downloaded checkpoint to {save_path}')
+        if expected_hash is not None:
+            with open(save_path, 'rb') as f:
+                file_hash = hashlib.md5(f.read()).hexdigest()
+            if file_hash != expected_hash:
+                print(f'Hash mismatch for {save_path}. Expected {expected_hash} but got {file_hash}. Deleting checkpoint and trying again.')
+                os.remove(save_path)
+                return load_ckpt(load_from_location, expected_hash)
+    if T.distributed.is_initialized():
+        save_path = [save_path]
+        T.distributed.broadcast_object_list(save_path, src=0)
+        save_path = save_path[0]
+    loaded = T.load(save_path, weights_only=False, map_location='cpu')
+    print0(f'Loaded checkpoint from {save_path}')
+    return loaded

utils/interp.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch as T
+import os
+def rank0():
+    rank = os.environ.get('RANK')
+    if rank is None or rank == '0':
+        return True
+    else:
+        return False
+def print_colored(message, color='reset', bold=False, **kwargs):
+    color_dict = {
+        'bold': '\033[1m',
+        'green': '\033[92m',
+        'yellow': '\033[93m',
+        'red': '\033[91m',
+        'blue': '\033[94m',
+        'grey': '\033[90m',
+        'white': '\033[97m',
+        'reset': '\033[0m'
+    }
+    color_code = color_dict.get(color.lower(), color_dict['reset'])
+    prefix = color_dict['bold'] if bold else ''
+    print(f"{prefix}{color_code}{message}{color_dict['reset']}", **kwargs)
+def print0_colored(*args, **kwargs):
+    if rank0():
+        print_colored(*args, **kwargs)
+def param_count(module):
+    def count_parameters(model):
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = count_parameters(module)
+    output = [f'Total model parameters: {total_params:,}', '---------------------------']
+    for name, child in module.named_children():
+        params = count_parameters(child)
+        output.append(f'{name} parameters: {params:,}')
+    return '\n'.join(output)
+def model_size_estimation(module):
+    def estimate_size(model):
+        param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
+        buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
+        return param_size + buffer_size
+    total_size = estimate_size(module)
+    output = [f'Total model size: {total_size / 1024**2:.2f} MB', '---------------------------']
+    for name, child in module.named_children():
+        child_size = estimate_size(child)
+        output.append(f'{name} size: {child_size / 1024**2:.2f} MB')
+    return '\n'.join(output)
+def layer_param_distribution(module):
+    def count_parameters(model):
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    def get_layer_types(model):
+        layer_types = {}
+        for name, module in model.named_modules():
+            layer_type = module.__class__.__name__
+            params = sum(p.numel() for p in module.parameters(recurse=False) if p.requires_grad)
+            if params > 0:
+                if layer_type not in layer_types:
+                    layer_types[layer_type] = 0
+                layer_types[layer_type] += params
+        return layer_types
+    total_params = count_parameters(module)
+    layer_types = get_layer_types(module)
+    output = [f'Total trainable parameters: {total_params:,}', '---------------------------']
+    for layer_type, count in sorted(layer_types.items(), key=lambda x: x[1], reverse=True):
+        percentage = (count / total_params) * 100
+        output.append(f'{layer_type}: {count:,} ({percentage:.2f}%)')
+    return '\n'.join(output)