/** @module index */
import { AudioPipeVisualizer } from "./visualizer.js";
import { GrowingSentenceChunker } from "./sentence.js";
import { sendAlert } from "./alert.js";
import { hexToRgb, replaceQuotes } from "./helpers.js";
// Global configuration
const documentStyle = window.getComputedStyle(document.body);
const primaryColor = documentStyle.getPropertyValue("--color-primary");
const [pR, pG, pB] = hexToRgb(primaryColor);
const [dpR, dpG, dpB] = [
Math.max(0, pR - 96),
Math.max(0, pG - 96),
Math.max(0, pB - 96),
];
const pollingInterval = 150;
const transcriptionParameters = {};
const languageParameters = {
role: "anachrovox",
stream: true,
use_tools: true,
max_tokens: 1024,
return_tool_metadata: true,
};
const speechParameters = {
enhance: true,
output_format: "float"
};
const waveformParameters = {
waveformNoiseLevel: 0.025,
fftSize: 512,
fillStyle: "rgba(8,16,14,0.3)",
strokeStyle: [
`rgba(${dpR},${dpG},${dpB},0.1)`,
`rgba(${pR},${pG},${pB},0.75)`,
"rgba(255,255,255,0.6)"
],
lineWidth: [6,3,1],
};
const speakerHoleRings = [ // radius, number of holes
[18, 6],
[36, 10],
[52, 14],
[70, 18],
[88, 22],
];
const maxTypingSpeed = 200; // characters per second
const minTypingSpeed = 50;
const maxDelay = 0.5; // max length to delay from completion to wait for speech to start generating
let overseerAddress;
if (window.location.port === "3000") {
// Development (e.g. npm start)
overseerAddress = "ws://localhost:32189";
} else {
// Docker or production
overseerAddress = "overseer";
}
const sharedModelRoot = "https://huggingface.co/benjamin-paine/hey-buddy/resolve/main/pretrained";
const wakeWordModelRoot = "https://huggingface.co/benjamin-paine/anachrovox/resolve/main";
const wakeWordPrefixes = [
"hello", "hey", "hi", "so","well",
"yo", "okay", "thanks", "thank-you",
];
const heyBuddyConfiguration = {
record: true,
modelPath: [
`${wakeWordModelRoot}/vox.onnx`,
`${wakeWordModelRoot}/anachrovox.onnx`
].concat(
wakeWordPrefixes.map(
(prefix) => `${wakeWordModelRoot}/${prefix}-vox.onnx`
)
),
vadModelPath: `${sharedModelRoot}/silero-vad.onnx`,
embeddingModelPath: `${sharedModelRoot}/speech-embedding.onnx`,
spectrogramModelPath: `${sharedModelRoot}/mel-spectrogram.onnx`,
wakeWordThreshold: 0.8,
};
// Get elements from the page
const transcriptionSection = document.querySelector("#transcription #content #history");
const waveformCanvas = document.querySelector("#waveform canvas");
const promptInput = document.getElementById("prompt");
const temperature = document.getElementById("temperature");
const topP = document.getElementById("top-p");
const minP = document.getElementById("min-p");
const topK = document.getElementById("top-k");
const topKDisplay = document.getElementById("top-k-display");
const voiceId = document.getElementById("voice-id");
const voiceIdWheel = document.getElementById("voice-id-wheel");
const speed = document.getElementById("speed");
const volume = document.getElementById("volume");
const speaker = document.getElementById("speaker");
const listening = document.getElementById("listening");
const recording = document.getElementById("recording");
const powerSwitch = document.getElementById("power-switch-input");
const listenButton = document.getElementById("listen");
const powerIndicator = document.getElementById("power");
// Build speaker hole (just cosmetic)
for (let [radius, holes] of speakerHoleRings) {
for (let i = 0; i < holes; i++) {
// Calculate hole position based on radius and angle
const hole = document.createElement("div");
const angle = i * 2 * Math.PI / holes;
const x = Math.cos(angle) * radius;
const y = Math.sin(angle) * radius;
hole.style.left = `${x}px`;
hole.style.top = `${y}px`;
hole.classList.add("hole");
speaker.appendChild(hole);
}
}
// Global objects
const client = new Taproot(overseerAddress);
const audio = new AudioPipeVisualizer({...waveformParameters, canvas: waveformCanvas});
const chunker = new GrowingSentenceChunker();
const conversationHistory = [];
// Scroll to the bottom of the transcription section
const scrollToBottom = () => {
if (transcriptionSection.parentElement.scrollHeight - transcriptionSection.parentElement.scrollTop - transcriptionSection.parentElement.offsetHeight < 80) {
transcriptionSection.parentElement.scrollTop = transcriptionSection.parentElement.scrollHeight;
}
}
// Helper methods for updating the page
const pushText = (text, className) => {
text = replaceQuotes(text);
const element = document.createElement("p");
element.classList.add(className);
element.textContent = text;
transcriptionSection.appendChild(element);
scrollToBottom();
return element;
};
// Bind voice ID wheel to change voice ID
// This is the list of voices from Kokoro
const voiceMap = {
"Adam": "male.en.us.adam",
"Bella": "female.en.us.bella",
"Emma": "female.en.gb.emma",
"George": "male.en.gb.george",
"Isabel": "female.en.gb.isabella",
"Lewis": "male.en.gb.lewis",
"Michael": "male.en.us.michael",
"Nicole": "female.en.us.nicole",
"Sarah": "female.en.us.sarah",
"Skye": "female.en.us.sky",
};
const voiceNames = Object.keys(voiceMap);
const voiceIds = Object.values(voiceMap);
let voiceIndex = -1;
const setVoiceIndex = (newIndex) => {
if (newIndex !== voiceIndex) {
voiceId.value = voiceNames[newIndex];
voiceId.dispatchEvent(new Event("change"));
voiceIndex = newIndex;
}
};
setVoiceIndex(Math.round(Math.random() * voiceIds.length));
voiceIdWheel.addEventListener("click", () => {
let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value);
if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1;
setVoiceIndex(newVoiceIndex % voiceIds.length);
});
// Bind volume to update the audio volume
volume.addEventListener("change", (event) => {
audio.volume = volume.value;
});
// Bind top-k to update the display
topK.addEventListener("change", (event) => {
topKDisplay.value = Math.floor(topK.value);
topKDisplay.dispatchEvent(new Event("change"));
});
// Getter functions for parameters
const getLanguageParameters = (overrides = {}) => {
return {
...languageParameters,
history: conversationHistory,
top_k: parseInt(topK.value),
top_p: parseFloat(topP.value),
min_p: parseFloat(minP.value),
temperature: parseFloat(temperature.value),
...overrides,
};
};
const getSpeechParameters = (overrides = {}) => {
return {
...speechParameters,
speed: parseFloat(speed.value),
voice: voiceMap[voiceId.value],
...overrides,
};
};
let typingElement,
typingStart,
typingCharactersPerSecond = minTypingSpeed,
typingTarget = "",
typingAudioTiming = {},
unsetWhenComplete = false,
requestNumber = 0,
interrupt = false;
// The loop for typing out the text
const typingLoop = () => {
if (typingElement !== null && typingElement !== undefined) {
const now = performance.now();
const typingIndex = Math.floor((now - typingStart) * typingCharactersPerSecond / 1000);
const targetTextLength = typingTarget.length;
let typingAudioIndex = 0;
let i = 0;
let hasAudio = Object.getOwnPropertyNames(typingAudioTiming).length > 0;
for (let [audioTime, [audioTextLength, audioDuration]] of Object.entries(typingAudioTiming)) {
audioTime = parseFloat(audioTime);
if (now >= audioTime + audioDuration) {
// Audio has finished playing
typingAudioIndex += audioTextLength;
} else if (now >= audioTime) {
// Currently playing audio
typingAudioIndex += Math.floor((now - audioTime) * audioTextLength / audioDuration);
}
i++;
}
if (!interrupt && (typingIndex < targetTextLength || ((audio.volume > 0 || hasAudio) && typingAudioIndex < targetTextLength))) {
let innerHTML = "";
if (typingAudioIndex > 0) {
innerHTML += `${typingTarget.substring(0, typingAudioIndex + 1)}`;
innerHTML += `${typingTarget.substring(typingAudioIndex + 1, typingIndex)}`;
} else {
innerHTML += `${typingTarget.substring(0, typingIndex)}`;
}
if (typingIndex < targetTextLength) {
innerHTML += `|`;
}
if (typingElement.innerHTML != innerHTML) {
typingElement.innerHTML = innerHTML;
scrollToBottom();
}
} else if (interrupt || unsetWhenComplete) {
let finalHTML;
if (hasAudio) {
finalHTML = `${typingTarget}`;
} else {
finalHTML = `${typingTarget}`;
}
typingElement.innerHTML = finalHTML;
unsetWhenComplete = false;
interrupt = false;
typingElement = null;
typingTarget = "";
typingAudioTiming = {};
}
}
requestAnimationFrame(typingLoop);
};
requestAnimationFrame(typingLoop);
// Callback for when a sentence is completed
chunker.onChunk(async (chunk) => {
let isFirst = false;
let requestNumberAtStart = requestNumber;
if (typingElement !== null && typingElement !== undefined) {
typingTarget += replaceQuotes(chunk).replaceAll(/\n\W*/g, "\n");
} else {
isFirst = true;
typingElement = pushText("", "completion");
typingTarget = replaceQuotes(chunk).replaceAll(/\n\W*/g, "\n");
typingStart = performance.now();
typingAudioTiming = {};
}
if (audio.volume > 0 && !interrupt) {
typingCharactersPerSecond = minTypingSpeed;
let audioEndTypingIndex = typingTarget.length;
let audioResult = await client.invoke({
task: "speech-synthesis",
parameters: getSpeechParameters({text: chunk}),
});
if (interrupt || requestNumberAtStart !== requestNumber) {
return;
}
if (audio.playing) {
audio.pushSilence(0.15);
}
let audioReady = performance.now();
let audioNode = audio.push(audioResult.data);
let audioDuration = audioNode.buffer.duration * 1000;
if (isFirst) {
typingAudioTiming[audioReady] = [chunk.length, audioDuration];
} else {
// Queue the audio speed for the next sentence
let lastAudioStartTime = Math.max(...Object.keys(typingAudioTiming));
let [lastAudioLength, lastAudioDuration] = typingAudioTiming[lastAudioStartTime];
let thisAudioTiming = Math.max(lastAudioStartTime + lastAudioDuration + (isFirst ? 0 : 0.15), audioReady);
typingAudioTiming[thisAudioTiming] = [chunk.length, audioDuration];
}
} else {
typingCharactersPerSecond = maxTypingSpeed;
}
});
// Callback when transcription and completion are done
const finalizeResult = (prompt, result) => {
interrupt = false;
unsetWhenComplete = true;
chunker.push(result.result);
chunker.flush();
conversationHistory.push(prompt);
conversationHistory.push(result.result);
if (result.function) {
let usedToolContainer = document.createElement("p");
usedToolContainer.classList.add("tool");
usedToolContainer.innerText = "Used tool: ";
let usedToolFunction = document.createElement("span");
usedToolFunction.innerText = result.function.name;
usedToolFunction.title = result.function.arguments;
usedToolContainer.appendChild(usedToolFunction);
transcriptionSection.appendChild(usedToolContainer);
if (result.citations) {
for (let i = 0; i < result.citations.length; i++) {
let citation = result.citations[i];
let citationContainer = document.createElement("p");
citationContainer.classList.add("citation");
let citationLabel = citation.title
? citation.title
: citation.source
? citation.source
: "";
if (citationLabel) {
citationContainer.innerText = `${citationLabel} `;
} else {
citationContainer.innerText = "Source ";
}
let citationLink = document.createElement("a");
citationLink.href = citation.url;
citationLink.innerText = `[${i + 1}]`;
citationLink.title = citation.url;
citationLink.target = "_blank";
citationContainer.appendChild(citationLink);
transcriptionSection.appendChild(citationContainer);
}
}
}
};
// Create a function to invoke the appropriate workflow based on the current state
const invokeFromMicrophone = async (samples) => {
requestNumber++;
interrupt = true;
let prompt;
try {
const textResult = await client.invoke(
{
task: "audio-transcription",
parameters: {audio: samples},
continuation: {
task: "text-generation",
parameters: getLanguageParameters(),
result_parameters: "prompt",
}
},
{
fetchIntermediates: true,
pollingInterval: pollingInterval,
onInterimResult: (result) => {
prompt = result;
pushText(result, "transcription");
},
onIntermediateResult: (result) => {
interrupt = false;
chunker.push(result);
}
}
);
finalizeResult(prompt, textResult);
} catch (error) {
console.error(error);
sendAlert(error);
}
};
const invokeFromPrompt = async (text) => {
requestNumber++;
interrupt = true;
pushText(text, "transcription");
try {
const inferenceResult = await client.invoke(
{
task: "text-generation",
parameters: getLanguageParameters({prompt: text}),
},
{
fetchIntermediates: true,
pollingInterval: pollingInterval,
onIntermediateResult: (result) => {
interrupt = false;
chunker.push(result);
}
}
);
finalizeResult(text, inferenceResult);
} catch (error) {
console.error(error);
sendAlert(error);
}
};
// Configure power button to disable everything
powerSwitch.addEventListener("change", (event) => {
if (powerSwitch.checked) {
powerIndicator.classList.add("active");
} else {
powerIndicator.classList.remove("active");
listening.classList.remove("active");
recording.classList.remove("active");
}
});
powerSwitch.dispatchEvent(new Event("change"));
// Configure HeyBuddy for audio recording and invocation
if (!window.HeyBuddy) {
console.error("HeyBuddy not found. Please include HeyBuddy.js in your project.");
} else {
const heyBuddy = new window.HeyBuddy(heyBuddyConfiguration);
heyBuddy.onProcessed(async (result) => {
let highestWakeWord = null, highestProbability = 0;
for (let wakewordName in result.wakeWords) {
let probability = result.wakeWords[wakewordName].probability;
if (probability > highestProbability) {
highestWakeWord = wakewordName;
highestProbability = probability;
}
}
});
heyBuddy.onRecording(async (samples) => {
if (powerSwitch.checked) {
await invokeFromMicrophone(samples);
}
});
heyBuddy.onProcessed((result) => {
if (powerSwitch.checked) {
if (result.recording) {
recording.classList.add("active");
} else {
recording.classList.remove("active");
}
if (result.listening) {
listening.classList.add("active");
} else {
listening.classList.remove("active");
}
}
});
const startEvents = ["mousedown", "touchstart"];
const stopEvents = ["mouseup", "touchend", "mouseleave"];
const startListening = () => {
const interval = setInterval(() => {
heyBuddy.negatives = 0;
heyBuddy.listening = true;
heyBuddy.recording = true;
}, 10);
const onStop = () => {
clearInterval(interval);
for (let event of stopEvents) {
window.removeEventListener(event, onStop);
}
}
for (let event of stopEvents) {
window.addEventListener(event, onStop);
}
};
for (let event of startEvents) {
listenButton.addEventListener(event, startListening);
}
}
// Bind the prompt input to the workflow
promptInput.addEventListener("keypress", async (event) => {
if (event.key === "Enter") {
event.preventDefault();
const text = promptInput.value;
// Wait a tick so the invocation doesn't send the new prompt as history
promptInput.value = "";
await invokeFromPrompt(text);
}
});