import { AudioPipeVisualizer } from "./visualizer.js"; |
import { GrowingSentenceChunker } from "./sentence.js"; |
import { sendAlert } from "./alert.js"; |
import { hexToRgb, replaceQuotes } from "./helpers.js"; |
const documentStyle = window.getComputedStyle(document.body); |
const primaryColor = documentStyle.getPropertyValue("--color-primary"); |
const [pR, pG, pB] = hexToRgb(primaryColor); |
const [dpR, dpG, dpB] = [ |
Math.max(0, pR - 96), |
Math.max(0, pG - 96), |
Math.max(0, pB - 96), |
]; |
const pollingInterval = 150; |
const transcriptionParameters = {}; |
const languageParameters = { |
role: "anachrovox", |
stream: true, |
use_tools: true, |
max_tokens: 1024, |
return_tool_metadata: true, |
}; |
const speechParameters = { |
enhance: true, |
output_format: "float" |
}; |
const waveformParameters = { |
waveformNoiseLevel: 0.025, |
fftSize: 512, |
fillStyle: "rgba(8,16,14,0.3)", |
strokeStyle: [ |
`rgba(${dpR},${dpG},${dpB},0.1)`, |
`rgba(${pR},${pG},${pB},0.75)`, |
"rgba(255,255,255,0.6)" |
], |
lineWidth: [6,3,1], |
}; |
const speakerHoleRings = [ |
[18, 6], |
[36, 10], |
[52, 14], |
[70, 18], |
[88, 22], |
]; |
const maxTypingSpeed = 200; |
const minTypingSpeed = 50; |
const maxDelay = 0.5; |
let overseerAddress; |
if (window.location.port === "3000") { |
overseerAddress = "ws://localhost:32189"; |
} else { |
overseerAddress = "overseer"; |
} |
const sharedModelRoot = "https://huggingface.co/benjamin-paine/hey-buddy/resolve/main/pretrained"; |
const wakeWordModelRoot = "https://huggingface.co/benjamin-paine/anachrovox/resolve/main"; |
const wakeWordPrefixes = [ |
"hello", "hey", "hi", "so","well", |
"yo", "okay", "thanks", "thank-you", |
]; |
const heyBuddyConfiguration = { |
record: true, |
modelPath: [ |
`${wakeWordModelRoot}/vox.onnx`, |
`${wakeWordModelRoot}/anachrovox.onnx` |
].concat( |
wakeWordPrefixes.map( |
(prefix) => `${wakeWordModelRoot}/${prefix}-vox.onnx` |
) |
), |
vadModelPath: `${sharedModelRoot}/silero-vad.onnx`, |
embeddingModelPath: `${sharedModelRoot}/speech-embedding.onnx`, |
spectrogramModelPath: `${sharedModelRoot}/mel-spectrogram.onnx`, |
wakeWordThreshold: 0.8, |
}; |
const transcriptionSection = document.querySelector("#transcription #content #history"); |
const waveformCanvas = document.querySelector("#waveform canvas"); |
const promptInput = document.getElementById("prompt"); |
const temperature = document.getElementById("temperature"); |
const topP = document.getElementById("top-p"); |
const minP = document.getElementById("min-p"); |
const topK = document.getElementById("top-k"); |
const topKDisplay = document.getElementById("top-k-display"); |
const voiceId = document.getElementById("voice-id"); |
const voiceIdWheel = document.getElementById("voice-id-wheel"); |
const speed = document.getElementById("speed"); |
const volume = document.getElementById("volume"); |
const speaker = document.getElementById("speaker"); |
const listening = document.getElementById("listening"); |
const recording = document.getElementById("recording"); |
const powerSwitch = document.getElementById("power-switch-input"); |
const listenButton = document.getElementById("listen"); |
const powerIndicator = document.getElementById("power"); |
for (let [radius, holes] of speakerHoleRings) { |
for (let i = 0; i < holes; i++) { |
const hole = document.createElement("div"); |
const angle = i * 2 * Math.PI / holes; |
const x = Math.cos(angle) * radius; |
const y = Math.sin(angle) * radius; |
hole.style.left = `${x}px`; |
hole.style.top = `${y}px`; |
hole.classList.add("hole"); |
speaker.appendChild(hole); |
} |
} |
const client = new Taproot(overseerAddress); |
const audio = new AudioPipeVisualizer({...waveformParameters, canvas: waveformCanvas}); |
const chunker = new GrowingSentenceChunker(); |
const conversationHistory = []; |
const scrollToBottom = () => { |
if (transcriptionSection.parentElement.scrollHeight - transcriptionSection.parentElement.scrollTop - transcriptionSection.parentElement.offsetHeight < 80) { |
transcriptionSection.parentElement.scrollTop = transcriptionSection.parentElement.scrollHeight; |
} |
} |
const pushText = (text, className) => { |
text = replaceQuotes(text); |
const element = document.createElement("p"); |
element.classList.add(className); |
element.textContent = text; |
transcriptionSection.appendChild(element); |
scrollToBottom(); |
return element; |
}; |
const voiceMap = { |
"Adam": "male.en.us.adam", |
"Bella": "female.en.us.bella", |
"Emma": "female.en.gb.emma", |
"George": "male.en.gb.george", |
"Isabel": "female.en.gb.isabella", |
"Lewis": "male.en.gb.lewis", |
"Michael": "male.en.us.michael", |
"Nicole": "female.en.us.nicole", |
"Sarah": "female.en.us.sarah", |
"Skye": "female.en.us.sky", |
}; |
const voiceNames = Object.keys(voiceMap); |
const voiceIds = Object.values(voiceMap); |
let voiceIndex = -1; |
const setVoiceIndex = (newIndex) => { |
if (newIndex !== voiceIndex) { |
voiceId.value = voiceNames[newIndex]; |
voiceId.dispatchEvent(new Event("change")); |
voiceIndex = newIndex; |
} |
}; |
setVoiceIndex(Math.round(Math.random() * voiceIds.length)); |
voiceIdWheel.addEventListener("click", () => { |
let newVoiceIndex = voiceIndex + parseInt(voiceIdWheel.value); |
if (newVoiceIndex < 0) newVoiceIndex = voiceIds.length - 1; |
setVoiceIndex(newVoiceIndex % voiceIds.length); |
}); |
volume.addEventListener("change", (event) => { |
audio.volume = volume.value; |
}); |
topK.addEventListener("change", (event) => { |
topKDisplay.value = Math.floor(topK.value); |
topKDisplay.dispatchEvent(new Event("change")); |
}); |
const getLanguageParameters = (overrides = {}) => { |
return { |
...languageParameters, |
history: conversationHistory, |
top_k: parseInt(topK.value), |
top_p: parseFloat(topP.value), |
min_p: parseFloat(minP.value), |
temperature: parseFloat(temperature.value), |
...overrides, |
}; |
}; |
const getSpeechParameters = (overrides = {}) => { |
return { |
...speechParameters, |
speed: parseFloat(speed.value), |
voice: voiceMap[voiceId.value], |
...overrides, |
}; |
}; |
let typingElement, |
typingStart, |
typingCharactersPerSecond = minTypingSpeed, |
typingTarget = "", |
typingAudioTiming = {}, |
unsetWhenComplete = false, |
requestNumber = 0, |
interrupt = false; |
const typingLoop = () => { |
if (typingElement !== null && typingElement !== undefined) { |
const now = performance.now(); |
const typingIndex = Math.floor((now - typingStart) * typingCharactersPerSecond / 1000); |
const targetTextLength = typingTarget.length; |
let typingAudioIndex = 0; |
let i = 0; |
let hasAudio = Object.getOwnPropertyNames(typingAudioTiming).length > 0; |
for (let [audioTime, [audioTextLength, audioDuration]] of Object.entries(typingAudioTiming)) { |
audioTime = parseFloat(audioTime); |
if (now >= audioTime + audioDuration) { |
typingAudioIndex += audioTextLength; |
} else if (now >= audioTime) { |
typingAudioIndex += Math.floor((now - audioTime) * audioTextLength / audioDuration); |
} |
i++; |
} |
if (!interrupt && (typingIndex < targetTextLength || ((audio.volume > 0 || hasAudio) && typingAudioIndex < targetTextLength))) { |
let innerHTML = ""; |
if (typingAudioIndex > 0) { |
innerHTML += `<span class="spoken">${typingTarget.substring(0, typingAudioIndex + 1)}</span>`; |
innerHTML += `<span class="unspoken">${typingTarget.substring(typingAudioIndex + 1, typingIndex)}</span>`; |
} else { |
innerHTML += `<span class="unspoken">${typingTarget.substring(0, typingIndex)}</span>`; |
} |
if (typingIndex < targetTextLength) { |
innerHTML += `<span class="cursor">|</span>`; |
} |
if (typingElement.innerHTML != innerHTML) { |
typingElement.innerHTML = innerHTML; |
scrollToBottom(); |
} |
} else if (interrupt || unsetWhenComplete) { |
let finalHTML; |
if (hasAudio) { |
finalHTML = `<span class="spoken">${typingTarget}</span>`; |
} else { |
finalHTML = `<span class="unspoken">${typingTarget}</span>`; |
} |
typingElement.innerHTML = finalHTML; |
unsetWhenComplete = false; |
interrupt = false; |
typingElement = null; |
typingTarget = ""; |
typingAudioTiming = {}; |
} |
} |
requestAnimationFrame(typingLoop); |
}; |
requestAnimationFrame(typingLoop); |
chunker.onChunk(async (chunk) => { |
let isFirst = false; |
let requestNumberAtStart = requestNumber; |
if (typingElement !== null && typingElement !== undefined) { |
typingTarget += replaceQuotes(chunk).replaceAll(/\n\W*/g, "\n"); |
} else { |
isFirst = true; |
typingElement = pushText("", "completion"); |
typingTarget = replaceQuotes(chunk).replaceAll(/\n\W*/g, "\n"); |
typingStart = performance.now(); |
typingAudioTiming = {}; |
} |
if (audio.volume > 0 && !interrupt) { |
typingCharactersPerSecond = minTypingSpeed; |
let audioEndTypingIndex = typingTarget.length; |
let audioResult = await client.invoke({ |
task: "speech-synthesis", |
parameters: getSpeechParameters({text: chunk}), |
}); |
if (interrupt || requestNumberAtStart !== requestNumber) { |
return; |
} |
if (audio.playing) { |
audio.pushSilence(0.15); |
} |
let audioReady = performance.now(); |
let audioNode = audio.push(audioResult.data); |
let audioDuration = audioNode.buffer.duration * 1000; |
if (isFirst) { |
typingAudioTiming[audioReady] = [chunk.length, audioDuration]; |
} else { |
let lastAudioStartTime = Math.max(...Object.keys(typingAudioTiming)); |
let [lastAudioLength, lastAudioDuration] = typingAudioTiming[lastAudioStartTime]; |
let thisAudioTiming = Math.max(lastAudioStartTime + lastAudioDuration + (isFirst ? 0 : 0.15), audioReady); |
typingAudioTiming[thisAudioTiming] = [chunk.length, audioDuration]; |
} |
} else { |
typingCharactersPerSecond = maxTypingSpeed; |
} |
}); |
const finalizeResult = (prompt, result) => { |
interrupt = false; |
unsetWhenComplete = true; |
chunker.push(result.result); |
chunker.flush(); |
conversationHistory.push(prompt); |
conversationHistory.push(result.result); |
if (result.function) { |
let usedToolContainer = document.createElement("p"); |
usedToolContainer.classList.add("tool"); |
usedToolContainer.innerText = "Used tool: "; |
let usedToolFunction = document.createElement("span"); |
usedToolFunction.innerText = result.function.name; |
usedToolFunction.title = result.function.arguments; |
usedToolContainer.appendChild(usedToolFunction); |
transcriptionSection.appendChild(usedToolContainer); |
if (result.citations) { |
for (let i = 0; i < result.citations.length; i++) { |
let citation = result.citations[i]; |
let citationContainer = document.createElement("p"); |
citationContainer.classList.add("citation"); |
let citationLabel = citation.title |
? citation.title |
: citation.source |
? citation.source |
: ""; |
if (citationLabel) { |
citationContainer.innerText = `${citationLabel} `; |
} else { |
citationContainer.innerText = "Source "; |
} |
let citationLink = document.createElement("a"); |
citationLink.href = citation.url; |
citationLink.innerText = `[${i + 1}]`; |
citationLink.title = citation.url; |
citationLink.target = "_blank"; |
citationContainer.appendChild(citationLink); |
transcriptionSection.appendChild(citationContainer); |
} |
} |
} |
}; |
const invokeFromMicrophone = async (samples) => { |
requestNumber++; |
interrupt = true; |
let prompt; |
try { |
const textResult = await client.invoke( |
{ |
task: "audio-transcription", |
parameters: {audio: samples}, |
continuation: { |
task: "text-generation", |
parameters: getLanguageParameters(), |
result_parameters: "prompt", |
} |
}, |
{ |
fetchIntermediates: true, |
pollingInterval: pollingInterval, |
onInterimResult: (result) => { |
prompt = result; |
pushText(result, "transcription"); |
}, |
onIntermediateResult: (result) => { |
interrupt = false; |
chunker.push(result); |
} |
} |
); |
finalizeResult(prompt, textResult); |
} catch (error) { |
console.error(error); |
sendAlert(error); |
} |
}; |
const invokeFromPrompt = async (text) => { |
requestNumber++; |
interrupt = true; |
pushText(text, "transcription"); |
try { |
const inferenceResult = await client.invoke( |
{ |
task: "text-generation", |
parameters: getLanguageParameters({prompt: text}), |
}, |
{ |
fetchIntermediates: true, |
pollingInterval: pollingInterval, |
onIntermediateResult: (result) => { |
interrupt = false; |
chunker.push(result); |
} |
} |
); |
finalizeResult(text, inferenceResult); |
} catch (error) { |
console.error(error); |
sendAlert(error); |
} |
}; |
powerSwitch.addEventListener("change", (event) => { |
if (powerSwitch.checked) { |
powerIndicator.classList.add("active"); |
} else { |
powerIndicator.classList.remove("active"); |
listening.classList.remove("active"); |
recording.classList.remove("active"); |
} |
}); |
powerSwitch.dispatchEvent(new Event("change")); |
if (!window.HeyBuddy) { |
console.error("HeyBuddy not found. Please include HeyBuddy.js in your project."); |
} else { |
const heyBuddy = new window.HeyBuddy(heyBuddyConfiguration); |
heyBuddy.onProcessed(async (result) => { |
let highestWakeWord = null, highestProbability = 0; |
for (let wakewordName in result.wakeWords) { |
let probability = result.wakeWords[wakewordName].probability; |
if (probability > highestProbability) { |
highestWakeWord = wakewordName; |
highestProbability = probability; |
} |
} |
}); |
heyBuddy.onRecording(async (samples) => { |
if (powerSwitch.checked) { |
await invokeFromMicrophone(samples); |
} |
}); |
heyBuddy.onProcessed((result) => { |
if (powerSwitch.checked) { |
if (result.recording) { |
recording.classList.add("active"); |
} else { |
recording.classList.remove("active"); |
} |
if (result.listening) { |
listening.classList.add("active"); |
} else { |
listening.classList.remove("active"); |
} |
} |
}); |
const startEvents = ["mousedown", "touchstart"]; |
const stopEvents = ["mouseup", "touchend", "mouseleave"]; |
const startListening = () => { |
const interval = setInterval(() => { |
heyBuddy.negatives = 0; |
heyBuddy.listening = true; |
heyBuddy.recording = true; |
}, 10); |
const onStop = () => { |
clearInterval(interval); |
for (let event of stopEvents) { |
window.removeEventListener(event, onStop); |
} |
} |
for (let event of stopEvents) { |
window.addEventListener(event, onStop); |
} |
}; |
for (let event of startEvents) { |
listenButton.addEventListener(event, startListening); |
} |
} |
promptInput.addEventListener("keypress", async (event) => { |
if (event.key === "Enter") { |
event.preventDefault(); |
const text = promptInput.value; |
promptInput.value = ""; |
await invokeFromPrompt(text); |
} |
}); |