Live-Video-Chat

Running

File size: 7,619 Bytes

// Constants and Configuration
const USER_SPEECH_INTERRUPT_DELAY = 500;
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
const CHUNK_SIZE = 300;
const MAX_PREFETCH_REQUESTS = 10;
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour

// DOM Elements
const startStopButton = document.getElementById('startStopButton');
const voiceSelectionDropdown = document.getElementById('voiceSelect');
const modelSelectionDropdown = document.getElementById('modelSelect');
const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
const responseTimeDisplay = document.getElementById('responseTime');
const userActivityIndicator = document.getElementById('userIndicator');
const aiActivityIndicator = document.getElementById('aiIndicator');
const transcriptDiv = document.getElementById('transcript');
const webcamToggleButton = document.getElementById('webcamToggle');

// Speech Recognition
let speechRecognizer;
let isSpeechRecognitionActive = false;

// AI Interaction State
let activeQuery = null;
let queryStartTime = 0;
let isRequestInProgress = false;
let isUserSpeaking = false;
let requestAbortController = null;
let firstResponseTextTimestamp = null;

// Audio Management
let currentAudio = null;
let audioPlaybackQueue = [];

// Prefetching and Caching
const prefetchCache = new Map();
const pendingPrefetchRequests = new Map();
const prefetchQueue = [];
let prefetchTextQuery = "";

// Conversation History
let conversationHistory = [];

// Audio Caching
const audioCache = new Map();

// Webcam
let isWebcamActive = false;
let app;
let lastCaption = ""; 

const clients = [
    "multimodalart/Florence-2-l4",
    "gokaygokay/Florence-2",
    "multimodalart/Florence-2-l4-2",
    "gokaygokay/Florence-2",
];

// Utility Functions

// Normalize query text
const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');

// Generate a cache key
const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
    `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;

// Update activity indicators
const updateActivityIndicators = (state = null) => {
    userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
    if (isRequestInProgress && !currentAudio) {
        aiActivityIndicator.textContent = "AI: Processing...";
    } else if (currentAudio && !isUserSpeaking) {
        aiActivityIndicator.textContent = state || "AI: Speaking";
    } else if (isUserSpeaking) {
        aiActivityIndicator.textContent = "AI: Listening";
    } else {
        aiActivityIndicator.textContent = "AI: Idle";
    }
};

// Update latency display
const updateLatency = () => {
    if (firstResponseTextTimestamp) {
        const latency = firstResponseTextTimestamp - queryStartTime;
        responseTimeDisplay.textContent = `Latency: ${latency}ms`;
    } else {
        responseTimeDisplay.textContent = "Latency: 0ms";
    }
};

// Add to conversation history
const addToConversationHistory = (role, content) => {
    if (conversationHistory.length > 0 &&
        conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
        conversationHistory[conversationHistory.length - 1].content === "") {
        conversationHistory.pop();
    }

    conversationHistory.push({ role, content });

    if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
};

// Audio Management Functions

// Play audio from the queue
const playNextAudio = async () => {
    if (audioPlaybackQueue.length > 0) {
        const audioData = audioPlaybackQueue.shift();
        const audio = new Audio(audioData.url);
        updateActivityIndicators();

        const audioPromise = new Promise(resolve => {
            audio.onended = resolve;
            audio.onerror = resolve;
        });
        if (currentAudio) {
            currentAudio.pause();
            currentAudio.currentTime = 0;
        }

        currentAudio = audio;
        await audio.play();
        await audioPromise;
        playNextAudio();
    } else {
        updateActivityIndicators();
    }
};

// Prefetching and Caching Functions

// Prefetch and cache the first TTS audio chunk
const prefetchFirstAudioChunk = (query, voice) => {
    const normalizedQuery = normalizeQueryText(query);
    const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);

    if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;

    prefetchQueue.push({ query: query.trim(), voice, cacheKey });
    processPrefetchQueue();
};

// Webcam Integration Functions
const startWebcam = async () => {
    try {
        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
        document.getElementById('webcam').srcObject = stream;
        setInterval(captureAndProcessImage, 5000);
    } catch (error) {
        console.error("Error accessing webcam: ", error);
    }
};

const stopWebcam = () => {
    const stream = document.getElementById('webcam').srcObject;
    if (stream) {
        const tracks = stream.getTracks();
        tracks.forEach(track => track.stop());
    }
};

const captureAndProcessImage = async () => {
    if (!isWebcamActive) return;

    const canvas = document.createElement('canvas');
    const video = document.getElementById('webcam');
    canvas.width = video.videoWidth;
    canvas.height = video.videoHeight;
    const context = canvas.getContext('2d');
    context.drawImage(video, 0, 0, canvas.width, canvas.height);

    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
    await processWithGradio(blob);
};

const processWithGradio = async (imageBlob) => {
    try {
        const randomClient = clients[Math.floor(Math.random() * clients.length)];
        app = await client(randomClient);
        const handledFile = await handle_file(imageBlob);

        const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);

        const dataString = result.data[0];
        lastCaption = dataString || lastCaption;
    } catch (error) {
        console.error("Error processing with Gradio:", error);
    }
};

// Event Listeners
startStopButton.addEventListener('click', () => {
    isSpeechRecognitionActive = !isSpeechRecognitionActive;
    if (isSpeechRecognitionActive) {
        speechRecognizer.start();
    } else {
        speechRecognizer.stop();
    }
});

webcamToggleButton.addEventListener('click', () => {
    isWebcamActive = !isWebcamActive;
    if (isWebcamActive) {
        startWebcam();
    } else {
        stopWebcam();
    }
});

// Speech Recognition Initialization
if ('webkitSpeechRecognition' in window) {
    speechRecognizer = new webkitSpeechRecognition();
    speechRecognizer.continuous = true;
    speechRecognizer.interimResults = true;

    speechRecognizer.onresult = (event) => {
        let interimTranscript = '';
        for (let i = event.resultIndex; i < event.results.length; i++) {
            const transcript = event.results[i][0].transcript;
            if (event.results[i].isFinal) {
                processSpeechTranscript(transcript);
                isUserSpeaking = false;
                updateActivityIndicators();
                queryStartTime = Date.now();
            } else {
                interimTranscript += transcript;
                isUserSpeaking = true;
                updateActivityIndicators();
            }
        }
    };
}

setInterval(updateLatency, 100);

window.onload = () => {
    startWebcam();
};