Spaces:

JaiSurya
/

bi-directional-sign-language-conversation

Running on Zero

bi-directional-sign-language-conversation

File size: 5,657 Bytes

import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import gradio as gr
import cv2
import mediapipe as mp
import numpy as np
import spaces

# Define the ASLClassifier model
class ASLClassifier(nn.Module):
    def __init__(self, input_size=63, hidden_size=256, num_classes=28):
        super(ASLClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
        self.bn2 = nn.BatchNorm1d(hidden_size * 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn4 = nn.BatchNorm1d(hidden_size // 2)
        self.relu4 = nn.ReLU()
        self.dropout4 = nn.Dropout(0.3)
        self.fc5 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

# Load the model and label encoder (CPU initially, GPU handled by decorator)
device = torch.device('cpu')  # Default to CPU; GPU inference handled by @spaces.GPU
model = ASLClassifier().to(device)
model.load_state_dict(torch.load('data/asl_classifier.pth', map_location=device))
model.eval()

df = pd.read_csv('data/asl_landmarks_final.csv')
label_encoder = LabelEncoder()
label_encoder.fit(df['label'].values)

# Initialize MediaPipe (runs on CPU)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Prediction function with GPU offloading
@spaces.GPU
def predict_letter(landmarks, model, label_encoder):
    with torch.no_grad():
        # Move to GPU for inference (handled by decorator)
        landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).to('cuda')
        model = model.to('cuda')
        output = model(landmarks)
        _, predicted_idx = torch.max(output, 1)
        letter = label_encoder.inverse_transform([predicted_idx.item()])[0]
        # Move model back to CPU to free GPU memory
        model = model.to('cpu')
    return letter

# Video processing function (CPU for video processing, GPU for prediction)
def process_video(video_path):
    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None, "Error: Could not open video."

    # Variables to store output
    text_output = ""
    out_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process frame with MediaPipe (CPU)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Draw landmarks
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                # Extract landmarks and predict (GPU via decorator)
                landmarks = []
                for lm in hand_landmarks.landmark:
                    landmarks.extend([lm.x, lm.y, lm.z])
                landmarks = np.array(landmarks, dtype=np.float32)
                predicted_letter = predict_letter(landmarks, model, label_encoder)

                # Add letter to text (avoid duplicates if same as last)
                if not text_output or predicted_letter != text_output[-1]:
                    text_output += predicted_letter

                # Overlay predicted letter on frame
                cv2.putText(frame, f"Letter: {predicted_letter}", (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        # Store processed frame
        out_frames.append(frame)

    cap.release()

    # Write processed video to a temporary file
    out_path = "processed_video.mp4"
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(out_path, fourcc, 20.0, (out_frames[0].shape[1], out_frames[0].shape[0]))
    for frame in out_frames:
        out.write(frame)
    out.release()

    return out_path, text_output

# Create Gradio interface with sample input
with gr.Blocks(title="Sign Language Translation") as demo:
    gr.Markdown("## Sign Language Translation")
    video_input = gr.Video(label="Input Video", sources=["upload", "webcam"])
    video_output = gr.Video(label="Processed Video with Landmarks")
    text_output = gr.Textbox(label="Predicted Text", interactive=False)

    # Button to process video
    btn = gr.Button("Translate")
    btn.click(
        fn=process_video,
        inputs=video_input,
        outputs=[video_output, text_output]
    )

    # Add sample input video
    gr.Examples(
        examples=[["data/letters_seq.mp4"]],
        inputs=[video_input],
        outputs=[video_output, text_output],
        fn=process_video,
        cache_examples=True  # Cache the output for faster loading
    )

# Launch the app
demo.launch()