import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import gradio as gr
import cv2
import mediapipe as mp
import numpy as np
import spaces

# Define the ASLClassifier model
class ASLClassifier(nn.Module):
    def __init__(self, input_size=63, hidden_size=256, num_classes=28):
        super(ASLClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
        self.bn2 = nn.BatchNorm1d(hidden_size * 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn4 = nn.BatchNorm1d(hidden_size // 2)
        self.relu4 = nn.ReLU()
        self.dropout4 = nn.Dropout(0.3)
        self.fc5 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

# Load the model and label encoder (CPU initially, GPU handled by decorator)
device = torch.device('cpu')  # Default to CPU; GPU inference handled by @spaces.GPU
model = ASLClassifier().to(device)
model.load_state_dict(torch.load('data/asl_classifier.pth', map_location=device))
model.eval()

df = pd.read_csv('data/asl_landmarks_final.csv')
label_encoder = LabelEncoder()
label_encoder.fit(df['label'].values)

# Initialize MediaPipe (runs on CPU)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Prediction function with GPU offloading
@spaces.GPU
def predict_letter(landmarks, model, label_encoder):
    with torch.no_grad():
        # Move to GPU for inference (handled by decorator)
        landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).to('cuda')
        model = model.to('cuda')
        output = model(landmarks)
        _, predicted_idx = torch.max(output, 1)
        letter = label_encoder.inverse_transform([predicted_idx.item()])[0]
        # Move model back to CPU to free GPU memory
        model = model.to('cpu')
    return letter

# Video processing function (CPU for video processing, GPU for prediction)
def process_video(video_path):
    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None, "Error: Could not open video."

    # Variables to store output
    text_output = ""
    out_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process frame with MediaPipe (CPU)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Draw landmarks
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                # Extract landmarks and predict (GPU via decorator)
                landmarks = []
                for lm in hand_landmarks.landmark:
                    landmarks.extend([lm.x, lm.y, lm.z])
                landmarks = np.array(landmarks, dtype=np.float32)
                predicted_letter = predict_letter(landmarks, model, label_encoder)

                # Add letter to text (avoid duplicates if same as last)
                if not text_output or predicted_letter != text_output[-1]:
                    text_output += predicted_letter

                # Overlay predicted letter on frame
                cv2.putText(frame, f"Letter: {predicted_letter}", (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        # Store processed frame
        out_frames.append(frame)

    cap.release()

    # Write processed video to a temporary file
    out_path = "processed_video.mp4"
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(out_path, fourcc, 20.0, (out_frames[0].shape[1], out_frames[0].shape[0]))
    for frame in out_frames:
        out.write(frame)
    out.release()

    return out_path, text_output

# Create Gradio interface with sample input
with gr.Blocks(title="Sign Language Translation") as demo:
    gr.Markdown("## Sign Language Translation")
    video_input = gr.Video(label="Input Video", sources=["upload", "webcam"])
    video_output = gr.Video(label="Processed Video with Landmarks")
    text_output = gr.Textbox(label="Predicted Text", interactive=False)

    # Button to process video
    btn = gr.Button("Translate")
    btn.click(
        fn=process_video,
        inputs=video_input,
        outputs=[video_output, text_output]
    )

    # Add sample input video
    gr.Examples(
        examples=[["data/letters_seq.mp4"]],
        inputs=[video_input],
        outputs=[video_output, text_output],
        fn=process_video,
        cache_examples=True  # Cache the output for faster loading
    )

# Launch the app
demo.launch()