import gradio as gr
import numpy as np
import librosa
from transformers import pipeline
from datetime import datetime
import os

# AI 모델 초기화
speech_recognizer = pipeline(
    "automatic-speech-recognition",
    model="kresnik/wav2vec2-large-xlsr-korean"  # 한국어 음성인식 모델
)
emotion_classifier = pipeline(
    "audio-classification",
    model="MIT/ast-finetuned-speech-commands-v2"
)
text_analyzer = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment"
)

def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        # 상태 관리
        state = gr.State({
            "user_name": "",
            "reflections": [],
            "voice_analysis": None,
            "final_prompt": "",
            "generated_images": []  # 생성된 이미지 저장
        })

        # 헤더
        header = gr.Markdown("# 디지털 굿판")
        user_display = gr.Markdown("")

        with gr.Tabs() as tabs:
            # 입장
            with gr.Tab("입장"):
                gr.Markdown("""# 디지털 굿판에 오신 것을 환영합니다""")
                name_input = gr.Textbox(label="이름을 알려주세요")
                start_btn = gr.Button("여정 시작하기")

            # 청신
            with gr.Tab("청신"):
                with gr.Row():
                    audio = gr.Audio(
                        value="assets/main_music.mp3",
                        type="filepath",
                        label="온천천의 소리"
                    )
                    with gr.Column():
                        reflection_input = gr.Textbox(
                            label="현재 순간의 감상을 적어주세요",
                            lines=3
                        )
                        save_btn = gr.Button("감상 저장하기")
                        reflections_display = gr.Dataframe(
                            headers=["시간", "감상", "감정 분석"],
                            label="기록된 감상들"
                        )

            # 기원
            with gr.Tab("기원"):
                gr.Markdown("## 기원 - 목소리로 전하기")
                with gr.Row():
                    # 음성 입력
                    voice_input = gr.Audio(
                        label="나누고 싶은 이야기를 들려주세요",
                        sources=["microphone"],
                        type="filepath"
                    )
                    
                    # 분석 결과
                    with gr.Column():
                        transcribed_text = gr.Textbox(
                            label="인식된 텍스트",
                            interactive=False
                        )
                        voice_emotion = gr.Textbox(
                            label="음성 감정 분석",
                            interactive=False
                        )
                        text_emotion = gr.Textbox(
                            label="텍스트 감정 분석",
                            interactive=False
                        )
                        analysis_details = gr.JSON(
                            label="상세 분석 결과"
                        )

            # 송신
            with gr.Tab("송신"):
                gr.Markdown("## 송신 - 시각화 결과")
                with gr.Column():
                    final_prompt = gr.Textbox(
                        label="생성된 프롬프트",
                        interactive=False
                    )
                    gallery = gr.Gallery(
                        label="시각화 결과",
                        columns=2
                    )
                    share_btn = gr.Button("결과 공유하기")

        def analyze_voice_comprehensive(audio_path, state):
            """종합적인 음성 분석"""
            try:
                if audio_path is None:
                    return state, "음성 입력이 필요합니다.", "", "", {}

                # 오디오 로드
                y, sr = librosa.load(audio_path, sr=16000)

                # 1. 음성-텍스트 변환
                transcription = speech_recognizer(y)
                spoken_text = transcription["text"]

                # 2. 음향학적 특성 분석
                features = {
                    "energy": float(np.mean(librosa.feature.rms(y=y))),
                    "pitch": float(np.mean(librosa.piptrack(y=y, sr=sr)[1])),
                    "tempo": float(librosa.beat.tempo(y)[0]),
                    "zero_crossing_rate": float(np.mean(librosa.feature.zero_crossing_rate(y)))
                }

                # 3. 음성 감정 분석
                voice_emotions = emotion_classifier(y)
                primary_emotion = voice_emotions[0]

                # 4. 텍스트 감정 분석
                text_sentiment = text_analyzer(spoken_text)[0]

                # 결과 종합
                analysis_result = {
                    "acoustic_features": features,
                    "voice_emotion": primary_emotion,
                    "text_sentiment": text_sentiment
                }

                # 프롬프트 생성
                prompt = generate_art_prompt(spoken_text, analysis_result, state["reflections"])
                state["final_prompt"] = prompt

                return (
                    state,
                    spoken_text,
                    f"음성 감정: {primary_emotion['label']} ({primary_emotion['score']:.2f})",
                    f"텍스트 감정: {text_sentiment['label']} ({text_sentiment['score']:.2f})",
                    analysis_result
                )

            except Exception as e:
                return state, f"오류 발생: {str(e)}", "", "", {}

        def generate_art_prompt(text, analysis, reflections):
            """예술적 프롬프트 생성"""
            # 음성 감정
            voice_emotion = analysis["voice_emotion"]["label"]
            # 텍스트 감정
            text_sentiment = analysis["text_sentiment"]["label"]
            # 에너지 레벨
            energy = analysis["acoustic_features"]["energy"]

            # 감정에 따른 색상 매핑
            emotion_colors = {
                "happy": "따뜻한 노란색과 주황색",
                "sad": "깊은 파랑색과 보라색",
                "angry": "강렬한 빨강색과 검정색",
                "neutral": "부드러운 회색과 베이지색"
            }

            # 기본 프롬프트 구성
            prompt = f"한국 전통 민화 스타일의 추상화, {emotion_colors.get(voice_emotion, '자연스러운 색상')} 사용. "
            prompt += f"음성의 감정({voice_emotion})과 텍스트의 감정({text_sentiment})이 조화를 이루며, "
            prompt += f"에너지 레벨({energy:.2f})을 통해 화면의 동적인 느낌을 표현. "

            # 이전 감상들 반영
            if reflections:
                prompt += "이전 감상들의 정서를 배경에 은은하게 담아내기. "

            return prompt

        def save_reflection(text, state):
            """감상 저장 및 감정 분석"""
            if not text.strip():
                return state, state["reflections"]

            current_time = datetime.now().strftime("%H:%M:%S")
            sentiment = text_analyzer(text)[0]
            new_reflection = [current_time, text, f"{sentiment['label']} ({sentiment['score']:.2f})"]
            
            state["reflections"].append(new_reflection)
            return state, state["reflections"]

        def start_journey(name):
            """여정 시작"""
            welcome_text = f"# 환영합니다, {name}님의 디지털 굿판"
            return welcome_text, gr.update(selected="청신")

        # 이벤트 연결
        start_btn.click(
            fn=start_journey,
            inputs=[name_input],
            outputs=[user_display, tabs]
        )

        save_btn.click(
            fn=save_reflection,
            inputs=[reflection_input, state],
            outputs=[state, reflections_display]
        )

        voice_input.change(
            fn=analyze_voice_comprehensive,
            inputs=[voice_input, state],
            outputs=[
                state,
                transcribed_text,
                voice_emotion,
                text_emotion,
                analysis_details
            ]
        )

    return app

# 앱 실행
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()