Spaces:

Mubin1917
/

Chat_With_Youtube_Videos

Sleeping

File size: 18,477 Bytes

857dbaf
c0e82c0
857dbaf
c0e82c0
 
 
 
857dbaf
 
dd41bc7
 
c0e82c0
 
 
 
 
 
dd41bc7
c0e82c0
 
857dbaf
 
 
 
dd41bc7
c0e82c0
857dbaf
 
 
 
 
 
c0e82c0
857dbaf
 
 
 
 
 
6a369e2
 
857dbaf
c0e82c0
 
857dbaf
c0e82c0
 
 
 
 
 
857dbaf
dd41bc7
c0e82c0
dd41bc7
c0e82c0
dd41bc7
c0e82c0
dd41bc7
 
857dbaf
c0e82c0
dd41bc7
c0e82c0
857dbaf
dd41bc7
 
857dbaf
dd41bc7
857dbaf
dd41bc7
 
857dbaf
 
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
857dbaf
 
 
dd41bc7
857dbaf
dd41bc7
857dbaf
 
 
 
 
dd41bc7
857dbaf
 
dd41bc7
 
 
 
 
 
 
 
 
857dbaf
c0e82c0
857dbaf
 
dd41bc7
857dbaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
857dbaf
 
c0e82c0
 
dd41bc7
857dbaf
 
c0e82c0
857dbaf
 
 
c0e82c0
857dbaf
c0e82c0
dd41bc7
 
c0e82c0
 
dd41bc7
 
 
 
 
 
 
 
857dbaf
c0e82c0
 
 
 
 
 
 
 
 
 
dd41bc7
857dbaf
 
 
dd41bc7
857dbaf
c0e82c0
 
 
857dbaf
 
 
 
 
dd41bc7
857dbaf
dd41bc7
857dbaf
dd41bc7
857dbaf
 
dd41bc7
 
 
857dbaf
 
dd41bc7
857dbaf
 
 
 
 
 
 
 
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
 
c0e82c0
dd41bc7
857dbaf
 
c0e82c0
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
ec3110b
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
 
c0e82c0
dd41bc7
 
c0e82c0
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
 
 
c0e82c0
dd41bc7
 
c0e82c0
dd41bc7
 
 
 
 
 
 
 
 
 
 
 
857dbaf
 
 
 
 
 
 
 
 
 
c0e82c0
857dbaf
dd41bc7
 
857dbaf
c0e82c0
ecd85b6
 
dd41bc7
 
 
 
 
 
ecd85b6
c0e82c0
857dbaf
c0e82c0
 
 
857dbaf
 
 
 
 
 
c0e82c0
857dbaf
 
 
c0e82c0
857dbaf
 
c0e82c0
857dbaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0e82c0

"""
YouTube Video Analysis and Interaction Module

This module provides a comprehensive set of tools for analyzing YouTube videos,
extracting information, and answering questions based on video content. It leverages
the LangChain library for natural language processing tasks and the YouTube Transcript
API for fetching video transcripts.

Classes:
    YouTubeTranscriptPointsExtractor: 
        Extracts and formats comments with clickable timestamps from a YouTube video transcript.
    QuestionAnswerExtractor: 
        Processes user questions and extracts answers from video transcripts.
    YouTubeAgent: 
        Manages the overall agent setup for interacting with YouTube videos and processing user queries.

Key Features:
    - Main points formatted as youtube comment with clickable timestamps
    - Question answering based on video content
    - Flexible AI agent for handling various YouTube video-related tasks
"""

import os
import openai
import json
from typing import List, Dict, Any, Union, Type
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.agents import tool, AgentExecutor
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser, JsonOutputFunctionsParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.memory import ConversationBufferWindowMemory

# _ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.getenv('OPENAI_API_KEY') #os.environ['OPENAI_API_KEY']

def get_temperature():
    return 0  #Default value

def set_temperature(new_temperature):
    global get_temperature
    def new_get_temperature():
        return new_temperature
    get_temperature = new_get_temperature
    # print(f"Temperature set to: {get_temperature()}")

class TimestampedPoint_1(BaseModel):
    """Pydantic model for representing extracted points from Youtube-Transcript"""
    timestamp: float = Field(description="The timestamp (in floating-point number) of when main points are discussed in the video.")
    main_point: str = Field(description="A title for Main point.")
    summary: str = Field(description="A summary of main points discussed at that timestamp.")
    emoji: str = Field(description="An emoji that matches the summary.")
    
class TimestampedPoint_2(BaseModel):
    """Pydantic model for representing extracted points."""
    main_point: str = Field(description="The main topic, theme, or subject extracted from the subtitle.")
    timestamp: float = Field(description="The timestamp (in floating-point number) from the video where the main point is mentioned.")
    summary: str = Field(description="The context or brief explanation of the main point.")
    emoji: str = Field(description="An emoji that represents or summarizes the main point.")
    
class YouTubeTranscriptPointsExtractor:
    """
    A tool for extracting and formatting main points with clickable timestamps from YouTube video transcripts.

    This class provides methods to process transcripts, identify key points,
    and format them for use in YouTube comments with clickable timestamps.
    """

    class PointsCollection_1(BaseModel):
        """Pydantic model for representing a collection of timestamped points."""
        points: List[TimestampedPoint_1]
    
    class PointsCollection_2(BaseModel):
        """Pydantic model for representing a collection of timestamped points."""
        points: List[TimestampedPoint_2]
    
    @staticmethod
    @tool(return_direct=True)
    def extract_clickable_points(youtube_video_id: str) -> str:
        """
        Extracts and formats comments with clickable timestamps from a YouTube video transcript.

        Args:
            youtube_video_id (str): The ID of the YouTube video.

        Returns:
            str: Formatted string of main points with clickable timestamps, ready for use in YouTube comments.
        """
        try:
            transcript = YouTubeTranscriptPointsExtractor._fetch_transcript(youtube_video_id)
            extracted_points_1 = YouTubeTranscriptPointsExtractor._process_transcript(transcript, YouTubeTranscriptPointsExtractor.PointsCollection_1)
            formatted_output_1 = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_1, True)
            formatted_output_1a = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_1, False)
            
            extracted_points_2 = YouTubeTranscriptPointsExtractor._process_transcript(transcript, YouTubeTranscriptPointsExtractor.PointsCollection_2)
            formatted_output_2 = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_2, True)
            formatted_output_2a = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_2, False)
            return f"""Main points extracted from YouTube video (ID: {youtube_video_id})\nOutput_style_1:\n```\n{formatted_output_1}\n```\nOutput_Style_1a:\n```\n{formatted_output_1a}\n```\nOutput_Style_2a:\n```\n{formatted_output_2}\n```\nOutput_Style_2a:\n```\n{formatted_output_2a}\n```\nChoose the style that best suits your needs for presenting the main points of the video."""
        except Exception as e:
            raise

    @staticmethod
    def _fetch_transcript(youtube_video_id: str) -> str:
        """
        Fetches the transcript for a YouTube video.

        Args:
            youtube_video_id (str): The ID of the YouTube video.

        Returns:
            str: The full transcript of the video.

        Raises:
            Exception: If there's an error fetching the transcript.
        """
        try:
            transcript_json = YouTubeTranscriptApi.get_transcript(youtube_video_id)
            transcript_data = [f"{entry['start']:.2f}: {entry['text']} " for entry in transcript_json]
            return "".join(transcript_data)
        except Exception as e:
            raise

    @staticmethod
    def _process_transcript(transcript: str, info_model: Union[Type[PointsCollection_1], Type[PointsCollection_2]]) -> List[Dict[str, Any]]:
        """
        Extracts main points from the transcript using NLP techniques.
        
        This method maintains a conversation history to provide context for subsequent calls.
        
        Args:
            transcript (str): The full transcript of the video.
        
        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing extracted main points.
        """
        main_points_extraction_function = [convert_to_openai_function(info_model)]
        
        model = ChatOpenAI(temperature=get_temperature())

        extraction_model = model.bind(functions=main_points_extraction_function, function_call={"name": info_model.__name__})
        
        system_message = f"""
        You are an AI assistant that extracts essential info from video transcripts.
        You have the authority to make improvements as you see fit.
        
        Rules To Follow:
        - Refining the summaries for clarity and conciseness.
        - Adjusting emoji choices to better represent the content.
        - Removing redundant information.
        - Grouping two points into a single point if the timestamps are close enough.
        
        Your goal is to produce a refined and accurate representation of the main points from the video transcript. Use your judgment to balance adherence to the specific rules with overall improvement of the extracted information.
        """
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_message),
            ("human", "{input}")
        ])
        
        extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="points")

        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0, chunk_size=16000, separators=[f" {char}" for char in "123456789"])
        
        prep = RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
        
        chain = prep | extraction_chain.map() | YouTubeTranscriptPointsExtractor._flatten
        
        result_1 = chain.invoke(transcript)
                
        return result_1

    @staticmethod
    def _flatten(matrix):
        """Flattens a 2D list into a 1D list."""
        return [item for row in matrix for item in row]
    
    @staticmethod
    def _format_for_youtube_comment(points: List[Dict[str, Any]], detailed: bool = True) -> str:
        """
        Formats extracted main points into a YouTube-style comment with clickable timestamps.

        Args:
            points (List[Dict[str, Any]]): List of dictionaries containing main points with timestamps.
            detailed (bool): If True, returns a detailed format with emojis and summaries. 
                             If False, returns a simpler format with just timestamps and main points.

        Returns:
            str: Formatted string representing the main points as a YouTube comment with clickable timestamps.
        """
        def _format_timestamp(seconds):
            hours = int(seconds // 3600)
            minutes = int((seconds % 3600) // 60)
            seconds = int(seconds % 60)
            return f"{hours:02}:{minutes:02}:{seconds:02}"

        formatted_comment = ""
        for point in points:
            timestamp = _format_timestamp(point['timestamp'])
            main_point = point['main_point'].rstrip('.')
            
            if detailed:
                emoji = point['emoji']
                summary = point['summary']
                formatted_comment += f"{timestamp} {emoji} {main_point}: {summary}\n"
            else:
                formatted_comment += f"{timestamp} {main_point}\n"
            
        return formatted_comment.strip()

class Answer(BaseModel):
    """Pydantic model for representing an answer to a question."""
    answer: str = Field(description="The answer to the user's question based on the video transcript.")
    confidence: float = Field(description="A confidence score between 0 and 1 indicating how certain the model is about the answer.")
    
class QuestionAnswerExtractor:
    """
    A tool for answering questions about YouTube videos based on their transcripts.

    This class provides methods to process transcripts and generate answers to user questions
    using natural language processing techniques.
    """
    
    class Info(BaseModel):
        """Pydantic model for representing a collection of answers."""
        answers: List[Answer]

    @staticmethod
    @tool(return_direct=False)
    def get_answer(youtube_video_id: str, question: str) -> str:
        """
        Answers a question about a YouTube video based on its transcript.
        
        Args:
            youtube_video_id (str): The ID of the YouTube video.
            question (str): The user's question about the video.

        Returns:
            str: Formatted string containing the answer to the user's question.
        """
        try:
            transcript = QuestionAnswerExtractor._get_youtube_video_transcript(youtube_video_id)
            answer = QuestionAnswerExtractor._extract_answer(transcript, question)
            return answer
        except Exception as e:
            return f"Error answering question: {str(e)}"

    @staticmethod
    def _get_youtube_video_transcript(youtube_video_id: str) -> str:
        """
        Fetches the transcript for a YouTube video.
        
        Args:
            youtube_video_id (str): The ID of the YouTube video.
        
        Returns:
            str: The full transcript of the video.
        
        Raises:
            Exception: If there's an error fetching the transcript.
        """
        try:
            transcript_json = YouTubeTranscriptApi.get_transcript(youtube_video_id)
            transcript_data = [entry['text'] for entry in transcript_json]
            return " ".join(transcript_data)
        except Exception as e:
            raise

    @staticmethod
    def _extract_answer(transcript: str, question: str) -> List[Answer]:
        """
        Extracts an answer to the user's question from the YouTube video transcript.
        
        Args:
            transcript (str): The full transcript of the video.
            question (str): The user's question about the video.
        
        Returns:
            List[Answer]: A list containing a single Answer object with the consolidated answer.
        """
        answer_extraction_function = [convert_to_openai_function(QuestionAnswerExtractor.Info)]
        
        model = ChatOpenAI(temperature=get_temperature())
        extraction_model = model.bind(functions=answer_extraction_function, function_call={"name": "Info"})
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant tasked with answering questions about a video based on its transcript."),
            ("human", "Transcript: {transcript}\n\nQuestion: {question}\n\nProvide an answer to the question based on the transcript, along with a confidence score.")
        ])
        
        extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="answers")
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=192, chunk_size=8000, separators=[f" {char}" for char in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"])
        
        def prepare_input(x):
            chunks = text_splitter.split_text(x['transcript'])
            return [{"transcript": chunk, "question": x['question']} for chunk in chunks]
        
        prep = RunnableLambda(prepare_input)
        
        chain = prep | extraction_chain.map() | QuestionAnswerExtractor._flatten
        
        # Get partial answers
        partial_answers = chain.invoke({"transcript": transcript, "question": question})
        
        # Filter out low-confidence answers
        filtered_answers = [answer for answer in partial_answers if answer['confidence'] > 0.4]
        
        # If all answers were filtered out, return a low-confidence "no answer" response
        if not filtered_answers:
            return "I couldn't find a reliable answer to your question based on the video transcript."
        
        # Consolidate filtered partial answers
        consolidation_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant tasked with consolidating multiple partial answers into a comprehensive final answer."),
            ("human", "Question: {question}\n\nPartial Answers: {partial_answers}\n\nPlease provide a consolidated, comprehensive answer to the question based on these partial answers. Ignore any information from answers with low confidence (0.5 or below).")
        ])
        
        consolidation_model = ChatOpenAI(temperature=get_temperature())
        consolidation_chain = consolidation_prompt | consolidation_model 
        
        final_answer = consolidation_chain.invoke({
            "question": question,
            "partial_answers": json.dumps(filtered_answers, indent=2)
        })
        
        return final_answer.content
        
    @staticmethod
    def _flatten(matrix):
        """Flattens a 2D list into a 1D list."""
        return [item for row in matrix for item in row]
    
class YouTubeAgent:
    """
    An agent for interacting with YouTube videos and processing user queries.

    This class sets up the necessary components for an AI agent that can understand
    and respond to user queries about YouTube videos.
    """

    def __init__(self):
        """Initializes the YouTubeAgent with necessary tools and components."""
        
        self.tools = [
            QuestionAnswerExtractor.get_answer,
            YouTubeTranscriptPointsExtractor.extract_clickable_points,
        ]
        
        self.sys_message = """You are a helpful assistant.
        
        Important instructions:
        1. Only use the 'extract_clickable_points' tool when the user explicitly asks for clickable points or timestamps from a video.
        2. For all other queries, including general questions about video content, use the 'get_answer' tool.
        3. If the user's query is unclear, ask for clarification before using any tools.
        4. Always provide concise and relevant responses based on the tool outputs.

        Remember to interpret the user's intent carefully and use the appropriate tools."""
        
        self.functions = [convert_to_openai_function(f) for f in self.tools]
        
        self.model = ChatOpenAI(temperature=get_temperature()).bind(functions=self.functions)
        
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", self.sys_message),
            MessagesPlaceholder(variable_name="history"),
            ("user", "{input}"),
            MessagesPlaceholder(variable_name="agent_scratchpad")
        ])
        
        self.agent_chain = RunnablePassthrough.assign(
            agent_scratchpad= lambda x: format_to_openai_functions(x["intermediate_steps"])
        ) | self.prompt | self.model | OpenAIFunctionsAgentOutputParser()
        
        self.memory = ConversationBufferWindowMemory(k=3, return_messages=True, memory_key="history")
        self.agent_executor = AgentExecutor(agent=self.agent_chain, tools=self.tools, memory=self.memory)

    def invoke(self, input_text: str) -> str:
        """
        Processes a user input and returns the agent's response.

        Args:
            input_text (str): The user's input query.

        Returns:
            str: The agent's response to the user's query.
        """
        try:
            result = self.agent_executor.invoke({"input": input_text})
            return result['output']
        except Exception as e:
            return f"An error occurred: {str(e)}"

# youtube_agent = YouTubeAgent()
# video_link = "https://www.youtube.com/watch?v=-OSxeoIAs2w"
# main_points = youtube_agent.invoke(f"The race involves which challenges in the following video {video_link}")