File size: 18,477 Bytes
857dbaf
c0e82c0
857dbaf
c0e82c0
 
 
 
857dbaf
 
dd41bc7
 
c0e82c0
 
 
 
 
 
dd41bc7
c0e82c0
 
857dbaf
 
 
 
dd41bc7
c0e82c0
857dbaf
 
 
 
 
 
c0e82c0
857dbaf
 
 
 
 
 
6a369e2
 
857dbaf
c0e82c0
 
857dbaf
c0e82c0
 
 
 
 
 
857dbaf
dd41bc7
c0e82c0
dd41bc7
c0e82c0
dd41bc7
c0e82c0
dd41bc7
 
857dbaf
c0e82c0
dd41bc7
c0e82c0
857dbaf
dd41bc7
 
857dbaf
dd41bc7
857dbaf
dd41bc7
 
857dbaf
 
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
857dbaf
 
 
dd41bc7
857dbaf
dd41bc7
857dbaf
 
 
 
 
dd41bc7
857dbaf
 
dd41bc7
 
 
 
 
 
 
 
 
857dbaf
c0e82c0
857dbaf
 
dd41bc7
857dbaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
857dbaf
 
c0e82c0
 
dd41bc7
857dbaf
 
c0e82c0
857dbaf
 
 
c0e82c0
857dbaf
c0e82c0
dd41bc7
 
c0e82c0
 
dd41bc7
 
 
 
 
 
 
 
857dbaf
c0e82c0
 
 
 
 
 
 
 
 
 
dd41bc7
857dbaf
 
 
dd41bc7
857dbaf
c0e82c0
 
 
857dbaf
 
 
 
 
dd41bc7
857dbaf
dd41bc7
857dbaf
dd41bc7
857dbaf
 
dd41bc7
 
 
857dbaf
 
dd41bc7
857dbaf
 
 
 
 
 
 
 
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
 
c0e82c0
dd41bc7
857dbaf
 
c0e82c0
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
ec3110b
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
c0e82c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd41bc7
 
c0e82c0
dd41bc7
 
c0e82c0
dd41bc7
 
 
c0e82c0
dd41bc7
 
 
 
 
c0e82c0
dd41bc7
 
c0e82c0
dd41bc7
 
 
 
 
 
 
 
 
 
 
 
857dbaf
 
 
 
 
 
 
 
 
 
c0e82c0
857dbaf
dd41bc7
 
857dbaf
c0e82c0
ecd85b6
 
dd41bc7
 
 
 
 
 
ecd85b6
c0e82c0
857dbaf
c0e82c0
 
 
857dbaf
 
 
 
 
 
c0e82c0
857dbaf
 
 
c0e82c0
857dbaf
 
c0e82c0
857dbaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0e82c0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
"""
YouTube Video Analysis and Interaction Module

This module provides a comprehensive set of tools for analyzing YouTube videos,
extracting information, and answering questions based on video content. It leverages
the LangChain library for natural language processing tasks and the YouTube Transcript
API for fetching video transcripts.

Classes:
    YouTubeTranscriptPointsExtractor: 
        Extracts and formats comments with clickable timestamps from a YouTube video transcript.
    QuestionAnswerExtractor: 
        Processes user questions and extracts answers from video transcripts.
    YouTubeAgent: 
        Manages the overall agent setup for interacting with YouTube videos and processing user queries.

Key Features:
    - Main points formatted as youtube comment with clickable timestamps
    - Question answering based on video content
    - Flexible AI agent for handling various YouTube video-related tasks
"""

import os
import openai
import json
from typing import List, Dict, Any, Union, Type
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.agents import tool, AgentExecutor
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser, JsonOutputFunctionsParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.memory import ConversationBufferWindowMemory

# _ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.getenv('OPENAI_API_KEY') #os.environ['OPENAI_API_KEY']

def get_temperature():
    return 0  #Default value

def set_temperature(new_temperature):
    global get_temperature
    def new_get_temperature():
        return new_temperature
    get_temperature = new_get_temperature
    # print(f"Temperature set to: {get_temperature()}")

class TimestampedPoint_1(BaseModel):
    """Pydantic model for representing extracted points from Youtube-Transcript"""
    timestamp: float = Field(description="The timestamp (in floating-point number) of when main points are discussed in the video.")
    main_point: str = Field(description="A title for Main point.")
    summary: str = Field(description="A summary of main points discussed at that timestamp.")
    emoji: str = Field(description="An emoji that matches the summary.")
    
class TimestampedPoint_2(BaseModel):
    """Pydantic model for representing extracted points."""
    main_point: str = Field(description="The main topic, theme, or subject extracted from the subtitle.")
    timestamp: float = Field(description="The timestamp (in floating-point number) from the video where the main point is mentioned.")
    summary: str = Field(description="The context or brief explanation of the main point.")
    emoji: str = Field(description="An emoji that represents or summarizes the main point.")
    
class YouTubeTranscriptPointsExtractor:
    """
    A tool for extracting and formatting main points with clickable timestamps from YouTube video transcripts.

    This class provides methods to process transcripts, identify key points,
    and format them for use in YouTube comments with clickable timestamps.
    """

    class PointsCollection_1(BaseModel):
        """Pydantic model for representing a collection of timestamped points."""
        points: List[TimestampedPoint_1]
    
    class PointsCollection_2(BaseModel):
        """Pydantic model for representing a collection of timestamped points."""
        points: List[TimestampedPoint_2]
    
    @staticmethod
    @tool(return_direct=True)
    def extract_clickable_points(youtube_video_id: str) -> str:
        """
        Extracts and formats comments with clickable timestamps from a YouTube video transcript.

        Args:
            youtube_video_id (str): The ID of the YouTube video.

        Returns:
            str: Formatted string of main points with clickable timestamps, ready for use in YouTube comments.
        """
        try:
            transcript = YouTubeTranscriptPointsExtractor._fetch_transcript(youtube_video_id)
            extracted_points_1 = YouTubeTranscriptPointsExtractor._process_transcript(transcript, YouTubeTranscriptPointsExtractor.PointsCollection_1)
            formatted_output_1 = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_1, True)
            formatted_output_1a = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_1, False)
            
            extracted_points_2 = YouTubeTranscriptPointsExtractor._process_transcript(transcript, YouTubeTranscriptPointsExtractor.PointsCollection_2)
            formatted_output_2 = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_2, True)
            formatted_output_2a = YouTubeTranscriptPointsExtractor._format_for_youtube_comment(extracted_points_2, False)
            return f"""Main points extracted from YouTube video (ID: {youtube_video_id})\nOutput_style_1:\n```\n{formatted_output_1}\n```\nOutput_Style_1a:\n```\n{formatted_output_1a}\n```\nOutput_Style_2a:\n```\n{formatted_output_2}\n```\nOutput_Style_2a:\n```\n{formatted_output_2a}\n```\nChoose the style that best suits your needs for presenting the main points of the video."""
        except Exception as e:
            raise

    @staticmethod
    def _fetch_transcript(youtube_video_id: str) -> str:
        """
        Fetches the transcript for a YouTube video.

        Args:
            youtube_video_id (str): The ID of the YouTube video.

        Returns:
            str: The full transcript of the video.

        Raises:
            Exception: If there's an error fetching the transcript.
        """
        try:
            transcript_json = YouTubeTranscriptApi.get_transcript(youtube_video_id)
            transcript_data = [f"{entry['start']:.2f}: {entry['text']} " for entry in transcript_json]
            return "".join(transcript_data)
        except Exception as e:
            raise

    @staticmethod
    def _process_transcript(transcript: str, info_model: Union[Type[PointsCollection_1], Type[PointsCollection_2]]) -> List[Dict[str, Any]]:
        """
        Extracts main points from the transcript using NLP techniques.
        
        This method maintains a conversation history to provide context for subsequent calls.
        
        Args:
            transcript (str): The full transcript of the video.
        
        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing extracted main points.
        """
        main_points_extraction_function = [convert_to_openai_function(info_model)]
        
        model = ChatOpenAI(temperature=get_temperature())

        extraction_model = model.bind(functions=main_points_extraction_function, function_call={"name": info_model.__name__})
        
        system_message = f"""
        You are an AI assistant that extracts essential info from video transcripts.
        You have the authority to make improvements as you see fit.
        
        Rules To Follow:
        - Refining the summaries for clarity and conciseness.
        - Adjusting emoji choices to better represent the content.
        - Removing redundant information.
        - Grouping two points into a single point if the timestamps are close enough.
        
        Your goal is to produce a refined and accurate representation of the main points from the video transcript. Use your judgment to balance adherence to the specific rules with overall improvement of the extracted information.
        """
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_message),
            ("human", "{input}")
        ])
        
        extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="points")

        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0, chunk_size=16000, separators=[f" {char}" for char in "123456789"])
        
        prep = RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
        
        chain = prep | extraction_chain.map() | YouTubeTranscriptPointsExtractor._flatten
        
        result_1 = chain.invoke(transcript)
                
        return result_1

    @staticmethod
    def _flatten(matrix):
        """Flattens a 2D list into a 1D list."""
        return [item for row in matrix for item in row]
    
    @staticmethod
    def _format_for_youtube_comment(points: List[Dict[str, Any]], detailed: bool = True) -> str:
        """
        Formats extracted main points into a YouTube-style comment with clickable timestamps.

        Args:
            points (List[Dict[str, Any]]): List of dictionaries containing main points with timestamps.
            detailed (bool): If True, returns a detailed format with emojis and summaries. 
                             If False, returns a simpler format with just timestamps and main points.

        Returns:
            str: Formatted string representing the main points as a YouTube comment with clickable timestamps.
        """
        def _format_timestamp(seconds):
            hours = int(seconds // 3600)
            minutes = int((seconds % 3600) // 60)
            seconds = int(seconds % 60)
            return f"{hours:02}:{minutes:02}:{seconds:02}"

        formatted_comment = ""
        for point in points:
            timestamp = _format_timestamp(point['timestamp'])
            main_point = point['main_point'].rstrip('.')
            
            if detailed:
                emoji = point['emoji']
                summary = point['summary']
                formatted_comment += f"{timestamp} {emoji} {main_point}: {summary}\n"
            else:
                formatted_comment += f"{timestamp} {main_point}\n"
            
        return formatted_comment.strip()

class Answer(BaseModel):
    """Pydantic model for representing an answer to a question."""
    answer: str = Field(description="The answer to the user's question based on the video transcript.")
    confidence: float = Field(description="A confidence score between 0 and 1 indicating how certain the model is about the answer.")
    
class QuestionAnswerExtractor:
    """
    A tool for answering questions about YouTube videos based on their transcripts.

    This class provides methods to process transcripts and generate answers to user questions
    using natural language processing techniques.
    """
    
    class Info(BaseModel):
        """Pydantic model for representing a collection of answers."""
        answers: List[Answer]

    @staticmethod
    @tool(return_direct=False)
    def get_answer(youtube_video_id: str, question: str) -> str:
        """
        Answers a question about a YouTube video based on its transcript.
        
        Args:
            youtube_video_id (str): The ID of the YouTube video.
            question (str): The user's question about the video.

        Returns:
            str: Formatted string containing the answer to the user's question.
        """
        try:
            transcript = QuestionAnswerExtractor._get_youtube_video_transcript(youtube_video_id)
            answer = QuestionAnswerExtractor._extract_answer(transcript, question)
            return answer
        except Exception as e:
            return f"Error answering question: {str(e)}"

    @staticmethod
    def _get_youtube_video_transcript(youtube_video_id: str) -> str:
        """
        Fetches the transcript for a YouTube video.
        
        Args:
            youtube_video_id (str): The ID of the YouTube video.
        
        Returns:
            str: The full transcript of the video.
        
        Raises:
            Exception: If there's an error fetching the transcript.
        """
        try:
            transcript_json = YouTubeTranscriptApi.get_transcript(youtube_video_id)
            transcript_data = [entry['text'] for entry in transcript_json]
            return " ".join(transcript_data)
        except Exception as e:
            raise

    @staticmethod
    def _extract_answer(transcript: str, question: str) -> List[Answer]:
        """
        Extracts an answer to the user's question from the YouTube video transcript.
        
        Args:
            transcript (str): The full transcript of the video.
            question (str): The user's question about the video.
        
        Returns:
            List[Answer]: A list containing a single Answer object with the consolidated answer.
        """
        answer_extraction_function = [convert_to_openai_function(QuestionAnswerExtractor.Info)]
        
        model = ChatOpenAI(temperature=get_temperature())
        extraction_model = model.bind(functions=answer_extraction_function, function_call={"name": "Info"})
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant tasked with answering questions about a video based on its transcript."),
            ("human", "Transcript: {transcript}\n\nQuestion: {question}\n\nProvide an answer to the question based on the transcript, along with a confidence score.")
        ])
        
        extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="answers")
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=192, chunk_size=8000, separators=[f" {char}" for char in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"])
        
        def prepare_input(x):
            chunks = text_splitter.split_text(x['transcript'])
            return [{"transcript": chunk, "question": x['question']} for chunk in chunks]
        
        prep = RunnableLambda(prepare_input)
        
        chain = prep | extraction_chain.map() | QuestionAnswerExtractor._flatten
        
        # Get partial answers
        partial_answers = chain.invoke({"transcript": transcript, "question": question})
        
        # Filter out low-confidence answers
        filtered_answers = [answer for answer in partial_answers if answer['confidence'] > 0.4]
        
        # If all answers were filtered out, return a low-confidence "no answer" response
        if not filtered_answers:
            return "I couldn't find a reliable answer to your question based on the video transcript."
        
        # Consolidate filtered partial answers
        consolidation_prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant tasked with consolidating multiple partial answers into a comprehensive final answer."),
            ("human", "Question: {question}\n\nPartial Answers: {partial_answers}\n\nPlease provide a consolidated, comprehensive answer to the question based on these partial answers. Ignore any information from answers with low confidence (0.5 or below).")
        ])
        
        consolidation_model = ChatOpenAI(temperature=get_temperature())
        consolidation_chain = consolidation_prompt | consolidation_model 
        
        final_answer = consolidation_chain.invoke({
            "question": question,
            "partial_answers": json.dumps(filtered_answers, indent=2)
        })
        
        return final_answer.content
        
    @staticmethod
    def _flatten(matrix):
        """Flattens a 2D list into a 1D list."""
        return [item for row in matrix for item in row]
    
class YouTubeAgent:
    """
    An agent for interacting with YouTube videos and processing user queries.

    This class sets up the necessary components for an AI agent that can understand
    and respond to user queries about YouTube videos.
    """

    def __init__(self):
        """Initializes the YouTubeAgent with necessary tools and components."""
        
        self.tools = [
            QuestionAnswerExtractor.get_answer,
            YouTubeTranscriptPointsExtractor.extract_clickable_points,
        ]
        
        self.sys_message = """You are a helpful assistant.
        
        Important instructions:
        1. Only use the 'extract_clickable_points' tool when the user explicitly asks for clickable points or timestamps from a video.
        2. For all other queries, including general questions about video content, use the 'get_answer' tool.
        3. If the user's query is unclear, ask for clarification before using any tools.
        4. Always provide concise and relevant responses based on the tool outputs.

        Remember to interpret the user's intent carefully and use the appropriate tools."""
        
        self.functions = [convert_to_openai_function(f) for f in self.tools]
        
        self.model = ChatOpenAI(temperature=get_temperature()).bind(functions=self.functions)
        
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", self.sys_message),
            MessagesPlaceholder(variable_name="history"),
            ("user", "{input}"),
            MessagesPlaceholder(variable_name="agent_scratchpad")
        ])
        
        self.agent_chain = RunnablePassthrough.assign(
            agent_scratchpad= lambda x: format_to_openai_functions(x["intermediate_steps"])
        ) | self.prompt | self.model | OpenAIFunctionsAgentOutputParser()
        
        self.memory = ConversationBufferWindowMemory(k=3, return_messages=True, memory_key="history")
        self.agent_executor = AgentExecutor(agent=self.agent_chain, tools=self.tools, memory=self.memory)

    def invoke(self, input_text: str) -> str:
        """
        Processes a user input and returns the agent's response.

        Args:
            input_text (str): The user's input query.

        Returns:
            str: The agent's response to the user's query.
        """
        try:
            result = self.agent_executor.invoke({"input": input_text})
            return result['output']
        except Exception as e:
            return f"An error occurred: {str(e)}"

# youtube_agent = YouTubeAgent()
# video_link = "https://www.youtube.com/watch?v=-OSxeoIAs2w"
# main_points = youtube_agent.invoke(f"The race involves which challenges in the following video {video_link}")