SmolVLM2-XSPFGenerator

Running on A100

App Files Files Community

mfarre HF staff commited on 9 days ago

Commit

d7863e4

1 Parent(s): c587eed

prompts update + transformers update

Browse files

Files changed (1) hide show

app.py +12 -8

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import subprocess
 import logging
 import xml.etree.ElementTree as ET
 from xml.dom import minidom
-from transformers import AutoProcessor, AutoModelForVision2Seq
 logging.basicConfig(level=logging.INFO)
@@ -49,7 +49,7 @@ class VideoHighlightDetector:
         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
-        self.model = AutoModelForVision2Seq.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16
         ).to(device)
@@ -86,13 +86,13 @@ class VideoHighlightDetector:
         messages = [
             {
                 "role": "system",
-                "content": [{"type": "text", "text": "Describe what is happening in this specific video segment in a brief, concise way."}]
             },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
-                    {"type": "text", "text": "What is happening in this segment? Provide a very brief and concise description."}
                 ]
             }
         ]
@@ -109,14 +109,15 @@ class VideoHighlightDetector:
         return self.processor.decode(outputs[0], skip_special_tokens=True).split("Assistant: ")[1]
     def determine_highlights(self, video_description: str) -> str:
         messages = [
             {
                 "role": "system",
-                "content": [{"type": "text", "text": "You are a professional video editor specializing in creating viral highlight reels."}]
             },
             {
                 "role": "user",
-                "content": [{"type": "text", "text": f"Based on this description, list which segments should be included in highlights: {video_description}"}]
             }
         ]
@@ -133,12 +134,15 @@ class VideoHighlightDetector:
     def process_segment(self, video_path: str, highlight_types: str) -> bool:
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
-                    {"type": "text", "text": f"Do you see any of these elements in the video: {highlight_types}? Answer yes or no."}
-                ]
             }
         ]

 import logging
 import xml.etree.ElementTree as ET
 from xml.dom import minidom
+from transformers import AutoProcessor, AutoModelForImageTextToText
 logging.basicConfig(level=logging.INFO)
         # Initialize model and processor
         self.processor = AutoProcessor.from_pretrained(model_path)
+        self.model = AutoModelForImageTextToText.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16
         ).to(device)
         messages = [
             {
                 "role": "system",
+                "content": [{"type": "text", "text": "Focus only on describing the key dramatic action or notable event occurring in this video segment. Skip general context or scene-setting details unless they are crucial to understanding the main action."}]
             },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
+                    {"type": "text", "text": "WWhat is the main action or notable event happening in this segment? Describe it in one brief sentence."}
                 ]
             }
         ]
         return self.processor.decode(outputs[0], skip_special_tokens=True).split("Assistant: ")[1]
     def determine_highlights(self, video_description: str) -> str:
+        """Determine what constitutes highlights based on video description."""
         messages = [
             {
                 "role": "system",
+                "content": [{"type": "text", "text": "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type."}]
             },
             {
                 "role": "user",
+                "content": [{"type": "text", "text": f"""Here is a description of a video:\n\n{video_description}\n\nList potential highlight moments to look for in this video:"""}]
             }
         ]
     def process_segment(self, video_path: str, highlight_types: str) -> bool:
         messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a video highlight analyzer. Your role is to identify moments that have high dramatic value, focusing on displays of skill, emotion, personality, or tension. Compare video segments against provided example highlights to find moments with similar emotional impact and visual interest, even if the specific actions differ."}]
+            },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
+                    {"type": "text", "text": f"""Given these highlight examples:\n{highlight_types}\n\nDoes this video contain a moment that matches the core action of one of the highlights? Answer with:\n'yes' or 'no'\nIf yes, justify it"""}]
             }
         ]