Spaces:

becteur92
/

smollvm

Paused

App Files Files Community

youssef commited on 3 days ago

Commit

24c2f62

1 Parent(s): d200533

more logs

Browse files

Files changed (1) hide show

src/video_processor/processor.py +11 -12

src/video_processor/processor.py CHANGED Viewed

@@ -23,20 +23,17 @@ class VideoAnalyzer:
         logger.info("Initializing VideoAnalyzer")
         self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-        logger.info(f"Loading model from {self.model_path}")
         # Load processor and model
-        self.processor = AutoProcessor.from_pretrained(
-            self.model_path,
-            torch_dtype=torch.bfloat16
-        )
         self.model = AutoModelForImageTextToText.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
             # _attn_implementation="flash_attention_2"
         ).to(DEVICE)
-        logger.info(f"Model loaded on device: {self.model.device} using attention implementation: flash_attention_2")
     def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
         logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
@@ -60,6 +57,8 @@ class VideoAnalyzer:
                     ]
                 }
             ]
             # Process video using chat template
             inputs = self.processor.apply_chat_template(
@@ -68,13 +67,9 @@ class VideoAnalyzer:
                 tokenize=True,
                 return_dict=True,
                 return_tensors="pt"
-            ).to(self.model.device)
-            # Convert inputs to bfloat16 before moving to GPU
-            #for key in inputs:
-             #   if torch.is_tensor(inputs[key]):
-              #      inputs[key] = inputs[key].to(dtype=torch.bfloat16, device=self.model.device)
             # Generate description with increased token limit
             generated_ids = self.model.generate(
                 **inputs,
@@ -82,10 +77,14 @@ class VideoAnalyzer:
                 temperature=0.7,
                 max_new_tokens=512  # Increased from 100 to get more detailed descriptions
             )
             description = self.processor.batch_decode(
                 generated_ids,
                 skip_special_tokens=True
             )[0]
             return [{
                 "description": description.split("Assistant: ")[-1]  # Remove assistant prefix if present

         logger.info("Initializing VideoAnalyzer")
         self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+        logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
         # Load processor and model
+        self.processor = AutoProcessor.from_pretrained(self.model_path)
         self.model = AutoModelForImageTextToText.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
             # _attn_implementation="flash_attention_2"
         ).to(DEVICE)
+        logger.info(f"Model loaded on device: {self.model.device}")
     def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
         logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
                     ]
                 }
             ]
+            logger.info(f"Applying chat template - message: {messages}")
             # Process video using chat template
             inputs = self.processor.apply_chat_template(
                 tokenize=True,
                 return_dict=True,
                 return_tensors="pt"
+            ).to(DEVICE)
+            logger.info(f"Generating IDs")
             # Generate description with increased token limit
             generated_ids = self.model.generate(
                 **inputs,
                 temperature=0.7,
                 max_new_tokens=512  # Increased from 100 to get more detailed descriptions
             )
+            logger.info(f"batch decoding...")
             description = self.processor.batch_decode(
                 generated_ids,
                 skip_special_tokens=True
             )[0]
             return [{
                 "description": description.split("Assistant: ")[-1]  # Remove assistant prefix if present