youssef commited on
Commit
24c2f62
·
1 Parent(s): d200533
Files changed (1) hide show
  1. src/video_processor/processor.py +11 -12
src/video_processor/processor.py CHANGED
@@ -23,20 +23,17 @@ class VideoAnalyzer:
23
 
24
  logger.info("Initializing VideoAnalyzer")
25
  self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
26
- logger.info(f"Loading model from {self.model_path}")
27
 
28
  # Load processor and model
29
- self.processor = AutoProcessor.from_pretrained(
30
- self.model_path,
31
- torch_dtype=torch.bfloat16
32
- )
33
 
34
  self.model = AutoModelForImageTextToText.from_pretrained(
35
  self.model_path,
36
  torch_dtype=torch.bfloat16,
37
  # _attn_implementation="flash_attention_2"
38
  ).to(DEVICE)
39
- logger.info(f"Model loaded on device: {self.model.device} using attention implementation: flash_attention_2")
40
 
41
  def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
42
  logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
@@ -60,6 +57,8 @@ class VideoAnalyzer:
60
  ]
61
  }
62
  ]
 
 
63
 
64
  # Process video using chat template
65
  inputs = self.processor.apply_chat_template(
@@ -68,13 +67,9 @@ class VideoAnalyzer:
68
  tokenize=True,
69
  return_dict=True,
70
  return_tensors="pt"
71
- ).to(self.model.device)
72
-
73
- # Convert inputs to bfloat16 before moving to GPU
74
- #for key in inputs:
75
- # if torch.is_tensor(inputs[key]):
76
- # inputs[key] = inputs[key].to(dtype=torch.bfloat16, device=self.model.device)
77
 
 
78
  # Generate description with increased token limit
79
  generated_ids = self.model.generate(
80
  **inputs,
@@ -82,10 +77,14 @@ class VideoAnalyzer:
82
  temperature=0.7,
83
  max_new_tokens=512 # Increased from 100 to get more detailed descriptions
84
  )
 
 
85
  description = self.processor.batch_decode(
86
  generated_ids,
87
  skip_special_tokens=True
88
  )[0]
 
 
89
 
90
  return [{
91
  "description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
 
23
 
24
  logger.info("Initializing VideoAnalyzer")
25
  self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
26
+ logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
27
 
28
  # Load processor and model
29
+ self.processor = AutoProcessor.from_pretrained(self.model_path)
 
 
 
30
 
31
  self.model = AutoModelForImageTextToText.from_pretrained(
32
  self.model_path,
33
  torch_dtype=torch.bfloat16,
34
  # _attn_implementation="flash_attention_2"
35
  ).to(DEVICE)
36
+ logger.info(f"Model loaded on device: {self.model.device}")
37
 
38
  def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
39
  logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
 
57
  ]
58
  }
59
  ]
60
+
61
+ logger.info(f"Applying chat template - message: {messages}")
62
 
63
  # Process video using chat template
64
  inputs = self.processor.apply_chat_template(
 
67
  tokenize=True,
68
  return_dict=True,
69
  return_tensors="pt"
70
+ ).to(DEVICE)
 
 
 
 
 
71
 
72
+ logger.info(f"Generating IDs")
73
  # Generate description with increased token limit
74
  generated_ids = self.model.generate(
75
  **inputs,
 
77
  temperature=0.7,
78
  max_new_tokens=512 # Increased from 100 to get more detailed descriptions
79
  )
80
+
81
+ logger.info(f"batch decoding...")
82
  description = self.processor.batch_decode(
83
  generated_ids,
84
  skip_special_tokens=True
85
  )[0]
86
+
87
+
88
 
89
  return [{
90
  "description": description.split("Assistant: ")[-1] # Remove assistant prefix if present