youssef
commited on
Commit
·
24c2f62
1
Parent(s):
d200533
more logs
Browse files- src/video_processor/processor.py +11 -12
src/video_processor/processor.py
CHANGED
@@ -23,20 +23,17 @@ class VideoAnalyzer:
|
|
23 |
|
24 |
logger.info("Initializing VideoAnalyzer")
|
25 |
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
26 |
-
logger.info(f"Loading model from {self.model_path}")
|
27 |
|
28 |
# Load processor and model
|
29 |
-
self.processor = AutoProcessor.from_pretrained(
|
30 |
-
self.model_path,
|
31 |
-
torch_dtype=torch.bfloat16
|
32 |
-
)
|
33 |
|
34 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
35 |
self.model_path,
|
36 |
torch_dtype=torch.bfloat16,
|
37 |
# _attn_implementation="flash_attention_2"
|
38 |
).to(DEVICE)
|
39 |
-
logger.info(f"Model loaded on device: {self.model.device}
|
40 |
|
41 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
42 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
@@ -60,6 +57,8 @@ class VideoAnalyzer:
|
|
60 |
]
|
61 |
}
|
62 |
]
|
|
|
|
|
63 |
|
64 |
# Process video using chat template
|
65 |
inputs = self.processor.apply_chat_template(
|
@@ -68,13 +67,9 @@ class VideoAnalyzer:
|
|
68 |
tokenize=True,
|
69 |
return_dict=True,
|
70 |
return_tensors="pt"
|
71 |
-
).to(
|
72 |
-
|
73 |
-
# Convert inputs to bfloat16 before moving to GPU
|
74 |
-
#for key in inputs:
|
75 |
-
# if torch.is_tensor(inputs[key]):
|
76 |
-
# inputs[key] = inputs[key].to(dtype=torch.bfloat16, device=self.model.device)
|
77 |
|
|
|
78 |
# Generate description with increased token limit
|
79 |
generated_ids = self.model.generate(
|
80 |
**inputs,
|
@@ -82,10 +77,14 @@ class VideoAnalyzer:
|
|
82 |
temperature=0.7,
|
83 |
max_new_tokens=512 # Increased from 100 to get more detailed descriptions
|
84 |
)
|
|
|
|
|
85 |
description = self.processor.batch_decode(
|
86 |
generated_ids,
|
87 |
skip_special_tokens=True
|
88 |
)[0]
|
|
|
|
|
89 |
|
90 |
return [{
|
91 |
"description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
|
|
|
23 |
|
24 |
logger.info("Initializing VideoAnalyzer")
|
25 |
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
26 |
+
logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
|
27 |
|
28 |
# Load processor and model
|
29 |
+
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
|
|
|
|
|
|
30 |
|
31 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
32 |
self.model_path,
|
33 |
torch_dtype=torch.bfloat16,
|
34 |
# _attn_implementation="flash_attention_2"
|
35 |
).to(DEVICE)
|
36 |
+
logger.info(f"Model loaded on device: {self.model.device}")
|
37 |
|
38 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
39 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
|
|
57 |
]
|
58 |
}
|
59 |
]
|
60 |
+
|
61 |
+
logger.info(f"Applying chat template - message: {messages}")
|
62 |
|
63 |
# Process video using chat template
|
64 |
inputs = self.processor.apply_chat_template(
|
|
|
67 |
tokenize=True,
|
68 |
return_dict=True,
|
69 |
return_tensors="pt"
|
70 |
+
).to(DEVICE)
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
logger.info(f"Generating IDs")
|
73 |
# Generate description with increased token limit
|
74 |
generated_ids = self.model.generate(
|
75 |
**inputs,
|
|
|
77 |
temperature=0.7,
|
78 |
max_new_tokens=512 # Increased from 100 to get more detailed descriptions
|
79 |
)
|
80 |
+
|
81 |
+
logger.info(f"batch decoding...")
|
82 |
description = self.processor.batch_decode(
|
83 |
generated_ids,
|
84 |
skip_special_tokens=True
|
85 |
)[0]
|
86 |
+
|
87 |
+
|
88 |
|
89 |
return [{
|
90 |
"description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
|