youssef
commited on
Commit
·
5f42812
1
Parent(s):
97c5040
Initial setup for HF Space
Browse files- .github/workflows/sync-to-hub.yml +15 -0
- requirements.txt +8 -0
- src/app.py +31 -0
- src/video_processor/processor.py +81 -0
.github/workflows/sync-to-hub.yml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face Hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
jobs:
|
7 |
+
sync-to-hub:
|
8 |
+
runs-on: ubuntu-latest
|
9 |
+
steps:
|
10 |
+
- uses: actions/checkout@v3
|
11 |
+
- name: Push to HF Hub
|
12 |
+
env:
|
13 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
14 |
+
run: |
|
15 |
+
git push https://bnkd:[email protected]/spaces/bnkd/smolvm-demo main
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.1.2
|
2 |
+
torchvision==0.16.2
|
3 |
+
transformers @ git+https://github.com/huggingface/[email protected]
|
4 |
+
num2words==0.5.13
|
5 |
+
gradio==4.19.2
|
6 |
+
av==10.0.0
|
7 |
+
numpy==1.24.3
|
8 |
+
Pillow==10.0.0
|
src/app.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from video_processor.processor import VideoAnalyzer
|
3 |
+
import logging
|
4 |
+
|
5 |
+
# Configure logging
|
6 |
+
logging.basicConfig(level=logging.INFO)
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
analyzer = VideoAnalyzer()
|
10 |
+
|
11 |
+
def process_video(video_path):
|
12 |
+
"""Process video and return description"""
|
13 |
+
try:
|
14 |
+
logger.info(f"Processing video: {video_path}")
|
15 |
+
results = analyzer.process_video(video_path)
|
16 |
+
return results[0]["description"]
|
17 |
+
except Exception as e:
|
18 |
+
logger.error(f"Error processing video: {e}")
|
19 |
+
return str(e)
|
20 |
+
|
21 |
+
# Create Gradio interface
|
22 |
+
demo = gr.Interface(
|
23 |
+
fn=process_video,
|
24 |
+
inputs=gr.Video(label="Upload Video"),
|
25 |
+
outputs=gr.Textbox(label="Video Description"),
|
26 |
+
title="SmolVLM Video Analyzer",
|
27 |
+
description="Upload a video to get a detailed description of its contents."
|
28 |
+
)
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
demo.launch()
|
src/video_processor/processor.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
3 |
+
from typing import List, Dict
|
4 |
+
import decord
|
5 |
+
import numpy as np
|
6 |
+
import logging
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
+
logger.info(f"Using device: {DEVICE}")
|
12 |
+
|
13 |
+
class VideoAnalyzer:
|
14 |
+
def __init__(self):
|
15 |
+
if not torch.cuda.is_available():
|
16 |
+
raise RuntimeError("CUDA is required but not available!")
|
17 |
+
|
18 |
+
logger.info("Initializing VideoAnalyzer")
|
19 |
+
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
20 |
+
logger.info(f"Loading model from {self.model_path}")
|
21 |
+
|
22 |
+
cache_dir = "/models"
|
23 |
+
logger.info(f"Using cache directory: {cache_dir}")
|
24 |
+
|
25 |
+
# Load processor and model
|
26 |
+
self.processor = AutoProcessor.from_pretrained(
|
27 |
+
self.model_path,
|
28 |
+
cache_dir=cache_dir,
|
29 |
+
torch_dtype=torch.bfloat16
|
30 |
+
)
|
31 |
+
|
32 |
+
# Load model directly to CUDA
|
33 |
+
device_map = {"": 0} # Force model to GPU 0
|
34 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
35 |
+
self.model_path,
|
36 |
+
torch_dtype=torch.bfloat16,
|
37 |
+
device_map=device_map,
|
38 |
+
_attn_implementation="flash_attention_2",
|
39 |
+
cache_dir=cache_dir
|
40 |
+
)
|
41 |
+
logger.info(f"Model loaded on device: {self.model.device}")
|
42 |
+
|
43 |
+
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
44 |
+
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
45 |
+
try:
|
46 |
+
# Create message for model
|
47 |
+
messages = [{
|
48 |
+
"role": "user",
|
49 |
+
"content": [
|
50 |
+
{"type": "video", "path": video_path},
|
51 |
+
{"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
|
52 |
+
]
|
53 |
+
}]
|
54 |
+
|
55 |
+
# Process video using chat template
|
56 |
+
inputs = self.processor.apply_chat_template(
|
57 |
+
messages,
|
58 |
+
add_generation_prompt=True,
|
59 |
+
tokenize=True,
|
60 |
+
return_dict=True,
|
61 |
+
return_tensors="pt"
|
62 |
+
).to(self.model.device)
|
63 |
+
|
64 |
+
# Generate description
|
65 |
+
generated_ids = self.model.generate(
|
66 |
+
**inputs,
|
67 |
+
do_sample=False,
|
68 |
+
max_new_tokens=100
|
69 |
+
)
|
70 |
+
description = self.processor.batch_decode(
|
71 |
+
generated_ids,
|
72 |
+
skip_special_tokens=True
|
73 |
+
)[0]
|
74 |
+
|
75 |
+
return [{
|
76 |
+
"description": description
|
77 |
+
}]
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Error processing video: {str(e)}", exc_info=True)
|
81 |
+
raise
|