youssef commited on
Commit
5f42812
·
1 Parent(s): 97c5040

Initial setup for HF Space

Browse files
.github/workflows/sync-to-hub.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ jobs:
7
+ sync-to-hub:
8
+ runs-on: ubuntu-latest
9
+ steps:
10
+ - uses: actions/checkout@v3
11
+ - name: Push to HF Hub
12
+ env:
13
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
14
+ run: |
15
+ git push https://bnkd:[email protected]/spaces/bnkd/smolvm-demo main
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.2
2
+ torchvision==0.16.2
3
+ transformers @ git+https://github.com/huggingface/[email protected]
4
+ num2words==0.5.13
5
+ gradio==4.19.2
6
+ av==10.0.0
7
+ numpy==1.24.3
8
+ Pillow==10.0.0
src/app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from video_processor.processor import VideoAnalyzer
3
+ import logging
4
+
5
+ # Configure logging
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger(__name__)
8
+
9
+ analyzer = VideoAnalyzer()
10
+
11
+ def process_video(video_path):
12
+ """Process video and return description"""
13
+ try:
14
+ logger.info(f"Processing video: {video_path}")
15
+ results = analyzer.process_video(video_path)
16
+ return results[0]["description"]
17
+ except Exception as e:
18
+ logger.error(f"Error processing video: {e}")
19
+ return str(e)
20
+
21
+ # Create Gradio interface
22
+ demo = gr.Interface(
23
+ fn=process_video,
24
+ inputs=gr.Video(label="Upload Video"),
25
+ outputs=gr.Textbox(label="Video Description"),
26
+ title="SmolVLM Video Analyzer",
27
+ description="Upload a video to get a detailed description of its contents."
28
+ )
29
+
30
+ if __name__ == "__main__":
31
+ demo.launch()
src/video_processor/processor.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText
3
+ from typing import List, Dict
4
+ import decord
5
+ import numpy as np
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
+ logger.info(f"Using device: {DEVICE}")
12
+
13
+ class VideoAnalyzer:
14
+ def __init__(self):
15
+ if not torch.cuda.is_available():
16
+ raise RuntimeError("CUDA is required but not available!")
17
+
18
+ logger.info("Initializing VideoAnalyzer")
19
+ self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
20
+ logger.info(f"Loading model from {self.model_path}")
21
+
22
+ cache_dir = "/models"
23
+ logger.info(f"Using cache directory: {cache_dir}")
24
+
25
+ # Load processor and model
26
+ self.processor = AutoProcessor.from_pretrained(
27
+ self.model_path,
28
+ cache_dir=cache_dir,
29
+ torch_dtype=torch.bfloat16
30
+ )
31
+
32
+ # Load model directly to CUDA
33
+ device_map = {"": 0} # Force model to GPU 0
34
+ self.model = AutoModelForImageTextToText.from_pretrained(
35
+ self.model_path,
36
+ torch_dtype=torch.bfloat16,
37
+ device_map=device_map,
38
+ _attn_implementation="flash_attention_2",
39
+ cache_dir=cache_dir
40
+ )
41
+ logger.info(f"Model loaded on device: {self.model.device}")
42
+
43
+ def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
44
+ logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
45
+ try:
46
+ # Create message for model
47
+ messages = [{
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "video", "path": video_path},
51
+ {"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
52
+ ]
53
+ }]
54
+
55
+ # Process video using chat template
56
+ inputs = self.processor.apply_chat_template(
57
+ messages,
58
+ add_generation_prompt=True,
59
+ tokenize=True,
60
+ return_dict=True,
61
+ return_tensors="pt"
62
+ ).to(self.model.device)
63
+
64
+ # Generate description
65
+ generated_ids = self.model.generate(
66
+ **inputs,
67
+ do_sample=False,
68
+ max_new_tokens=100
69
+ )
70
+ description = self.processor.batch_decode(
71
+ generated_ids,
72
+ skip_special_tokens=True
73
+ )[0]
74
+
75
+ return [{
76
+ "description": description
77
+ }]
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error processing video: {str(e)}", exc_info=True)
81
+ raise