kenken999 commited on
Commit
9ef5396
·
1 Parent(s): 9ec91e9
Files changed (1) hide show
  1. controllers/gra_06_video/video.py +68 -65
controllers/gra_06_video/video.py CHANGED
@@ -1,11 +1,9 @@
1
  #!/usr/bin/env python
2
 
3
  from __future__ import annotations
4
-
5
  import os
6
  import random
7
  import tempfile
8
-
9
  import gradio as gr
10
  import imageio
11
  import numpy as np
@@ -15,21 +13,26 @@ from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
15
  DESCRIPTION = '# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)'
16
  DESCRIPTION += '\n<p>For Colab usage, you can view <a href="https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing" style="text-decoration: underline;" target="_blank">this webpage</a>.(the latest update on 2023.03.21)</p>'
17
  DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
 
18
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
19
  DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
20
 
21
  MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '200'))
22
- DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES,
23
- int(os.getenv('DEFAULT_NUM_FRAMES', '16')))
 
 
 
 
 
 
 
 
24
 
25
- pipe = DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b',
26
- torch_dtype=torch.float16,
27
- variant='fp16')
28
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
29
  pipe.enable_model_cpu_offload()
30
  pipe.enable_vae_slicing()
31
 
32
-
33
  def to_video(frames: list[np.ndarray], fps: int) -> str:
34
  out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
35
  writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
@@ -38,19 +41,18 @@ def to_video(frames: list[np.ndarray], fps: int) -> str:
38
  writer.close()
39
  return out_file.name
40
 
41
-
42
- def generate(prompt: str, seed: int, num_frames: int,
43
- num_inference_steps: int) -> str:
44
  if seed == -1:
45
  seed = random.randint(0, 1000000)
46
- generator = torch.Generator().manual_seed(seed)
47
- frames = pipe(prompt,
48
- num_inference_steps=num_inference_steps,
49
- num_frames=num_frames,
50
- generator=generator).frames
 
 
51
  return to_video(frames, 8)
52
 
53
-
54
  examples = [
55
  ['An astronaut riding a horse.', 0, 16, 25],
56
  ['A panda eating bamboo on a rock.', 0, 16, 25],
@@ -60,16 +62,19 @@ examples = [
60
  with gr.Blocks() as gradio_interface:
61
  gr.Markdown(DESCRIPTION)
62
  with gr.Group():
63
- with gr.Column(): # `gr.Box()` → `gr.Column()` に変更
64
- with gr.Row(elem_id='prompt-container'): # `.style()` を削除
65
- prompt = gr.Text(
66
  label='Prompt',
67
  show_label=False,
68
  max_lines=1,
69
  placeholder='Enter your prompt',
70
- elem_id='prompt-text-input')
71
- run_button = gr.Button('Generate video')
 
 
72
  result = gr.Video(label='Result', show_label=False, elem_id='gallery')
 
73
  with gr.Accordion('Advanced options', open=False):
74
  seed = gr.Slider(
75
  label='Seed',
@@ -77,61 +82,59 @@ with gr.Blocks() as gradio_interface:
77
  maximum=1000000,
78
  step=1,
79
  value=-1,
80
- info='If set to -1, a different seed will be used each time.')
 
81
  num_frames = gr.Slider(
82
  label='Number of frames',
83
  minimum=16,
84
  maximum=MAX_NUM_FRAMES,
85
  step=1,
86
  value=16,
87
- info=
88
- 'Note that the content of the video also changes when you change the number of frames.'
 
 
 
 
 
 
89
  )
90
- num_inference_steps = gr.Slider(label='Number of inference steps',
91
- minimum=10,
92
- maximum=50,
93
- step=1,
94
- value=25)
95
 
 
96
 
97
- inputs = [
98
- prompt,
99
- seed,
100
- num_frames,
101
- num_inference_steps,
102
- ]
103
- gr.Examples(examples=examples,
104
- inputs=inputs,
105
- outputs=result,
106
- fn=generate,
107
- cache_examples=os.getenv('SYSTEM') == 'spaces')
108
 
109
  prompt.submit(fn=generate, inputs=inputs, outputs=result)
110
  run_button.click(fn=generate, inputs=inputs, outputs=result)
111
 
 
 
 
 
 
 
 
 
 
112
 
113
- with gr.Accordion(label='We are hiring(Based in Beijing / Hangzhou, China.)', open=False):
114
- gr.HTML("""<div class="acknowledgments">
115
- <p>
116
- If you're looking for an exciting challenge and the opportunity to work with cutting-edge technologies in AIGC and large-scale pretraining, then we are the place for you. We are looking for talented, motivated and creative individuals to join our team. If you are interested, please send your CV to us.
117
- </p>
118
- <p>
119
- <b>EMAIL: [email protected]</b>.
120
- </p>
121
- </div>
122
- """)
123
-
124
  with gr.Accordion(label='Biases and content acknowledgment', open=False):
125
- gr.HTML("""<div class="acknowledgments">
126
- <h4>Biases and content acknowledgment</h4>
127
- <p>
128
- Despite how impressive being able to turn text into video is, beware to the fact that this model may output content that reinforces or exacerbates societal biases. The training data includes LAION5B, ImageNet, Webvid and other public datasets. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
129
- </p>
130
- <p>
131
- It is not intended to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Similarly, it is not allowed to generate pornographic, violent and bloody content generation. <b>The model is meant for research purposes</b>.
132
- </p>
133
- <p>
134
- To learn more about the model, head to its <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.
135
- </p>
136
- </div>
137
- """)
 
 
1
  #!/usr/bin/env python
2
 
3
  from __future__ import annotations
 
4
  import os
5
  import random
6
  import tempfile
 
7
  import gradio as gr
8
  import imageio
9
  import numpy as np
 
13
  DESCRIPTION = '# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)'
14
  DESCRIPTION += '\n<p>For Colab usage, you can view <a href="https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing" style="text-decoration: underline;" target="_blank">this webpage</a>.(the latest update on 2023.03.21)</p>'
15
  DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
16
+
17
  if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
18
  DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
19
 
20
  MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '200'))
21
+ DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES, int(os.getenv('DEFAULT_NUM_FRAMES', '16')))
22
+
23
+ # ✅ CUDA 対応修正
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ pipe = DiffusionPipeline.from_pretrained(
27
+ 'damo-vilab/text-to-video-ms-1.7b',
28
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
29
+ variant='fp16' if torch.cuda.is_available() else None
30
+ ).to(device)
31
 
 
 
 
32
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
33
  pipe.enable_model_cpu_offload()
34
  pipe.enable_vae_slicing()
35
 
 
36
  def to_video(frames: list[np.ndarray], fps: int) -> str:
37
  out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
38
  writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
 
41
  writer.close()
42
  return out_file.name
43
 
44
+ def generate(prompt: str, seed: int, num_frames: int, num_inference_steps: int) -> str:
 
 
45
  if seed == -1:
46
  seed = random.randint(0, 1000000)
47
+ generator = torch.Generator(device).manual_seed(seed)
48
+ frames = pipe(
49
+ prompt,
50
+ num_inference_steps=num_inference_steps,
51
+ num_frames=num_frames,
52
+ generator=generator
53
+ ).frames
54
  return to_video(frames, 8)
55
 
 
56
  examples = [
57
  ['An astronaut riding a horse.', 0, 16, 25],
58
  ['A panda eating bamboo on a rock.', 0, 16, 25],
 
62
  with gr.Blocks() as gradio_interface:
63
  gr.Markdown(DESCRIPTION)
64
  with gr.Group():
65
+ with gr.Column(): # `gr.Box()` → `gr.Column()` に変更
66
+ with gr.Row(elem_id='prompt-container'):
67
+ prompt = gr.Textbox( # ✅ `gr.Text()` → `gr.Textbox()` に変更
68
  label='Prompt',
69
  show_label=False,
70
  max_lines=1,
71
  placeholder='Enter your prompt',
72
+ elem_id='prompt-text-input'
73
+ )
74
+ run_button = gr.Button('Generate video') # ✅ `.style()` 削除
75
+
76
  result = gr.Video(label='Result', show_label=False, elem_id='gallery')
77
+
78
  with gr.Accordion('Advanced options', open=False):
79
  seed = gr.Slider(
80
  label='Seed',
 
82
  maximum=1000000,
83
  step=1,
84
  value=-1,
85
+ info='If set to -1, a different seed will be used each time.'
86
+ )
87
  num_frames = gr.Slider(
88
  label='Number of frames',
89
  minimum=16,
90
  maximum=MAX_NUM_FRAMES,
91
  step=1,
92
  value=16,
93
+ info='Note that the content of the video also changes when you change the number of frames.'
94
+ )
95
+ num_inference_steps = gr.Slider(
96
+ label='Number of inference steps',
97
+ minimum=10,
98
+ maximum=50,
99
+ step=1,
100
+ value=25
101
  )
 
 
 
 
 
102
 
103
+ inputs = [prompt, seed, num_frames, num_inference_steps]
104
 
105
+ gr.Examples(
106
+ examples=examples,
107
+ inputs=inputs,
108
+ outputs=result,
109
+ fn=generate,
110
+ cache_examples=os.getenv('SYSTEM') == 'spaces'
111
+ )
 
 
 
 
112
 
113
  prompt.submit(fn=generate, inputs=inputs, outputs=result)
114
  run_button.click(fn=generate, inputs=inputs, outputs=result)
115
 
116
+ with gr.Accordion(label='We are hiring (Based in Beijing / Hangzhou, China.)', open=False):
117
+ gr.HTML("""
118
+ <div class="acknowledgments">
119
+ <p>
120
+ If you're looking for an exciting challenge and the opportunity to work with cutting-edge technologies in AIGC and large-scale pretraining, then we are the place for you. We are looking for talented, motivated and creative individuals to join our team. If you are interested, please send your CV to us.
121
+ </p>
122
+ <p><b>EMAIL: [email protected]</b></p>
123
+ </div>
124
+ """)
125
 
 
 
 
 
 
 
 
 
 
 
 
126
  with gr.Accordion(label='Biases and content acknowledgment', open=False):
127
+ gr.HTML("""
128
+ <div class="acknowledgments">
129
+ <h4>Biases and content acknowledgment</h4>
130
+ <p>
131
+ Despite how impressive being able to turn text into video is, beware to the fact that this model may output content that reinforces or exacerbates societal biases. The training data includes LAION5B, ImageNet, Webvid and other public datasets. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
132
+ </p>
133
+ <p>
134
+ It is not intended to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Similarly, it is not allowed to generate pornographic, violent and bloody content generation. <b>The model is meant for research purposes</b>.
135
+ </p>
136
+ <p>
137
+ To learn more about the model, head to its <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" target="_blank">model card</a>.
138
+ </p>
139
+ </div>
140
+ """)