Spaces:

wsntxxn
/

MM-StoryAgent

Sleeping

App Files Files Community

Xu Xuenan commited on Aug 13, 2024

Commit

5152717

1 Parent(s): 6331da0

Multi-GPUs

Browse files

Files changed (9) hide show

app.py +137 -177
configs/mm_story_agent.yaml +4 -4
mm_story_agent/__init__.py +9 -3
mm_story_agent/modality_agents/image_agent.py +5 -3
mm_story_agent/modality_agents/music_agent.py +4 -3
mm_story_agent/modality_agents/sound_agent.py +4 -3
mm_story_agent/modality_agents/speech_agent.py +1 -1
mm_story_agent/prompts_en.py +5 -4
nls-1.0.0-py3-none-any.whl +0 -0

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import spaces
 from pathlib import Path
-import argparse
 import shutil
 import time
 import uuid
 import subprocess
@@ -22,7 +22,6 @@ except FileNotFoundError:
     imagemagick_installed = False
 if not imagemagick_installed:
-    import os
     os.system("apt update -y")
     os.system("apt install -y imagemagick")
     os.system("cp policy.xml /etc/ImageMagick-6/")
@@ -41,7 +40,7 @@ default_music_config = config["music_generation"]
 def set_generating_progress_text(text):
-    return gr.update(visible=True, value=f"<h3>{text} ...</h3>")
 def set_text_invisible():
     return gr.update(visible=False)
@@ -67,8 +66,19 @@ def update_page(direction, page, story_data):
 def write_story_fn(story_topic, main_role, scene,
                    num_outline, temperature,
                    current_page,
                    progress=gr.Progress(track_tqdm=True)):
     config["story_dir"] = f"generated_stories/{time.strftime('%Y%m%d-%H%M%S') + '-' + str(uuid.uuid1().hex)}"
     deep_update(config, {
         "story_setting": {
             "story_topic": story_topic,
@@ -85,12 +95,11 @@ def write_story_fn(story_topic, main_role, scene,
     # story_data, story_accordion, story_content
     return pages, gr.update(visible=True), pages[current_page], gr.update()
-@spaces.GPU()
 def modality_assets_generation_fn(
         height, width, image_seed, sound_guidance_scale, sound_seed,
         n_candidate_per_text, music_duration,
-        story_data,
-        progress=gr.Progress(track_tqdm=True)):
     deep_update(config, {
         "image_generation": {
             "obj_cfg": {
@@ -119,60 +128,10 @@ def modality_assets_generation_fn(
     # image gallery
     return gr.update(visible=True, value=images, columns=[len(images)], rows=[1], height="auto")
-def speech_generation_fn(story_data):
-    story_gen_agent = MMStoryAgent()
-    story_gen_agent.generate_speech(config, story_data)
-@spaces.GPU(duration=60)
-def sound_generation_fn(sound_guidance_scale, sound_seed, n_candidate_per_text,
-                        story_data, progress=gr.Progress(track_tqdm=True)):
-    deep_update(config, {
-        "sound_generation": {
-            "call_cfg": {
-                "guidance_scale": sound_guidance_scale,
-                "seed": sound_seed,
-                "n_candidate_per_text": n_candidate_per_text
-            }
-        }
-    })
-    story_gen_agent = MMStoryAgent()
-    story_gen_agent.generate_sound(config, story_data)
-@spaces.GPU(duration=120)
-def music_generation_fn(music_duration,
-                        story_data, progress=gr.Progress(track_tqdm=True)):
-    deep_update(config, {
-        "music_generation": {
-            "call_cfg": {
-                "duration": music_duration
-            }
-        }
-    })
-    story_gen_agent = MMStoryAgent()
-    story_gen_agent.generate_music(config, story_data)
-@spaces.GPU(duration=120)
-def image_generation_fn(height, width, image_seed,
-                        story_data, progress=gr.Progress(track_tqdm=True)):
-    deep_update(config, {
-        "image_generation": {
-            "obj_cfg": {
-                "height": height,
-                "width": width,
-            },
-            "call_cfg": {
-                "seed": image_seed
-            }
-        },
-    })
-    story_gen_agent = MMStoryAgent()
-    result = story_gen_agent.generate_image(config, story_data)
-    images = result["images"]
-    return gr.update(visible=True, value=images, columns=[len(images)], rows=[1], height="auto")
 def compose_storytelling_video_fn(
         fade_duration, slide_duration, zoom_speed, move_ratio,
-        sound_volume, music_volume, bg_speech_ratio, fps,
         story_data,
         progress=gr.Progress(track_tqdm=True)):
     deep_update(config, {
@@ -194,121 +153,122 @@ def compose_storytelling_video_fn(
     return Path(config["story_dir"]) / "output.mp4"
-if __name__ == "__main__":
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.HTML("""
-        <h1 style="text-align: center;">MM-StoryAgent</h1>
-        <p style="font-size: 16px;">This is a demo for generating attractive storytelling videos based on the given story setting.</p>
-        """)
-        with gr.Row():
-            with gr.Column():
-                story_topic = gr.Textbox(label="Story Topic", value=default_story_setting["story_topic"])
-                main_role = gr.Textbox(label="Main Role", value=default_story_setting["main_role"])
-                scene = gr.Textbox(label="Scene", value=default_story_setting["scene"])
-                chapter_num = gr.Number(label="Chapter Number", value=default_story_gen_config["num_outline"])
-                temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Temperature", value=default_story_gen_config["temperature"])
-                with gr.Accordion("Detailed Image Configuration (Optional)", open=False):
-                    height = gr.Slider(label="Height", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['height'])
-                    width = gr.Slider(label="Width", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['width'])
-                    image_seed = gr.Number(label="Image Seed", value=default_image_config["call_cfg"]['seed'])
-                with gr.Accordion("Detailed Sound Configuration (Optional)", open=False):
-                    sound_guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=7.0, step=0.5, value=default_sound_config["call_cfg"]['guidance_scale'])
-                    sound_seed = gr.Number(label="Sound Seed", value=default_sound_config["call_cfg"]['seed'])
-                    n_candidate_per_text = gr.Slider(label="Number of Candidates per Text", minimum=0, maximum=5, step=1, value=default_sound_config["call_cfg"]['n_candidate_per_text'])
-                with gr.Accordion("Detailed Music Configuration (Optional)", open=False):
-                    music_duration = gr.Number(label="Music Duration", min_width=30.0, maximum=120.0, value=default_music_config["call_cfg"]["duration"])
-                with gr.Accordion("Detailed Slideshow Effect (Optional)", open=False):
-                    fade_duration = gr.Slider(label="Fade Duration", minimum=0.1, maximum=1.5, step=0.1, value=default_slideshow_effect['fade_duration'])
-                    slide_duration = gr.Slider(label="Slide Duration", minimum=0.1, maximum=1.0, step=0.1, value=default_slideshow_effect['slide_duration'])
-                    zoom_speed = gr.Slider(label="Zoom Speed", minimum=0.1, maximum=2.0, step=0.1, value=default_slideshow_effect['zoom_speed'])
-                    move_ratio = gr.Slider(label="Move Ratio", minimum=0.8, maximum=1.0, step=0.05, value=default_slideshow_effect['move_ratio'])
-                    sound_volume = gr.Slider(label="Sound Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['sound_volume'])
-                    music_volume = gr.Slider(label="Music Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['music_volume'])
-                    bg_speech_ratio = gr.Slider(label="Background / Speech Ratio", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['bg_speech_ratio'])
-                    fps = gr.Slider(label="FPS", minimum=1, maximum=30, step=1, value=default_slideshow_effect['fps'])
-            with gr.Column():
-                story_data = gr.State([])
-                story_generation_information = gr.Markdown(
-                    label="Story Generation Status",
-                    value="<h3>Generating Story Script ......</h3>",
-                    visible=False)
-                with gr.Accordion(label="Story Content", open=False, visible=False) as story_accordion:
-                    with gr.Row():
-                        prev_button = gr.Button("Previous Page",)
-                        next_button = gr.Button("Next Page",)
-                    story_content = gr.Textbox(label="Page Content")
-                video_generation_information = gr.Markdown(label="Generation Status", value="<h3>Generating Video ......</h3>", visible=False)
-                image_gallery = gr.Gallery(label="Images", show_label=False, visible=False)
-                video_generation_btn = gr.Button("Generate Video")
-                video_output = gr.Video(label="Generated Story", interactive=False)
-        current_page = gr.State(0)
-        prev_button.click(
-            fn=update_page,
-            inputs=[gr.State("prev"), current_page, story_data],
-            outputs=[current_page, story_content]
-        )
-        next_button.click(
-            fn=update_page,
-            inputs=[gr.State("next"), current_page, story_data],
-            outputs=[current_page, story_content,])
-        video_generation_btn.click(
-            fn=set_generating_progress_text,
-            inputs=[gr.State("Generating Story")],
-            outputs=video_generation_information
-        ).then(
-            fn=write_story_fn,
-            inputs=[story_topic, main_role, scene,
-                    chapter_num, temperature,
-                    current_page],
-            outputs=[story_data, story_accordion, story_content, video_output]
-        ).then(
-            fn=set_generating_progress_text,
-            inputs=[gr.State("Generating Modality Assets")],
-            outputs=video_generation_information
-        ).then(
-            fn=speech_generation_fn,
-            inputs=[story_data]
-        ).then(
-            fn=sound_generation_fn,
-            inputs=[sound_guidance_scale, sound_seed, n_candidate_per_text, story_data]
-        ).then(
-            fn=music_generation_fn,
-            inputs=[music_duration, story_data]
-        ).then(
-            fn=image_generation_fn,
-            inputs=[height, width, image_seed, story_data],
-            outputs=[image_gallery]
-        ).then(
-            fn=set_generating_progress_text,
-            inputs=[gr.State("Composing Video")],
-            outputs=video_generation_information
-        ).then(
-            fn=compose_storytelling_video_fn,
-            inputs=[fade_duration, slide_duration, zoom_speed, move_ratio,
-                    sound_volume, music_volume, bg_speech_ratio, fps,
-                    story_data],
-            outputs=[video_output]
-        ).then(
-            fn=lambda : gr.update(visible=False),
-            inputs=[],
-            outputs=[image_gallery]
-        ).then(
-            fn=set_generating_progress_text,
-            inputs=[gr.State("Generation Finished")],
-            outputs=video_generation_information
-        )
-    demo.launch()

 from pathlib import Path
+from copy import deepcopy
 import shutil
+import os
+from datetime import datetime
 import time
 import uuid
 import subprocess
     imagemagick_installed = False
 if not imagemagick_installed:
     os.system("apt update -y")
     os.system("apt install -y imagemagick")
     os.system("cp policy.xml /etc/ImageMagick-6/")
 def set_generating_progress_text(text):
+    return gr.update(visible=True, value=f"<h3>{text}</h3>")
 def set_text_invisible():
     return gr.update(visible=False)
 def write_story_fn(story_topic, main_role, scene,
                    num_outline, temperature,
                    current_page,
+                   config,
                    progress=gr.Progress(track_tqdm=True)):
     config["story_dir"] = f"generated_stories/{time.strftime('%Y%m%d-%H%M%S') + '-' + str(uuid.uuid1().hex)}"
+    current_date = datetime.now()
+    if Path("generated_stories").exists():
+        for story_dir in Path("generated_stories").iterdir():
+            story_date = story_dir.name[:8]
+            story_date = datetime.strptime(story_date, '%Y%m%d')
+            date_difference = current_date - story_date
+            if date_difference.days >= 2:
+                shutil.rmtree(story_dir)
     deep_update(config, {
         "story_setting": {
             "story_topic": story_topic,
     # story_data, story_accordion, story_content
     return pages, gr.update(visible=True), pages[current_page], gr.update()
 def modality_assets_generation_fn(
         height, width, image_seed, sound_guidance_scale, sound_seed,
         n_candidate_per_text, music_duration,
+        config,
+        story_data):
     deep_update(config, {
         "image_generation": {
             "obj_cfg": {
     # image gallery
     return gr.update(visible=True, value=images, columns=[len(images)], rows=[1], height="auto")
 def compose_storytelling_video_fn(
         fade_duration, slide_duration, zoom_speed, move_ratio,
+        sound_volume, music_volume, bg_speech_ratio, fps,
+        config,
         story_data,
         progress=gr.Progress(track_tqdm=True)):
     deep_update(config, {
     return Path(config["story_dir"]) / "output.mp4"
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <h1 style="text-align: center;">MM-StoryAgent</h1>
+    <p style="font-size: 16px;">This is a demo for generating attractive storytelling videos based on the given story setting.</p>
+    """)
+    config = gr.State(deepcopy(config))
+    with gr.Row():
+        with gr.Column():
+            story_topic = gr.Textbox(label="Story Topic", value=default_story_setting["story_topic"])
+            main_role = gr.Textbox(label="Main Role", value=default_story_setting["main_role"])
+            scene = gr.Textbox(label="Scene", value=default_story_setting["scene"])
+            chapter_num = gr.Number(label="Chapter Number", value=default_story_gen_config["num_outline"])
+            temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Temperature", value=default_story_gen_config["temperature"])
+            with gr.Accordion("Detailed Image Configuration (Optional)", open=False):
+                height = gr.Slider(label="Height", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['height'])
+                width = gr.Slider(label="Width", minimum=256, maximum=1024, step=32, value=default_image_config["obj_cfg"]['width'])
+                image_seed = gr.Number(label="Image Seed", value=default_image_config["call_cfg"]['seed'])
+            with gr.Accordion("Detailed Sound Configuration (Optional)", open=False):
+                sound_guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=7.0, step=0.5, value=default_sound_config["call_cfg"]['guidance_scale'])
+                sound_seed = gr.Number(label="Sound Seed", value=default_sound_config["call_cfg"]['seed'])
+                n_candidate_per_text = gr.Slider(label="Number of Candidates per Text", minimum=0, maximum=5, step=1, value=default_sound_config["call_cfg"]['n_candidate_per_text'])
+            with gr.Accordion("Detailed Music Configuration (Optional)", open=False):
+                music_duration = gr.Number(label="Music Duration", min_width=30.0, maximum=120.0, value=default_music_config["call_cfg"]["duration"])
+            with gr.Accordion("Detailed Slideshow Effect (Optional)", open=False):
+                fade_duration = gr.Slider(label="Fade Duration", minimum=0.1, maximum=1.5, step=0.1, value=default_slideshow_effect['fade_duration'])
+                slide_duration = gr.Slider(label="Slide Duration", minimum=0.1, maximum=1.0, step=0.1, value=default_slideshow_effect['slide_duration'])
+                zoom_speed = gr.Slider(label="Zoom Speed", minimum=0.1, maximum=2.0, step=0.1, value=default_slideshow_effect['zoom_speed'])
+                move_ratio = gr.Slider(label="Move Ratio", minimum=0.8, maximum=1.0, step=0.05, value=default_slideshow_effect['move_ratio'])
+                sound_volume = gr.Slider(label="Sound Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['sound_volume'])
+                music_volume = gr.Slider(label="Music Volume", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['music_volume'])
+                bg_speech_ratio = gr.Slider(label="Background / Speech Ratio", minimum=0.0, maximum=1.0, step=0.1, value=default_slideshow_effect['bg_speech_ratio'])
+                fps = gr.Slider(label="FPS", minimum=1, maximum=30, step=1, value=default_slideshow_effect['fps'])
+        with gr.Column():
+            story_data = gr.State([])
+            story_generation_information = gr.Markdown(
+                label="Story Generation Status",
+                value="<h3>Generating Story Script ......</h3>",
+                visible=False)
+            with gr.Accordion(label="Story Content", open=False, visible=False) as story_accordion:
+                with gr.Row():
+                    prev_button = gr.Button("Previous Page",)
+                    next_button = gr.Button("Next Page",)
+                story_content = gr.Textbox(label="Page Content")
+            video_generation_information = gr.Markdown(label="Generation Status", value="<h3>Generating Video ......</h3>", visible=False)
+            image_gallery = gr.Gallery(label="Images", show_label=False, visible=False)
+            video_generation_btn = gr.Button("Generate Video")
+            video_output = gr.Video(label="Generated Story", interactive=False)
+    current_page = gr.State(0)
+    prev_button.click(
+        fn=update_page,
+        inputs=[gr.State("prev"), current_page, story_data],
+        outputs=[current_page, story_content]
+    )
+    next_button.click(
+        fn=update_page,
+        inputs=[gr.State("next"), current_page, story_data],
+        outputs=[current_page, story_content,])
+    # (possibly) update role description and scripts
+    video_generation_btn.click(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generating Story ...")],
+        outputs=video_generation_information
+    ).then(
+        fn=write_story_fn,
+        inputs=[story_topic, main_role, scene,
+                chapter_num, temperature,
+                current_page,
+                config
+                ],
+        outputs=[story_data, story_accordion, story_content, video_output]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generating Modality Assets ...")],
+        outputs=video_generation_information
+    ).then(
+        fn=modality_assets_generation_fn,
+        inputs=[height, width, image_seed, sound_guidance_scale, sound_seed,
+                n_candidate_per_text, music_duration,
+                config,
+                story_data],
+        outputs=[image_gallery]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Composing Video ...")],
+        outputs=video_generation_information
+    ).then(
+        fn=compose_storytelling_video_fn,
+        inputs=[fade_duration, slide_duration, zoom_speed, move_ratio,
+                sound_volume, music_volume, bg_speech_ratio, fps,
+                config,
+                story_data],
+        outputs=[video_output]
+    ).then(
+        fn=lambda : gr.update(visible=False),
+        inputs=[],
+        outputs=[image_gallery]
+    ).then(
+        fn=set_generating_progress_text,
+        inputs=[gr.State("Generation Finished!")],
+        outputs=video_generation_information
+    )
+if __name__ == "__main__":
+    demo.launch()

configs/mm_story_agent.yaml CHANGED Viewed

@@ -1,10 +1,9 @@
-story_dir: generated_stories/20240808_1130
 audio_sample_rate: &audio_sample_rate 16000
 audio_codec: mp3 # [mp3, aac, ...]
 story_setting:
-    story_topic: "Time Management: A child learning how to manage their time effectively."
     main_role: "(no main role specified)"
     scene: "(no scene specified)"
@@ -26,7 +25,7 @@ sound_generation:
     call_cfg:
         guidance_scale: 3.5
         seed: 0
-        ddim_steps: 200
         n_candidate_per_text: 3
     revise_cfg:
         num_turns: 3
@@ -44,7 +43,7 @@ image_generation:
         num_turns: 3
     obj_cfg:
         model_name: stabilityai/stable-diffusion-xl-base-1.0
-        id_length: 2
         height: 512
         width: 1024
     call_cfg:
@@ -56,6 +55,7 @@ image_generation:
 music_generation:
     revise_cfg:
         num_turns: 3
     call_cfg:
         duration: 60.0

 audio_sample_rate: &audio_sample_rate 16000
 audio_codec: mp3 # [mp3, aac, ...]
 story_setting:
+    story_topic: "learn to use computer"
     main_role: "(no main role specified)"
     scene: "(no scene specified)"
     call_cfg:
         guidance_scale: 3.5
         seed: 0
+        ddim_steps: 100
         n_candidate_per_text: 3
     revise_cfg:
         num_turns: 3
         num_turns: 3
     obj_cfg:
         model_name: stabilityai/stable-diffusion-xl-base-1.0
+        id_length: 1
         height: 512
         width: 1024
     call_cfg:
 music_generation:
     revise_cfg:
         num_turns: 3
+    obj_cfg: {}
     call_cfg:
         duration: 60.0

mm_story_agent/__init__.py CHANGED Viewed

@@ -22,10 +22,16 @@ class MMStoryAgent:
             "speech": CosyVoiceAgent,
             "music": MusicGenAgent
         }
         self.agents = {}
-    def call_modality_agent(self, agent, pages, save_path, return_dict):
-        result = agent.call(pages, save_path)
         modality = result["modality"]
         return_dict[modality] = result
@@ -73,7 +79,7 @@ class MMStoryAgent:
         return_dict = mp.Manager().dict()
         for modality in self.modalities:
-            p = mp.Process(target=self.call_modality_agent, args=(agents[modality], pages, story_dir / modality, return_dict), daemon=False)
             processes.append(p)
             p.start()

             "speech": CosyVoiceAgent,
             "music": MusicGenAgent
         }
+        self.modality_devices = {
+            "image": "cuda:0",
+            "sound": "cuda:1",
+            "music": "cuda:2",
+            "speech": "cuda:3"
+        }
         self.agents = {}
+    def call_modality_agent(self, agent, device, pages, save_path, return_dict):
+        result = agent.call(pages, device, save_path)
         modality = result["modality"]
         return_dict[modality] = result
         return_dict = mp.Manager().dict()
         for modality in self.modalities:
+            p = mp.Process(target=self.call_modality_agent, args=(agents[modality], self.modality_devices[modality], pages, story_dir / modality, return_dict), daemon=False)
             processes.append(p)
             p.start()

mm_story_agent/modality_agents/image_agent.py CHANGED Viewed

@@ -389,6 +389,7 @@ class StoryDiffusionSynthesizer:
                  num_pages: int,
                  height: int,
                  width: int,
                  model_name: str = "stabilityai/stable-diffusion-xl-base-1.0",
                  model_path: str = None,
                  id_length: int = 4,
@@ -404,7 +405,7 @@ class StoryDiffusionSynthesizer:
         self.total_length = num_pages
         self.height = height
         self.width = width
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.float16
         self.num_steps = num_steps
         self.styles = {
@@ -525,7 +526,7 @@ class StoryDiffusionSynthesizer:
         return p.replace("{prompt}", positive)
     def call(self,
-             prompts: List[str],
              input_id_images = None,
              start_merge_step = None,
              style_name: str = "Pixar/Disney Character",
@@ -581,7 +582,7 @@ class StoryDiffusionAgent:
         if llm_type == "qwen2":
             self.LLM = QwenAgent
-    def call(self, pages: List, save_path: str):
         role_dict = self.extract_role_from_story(pages, **self.config["revise_cfg"])
         image_prompts = self.generate_image_prompt_from_story(pages, **self.config["revise_cfg"])
         image_prompts_with_role_desc = []
@@ -592,6 +593,7 @@ class StoryDiffusionAgent:
             image_prompts_with_role_desc.append(image_prompt)
         generation_agent = StoryDiffusionSynthesizer(
             num_pages=len(pages),
             **self.config["obj_cfg"]
         )
         images = generation_agent.call(

                  num_pages: int,
                  height: int,
                  width: int,
+                 device: str,
                  model_name: str = "stabilityai/stable-diffusion-xl-base-1.0",
                  model_path: str = None,
                  id_length: int = 4,
         self.total_length = num_pages
         self.height = height
         self.width = width
+        self.device = device
         self.dtype = torch.float16
         self.num_steps = num_steps
         self.styles = {
         return p.replace("{prompt}", positive)
     def call(self,
+             prompts: List[str],
              input_id_images = None,
              start_merge_step = None,
              style_name: str = "Pixar/Disney Character",
         if llm_type == "qwen2":
             self.LLM = QwenAgent
+    def call(self, pages: List, device: str, save_path: str):
         role_dict = self.extract_role_from_story(pages, **self.config["revise_cfg"])
         image_prompts = self.generate_image_prompt_from_story(pages, **self.config["revise_cfg"])
         image_prompts_with_role_desc = []
             image_prompts_with_role_desc.append(image_prompt)
         generation_agent = StoryDiffusionSynthesizer(
             num_pages=len(pages),
+            device=device,
             **self.config["obj_cfg"]
         )
         images = generation_agent.call(

mm_story_agent/modality_agents/music_agent.py CHANGED Viewed

@@ -14,9 +14,10 @@ class MusicGenSynthesizer:
     def __init__(self,
                  model_name: str = 'facebook/musicgen-medium',
                  sample_rate: int = 16000,
                  ) -> None:
-        self.model = MusicGen.get_pretrained(model_name)
         self.sample_rate = sample_rate
     def call(self,
@@ -63,10 +64,10 @@ class MusicGenAgent:
         return music_prompt
-    def call(self, pages: List, save_path: str):
         save_path = Path(save_path)
         music_prompt = self.generate_music_prompt_from_story(pages, **self.config["revise_cfg"])
-        generation_agent = MusicGenSynthesizer()
         generation_agent.call(
             prompt=music_prompt,
             save_path=save_path / "music.wav",

     def __init__(self,
                  model_name: str = 'facebook/musicgen-medium',
+                 device: str = 'cuda',
                  sample_rate: int = 16000,
                  ) -> None:
+        self.model = MusicGen.get_pretrained(model_name, device=device).to(device)
         self.sample_rate = sample_rate
     def call(self,
         return music_prompt
+    def call(self, pages: List, device: str, save_path: str):
         save_path = Path(save_path)
         music_prompt = self.generate_music_prompt_from_story(pages, **self.config["revise_cfg"])
+        generation_agent = MusicGenSynthesizer(device=device)
         generation_agent.call(
             prompt=music_prompt,
             save_path=save_path / "music.wav",

mm_story_agent/modality_agents/sound_agent.py CHANGED Viewed

@@ -14,8 +14,9 @@ class AudioLDM2Synthesizer:
     def __init__(self,
                  model_path: str = None,
                  ) -> None:
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = AudioLDM2Pipeline.from_pretrained(
             model_path if model_path is not None else "cvssp/audioldm2",
             torch_dtype=torch.float16
@@ -49,7 +50,7 @@ class AudioLDM2Agent:
         if llm_type == "qwen2":
             self.LLM = QwenAgent
-    def call(self, pages: List, save_path: str):
         sound_prompts = self.generate_sound_prompt_from_story(pages, **self.config["revise_cfg"])
         save_paths = []
         forward_prompts = []
@@ -59,7 +60,7 @@ class AudioLDM2Agent:
                 save_paths.append(save_path / f"p{idx + 1}.wav")
                 forward_prompts.append(sound_prompts[idx])
-        generation_agent = AudioLDM2Synthesizer()
         if len(forward_prompts) > 0:
             sounds = generation_agent.call(
                 forward_prompts,

     def __init__(self,
                  model_path: str = None,
+                 device: str = "cuda"
                  ) -> None:
+        self.device = device
         self.pipe = AudioLDM2Pipeline.from_pretrained(
             model_path if model_path is not None else "cvssp/audioldm2",
             torch_dtype=torch.float16
         if llm_type == "qwen2":
             self.LLM = QwenAgent
+    def call(self, pages: List, device: str, save_path: str):
         sound_prompts = self.generate_sound_prompt_from_story(pages, **self.config["revise_cfg"])
         save_paths = []
         forward_prompts = []
                 save_paths.append(save_path / f"p{idx + 1}.wav")
                 forward_prompts.append(sound_prompts[idx])
+        generation_agent = AudioLDM2Synthesizer(device=device)
         if len(forward_prompts) > 0:
             sounds = generation_agent.call(
                 forward_prompts,

mm_story_agent/modality_agents/speech_agent.py CHANGED Viewed

@@ -74,7 +74,7 @@ class CosyVoiceAgent:
     def __init__(self, config) -> None:
         self.config = config
-    def call(self, pages: List, save_path: str):
         save_path = Path(save_path)
         generation_agent = CosyVoiceSynthesizer()

     def __init__(self, config) -> None:
         self.config = config
+    def call(self, pages: List, device: str, save_path: str):
         save_path = Path(save_path)
         generation_agent = CosyVoiceSynthesizer()

mm_story_agent/prompts_en.py CHANGED Viewed

@@ -89,10 +89,11 @@ The input consists of already written story content and the current chapter that
 Output the expanded story content for the current chapter. The result should be a list where each element corresponds to the plot of one page of the storybook.
 ## Notes
-1. Only expand the current chapter; do not overwrite content from other chapters.
-2. The expanded content should not be too lengthy, with a maximum of 3 pages and no more than 2 sentences per page.
-3. Maintain the tone of the story; do not add extra annotations, explanations, settings, or comments.
-4. If the story is already complete, no further writing is necessary.
 """.strip()

 Output the expanded story content for the current chapter. The result should be a list where each element corresponds to the plot of one page of the storybook.
 ## Notes
+1. Only expand the current chapter. Do not overwrite content from other chapters.
+2. The expanded story content should not be too long, with a maximum of 3 pages. Each page contains only 1 sentence.
+3. Maintain the tone of the story. Do not add extra annotations, explanations, settings, or comments.
+4. Use simple and straightforward language suitable for children's stories.
+5. If the story is already complete, no further writing is necessary.
 """.strip()

nls-1.0.0-py3-none-any.whl DELETED Viewed

Binary file (47 kB)