MultiShotMaster/util.py at main · KlingAIResearch/MultiShotMaster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from PIL import Image
from diffsynth.utils.data import save_video
from copy import deepcopy
import numpy as np
from PIL import Image, ImageDraw, ImageFont

def rgb_to_latent_shot_groups_list(shot_groups_list):
    latent_shot_group_list = deepcopy(shot_groups_list)

    for shot_index, shot_group in enumerate(shot_groups_list):
        shot_start_frame, shot_end_frame = shot_group
        now_shot_frame_num = shot_end_frame - shot_start_frame

        if shot_start_frame == 0:
            shot_latent_start_frame = 0
        else:
            shot_latent_start_frame = latent_shot_group_list[shot_index - 1][-1]

        shot_latent_end_frame = 1 + (max(0, now_shot_frame_num - 1)) // 4 + shot_latent_start_frame
        latent_shot_group_list[shot_index] = [shot_latent_start_frame, shot_latent_end_frame]

    return latent_shot_group_list

def pad_shot_groups_to_4n_plus_1(shot_groups_list):
    padded_shot_groups = []
    save_shot_num_list = []
    current_start = shot_groups_list[0][0]

    for i, (start, end) in enumerate(shot_groups_list):
        current_frame_num = end - start
        save_shot_num_list.append(current_frame_num)

        remainder = (current_frame_num - 1) % 4
        if remainder == 0:
            needed_frame_num = current_frame_num
        else:
            needed_frame_num = current_frame_num + (4 - remainder)

        current_end = current_start + needed_frame_num
        padded_shot_groups.append([current_start, current_end])

        current_start = current_end

    return padded_shot_groups, save_shot_num_list


def get_user_wanted_frames(video, padded_shot_groups, save_shot_num_list):
    output_list = []
    for i, (start, end) in enumerate(padded_shot_groups):
        output_list.append(video[start: start + save_shot_num_list[i]])
    output_video = np.concatenate(output_list, axis=0)

    return output_video


class TextImageCreator:
    def get_text_width(self, text, font):
        try:
            return font.getlength(text)
        except AttributeError:
            try:
                return font.getsize(text)[0]
            except:
                return len(text) * font.size

    def create_text_image(self, text, width, height):

        image = Image.new('RGB', (width, height), color='black')
        draw = ImageDraw.Draw(image)

        try:
            font = ImageFont.truetype("comic.ttf", 19)
        except:
            font = ImageFont.load_default()

        lines = []
        paragraphs = text.split('\n')

        for paragraph in paragraphs:
            if not paragraph.strip():
                lines.append('')
                continue

            current_line = ''
            for char in paragraph:
                test_line = current_line + char
                if self.get_text_width(test_line, font) > width - 20:
                    lines.append(current_line)
                    current_line = char
                else:
                    current_line = test_line

            if current_line:
                lines.append(current_line)

        y = 5
        line_spacing = 19
        for line in lines:
            draw.text((10, y), line, font=font, fill='white')
            y += line_spacing

        return np.array(image)


def save_video_with_caption(num_shots, shot_groups, now_multishot_video_caption_list, user_wanted_frames, save_path, target_width):
    creator = TextImageCreator()
    caption_frame_list = []
    for count in range(num_shots):
        now_shot_caption = now_multishot_video_caption_list[count]
        caption_frame = creator.create_text_image(now_shot_caption.replace("Now:", f"\n\n[Shot{num_shots}_{count+1}]-Now:"), target_width, 250)
        start_frame, end_frame = shot_groups[count]
        caption_frame_list = caption_frame_list + [caption_frame] * (end_frame - start_frame)
    caption_video = np.stack(caption_frame_list)

    video_with_caption = np.concatenate((user_wanted_frames, caption_video), axis=1)
    save_video(video_with_caption, save_path, fps=15, quality=5)