MuseV-test / mmcm /utils /table_util.py
kevinwang676's picture
Upload folder using huggingface_hub
6755a2d verified
import io
from typing import List, Dict, Tuple, BinaryIO
from copy import deepcopy
import pandas as pd
from PIL import Image
from xlsxwriter.utility import xl_col_to_name
def read_image_data(path: str, target_image_height: int) -> Tuple[BinaryIO, int, int]:
"""读取图像,按照目标高做resize处理,并转化成二进制格式,返回最终图像的宽和高
Args:
path (str): 图像路径
target_image_height (int): 目标高
Returns:
Tuple[BinaryIO, int, int]: 图像二进制格式,返回最终图像的宽和高
"""
image = Image.open(path)
image_width_in_excel = int(image.width / (image.height / target_image_height))
image = image.resize(size=(image_width_in_excel, target_image_height))
image_byte = io.BytesIO()
image.save(image_byte, format="png")
return image_byte, image.width, image.height
def set_text_column_dynamic_width(worksheet, df, format, default_width=50):
"""将df在excel workshhet中的列按照实际内容长度设置列宽以及文本格式
Args:
worksheet (_type_): 待处理的excel worksheet
df (_type_): worksheet中原来对应的DataFrame格式
format (_type_): 对应列的文本格式
default_width (int, optional): 默认目标宽度. Defaults to 50.
"""
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
width = min(column_width, default_width)
worksheet.set_column(col_idx, col_idx, width, format)
def convert_tasks2clean(tasks):
tasks = [{"prompt": task["prompt"]} for task in tasks]
return tasks
def split_tasks_by_images_lst(tasks, save_images_path_key: str = "save_images_path"):
new_tasks = []
for task in tasks:
for image_path in task[save_images_path_key]:
new_task = deepcopy(task)
new_task[save_images_path_key] = image_path
new_tasks.append(new_task)
return new_tasks
def save_texts_images_2_csv(tasks: List[Dict], save_path: str):
"""存储相关结果为csv表格
tasks (List[Dict]): 待转换的字典列表
save_path (str): 表格存储路径
"""
df = pd.DataFrame(tasks)
df.to_csv(save_path, encoding="utf_8_sig", index=False)
def add_multi_data_validation(workbook, worksheet, validates, validate_idxs, n_rows):
for i, validate in enumerate(validates):
validate_idx = validate_idxs[i]
worksheet = add_data_validation(
workbook=workbook,
worksheet=worksheet,
col=validate_idx,
head=validate["col_name"],
candidates=validate["candidates"],
colors=validate["colors"],
n_rows=n_rows,
)
return worksheet
def add_data_validation(
workbook, worksheet, col: int, head, candidates, n_rows, colors
):
col = xl_col_to_name(col)
# Adding the header and Datavalidation list
worksheet.write('{}1'.format(col), head)
colors_fmt = [workbook.add_format({'bg_color': color}) for color in colors]
for row in range(n_rows):
cell_idx = '{}{}'.format(col, row + 2)
worksheet.data_validation(cell_idx, {'validate': 'list', 'source': candidates})
for i_c in range(len(candidates)):
worksheet.conditional_format(
cell_idx,
{
'type': 'formula',
'criteria': '=${}=\"{}\"'.format(cell_idx, candidates[i_c]),
'format': colors_fmt[i_c],
},
)
return worksheet
def insert_cell_image(
worksheet,
row,
col,
image_path,
image_height_in_table,
text_format,
row_ratio,
col_ratio,
):
image_byte, new_image_width, new_image_height = read_image_data(
image_path, target_image_height=image_height_in_table
)
# TODO:现在的图像列并不是预期内的和图像等宽,而是宽了很多
worksheet.set_column(
col,
col,
int(new_image_width / col_ratio),
)
worksheet.insert_image(
row,
col,
image_path,
{"image_data": image_byte},
)
worksheet.set_row(row, int(new_image_height / row_ratio), text_format)
return worksheet
def save_texts_images_2_excel(
tasks: List[Dict],
save_path: str,
image_height_in_table: int = 120,
row_ratio: float = 1.3,
col_ratio: float = 5,
validates: List = None,
):
"""将任务列表和生成的图像统一存储在表格中,方便观看对比实验结果。
Args:
tasks (List[Dict]): 待转换的字典列表
save_path (str): 表格存储路径
image_height_in_table (int, optional): 表格中缩略图的高. Defaults to 120.
row_ratio (float, optional): excel的单元格宽高和实际图像边长需要做比例转换. Defaults to 1.2.
col_ratio (float, optional): excel的单元格宽高和实际图像边长需要做比例转换. Defaults to 7.5.
need_add_checker_column (bool, optional): 是否新增一列用于审核检查状态. Defaults to False.
"""
df = pd.DataFrame(tasks)
# 先找到需要插入图像的列,插入图像列
keys_with_image = [
k for k in tasks[0].keys() if "images_path" in k and k != "save_images_path"
]
high_priority_col_idx = 0
# 默认save_images_path是生成图像,放在后面
if "save_images_path" in tasks[0]:
keys_with_image.append("save_images_path")
for img_key in keys_with_image:
maxlen_img_key_value = max(
[
len(task[img_key]) if isinstance(task[img_key], list) else 1
for task in tasks
]
)
for i in range(maxlen_img_key_value):
column = "{}_{}".format(img_key, i)
if column not in df.columns:
df.insert(
loc=high_priority_col_idx,
column=column,
value="",
)
high_priority_col_idx += 1
validate_start_idx = high_priority_col_idx
if validates is not None:
for i, validate in enumerate(validates):
if validate["col_name"] not in df.columns:
col_idx = validate_start_idx + i
df.insert(loc=col_idx, column=validate["col_name"], value="")
validate_idxs = range(validate_start_idx, validate_start_idx + len(validates))
writer = pd.ExcelWriter(save_path, engine="xlsxwriter")
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name="Sheet1", index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets["Sheet1"]
n_col = len(df.columns)
n_row = len(df)
text_format = workbook.add_format({"text_wrap": True})
set_text_column_dynamic_width(worksheet=worksheet, df=df, format=text_format)
# Insert an image.
for row in range(len(df)):
task = tasks[row]
cell_col = 0
for im_idx, img_key in enumerate(keys_with_image):
images_path = task[img_key]
if not isinstance(images_path, list):
cell_row = 1 + row
worksheet.write(cell_row, cell_col, img_key)
if len(images_path) == 0:
continue
worksheet = insert_cell_image(
worksheet=worksheet,
row=cell_row,
col=cell_col,
image_path=images_path,
image_height_in_table=image_height_in_table,
text_format=text_format,
row_ratio=row_ratio,
col_ratio=col_ratio,
)
cell_col += 1
else:
for i, image_path in enumerate(images_path):
worksheet = insert_cell_image(
worksheet=worksheet,
row=cell_row,
col=cell_col,
image_path=image_path,
image_height_in_table=image_height_in_table,
text_format=text_format,
row_ratio=row_ratio,
col_ratio=col_ratio,
)
cell_col += 1
if validates is not None:
worksheet = add_multi_data_validation(
workbook,
worksheet,
validates=validates,
validate_idxs=validate_idxs,
n_rows=len(df),
)
# Close the Pandas Excel writer and output the Excel file.
writer.save()