Spaces:
Sleeping
Sleeping
File size: 4,973 Bytes
4fbb557 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from itertools import count, islice
from typing import Any, Iterable
import gradio as gr
import pandas as pd
import requests
from gradio_huggingfacehub_search import HuggingfaceHubSearch
session = requests.Session()
empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
NUM_ROWS_PREVIEW = 5
with gr.Blocks() as demo:
gr.Markdown(
"# 🤗 Dataset ReWriter ✍️✨\n\n"
"Adjust, translate or transform completely existing datasets.\n\n"
)
with gr.Row():
with gr.Column(scale=3):
dataset_search = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
)
subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
input_query = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
gr.Markdown("### Input")
input_preview = gr.DataFrame(interactive=False, wrap=True)
gr.Markdown("### Output")
output_preview = gr.DataFrame(interactive=False, wrap=True)
save_button = gr.Button("Save ReWriten Dataset", interactive=False)
############
#
# Utils
#
###########
def stream_rows(dataset: str, subset: str, split: str, batch_size: int = 100) -> Iterable[dict[str, Any]]:
for i in count():
rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={subset}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
if "error" in rows_resp:
raise RuntimeError(rows_resp["error"])
if not rows_resp["rows"]:
break
for row_item in rows_resp["rows"]:
yield row_item["row"]
############
#
# Events
#
###########
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str) -> dict:
if "/" not in dataset.strip().strip("/"):
return None, None, {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
}
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
if "error" in info_resp:
return None, None, {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
}
subsets: list[str] = list(info_resp["dataset_info"])
subset = default_subset if default_subset in subsets else subsets[0]
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
split = default_split if default_split in splits else splits[0]
return subset, split, {
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
}
def _show_input_preview(dataset: str, default_subset: str, default_split: str) -> dict:
subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
if subset is None or split is None:
return output
return {
input_preview: pd.DataFrame(islice(({
k: str(v) for k, v in row.items()}
for row in stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)
), NUM_ROWS_PREVIEW)),
**output
}
@dataset_search.change(inputs=[dataset_search], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_dataset_search(dataset: str) -> dict:
return _show_input_preview(dataset, default_subset="default", default_split="train")
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
return _show_input_preview(dataset, default_subset=subset, default_split="train")
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
return _show_input_preview(dataset, default_subset=subset, default_split=split)
@rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, input_preview], outputs=[output_preview])
def rewrite(dataset: str, subset: str, split: str, input_preview_df: pd.DataFrame) -> dict:
# TODO: implement
return {output_preview: pd.DataFrame([{"TODO": ["implement"]}])}
demo.launch()
|