davidberenstein1957 HF staff commited on
Commit
a69bbb8
1 Parent(s): 1df21c4

feat: add support for file uploads

Browse files
src/distilabel_dataset_generator/__init__.py CHANGED
@@ -7,7 +7,7 @@ from distilabel.utils.card.dataset_card import (
7
  DistilabelDatasetCard,
8
  size_categories_parser,
9
  )
10
- from huggingface_hub import DatasetCardData, HfApi
11
 
12
 
13
  class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
 
7
  DistilabelDatasetCard,
8
  size_categories_parser,
9
  )
10
+ from huggingface_hub import DatasetCardData, HfApi, upload_file
11
 
12
 
13
  class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
src/distilabel_dataset_generator/apps/sft.py CHANGED
@@ -1,9 +1,11 @@
 
1
  import multiprocessing
2
  import time
3
 
4
  import gradio as gr
5
  import pandas as pd
6
  from distilabel.distiset import Distiset
 
7
 
8
  from src.distilabel_dataset_generator.pipelines.sft import (
9
  DEFAULT_DATASET_DESCRIPTIONS,
@@ -140,7 +142,7 @@ def generate_dataset(
140
  distiset.push_to_hub(
141
  repo_id=repo_id,
142
  private=private,
143
- include_script=False,
144
  token=oauth_token,
145
  )
146
 
@@ -155,6 +157,18 @@ def generate_dataset(
155
  return pd.DataFrame(outputs)
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  css = """
159
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
160
  """
@@ -169,9 +183,9 @@ with gr.Blocks(
169
  "To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
170
  )
171
  with gr.Row():
172
- gr.Column(scale=0.5)
173
  get_login_button()
174
- gr.Column(scale=0.5)
175
 
176
  gr.Markdown("## Iterate on a sample dataset")
177
  with gr.Column() as main_ui:
@@ -304,6 +318,17 @@ with gr.Blocks(
304
  def hide_success_message():
305
  return gr.Markdown(visible=False)
306
 
 
 
 
 
 
 
 
 
 
 
 
307
  sample_dataset.change(
308
  fn=lambda x: x,
309
  inputs=[sample_dataset],
@@ -326,23 +351,16 @@ with gr.Blocks(
326
  ],
327
  outputs=[final_dataset],
328
  show_progress=True,
 
 
 
 
329
  ).success(
330
  fn=show_success_message,
331
  inputs=[org_name, repo_name],
332
  outputs=[success_message],
333
  )
334
 
335
- gr.Markdown("## Or run this pipeline locally with distilabel")
336
-
337
- with gr.Accordion("Run this pipeline using distilabel", open=False):
338
- pipeline_code = gr.Code(
339
- value=generate_pipeline_code(
340
- system_prompt.value, num_turns.value, num_rows.value
341
- ),
342
- language="python",
343
- label="Distilabel Pipeline Code",
344
- )
345
-
346
  system_prompt.change(
347
  fn=generate_pipeline_code,
348
  inputs=[system_prompt, num_turns, num_rows],
 
1
+ import io
2
  import multiprocessing
3
  import time
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from distilabel.distiset import Distiset
8
+ from huggingface_hub import upload_file
9
 
10
  from src.distilabel_dataset_generator.pipelines.sft import (
11
  DEFAULT_DATASET_DESCRIPTIONS,
 
142
  distiset.push_to_hub(
143
  repo_id=repo_id,
144
  private=private,
145
+ include_script=True,
146
  token=oauth_token,
147
  )
148
 
 
157
  return pd.DataFrame(outputs)
158
 
159
 
160
+ def upload_pipeline_code(pipeline_code, org_name, repo_name, oauth_token):
161
+ with io.BytesIO(pipeline_code.encode("utf-8")) as f:
162
+ upload_file(
163
+ path_or_fileobj=f,
164
+ path_in_repo="pipeline.py",
165
+ repo_id=f"{org_name}/{repo_name}",
166
+ repo_type="dataset",
167
+ token=oauth_token,
168
+ commit_message="Include pipeline script",
169
+ )
170
+
171
+
172
  css = """
173
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
174
  """
 
183
  "To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
184
  )
185
  with gr.Row():
186
+ gr.Column()
187
  get_login_button()
188
+ gr.Column()
189
 
190
  gr.Markdown("## Iterate on a sample dataset")
191
  with gr.Column() as main_ui:
 
318
  def hide_success_message():
319
  return gr.Markdown(visible=False)
320
 
321
+ gr.Markdown("## Or run this pipeline locally with distilabel")
322
+
323
+ with gr.Accordion("Run this pipeline using distilabel", open=False):
324
+ pipeline_code = gr.Code(
325
+ value=generate_pipeline_code(
326
+ system_prompt.value, num_turns.value, num_rows.value
327
+ ),
328
+ language="python",
329
+ label="Distilabel Pipeline Code",
330
+ )
331
+
332
  sample_dataset.change(
333
  fn=lambda x: x,
334
  inputs=[sample_dataset],
 
351
  ],
352
  outputs=[final_dataset],
353
  show_progress=True,
354
+ ).then(
355
+ fn=upload_pipeline_code,
356
+ inputs=[pipeline_code, org_name, repo_name, oauth_token],
357
+ outputs=[],
358
  ).success(
359
  fn=show_success_message,
360
  inputs=[org_name, repo_name],
361
  outputs=[success_message],
362
  )
363
 
 
 
 
 
 
 
 
 
 
 
 
364
  system_prompt.change(
365
  fn=generate_pipeline_code,
366
  inputs=[system_prompt, num_turns, num_rows],
src/distilabel_dataset_generator/utils.py CHANGED
@@ -33,8 +33,7 @@ else:
33
 
34
  def get_login_button():
35
  return gr.LoginButton(
36
- value="Sign in with Hugging Face!",
37
- size="lg",
38
  ).activate()
39
 
40
 
 
33
 
34
  def get_login_button():
35
  return gr.LoginButton(
36
+ value="Sign in with Hugging Face!", size="lg", scale=2
 
37
  ).activate()
38
 
39