TeacherPuffy commited on
Commit
1153ecb
·
verified ·
1 Parent(s): decfc82

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ import tempfile
4
+
5
+ def combine_dataset_texts(dataset_name, split, text_column):
6
+ try:
7
+ # Load the dataset from Hugging Face Hub
8
+ dataset = load_dataset(dataset_name, split=split)
9
+
10
+ # Verify the text column exists
11
+ if text_column not in dataset.column_names:
12
+ raise gr.Error(f"Column '{text_column}' not found in dataset")
13
+
14
+ # Combine all texts
15
+ combined_text = "\n\n".join([example[text_column] for example in dataset])
16
+
17
+ # Create a temporary file
18
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
19
+ f.write(combined_text)
20
+ return f.name
21
+
22
+ except Exception as e:
23
+ raise gr.Error(f"Error processing dataset: {str(e)}")
24
+
25
+ # Create Gradio interface
26
+ with gr.Blocks() as demo:
27
+ gr.Markdown("## Hugging Face Dataset Text Combiner")
28
+ gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")
29
+
30
+ with gr.Row():
31
+ dataset_input = gr.Textbox(label="Dataset Name",
32
+ placeholder="username/dataset-name")
33
+ split_input = gr.Textbox(label="Split", value="train")
34
+ column_input = gr.Textbox(label="Text Column", value="text")
35
+
36
+ submit_btn = gr.Button("Combine Texts")
37
+
38
+ with gr.Row():
39
+ output_file = gr.File(label="Combined Text File")
40
+ error_out = gr.Textbox(label="Error Output", visible=False)
41
+
42
+ submit_btn.click(
43
+ fn=combine_dataset_texts,
44
+ inputs=[dataset_input, split_input, column_input],
45
+ outputs=output_file,
46
+ api_name="combine_texts"
47
+ )
48
+
49
+ if __name__ == "__main__":
50
+ demo.launch()