openfree commited on
Commit
5322786
Β·
verified Β·
1 Parent(s): 4a831bc

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +179 -0
app-backup.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ from io import BytesIO
5
+ import requests
6
+
7
+ def dataset_converter(input_file, conversion_type, parquet_url):
8
+ # Initialize variables for file data and extension
9
+ file_bytes = None
10
+ file_name = None
11
+ file_extension = None
12
+
13
+ # Read the input file if provided
14
+ if input_file is not None:
15
+ try:
16
+ file_bytes = input_file.read()
17
+ file_name = input_file.name
18
+ except AttributeError:
19
+ file_name = input_file
20
+ with open(file_name, "rb") as f:
21
+ file_bytes = f.read()
22
+ file_extension = file_name.lower().split('.')[-1]
23
+
24
+ # Conversion: CSV to Parquet
25
+ if conversion_type == "CSV to Parquet":
26
+ if input_file is None or file_extension != "csv":
27
+ raise ValueError("For CSV to Parquet conversion, please upload a CSV file. πŸ“„")
28
+ df = pd.read_csv(BytesIO(file_bytes))
29
+ output_file = "output.parquet"
30
+ df.to_parquet(output_file, index=False)
31
+ converted_format = "Parquet"
32
+ preview_str = df.head(10).to_string(index=False)
33
+
34
+ # Conversion: Parquet to CSV
35
+ elif conversion_type == "Parquet to CSV":
36
+ if input_file is None or file_extension != "parquet":
37
+ raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. πŸ“„")
38
+ df = pd.read_parquet(BytesIO(file_bytes))
39
+ output_file = "output.csv"
40
+ df.to_csv(output_file, index=False)
41
+ converted_format = "CSV"
42
+ preview_str = df.head(10).to_string(index=False)
43
+
44
+ # Conversion: CSV to JSONL
45
+ elif conversion_type == "CSV to JSONL":
46
+ if input_file is None or file_extension != "csv":
47
+ raise ValueError("For CSV to JSONL conversion, please upload a CSV file. πŸ“„")
48
+ # Read CSV with latin1 encoding
49
+ df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
50
+ output_file = "metadata.jsonl"
51
+ total_data = []
52
+ for index, row in df.iterrows():
53
+ data = {}
54
+ file_name_val = None # Initialize file_name for each row
55
+ for column in df.columns:
56
+ if column == 'file_name':
57
+ file_name_val = row[column]
58
+ data[column] = row[column]
59
+ row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
60
+ total_data.append(row_data)
61
+ # Write JSONL output (using write mode so previous data is overwritten)
62
+ with open(output_file, 'w', encoding='utf-8') as f:
63
+ for row_data in total_data:
64
+ f.write(json.dumps(row_data) + '\n')
65
+ converted_format = "JSONL"
66
+ preview_str = df.head(10).to_string(index=False)
67
+
68
+ # Conversion: Parquet to JSONL
69
+ elif conversion_type == "Parquet to JSONL":
70
+ # Use uploaded file if available; otherwise try the provided URL
71
+ if input_file is not None:
72
+ df = pd.read_parquet(BytesIO(file_bytes))
73
+ elif parquet_url:
74
+ response = requests.get(parquet_url)
75
+ response.raise_for_status() # Ensure the request was successful
76
+ df = pd.read_parquet(BytesIO(response.content))
77
+ file_name = "from_url.parquet"
78
+ else:
79
+ raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
80
+
81
+ output_file = "output.jsonl"
82
+ # Recursive function to decode bytes to UTF-8 strings
83
+ def recursive_sanitize(val):
84
+ if isinstance(val, bytes):
85
+ return val.decode("utf-8", errors="replace")
86
+ elif isinstance(val, dict):
87
+ return {k: recursive_sanitize(v) for k, v in val.items()}
88
+ elif isinstance(val, list):
89
+ return [recursive_sanitize(item) for item in val]
90
+ else:
91
+ return val
92
+
93
+ records = df.to_dict(orient="records")
94
+ with open(output_file, "w", encoding="utf-8") as f:
95
+ for record in records:
96
+ sanitized_record = recursive_sanitize(record)
97
+ f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n")
98
+ converted_format = "JSONL"
99
+ preview_str = df.head(10).to_string(index=False)
100
+
101
+ else:
102
+ raise ValueError("Invalid conversion type selected. ⚠️")
103
+
104
+ info_message = (
105
+ f"Input file: {file_name if file_name is not None else 'N/A'}\n"
106
+ f"Converted file format: {converted_format}\n\n"
107
+ f"Preview (Top 10 Rows):\n{preview_str}\n\n"
108
+ "Community: https://discord.gg/openfreeai πŸš€"
109
+ )
110
+ return output_file, info_message
111
+
112
+ # Custom CSS for a modern and sleek look
113
+ custom_css = """
114
+ body {
115
+ background-color: #f4f4f4;
116
+ font-family: 'Helvetica Neue', Arial, sans-serif;
117
+ }
118
+ .gradio-container {
119
+ max-width: 900px;
120
+ margin: 40px auto;
121
+ padding: 20px;
122
+ background-color: #ffffff;
123
+ border-radius: 12px;
124
+ box-shadow: 0 8px 16px rgba(0,0,0,0.1);
125
+ }
126
+ h1, h2 {
127
+ color: #333333;
128
+ }
129
+ .gradio-input, .gradio-output {
130
+ margin-bottom: 20px;
131
+ }
132
+ .gradio-button {
133
+ background-color: #4CAF50 !important;
134
+ color: white !important;
135
+ border: none !important;
136
+ padding: 10px 20px !important;
137
+ font-size: 16px !important;
138
+ border-radius: 6px !important;
139
+ cursor: pointer;
140
+ }
141
+ .gradio-button:hover {
142
+ background-color: #45a049 !important;
143
+ }
144
+ """
145
+
146
+ with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
147
+ gr.Markdown("# Datasets Convertor πŸš€")
148
+ gr.Markdown(
149
+ "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
150
+ "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
151
+ )
152
+
153
+ with gr.Row():
154
+ with gr.Column(scale=1):
155
+ input_file = gr.File(label="Upload CSV or Parquet File πŸ“„")
156
+ with gr.Column(scale=1):
157
+ conversion_type = gr.Radio(
158
+ choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
159
+ label="Conversion Type πŸ”„"
160
+ )
161
+
162
+ # Optional URL input for Parquet to JSONL conversion
163
+ parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
164
+
165
+ convert_button = gr.Button("Convert ⚑", elem_classes=["gradio-button"])
166
+
167
+ with gr.Row():
168
+ output_file = gr.File(label="Converted File πŸ’Ύ")
169
+ preview = gr.Textbox(label="Preview (Top 10 Rows) πŸ”", lines=15)
170
+
171
+ convert_button.click(
172
+ fn=dataset_converter,
173
+ inputs=[input_file, conversion_type, parquet_url],
174
+ outputs=[output_file, preview]
175
+ )
176
+
177
+ gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) 🀝")
178
+
179
+ demo.launch()