ola13 commited on
Commit
de3513e
Β·
1 Parent(s): a7117c1

Init space

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  title: Dataset Explorer
3
- emoji: 😻
4
- colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.23.0
 
1
  ---
2
  title: Dataset Explorer
3
+ emoji: 🌘
4
+ colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.23.0
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import jsonlines
3
+ import os
4
+ import uuid
5
+
6
+
7
+ from datetime import datetime
8
+ from huggingface_hub import HfApi
9
+ from pprint import pprint
10
+
11
+
12
+ datasets = [
13
+ "gutenberg_raw",
14
+ "stackexchange2",
15
+ "bigcode_python_code",
16
+ "bigcode_python_github_issues",
17
+ "bigcode_python_jupyter_scripts_dedup_filtered",
18
+ "books3",
19
+ "c4",
20
+ "s2orc_raw",
21
+ "reddit_threaded",
22
+ "cc_filtered_text",
23
+ ]
24
+
25
+
26
+ def line_generator(dataset):
27
+ if dataset == "gutenberg_raw":
28
+ with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
29
+ for line in f:
30
+ yield line
31
+ if dataset == "stackexchange2":
32
+ with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
33
+ for line in f:
34
+ yield line
35
+ if dataset == "bigcode_python_code":
36
+ with jsonlines.open(
37
+ "data/bigcode_python_code_examples_with_stats.json", "r"
38
+ ) as f:
39
+ for line in f:
40
+ yield line
41
+ if dataset == "bigcode_python_github_issues":
42
+ with jsonlines.open(
43
+ "data/bigcode_python_github_issues_examples_with_stats.json", "r"
44
+ ) as f:
45
+ for line in f:
46
+ yield line
47
+ if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
48
+ with jsonlines.open(
49
+ "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
50
+ "r",
51
+ ) as f:
52
+ for line in f:
53
+ yield line
54
+ if dataset == "books3":
55
+ with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
56
+ for line in f:
57
+ yield line
58
+ if dataset == "c4":
59
+ with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
60
+ for line in f:
61
+ yield line
62
+ if dataset == "s2orc_raw":
63
+ with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
64
+ for line in f:
65
+ yield line
66
+ if dataset == "reddit_threaded":
67
+ with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
68
+ for line in f:
69
+ yield line
70
+ if dataset == "cc_filtered_text":
71
+ with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
72
+ for line in f:
73
+ yield line
74
+
75
+
76
+ line_generators = {dataset: line_generator(dataset) for dataset in datasets}
77
+
78
+
79
+ def send_report(sample, dataset, reason, annotator):
80
+ text = sample["text"]
81
+ sample.pop("text")
82
+
83
+ sample_id = ""
84
+ if "id" not in sample:
85
+ if "title" in sample:
86
+ sample_id = sample["title"]
87
+ else:
88
+ sample_id = sample["id"]
89
+
90
+ print("submitting")
91
+ pprint(
92
+ {
93
+ "dataset": dataset,
94
+ "docid": sample_id,
95
+ "text": text,
96
+ "metadata": sample,
97
+ "reason": reason,
98
+ "annotator": annotator,
99
+ "timestamp": str(datetime.now()),
100
+ }
101
+ )
102
+ with jsonlines.open("report.jsonl", "w") as f:
103
+ f.write(
104
+ {
105
+ "dataset": dataset,
106
+ "docid": sample_id,
107
+ "text": text,
108
+ "metadata": sample,
109
+ "reason": reason,
110
+ "annotator": annotator,
111
+ "timestamp": str(datetime.now()),
112
+ }
113
+ )
114
+ print("geclm_token", os.environ.get("geclm_token"))
115
+ api = HfApi()
116
+ api.upload_file(
117
+ path_or_fileobj="report.jsonl",
118
+ path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
119
+ repo_id="HuggingFaceGECLM/data_feedback",
120
+ repo_type="dataset",
121
+ token=os.environ.get("geclm_token"),
122
+ )
123
+
124
+
125
+ if __name__ == "__main__":
126
+ demo = gr.Blocks()
127
+
128
+ with demo:
129
+ current_sample_state = gr.State(dict())
130
+
131
+ with gr.Row():
132
+ annotator = gr.Textbox(
133
+ lines=1,
134
+ max_lines=1,
135
+ placeholder="Type your name here if you'd like it to be recorded.",
136
+ label="Annotator",
137
+ )
138
+ with gr.Row():
139
+ dataset = gr.Dropdown(
140
+ choices=datasets,
141
+ value="Pick a dataset below",
142
+ label="Dataset",
143
+ )
144
+ with gr.Row():
145
+ reason_txt = gr.Textbox(
146
+ label="Flagging reason",
147
+ placeholder="Provide the reason for flagging if you think the sample is bad.",
148
+ visible=False,
149
+ )
150
+ with gr.Row():
151
+ bad_btn = gr.Button("Bad", visible=False)
152
+ good_btn = gr.Button("Next", visible=False)
153
+ with gr.Row():
154
+ text = gr.Markdown(visible=False)
155
+
156
+ def next_line(dataset):
157
+ next_line = next(line_generators[dataset])
158
+ return [
159
+ gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
160
+ next_line,
161
+ gr.update(visible=True),
162
+ gr.update(visible=True),
163
+ gr.update(visible=True),
164
+ ]
165
+
166
+ def bad_line(current_sample, dataset, reason, annotator):
167
+ send_report(current_sample, dataset, reason, annotator)
168
+ next_line = next(line_generators[dataset])
169
+ return [
170
+ "<pre>" + next_line["text"] + "</pre>",
171
+ gr.update(
172
+ value="",
173
+ placeholder="Provide the reason for flagging if you think the sample is bad.",
174
+ ),
175
+ next_line,
176
+ ]
177
+
178
+ good_btn.click(
179
+ next_line,
180
+ inputs=dataset,
181
+ outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
182
+ )
183
+ dataset.change(
184
+ next_line,
185
+ inputs=dataset,
186
+ outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
187
+ )
188
+ bad_btn.click(
189
+ bad_line,
190
+ inputs=[current_sample_state, dataset, reason_txt, annotator],
191
+ outputs=[text, reason_txt, current_sample_state],
192
+ )
193
+
194
+ demo.launch(enable_queue=False, debug=True)
data/anton_cc_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d51628275c8d69b4093333f25124d1739530610d5afab21aa7ce65ae884d101a
3
+ size 28676983
data/bigcode_python_code_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e681c3ae57ba5342a5df6fb426d6f75b0db857d6dfba249c2d8f7a0f1c358888
3
+ size 9894655
data/bigcode_python_github_issues_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1a08c977cfe4a19cc0dbde6fdd76e17b44966d7e7a0fd09725c4ca8d4ee2cb8
3
+ size 17823834
data/bigcode_python_jupyter_markdowned_clean_dedup_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc8ed48cd9f3513113608210b154aa44bc19007bfe6a3dc7450f0710db61e2d
3
+ size 10827004
data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:930708072ae21b8e3234423a0d8a738f31a55e4af01477aefabb86e6b928b17e
3
+ size 8820911
data/books3_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3096e0c9f10abf1a35f93fb26c0ab5bde41ea20a9b41d677577e58cd5fc1657c
3
+ size 505731876
data/c4_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e62506224d8090b48ec2dc724e8943f6e969ad8dfd6c6ae5b8d33478fc815d13
3
+ size 2657133
data/cc_filtered_text_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:766f2fa24b0d89d6e9a140416fb95b068ab348a4b860bea0ca7ba37f12d8bfc5
3
+ size 6953247
data/gutenberg_raw_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85d3804c498fe5624446e222d85918438e09a3604307a1523be131c3890259b3
3
+ size 172318302
data/helen_cc_bad_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67a4b70e0eabe8dd270667b74fff3d0e9c426dd02f98b8f6149245a0f682f019
3
+ size 751776
data/helen_cc_good_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f633d4b01f07b6bee8e90ba9285f7c3a6c71b98d92d43dcbfe490880a0ac4fc
3
+ size 28075699
data/reddit_threaded_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60955d5f50d6643af8bf7253e2beb9b1b703a3059968e3d9d2d424954291b64f
3
+ size 2295871
data/s2orc_raw_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbcb36fd24dca3e62696609327dcb28e2f38e78d9e32c0d24439e66a5c84b191
3
+ size 25281345
data/stackexchange2_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67e4519399245b056952ec073f00dd6e6e94895e2b1052def39044625537a794
3
+ size 5947625
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ huggingface_hub
2
+ jsonlines