Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
3
#714
by
Tennish
- opened
- README.md +3 -23
- app.py +42 -506
- requirements.txt +3 -16
README.md
CHANGED
@@ -1,24 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🏆
|
4 |
-
colorFrom: green
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.9.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: true
|
10 |
-
license: apache-2.0
|
11 |
-
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
-
fullWidth: true
|
13 |
-
startup_duration_timeout: 1h
|
14 |
-
space_ci:
|
15 |
-
private: true
|
16 |
-
secrets:
|
17 |
-
- HF_TOKEN
|
18 |
-
- H4_TOKEN
|
19 |
-
tags:
|
20 |
-
- leaderboard
|
21 |
-
short_description: Track, rank and evaluate open LLMs and chatbots
|
22 |
-
---
|
23 |
|
24 |
-
|
|
|
1 |
+
# pdf-table-extraction-streamlit
|
2 |
+
Streamlit App using Camelot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
https://huggingface.co/spaces/Amrrs/pdf-table-extractor
|
app.py
CHANGED
@@ -1,526 +1,62 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
import
|
5 |
-
|
6 |
-
from
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
CITATION_BUTTON_LABEL,
|
11 |
-
CITATION_BUTTON_TEXT,
|
12 |
-
EVALUATION_QUEUE_TEXT,
|
13 |
-
FAQ_TEXT,
|
14 |
-
INTRODUCTION_TEXT,
|
15 |
-
LLM_BENCHMARKS_TEXT,
|
16 |
-
TITLE,
|
17 |
-
)
|
18 |
-
from src.display.css_html_js import custom_css
|
19 |
-
from src.display.utils import (
|
20 |
-
BENCHMARK_COLS,
|
21 |
-
COLS,
|
22 |
-
EVAL_COLS,
|
23 |
-
EVAL_TYPES,
|
24 |
-
NUMERIC_INTERVALS,
|
25 |
-
TYPES,
|
26 |
-
AutoEvalColumn,
|
27 |
-
ModelType,
|
28 |
-
Precision,
|
29 |
-
WeightType,
|
30 |
-
fields,
|
31 |
-
)
|
32 |
-
from src.envs import (
|
33 |
-
API,
|
34 |
-
DYNAMIC_INFO_FILE_PATH,
|
35 |
-
DYNAMIC_INFO_PATH,
|
36 |
-
DYNAMIC_INFO_REPO,
|
37 |
-
EVAL_REQUESTS_PATH,
|
38 |
-
EVAL_RESULTS_PATH,
|
39 |
-
H4_TOKEN,
|
40 |
-
IS_PUBLIC,
|
41 |
-
QUEUE_REPO,
|
42 |
-
REPO_ID,
|
43 |
-
RESULTS_REPO,
|
44 |
-
)
|
45 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
46 |
-
from src.scripts.update_all_request_files import update_dynamic_files
|
47 |
-
from src.submission.submit import add_new_eval
|
48 |
-
from src.tools.collections import update_collections
|
49 |
-
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
50 |
|
51 |
|
52 |
-
# Start ephemeral Spaces on PRs (see config in README.md)
|
53 |
-
enable_space_ci()
|
54 |
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
58 |
|
59 |
|
60 |
-
|
61 |
-
"""Attempt to download dataset with retries."""
|
62 |
-
attempt = 0
|
63 |
-
while attempt < max_attempts:
|
64 |
-
try:
|
65 |
-
print(f"Downloading {repo_id} to {local_dir}")
|
66 |
-
snapshot_download(
|
67 |
-
repo_id=repo_id,
|
68 |
-
local_dir=local_dir,
|
69 |
-
repo_type=repo_type,
|
70 |
-
tqdm_class=None,
|
71 |
-
etag_timeout=30,
|
72 |
-
max_workers=8,
|
73 |
-
)
|
74 |
-
return
|
75 |
-
except Exception as e:
|
76 |
-
logging.error(f"Error downloading {repo_id}: {e}")
|
77 |
-
attempt += 1
|
78 |
-
if attempt == max_attempts:
|
79 |
-
restart_space()
|
80 |
|
|
|
81 |
|
82 |
-
|
83 |
-
"""Initializes the application space, loading only necessary data."""
|
84 |
-
if full_init:
|
85 |
-
# These downloads only occur on full initialization
|
86 |
-
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
87 |
-
download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
88 |
-
download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
|
89 |
|
90 |
-
|
91 |
-
raw_data, original_df = get_leaderboard_df(
|
92 |
-
results_path=EVAL_RESULTS_PATH,
|
93 |
-
requests_path=EVAL_REQUESTS_PATH,
|
94 |
-
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
95 |
-
cols=COLS,
|
96 |
-
benchmark_cols=BENCHMARK_COLS,
|
97 |
-
)
|
98 |
|
99 |
-
|
100 |
-
# Collection update only happens on full initialization
|
101 |
-
update_collections(original_df)
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
107 |
|
108 |
-
|
|
|
109 |
|
110 |
-
|
111 |
-
# This controls whether a full initialization should be performed.
|
112 |
-
do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
113 |
|
114 |
-
#
|
115 |
-
|
116 |
-
leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
|
117 |
-
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
118 |
|
|
|
119 |
|
120 |
-
|
121 |
-
def load_and_create_plots():
|
122 |
-
plot_df = create_plot_df(create_scores_df(raw_data))
|
123 |
-
return plot_df
|
124 |
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
def update_table(
|
128 |
-
hidden_df: pd.DataFrame,
|
129 |
-
columns: list,
|
130 |
-
type_query: list,
|
131 |
-
precision_query: str,
|
132 |
-
size_query: list,
|
133 |
-
hide_models: list,
|
134 |
-
query: str,
|
135 |
-
):
|
136 |
-
filtered_df = filter_models(
|
137 |
-
df=hidden_df,
|
138 |
-
type_query=type_query,
|
139 |
-
size_query=size_query,
|
140 |
-
precision_query=precision_query,
|
141 |
-
hide_models=hide_models,
|
142 |
-
)
|
143 |
-
filtered_df = filter_queries(query, filtered_df)
|
144 |
-
df = select_columns(filtered_df, columns)
|
145 |
-
return df
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
return (
|
151 |
-
query,
|
152 |
-
query,
|
153 |
-
) # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
|
154 |
-
|
155 |
-
|
156 |
-
def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
157 |
-
return df[(df[AutoEvalColumn.fullname.name].str.contains(query, case=False, na=False))]
|
158 |
-
|
159 |
-
def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
160 |
-
return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
|
161 |
-
|
162 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
163 |
-
always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
164 |
-
dummy_col = [AutoEvalColumn.fullname.name]
|
165 |
-
filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
|
166 |
-
return filtered_df
|
167 |
-
|
168 |
-
def filter_queries(query: str, df: pd.DataFrame):
|
169 |
-
tmp_result_df = []
|
170 |
-
|
171 |
-
# Empty query return the same df
|
172 |
-
if query == "":
|
173 |
-
return df
|
174 |
-
|
175 |
-
# all_queries = [q.strip() for q in query.split(";")]
|
176 |
-
# license_queries = []
|
177 |
-
all_queries = [q.strip() for q in query.split(";") if q.strip() != ""]
|
178 |
-
model_queries = [q for q in all_queries if not q.startswith("licence")]
|
179 |
-
license_queries_raw = [q for q in all_queries if q.startswith("license")]
|
180 |
-
license_queries = [
|
181 |
-
q.replace("license:", "").strip() for q in license_queries_raw if q.replace("license:", "").strip() != ""
|
182 |
-
]
|
183 |
-
|
184 |
-
# Handling model name search
|
185 |
-
for query in model_queries:
|
186 |
-
tmp_df = search_model(df, query)
|
187 |
-
if len(tmp_df) > 0:
|
188 |
-
tmp_result_df.append(tmp_df)
|
189 |
-
|
190 |
-
if not tmp_result_df and not license_queries:
|
191 |
-
# Nothing is found, no license_queries -> return empty df
|
192 |
-
return pd.DataFrame(columns=df.columns)
|
193 |
-
|
194 |
-
if tmp_result_df:
|
195 |
-
df = pd.concat(tmp_result_df)
|
196 |
-
df = df.drop_duplicates(
|
197 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
198 |
-
)
|
199 |
-
|
200 |
-
if not license_queries:
|
201 |
-
return df
|
202 |
-
|
203 |
-
# Handling license search
|
204 |
-
tmp_result_df = []
|
205 |
-
for query in license_queries:
|
206 |
-
tmp_df = search_license(df, query)
|
207 |
-
if len(tmp_df) > 0:
|
208 |
-
tmp_result_df.append(tmp_df)
|
209 |
-
|
210 |
-
if not tmp_result_df:
|
211 |
-
# Nothing is found, return empty df
|
212 |
-
return pd.DataFrame(columns=df.columns)
|
213 |
-
|
214 |
-
df = pd.concat(tmp_result_df)
|
215 |
-
df = df.drop_duplicates(
|
216 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
217 |
-
)
|
218 |
-
|
219 |
-
return df
|
220 |
-
|
221 |
-
|
222 |
-
def filter_models(
|
223 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, hide_models: list
|
224 |
-
) -> pd.DataFrame:
|
225 |
-
# Show all models
|
226 |
-
if "Private or deleted" in hide_models:
|
227 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
228 |
-
else:
|
229 |
-
filtered_df = df
|
230 |
-
|
231 |
-
if "Contains a merge/moerge" in hide_models:
|
232 |
-
filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
|
233 |
-
|
234 |
-
if "MoE" in hide_models:
|
235 |
-
filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
|
236 |
-
|
237 |
-
if "Flagged" in hide_models:
|
238 |
-
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
239 |
-
|
240 |
-
type_emoji = [t[0] for t in type_query]
|
241 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
242 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
243 |
-
|
244 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
245 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
246 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
247 |
-
filtered_df = filtered_df.loc[mask]
|
248 |
-
|
249 |
-
return filtered_df
|
250 |
-
|
251 |
-
|
252 |
-
leaderboard_df = filter_models(
|
253 |
-
df=leaderboard_df,
|
254 |
-
type_query=[t.to_str(" : ") for t in ModelType],
|
255 |
-
size_query=list(NUMERIC_INTERVALS.keys()),
|
256 |
-
precision_query=[i.value.name for i in Precision],
|
257 |
-
hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
|
258 |
-
)
|
259 |
-
|
260 |
-
demo = gr.Blocks(css=custom_css)
|
261 |
-
with demo:
|
262 |
-
gr.HTML(TITLE)
|
263 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
264 |
-
|
265 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
266 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
267 |
-
with gr.Row():
|
268 |
-
with gr.Column():
|
269 |
-
with gr.Row():
|
270 |
-
search_bar = gr.Textbox(
|
271 |
-
placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
|
272 |
-
show_label=False,
|
273 |
-
elem_id="search-bar",
|
274 |
-
)
|
275 |
-
with gr.Row():
|
276 |
-
shown_columns = gr.CheckboxGroup(
|
277 |
-
choices=[
|
278 |
-
c.name
|
279 |
-
for c in fields(AutoEvalColumn)
|
280 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
281 |
-
],
|
282 |
-
value=[
|
283 |
-
c.name
|
284 |
-
for c in fields(AutoEvalColumn)
|
285 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
286 |
-
],
|
287 |
-
label="Select columns to show",
|
288 |
-
elem_id="column-select",
|
289 |
-
interactive=True,
|
290 |
-
)
|
291 |
-
with gr.Row():
|
292 |
-
hide_models = gr.CheckboxGroup(
|
293 |
-
label="Hide models",
|
294 |
-
choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
|
295 |
-
value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
|
296 |
-
interactive=True,
|
297 |
-
)
|
298 |
-
with gr.Column(min_width=320):
|
299 |
-
# with gr.Box(elem_id="box-filter"):
|
300 |
-
filter_columns_type = gr.CheckboxGroup(
|
301 |
-
label="Model types",
|
302 |
-
choices=[t.to_str() for t in ModelType],
|
303 |
-
value=[t.to_str() for t in ModelType],
|
304 |
-
interactive=True,
|
305 |
-
elem_id="filter-columns-type",
|
306 |
-
)
|
307 |
-
filter_columns_precision = gr.CheckboxGroup(
|
308 |
-
label="Precision",
|
309 |
-
choices=[i.value.name for i in Precision],
|
310 |
-
value=[i.value.name for i in Precision],
|
311 |
-
interactive=True,
|
312 |
-
elem_id="filter-columns-precision",
|
313 |
-
)
|
314 |
-
filter_columns_size = gr.CheckboxGroup(
|
315 |
-
label="Model sizes (in billions of parameters)",
|
316 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
317 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
318 |
-
interactive=True,
|
319 |
-
elem_id="filter-columns-size",
|
320 |
-
)
|
321 |
-
|
322 |
-
leaderboard_table = gr.components.Dataframe(
|
323 |
-
value=leaderboard_df[
|
324 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
325 |
-
+ shown_columns.value
|
326 |
-
+ [AutoEvalColumn.fullname.name]
|
327 |
-
],
|
328 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
329 |
-
datatype=TYPES,
|
330 |
-
elem_id="leaderboard-table",
|
331 |
-
interactive=False,
|
332 |
-
visible=True,
|
333 |
-
)
|
334 |
-
|
335 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
336 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
337 |
-
value=original_df[COLS],
|
338 |
-
headers=COLS,
|
339 |
-
datatype=TYPES,
|
340 |
-
visible=False,
|
341 |
-
)
|
342 |
-
search_bar.submit(
|
343 |
-
update_table,
|
344 |
-
[
|
345 |
-
hidden_leaderboard_table_for_search,
|
346 |
-
shown_columns,
|
347 |
-
filter_columns_type,
|
348 |
-
filter_columns_precision,
|
349 |
-
filter_columns_size,
|
350 |
-
hide_models,
|
351 |
-
search_bar,
|
352 |
-
],
|
353 |
-
leaderboard_table,
|
354 |
-
)
|
355 |
-
|
356 |
-
# Define a hidden component that will trigger a reload only if a query parameter has been set
|
357 |
-
hidden_search_bar = gr.Textbox(value="", visible=False)
|
358 |
-
hidden_search_bar.change(
|
359 |
-
update_table,
|
360 |
-
[
|
361 |
-
hidden_leaderboard_table_for_search,
|
362 |
-
shown_columns,
|
363 |
-
filter_columns_type,
|
364 |
-
filter_columns_precision,
|
365 |
-
filter_columns_size,
|
366 |
-
hide_models,
|
367 |
-
search_bar,
|
368 |
-
],
|
369 |
-
leaderboard_table,
|
370 |
-
)
|
371 |
-
# Check query parameter once at startup and update search bar + hidden component
|
372 |
-
demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
|
373 |
-
|
374 |
-
for selector in [
|
375 |
-
shown_columns,
|
376 |
-
filter_columns_type,
|
377 |
-
filter_columns_precision,
|
378 |
-
filter_columns_size,
|
379 |
-
hide_models,
|
380 |
-
]:
|
381 |
-
selector.change(
|
382 |
-
update_table,
|
383 |
-
[
|
384 |
-
hidden_leaderboard_table_for_search,
|
385 |
-
shown_columns,
|
386 |
-
filter_columns_type,
|
387 |
-
filter_columns_precision,
|
388 |
-
filter_columns_size,
|
389 |
-
hide_models,
|
390 |
-
search_bar,
|
391 |
-
],
|
392 |
-
leaderboard_table,
|
393 |
-
queue=True,
|
394 |
-
)
|
395 |
-
|
396 |
-
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
397 |
-
with gr.Row():
|
398 |
-
with gr.Column():
|
399 |
-
plot_df = load_and_create_plots()
|
400 |
-
chart = create_metric_plot_obj(
|
401 |
-
plot_df,
|
402 |
-
[AutoEvalColumn.average.name],
|
403 |
-
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
404 |
-
)
|
405 |
-
gr.Plot(value=chart, min_width=500)
|
406 |
-
with gr.Column():
|
407 |
-
plot_df = load_and_create_plots()
|
408 |
-
chart = create_metric_plot_obj(
|
409 |
-
plot_df,
|
410 |
-
BENCHMARK_COLS,
|
411 |
-
title="Top Scores and Human Baseline Over Time (from last update)",
|
412 |
-
)
|
413 |
-
gr.Plot(value=chart, min_width=500)
|
414 |
-
|
415 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
416 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
417 |
-
|
418 |
-
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
419 |
-
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
420 |
-
|
421 |
-
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
422 |
-
with gr.Column():
|
423 |
-
with gr.Row():
|
424 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
425 |
-
|
426 |
-
with gr.Row():
|
427 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
428 |
-
|
429 |
-
with gr.Row():
|
430 |
-
with gr.Column():
|
431 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
432 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
433 |
-
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
434 |
-
model_type = gr.Dropdown(
|
435 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
436 |
-
label="Model type",
|
437 |
-
multiselect=False,
|
438 |
-
value=ModelType.FT.to_str(" : "),
|
439 |
-
interactive=True,
|
440 |
-
)
|
441 |
-
|
442 |
-
with gr.Column():
|
443 |
-
precision = gr.Dropdown(
|
444 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
445 |
-
label="Precision",
|
446 |
-
multiselect=False,
|
447 |
-
value="float16",
|
448 |
-
interactive=True,
|
449 |
-
)
|
450 |
-
weight_type = gr.Dropdown(
|
451 |
-
choices=[i.value.name for i in WeightType],
|
452 |
-
label="Weights type",
|
453 |
-
multiselect=False,
|
454 |
-
value="Original",
|
455 |
-
interactive=True,
|
456 |
-
)
|
457 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
458 |
-
|
459 |
-
with gr.Column():
|
460 |
-
with gr.Accordion(
|
461 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
462 |
-
open=False,
|
463 |
-
):
|
464 |
-
with gr.Row():
|
465 |
-
finished_eval_table = gr.components.Dataframe(
|
466 |
-
value=finished_eval_queue_df,
|
467 |
-
headers=EVAL_COLS,
|
468 |
-
datatype=EVAL_TYPES,
|
469 |
-
row_count=5,
|
470 |
-
)
|
471 |
-
with gr.Accordion(
|
472 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
473 |
-
open=False,
|
474 |
-
):
|
475 |
-
with gr.Row():
|
476 |
-
running_eval_table = gr.components.Dataframe(
|
477 |
-
value=running_eval_queue_df,
|
478 |
-
headers=EVAL_COLS,
|
479 |
-
datatype=EVAL_TYPES,
|
480 |
-
row_count=5,
|
481 |
-
)
|
482 |
-
|
483 |
-
with gr.Accordion(
|
484 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
485 |
-
open=False,
|
486 |
-
):
|
487 |
-
with gr.Row():
|
488 |
-
pending_eval_table = gr.components.Dataframe(
|
489 |
-
value=pending_eval_queue_df,
|
490 |
-
headers=EVAL_COLS,
|
491 |
-
datatype=EVAL_TYPES,
|
492 |
-
row_count=5,
|
493 |
-
)
|
494 |
-
|
495 |
-
submit_button = gr.Button("Submit Eval")
|
496 |
-
submission_result = gr.Markdown()
|
497 |
-
submit_button.click(
|
498 |
-
add_new_eval,
|
499 |
-
[
|
500 |
-
model_name_textbox,
|
501 |
-
base_model_name_textbox,
|
502 |
-
revision_name_textbox,
|
503 |
-
precision,
|
504 |
-
private,
|
505 |
-
weight_type,
|
506 |
-
model_type,
|
507 |
-
],
|
508 |
-
submission_result,
|
509 |
-
)
|
510 |
-
|
511 |
-
with gr.Row():
|
512 |
-
with gr.Accordion("📙 Citation", open=False):
|
513 |
-
citation_button = gr.Textbox(
|
514 |
-
value=CITATION_BUTTON_TEXT,
|
515 |
-
label=CITATION_BUTTON_LABEL,
|
516 |
-
lines=20,
|
517 |
-
elem_id="citation-button",
|
518 |
-
show_copy_button=True,
|
519 |
-
)
|
520 |
-
|
521 |
-
scheduler = BackgroundScheduler()
|
522 |
-
scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
|
523 |
-
scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
|
524 |
-
scheduler.start()
|
525 |
-
|
526 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
+
import streamlit as st # data app development
|
2 |
+
import subprocess # process in the os
|
3 |
+
from subprocess import STDOUT, check_call #os process manipuation
|
4 |
+
import os #os process manipuation
|
5 |
+
import base64 # byte object into a pdf file
|
6 |
+
import camelot as cam # extracting tables from PDFs
|
7 |
+
import cv2
|
8 |
+
# to run this only once and it's cached
|
9 |
+
@st.cache
|
10 |
+
def gh():
|
11 |
+
"""install ghostscript on the linux machine"""
|
12 |
+
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
|
13 |
+
proc.wait()
|
14 |
|
15 |
+
gh()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
|
|
|
|
18 |
|
19 |
+
st.title("PDF Table Extractor")
|
20 |
+
st.subheader("with `Camelot` Python library")
|
21 |
|
22 |
+
st.image("https://raw.githubusercontent.com/camelot-dev/camelot/master/docs/_static/camelot.png", width=200)
|
|
|
23 |
|
24 |
|
25 |
+
# file uploader on streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
input_pdf = st.file_uploader(label = "upload your pdf here", type = 'pdf')
|
28 |
|
29 |
+
st.markdown("### Page Number")
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
page_number = st.text_input("Enter the page # from where you want to extract the PDF eg: 3", value = 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
# run this only when a PDF is uploaded
|
|
|
|
|
34 |
|
35 |
+
if input_pdf is not None:
|
36 |
+
# byte object into a PDF file
|
37 |
+
with open("input.pdf", "wb") as f:
|
38 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
39 |
+
f.write(base64.b64decode(base64_pdf))
|
40 |
+
f.close()
|
41 |
|
42 |
+
# read the pdf and parse it using stream
|
43 |
+
table = cam.read_pdf("input.pdf", pages = page_number, flavor = 'stream')
|
44 |
|
45 |
+
st.markdown("### Number of Tables")
|
|
|
|
|
46 |
|
47 |
+
# display the output after parsing
|
48 |
+
st.write(table)
|
|
|
|
|
49 |
|
50 |
+
# display the table
|
51 |
|
52 |
+
if len(table) > 0:
|
|
|
|
|
|
|
53 |
|
54 |
+
# extract the index value of the table
|
55 |
+
|
56 |
+
option = st.selectbox(label = "Select the Table to be displayed", options = range(len(table) + 1))
|
57 |
|
58 |
+
st.markdown('### Output Table')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
# display the dataframe
|
61 |
+
|
62 |
+
st.dataframe(table[int(option)-1].df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,16 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
datasets==2.14.5
|
5 |
-
huggingface-hub>=0.18.0
|
6 |
-
matplotlib==3.8.4
|
7 |
-
numpy==1.26.0
|
8 |
-
pandas==2.2.2
|
9 |
-
plotly==5.14.1
|
10 |
-
python-dateutil==2.8.2
|
11 |
-
requests==2.28.2
|
12 |
-
sentencepiece
|
13 |
-
tqdm==4.65.0
|
14 |
-
transformers==4.40.0
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
|
|
|
1 |
+
opencv-python-headless
|
2 |
+
camelot-py
|
3 |
+
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|