Samuel Mueller commited on
Commit
a833f5f
·
1 Parent(s): 8c2994a

eval demo w/ shuffling

Browse files
Files changed (1) hide show
  1. app.py +43 -56
app.py CHANGED
@@ -8,89 +8,76 @@ import pandas as pd
8
  import torch
9
  import gradio as gr
10
  import openml
 
11
 
12
 
13
- def compute(table: np.array):
14
- vfunc = np.vectorize(lambda s: len(s))
15
- non_empty_row_mask = (vfunc(table).sum(1) != 0)
16
- table = table[non_empty_row_mask]
17
- empty_mask = table == ''
18
- empty_inds = np.where(empty_mask)
19
- if not len(empty_inds[0]):
20
- return "**Please leave at least one field blank for prediction.**", None
21
- if not np.all(empty_inds[1][0] == empty_inds[1]):
22
- return "**Please only leave fields of one column blank for prediction.**", None
23
- y_column = empty_inds[1][0]
24
- eval_lines = empty_inds[0]
25
-
26
- train_table = np.delete(table, eval_lines, axis=0)
27
- eval_table = table[eval_lines]
28
-
29
- try:
30
- x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32))
31
- x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32))
32
-
33
- y_train = train_table[:, y_column]
34
- except ValueError:
35
- return "**Please only add numbers (to the inputs) or leave fields empty.**", None
36
 
37
  classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
38
- classifier.fit(x_train, y_train)
39
- y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)
 
 
 
40
 
41
  # print(file, type(file))
42
- out_table = table.copy().astype(str)
43
- out_table[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)]
44
- return None, out_table
45
 
46
 
47
  def upload_file(file):
 
 
48
  if file.name.endswith('.arff'):
49
  dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
 
50
  X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
51
- dataset_format="array"
52
- )
53
- df = pd.DataFrame(X_, columns=attribute_names_)
54
- return df
55
- elif file.name.endswith('.csv') or file.name.endswith('.data'):
56
- df = pd.read_csv(file.name, header=None)
57
- df.columns = np.arange(len(df.columns))
58
- print(df)
59
- return df
60
 
61
 
62
- example = \
63
- [
64
- [1, 2, 1],
65
- [2, 1, 1],
66
- [1, 1, 1],
67
- [2, 2, 2],
68
- [3, 4, 2],
69
- [3, 2, 2],
70
- [2, 3, '']
71
- ]
72
-
73
  with gr.Blocks() as demo:
74
  gr.Markdown("""This demo allows you to play with the **TabPFN**.
75
  You can either change the table manually (we have filled it with a toy benchmark, sum up to 3 has label 1 and over that label 2).
76
  The network predicts fields you leave empty. Only one column can have empty entries that are predicted.
77
  Please, provide everything but the label column as numeric values. It is ok to encode classes as integers.
78
  """)
79
- inp_table = gr.DataFrame(type='numpy', value=example, headers=[''] * 3)
80
  inp_file = gr.File(
81
  label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.')
 
 
 
 
 
82
  examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'],
83
  inputs=[inp_file],
84
- outputs=[inp_table],
85
  fn=upload_file,
86
  cache_examples=True)
87
  btn = gr.Button("Predict Empty Table Cells")
 
 
88
 
89
- inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table)
90
-
91
- out_text = gr.Markdown()
92
- out_table = gr.DataFrame()
93
-
94
- btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table])
95
 
96
  demo.launch()
 
8
  import torch
9
  import gradio as gr
10
  import openml
11
+ from sklearn.model_selection import cross_val_score
12
 
13
 
14
+ def compute(file, y_attribute, cv_folds):
15
+ if file is None:
16
+ return 'Please upload a .arff file', y_attribute
17
+ if file.name.endswith('.arff'):
18
+ dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
19
+ X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
20
+ dataset_format="array")
21
+ if y_attribute not in attribute_names_:
22
+ return f"**Select attribute from {', '.join(attribute_names_)}**"
23
+ X, y, categorical_indicator_, attribute_names_ = dataset.get_data(
24
+ dataset_format="array", target=y_attribute)
25
+ else:
26
+ return 'Please upload a .arff file', y_attribute
27
+
28
+ order = np.arange(y.shape[0])
29
+ np.random.seed(13)
30
+ np.random.shuffle(order)
31
+ X, y = torch.tensor(X[order]), torch.tensor(y[order])
 
 
 
 
 
32
 
33
  classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
34
+
35
+ scores = cross_val_score(classifier, X, y, cv=cv_folds, scoring='roc_auc_ovo')
36
+ print(scores)
37
+ # classifier.fit(x_train, y_train)
38
+ # y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)
39
 
40
  # print(file, type(file))
41
+ return f"ROC AUC OVO Cross Val mean is {sum(scores) / len(scores)} from {scores}. " + (
42
+ "The PFN is only trained for datasets with up to 1024 training examples and it had to extrapolate to greater datasets for this evaluation." if len(
43
+ y) // cv_folds > 1024 else ""), y_attribute
44
 
45
 
46
  def upload_file(file):
47
+ if file is None:
48
+ return
49
  if file.name.endswith('.arff'):
50
  dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
51
+ print(y_attribute)
52
  X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
53
+ dataset_format="array")
54
+ return f"Select attribute from {', '.join(attribute_names_)}", attribute_names_[-1]
55
+ else:
56
+ return 'Please upload a .arff file', None
 
 
 
 
 
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  with gr.Blocks() as demo:
60
  gr.Markdown("""This demo allows you to play with the **TabPFN**.
61
  You can either change the table manually (we have filled it with a toy benchmark, sum up to 3 has label 1 and over that label 2).
62
  The network predicts fields you leave empty. Only one column can have empty entries that are predicted.
63
  Please, provide everything but the label column as numeric values. It is ok to encode classes as integers.
64
  """)
 
65
  inp_file = gr.File(
66
  label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.')
67
+ cv_folds = gr.Dropdown([2, 3, 4, 5], value=2, label='Number of CV folds')
68
+ out_text = gr.Markdown()
69
+
70
+ y_attribute = gr.Textbox(label='y attribute')
71
+
72
  examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'],
73
  inputs=[inp_file],
74
+ outputs=[out_text, y_attribute],
75
  fn=upload_file,
76
  cache_examples=True)
77
  btn = gr.Button("Predict Empty Table Cells")
78
+ # out_table = gr.DataFrame()
79
+ inp_file.change(fn=upload_file, inputs=inp_file, outputs=[out_text, y_attribute])
80
 
81
+ btn.click(fn=compute, inputs=[inp_file, y_attribute, cv_folds], outputs=[out_text, y_attribute])
 
 
 
 
 
82
 
83
  demo.launch()