# from matplotlib_venn import venn2, venn3 import json import numpy as np import pandas as pd import plotly.graph_objects as go import plotly.io as pio from datasets import load_dataset from plotly.subplots import make_subplots from rich import print as rprint from collections import Counter from ngram import get_tuples_manual_sentences from bigbio.dataloader import BigBioConfigHelpers import sys pio.kaleido.scope.mathjax = None # vanilla tokenizer def tokenizer(text, counter): if not text: return text, [] text = text.strip() text = text.replace("\t", "") text = text.replace("\n", "") # split text_list = text.split(" ") return text, text_list def norm(lengths): mu = np.mean(lengths) sigma = np.std(lengths) return mu, sigma def load_helper(local=""): if local != "": with open(local, "r") as file: conhelps = json.load(file) else: conhelps = BigBioConfigHelpers() conhelps = conhelps.filtered(lambda x: x.dataset_name != "pubtator_central") conhelps = conhelps.filtered(lambda x: x.is_bigbio_schema) conhelps = conhelps.filtered(lambda x: not x.is_local) rprint( "loaded {} configs from {} datasets".format( len(conhelps), len(set([helper.dataset_name for helper in conhelps])), ) ) return conhelps _TEXT_MAPS = { "bigbio_kb": ["text"], "bigbio_text": ["text"], "bigbio_qa": ["question", "context"], "bigbio_te": ["premise", "hypothesis"], "bigbio_tp": ["text_1", "text_2"], "bigbio_pairs": ["text_1", "text_2"], "bigbio_t2t": ["text_1", "text_2"], } IBM_COLORS = [ "#648fff", # train "#dc267f", # val "#ffb000", # test "#fe6100", "#785ef0", "#000000", "#ffffff", ] SPLIT_COLOR_MAP = { "train": "#648fff", "validation": "#dc267f", "test": "#ffb000", } N = 3 def token_length_per_entry(entry, schema, counter): result = {} entry_id = entry['id'] if schema == "bigbio_kb": for passage in entry["passages"]: result_key = passage["type"] for key in _TEXT_MAPS[schema]: text = passage[key][0] if not text: print(f"WARNING: text key does not exist: entry {entry_id}") result["token_length"] = 0 result["text_type"] = result_key continue sents, ngrams = get_tuples_manual_sentences(text.lower(), N) toks = [tok for sent in sents for tok in sent] tups = ["_".join(tup) for tup in ngrams] counter.update(tups) result["token_length"] = len(toks) result["text_type"] = result_key else: for key in _TEXT_MAPS[schema]: text = entry[key] if not text: print(f"WARNING: text key does not exist, entry {entry_id}") result["token_length"] = 0 result["text_type"] = key continue else: sents, ngrams = get_tuples_manual_sentences(text.lower(), N) toks = [tok for sent in sents for tok in sent] result["token_length"] = len(toks) result["text_type"] = key tups = ["_".join(tup) for tup in ngrams] counter.update(tups) return result, counter def parse_token_length_and_n_gram(dataset, schema_type): hist_data = [] n_gram_counters = [] for split, data in dataset.items(): n_gram_counter = Counter() for i, entry in enumerate(data): result, n_gram_counter = token_length_per_entry( entry, schema_type, n_gram_counter ) result["split"] = split hist_data.append(result) n_gram_counters.append(n_gram_counter) return pd.DataFrame(hist_data), n_gram_counters def resolve_splits(df_split): official_splits = set(df_split).intersection(set(SPLIT_COLOR_MAP.keys())) return official_splits def draw_box(df, col_name, row, col, fig): splits = resolve_splits(df["split"].unique()) for split in splits: split_count = df.loc[df["split"] == split, col_name].tolist() print(split) fig.add_trace( go.Box( x=split_count, name=split, marker_color=SPLIT_COLOR_MAP[split.split("_")[0]], ), row=row, col=col, ) def draw_bar(df, col_name, y_name, row, col, fig): splits = resolve_splits(df["split"].unique()) for split in splits: split_count = df.loc[df["split"] == split, col_name].tolist() y_list = df.loc[df["split"] == split, y_name].tolist() fig.add_trace( go.Bar( x=split_count, y=y_list, name=split, marker_color=SPLIT_COLOR_MAP[split.split("_")[0]], showlegend=False, ), row=row, col=col, ) fig.update_traces(orientation="h") # horizontal box plots def parse_counters(metadata): metadata = metadata[ list(metadata.keys())[0] ] # using the training counter to fetch the names counters = [] for k, v in metadata.__dict__.items(): if "counter" in k and len(v) > 0: counters.append(k) return counters # generate the df for histogram def parse_label_counter(metadata, counter_type): hist_data = [] for split, m in metadata.items(): metadata_counter = getattr(m, counter_type) for k, v in metadata_counter.items(): row = {} row["labels"] = k row[counter_type] = v row["split"] = split hist_data.append(row) return pd.DataFrame(hist_data) def gen_latex(dataset_name, helper, splits, schemas, fig_path): if type(helper.description) is dict: # TODO hacky, change this to include all decsriptions descriptions = helper.description[list(helper.description.keys())[0]] else: descriptions = helper.description descriptions = descriptions.replace("\n", "").replace("\t", "") langs = [l.value for l in helper.languages] languages = " ".join(langs) if type(helper.license) is dict: license = helper.license.value.name else: license = helper.license.name tasks = [" ".join(t.name.lower().split("_")) for t in helper.tasks] tasks = ", ".join(tasks) schemas = " ".join([r"{\tt "] + list(schemas) + ["}"]) # TODO \tt splits = ", ".join(list(splits)) data_name_display = " ".join(data_name.split("_")) latex_bod = r"\clearpage" + "\n" + r"\section*{" + fr"{data_name_display}" + " Data Card" + r"}" + "\n" latex_bod += ( r"\begin{figure}[ht!]" + "\n" + r"\centering" + "\n" + r"\includegraphics[width=\linewidth]{" ) latex_bod += f"{fig_path}" + r"}" + "\n" latex_bod += r"\caption{\label{fig:" latex_bod += fr"{data_name}" + r"}" latex_bod += ( r"Token frequency distribution by split (top) and frequency of different kind of instances (bottom).}" + "\n" ) latex_bod += r"\end{figure}" + "\n" + r"\textbf{Dataset Description} " latex_bod += ( fr"{descriptions}" + "\n" + r"\textbf{Homepage:} " + f"{helper.homepage}" + "\n" + r"\textbf{URL:} " + f"{helper.homepage}" # TODO change this later + "\n" + r"\textbf{Licensing:} " + f"{license}" + "\n" + r"\textbf{Languages:} " + f"{languages}" + "\n" + r"\textbf{Tasks:} " + f"{tasks}" + "\n" + r"\textbf{Schemas:} " + f"{schemas}" + "\n" + r"\textbf{Splits:} " + f"{splits}" ) return latex_bod def write_latex(latex_body, latex_name): text_file = open(f"tex/{latex_name}", "w") text_file.write(latex_body) text_file.close() def draw_figure(data_name, data_config_name, schema_type): helper = conhelps.for_config_name(data_config_name) metadata_helper = helper.get_metadata() # calls load_dataset for meta parsing rprint(metadata_helper) splits = metadata_helper.keys() # calls HF load_dataset _again_ for token parsing dataset = load_dataset( f"bigbio/biodatasets/{data_name}/{data_name}.py", name=data_config_name ) # general token length tok_hist_data, ngram_counters = parse_token_length_and_n_gram(dataset, schema_type) rprint(helper) # general counter(s) # TODO generate the pdf and fix latex counters = parse_counters(metadata_helper) print(counters) rows = len(counters) // 3 if len(counters) >= 3: # counters = counters[:3] cols = 3 specs = [[{"colspan": 3}, None, None]] + [[{}, {}, {}]] * (rows + 1) elif len(counters) == 1: specs = [[{}], [{}]] cols = 1 elif len(counters) == 2: specs = [[{"colspan": 2}, None]] + [[{}, {}]] * (rows + 1) cols = 2 counters.sort() counter_titles = ["Label Counts by Type: " + ct.split("_")[0] for ct in counters] titles = ("token length",) + tuple(counter_titles) # Make figure with subplots fig = make_subplots( rows=rows + 2, cols=cols, subplot_titles=titles, specs=specs, vertical_spacing=0.10, horizontal_spacing=0.10, ) # draw token distribution if "token_length" in tok_hist_data.keys(): draw_box(tok_hist_data, "token_length", row=1, col=1, fig=fig) for i, ct in enumerate(counters): row = i // 3 + 2 col = i % 3 + 1 label_df = parse_label_counter(metadata_helper, ct) label_min = int(label_df[ct].min()) # filter_value = int((label_max - label_min) * 0.01 + label_min) label_df = label_df[label_df[ct] >= label_min] print(label_df.head(5)) # draw bar chart for counter draw_bar(label_df, ct, "labels", row=row, col=col, fig=fig) fig.update_annotations(font_size=12) fig.update_layout( margin=dict(l=25, r=25, t=25, b=25, pad=2), # showlegend=False, # title_text=data_name, height=600, width=1000, ) # fig.show() fig_name = f"{data_name}_{data_config_name}.pdf" fig_path = f"figures/data_card/{fig_name}" fig.write_image(fig_path) dataset.cleanup_cache_files() return helper, splits, fig_path if __name__ == "__main__": # load helpers # each entry in local metadata is the dataset name dc_local = load_helper(local="scripts/bigbio-public-metadatas-6-8.json") # each entry is the config conhelps = load_helper() dc = list() # TODO uncomment this # for conhelper in conhelps: # # print(f"{conhelper.dataset_name}-{conhelper.config.subset_id}-{conhelper.config.schema}") # dc.append(conhelper.dataset_name) # datacard per data, metadata chart per config # for data_name, meta in dc_local.items(): # config_metas = meta['config_metas'] # config_metas_keys = config_metas.keys() # if len(config_metas_keys) > 1: # print(f'dataset {data_name} has more than one config') # schemas = set() # for config_name, config in config_metas.items(): # bigbio_schema = config['bigbio_schema'] # helper, splits, fig_path = draw_figure(data_name, config_name, bigbio_schema) # schemas.add(helper.bigbio_schema_caps) # latex_bod = gen_latex(data_name, helper, splits, schemas, fig_path) # latex_name = f"{data_name}_{config_name}.tex" # write_latex(latex_bod, latex_name) # print(latex_bod) # TODO try this code first, then use this for the whole loop # skipped medal, too large, no nagel/pcr/pubtator_central/spl_adr_200db in local data_name = sys.argv[1] schemas = set() # LOCAL # meta = dc_local[data_name] # config_metas = meta['config_metas'] # config_metas_keys = config_metas.keys() # if len(config_metas_keys) >= 1: # print(f'dataset {data_name} has more than one config') # for config_name, config in config_metas.items(): # bigbio_schema = config['bigbio_schema'] # helper, splits, fig_path = draw_figure(data_name, config_name, bigbio_schema) # schemas.add(helper.bigbio_schema_caps) # latex_bod = gen_latex(data_name, helper, splits, schemas, fig_path) # latex_name = f"{data_name}_{config_name}.tex" # write_latex(latex_bod, latex_name) # print(latex_bod) # NON LOCAL config_helpers = conhelps.for_dataset(data_name) for config_helper in config_helpers: rprint(config_helper) bigbio_schema = config_helper.config.schema config_name = config_helper.config.name helper, splits, fig_path = draw_figure(data_name, config_name, bigbio_schema) schemas.add(helper.bigbio_schema_caps) latex_bod = gen_latex(data_name, helper, splits, schemas, fig_path) latex_name = f"{data_name}_{config_name}.tex" write_latex(latex_bod, latex_name) print(latex_bod)