from utils import * import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader import unicodedata import re # Undesirable patterns within texts patterns = { 'CONCLUSIONS AND IMPLICATIONS':'', 'BACKGROUND AND PURPOSE':'', 'EXPERIMENTAL APPROACH':'', 'KEY RESULTS AEA':'', '©':'', '®':'', 'μ':'', '(C)':'', 'OBJECTIVE:':'', 'MATERIALS AND METHODS:':'', 'SIGNIFICANCE:':'', 'BACKGROUND:':'', 'RESULTS:':'', 'METHODS:':'', 'CONCLUSIONS:':'', 'AIM:':'', 'STUDY DESIGN:':'', 'CLINICAL RELEVANCE:':'', 'CONCLUSION:':'', 'HYPOTHESIS:':'', 'CLINICAL RELEVANCE:':'', 'Questions/Purposes:':'', 'Introduction:':'', 'PURPOSE:':'', 'PATIENTS AND METHODS:':'', 'FINDINGS:':'', 'INTERPRETATIONS:':'', 'FUNDING:':'', 'PROGRESS:':'', 'CONTEXT:':'', 'MEASURES:':'', 'DESIGN:':'', 'BACKGROUND AND OBJECTIVES:':'', '

':'', '

':'', '<>':'', '+/-':'', } patterns = {x.lower():y for x,y in patterns.items()} class treat_text: def __init__(self, patterns): self.patterns = patterns def __call__(self,text): text = unicodedata.normalize("NFKD",str(text)) text = multiple_replace(self.patterns,text.lower()) text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text) text = re.sub('( +)',' ', text) text = re.sub('(, ,)|(,,)',',', text) text = re.sub('(%)|(per cent)',' percent', text) return text # Regex multiple replace function def multiple_replace(dict, text): # Building regex from dict keys regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys()))) # Substitution return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) treat_text_fun = treat_text(patterns) import sys sys.path.append('ML-SLRC/') path = 'ML-SLRC/' model_path = path + 'model.pt' info_path = path + 'Info.json' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # # carrega o modelo model = torch.load(model_path) # # carrega as meta informações do modelo treinado with open(info_path, 'r') as f: Info = json.load(f) import random from datetime import datetime rand_seed = 2003 # datetime object containing current date and time now = datetime.now() time_stamp = now.strftime("%d_%m_%Y_HR_%H_%M_%S") config = { "shots_per_class":8, "batch_size":4, "epochs":8, "learning_rate":5e-05, "weight_decay": 0.85, "rand_seed":rand_seed, 'pos_weight':3.5, 'p_incld': 0.2, 'p_excld': 0.01, } NAME = str(config['shots_per_class'])+'-shots-Learner' +'_'+ time_stamp num_workers = 0 val_batch = 100 p_included = 0.7 p_notincluded = 0.3 sample_valid = 300 gen_seed = torch.Generator().manual_seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) random.seed(rand_seed) def treat_data_input(data, etailment_txt): data_train = data.groupby('test').sample(frac=1) dataload_all = data.copy() dataload_all.test = dataload_all.test.replace({np.nan: 'NANN'}) dataset_train = SLR_DataSet(data=data_train, input= 'text', output='test', tokenizer= initializer_model_scibert.tokenizer, LABEL_MAP=LABEL_MAP, treat_text=treat_text_fun, etailment_txt=etailment_txt) dataset_remain = SLR_DataSet(data=dataload_all, input= 'text', output='test', tokenizer= initializer_model_scibert.tokenizer, LABEL_MAP=LABEL_MAP, treat_text=treat_text_fun, etailment_txt=etailment_txt) dataload_train = DataLoader(dataset_train, batch_size=config['batch_size'],drop_last=False, num_workers=num_workers) dataload_remain = DataLoader(dataset_remain, batch_size=200,drop_last=False, num_workers=num_workers) return dataload_train, dataload_remain import gc from torch.optim import Adam def treat_train_evaluate(dataload_train, dataload_remain): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') gc.collect() torch.cuda.empty_cache() model_few = deepcopy(model) model_few.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean', pos_weight=torch.FloatTensor([config['pos_weight']])) optimizer = Adam(model_few.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay']) model_few.to('cuda') model_few.train() trainlog = model_few.fit(optimizer=optimizer, scheduler = None, data_train_loader=dataload_train, epochs = config['epochs'], print_info = 1, metrics= False, log = None, metrics_print = False) (loss, features_out, (logits, outputs)) = model_few.evaluate(dataload_remain) return logits def treat_sort(dataload_all,logits): dataload_all['prediction'] = torch.sigmoid(logits) dataload_all = dataload_all.sort_values(by=['prediction'], ascending=False).reset_index(drop=True) dataload_all.to_excel("output.xlsx") def pipeline(data): # data = pd.read_csv(fil.name) data = pd.read_excel(data) dataload_train, dataload_remain = treat_data_input(data,"its a great text") logits = treat_train_evaluate(dataload_train, dataload_remain) treat_sort(dataload_all,logits) return "output.xlsx" import gradio as gr with gr.Blocks() as demo: fil = gr.File(label="input data") output = gr.File(label="output data") greet_btn = gr.Button("Greet") greet_btn.click(fn=pipeline, inputs=fil, outputs=output) demo.launch()