RaTE-NER-Deberta
This model is a fine-tuned version of DeBERTa on the RaTE-NER dataset.
Model description
This model is trained to serve the RaTEScore metric, if you are interested in our pipeline, please refer to our paper and Github.
This model also can be used to extract Abnormality, Non-Abnormality, Anatomy, Disease, Non-Disease in medical radiology reports.
Usage
Click to expand the usage of this model.
from transformers import AutoTokenizer, AutoModelForTokenClassification import torch def post_process(tokenized_text, predicted_entities, tokenizer): entity_spans = [] start = end = None entity_type = None for i, (token, label) in enumerate(zip(tokenized_text, predicted_entities[:len(tokenized_text)])): if token in ["[CLS]", "[SEP]"]: continue if label != "O" and i < len(predicted_entities) - 1: if label.startswith("B-") and predicted_entities[i+1].startswith("I-"): start = i entity_type = label[2:] elif label.startswith("B-") and predicted_entities[i+1].startswith("B-"): start = i end = i entity_spans.append((start, end, label[2:])) start = i entity_type = label[2:] elif label.startswith("B-") and predicted_entities[i+1].startswith("O"): start = i end = i entity_spans.append((start, end, label[2:])) start = end = None entity_type = None elif label.startswith("I-") and predicted_entities[i+1].startswith("B-"): end = i if start is not None: entity_spans.append((start, end, entity_type)) start = i entity_type = label[2:] elif label.startswith("I-") and predicted_entities[i+1].startswith("O"): end = i if start is not None: entity_spans.append((start, end, entity_type)) start = end = None entity_type = None if start is not None and end is None: end = len(tokenized_text) - 2 entity_spans.append((start, end, entity_type)) save_pair = [] for start, end, entity_type in entity_spans: entity_str = tokenizer.convert_tokens_to_string(tokenized_text[start:end+1]) save_pair.append((entity_str, entity_type)) return save_pair
def run_ner(texts, idx2label, tokenizer, model, device): inputs = tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) predicted_labels = torch.argmax(outputs.logits, dim=2).tolist() save_pairs = [] for i in range(len(texts)): predicted_entities = [idx2label[label] for label in predicted_labels[i]] non_pad_mask = inputs["input_ids"][i] != tokenizer.pad_token_id non_pad_length = non_pad_mask.sum().item() non_pad_input_ids = inputs["input_ids"][i][:non_pad_length] tokenized_text = tokenizer.convert_ids_to_tokens(non_pad_input_ids) save_pair = post_process(tokenized_text, predicted_entities, tokenizer) if i == 0: save_pairs = save_pair else: save_pairs.extend(save_pair) return save_pairs
ner_labels = ['B-ABNORMALITY', 'I-ABNORMALITY', 'B-NON-ABNORMALITY', 'I-NON-ABNORMALITY', 'B-DISEASE', 'I-DISEASE', 'B-NON-DISEASE', 'I-NON-DISEASE', 'B-ANATOMY', 'I-ANATOMY', 'O'] idx2label = {i: label for i, label in enumerate(ner_labels)}
tokenizer = AutoTokenizer.from_pretrained('Angelakeke/RaTE-NER-Deberta') model = AutoModelForTokenClassification.from_pretrained('Angelakeke/RaTE-NER-Deberta')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval()
We recommend to inference by sentences.
text = ""
texts = text.split('. ') save_pair = run_ner(texts, idx2label, tokenizer, model, device)
Author
Author: Weike Zhao
If you have any questions, please feel free to contact [email protected].
Citation
@inproceedings{zhao2024ratescore,
title={RaTEScore: A Metric for Radiology Report Generation},
author={Zhao, Weike and Wu, Chaoyi and Zhang, Xiaoman and Zhang, Ya and Wang, Yanfeng and Xie, Weidi},
booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
pages={15004--15019},
year={2024}
}
- Downloads last month
- 15,663