f1: 88.43
precision | recall | f1-score | support | |
---|---|---|---|---|
DAT | 0.96 | 0.97 | 0.96 | 182 |
DUR | 0.79 | 0.82 | 0.80 | 50 |
LOC | 0.70 | 0.79 | 0.74 | 206 |
MNY | 0.87 | 1.00 | 0.93 | 20 |
NOH | 0.91 | 0.93 | 0.92 | 1007 |
ORG | 0.86 | 0.89 | 0.88 | 795 |
PER | 0.92 | 0.95 | 0.94 | 853 |
PNT | 0.78 | 0.78 | 0.78 | 60 |
POH | 0.64 | 0.71 | 0.68 | 214 |
TIM | 0.76 | 1.00 | 0.86 | 19 |
------- | ----------- | -------- | ---------- | --------- |
micro avg | 0.87 | 0.90 | 0.88 | 3406 |
macro avg | 0.82 | 0.89 | 0.85 | 3406 |
weighted avg | 0.87 | 0.90 | 0.89 | 3406 |
from transformers import TFBertModel, BertTokenizer
import os
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from konlpy.tag import Mecab
mecab = Mecab()
checkpoint_path = "./cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
index_to_tag = {0: 'B-PER', 1: 'B-LOC', 2: 'I-ORG', 3: 'B-DAT', 4: 'O', 5: 'I-DUR', 6: 'I-TIM', 7: 'I-NOH', 8: 'B-MNY', 9: 'B-PNT', 10: 'I-PER', 11: 'I-PNT', 12: 'I-LOC', 13: 'I-DAT', 14: 'B-TIM', 15: 'B-POH', 16: 'B-NOH', 17: 'I-POH', 18: 'I-MNY', 19: 'B-ORG', 20: 'B-DUR'}
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")
model = TFBertForTokenClassification("klue/bert-base", num_labels=21)
model.load_weights(latest)
class TFBertForTokenClassification(tf.keras.Model):
def __init__(self, model_name, num_labels):
super(TFBertForTokenClassification, self).__init__()
self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
self.classifier = tf.keras.layers.Dense(num_labels,
kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
name='classifier')
def call(self, inputs):
input_ids, attention_mask, token_type_ids = inputs
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
all_output = outputs[0]
prediction = self.classifier(all_output)
return prediction
def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer,
pad_token_id_for_segment=0, pad_token_id_for_label=-100):
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token_id = tokenizer.pad_token_id
input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []
for example in tqdm(examples):
tokens = []
label_mask = []
for one_word in example:
subword_tokens = tokenizer.tokenize(one_word)
tokens.extend(subword_tokens)
label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))
special_tokens_count = 2
if len(tokens) > max_seq_len - special_tokens_count:
tokens = tokens[:(max_seq_len - special_tokens_count)]
label_mask = label_mask[:(max_seq_len - special_tokens_count)]
tokens += [sep_token]
label_mask += [pad_token_id_for_label]
tokens = [cls_token] + tokens
label_mask = [pad_token_id_for_label] + label_mask
input_id = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(input_id)
padding_count = max_seq_len - len(input_id)
input_id = input_id + ([pad_token_id] * padding_count)
attention_mask = attention_mask + ([0] * padding_count)
token_type_id = [pad_token_id_for_segment] * max_seq_len
label_mask = label_mask + ([pad_token_id_for_label] * padding_count)
assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len)
input_ids.append(input_id)
attention_masks.append(attention_mask)
token_type_ids.append(token_type_id)
label_masks.append(label_mask)
input_ids = np.array(input_ids, dtype=int)
attention_masks = np.array(attention_masks, dtype=int)
token_type_ids = np.array(token_type_ids, dtype=int)
label_masks = np.asarray(label_masks, dtype=np.int32)
return (input_ids, attention_masks, token_type_ids), label_masks
def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'):
if lang == 'ko':
examples = [mecab.morphs(sent) for sent in examples]
else:
examples = [sent.split() for sent in examples]
X_pred, label_masks = convert_examples_to_features_for_prediction(
examples, max_seq_len=128, tokenizer=tokenizer)
y_predicted = model.predict(X_pred)
y_predicted = np.argmax(y_predicted, axis=2)
pred_list = []
result_list = []
for i in range(0, len(label_masks)):
pred_tag = []
for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
if label_index != -100:
pred_tag.append(index_to_tag[pred_index])
pred_list.append(pred_tag)
for example, pred in zip(examples, pred_list):
one_sample_result = []
for one_word, label_token in zip(example, pred):
one_sample_result.append((one_word, label_token))
result_list.append(one_sample_result)
return result_list
sent1 = 'μΈμ°μμ νλνκ³ μλ μκ°μμ λΆμΌ κΉμ κ²½ μκ°λ μ΅κ·Ό μ§μ AI κΈ°μ
μ½μ΄λ·ν¬λ°μ΄μμ νμ
μ ν΅ν νΉλ³ν μ μλ₯Ό μ΄μλ€.'
sent2 = 'κ°μΉκ΄μ΄λ μΈμμ λ°λΌ μΈμμ λΆμμ νκ² λ³΄λ μΈκ°μ΄ νμ΅μ ν΅ν΄ μΈμ§ν λΆλΆλ§μ μΈμνλ AIμ λΉμ·νλ€κ³ λ³΄κ³ μ μλ₯Ό κΈ°ννλ€.'
sent3 = 'λΆμ° κ΄μ리 ν΄λ³κ³Ό λ¬λ§μ΄ κ³ κ° λ± μ λ μΈκ΅¬μ μ°¨λ μ΄λμ΄ λ§μ μ§μ λͺ κ³³μ κ³¨λΌ CCTV λ°μ΄ν° μ μ 보λ₯Ό μ΄λ»κ² μΈμνλμ§, 곡κ°μ λν μ°°λλ₯Ό ννν μκ°μ μνμ μ΄λ»κ² μΈμνλμ§ μ°¨μ΄λ₯Ό λΉκ΅νλ€.'
test_samples = [sent1, sent2, sent3]
ner_prediction(test_samples, max_seq_len=128, tokenizer=tokenizer, lang='ko')
[[('μΈμ°', 'B-LOC'),
('μμ', 'O'),
('νλ', 'O'),
('ν', 'O'),
('κ³ ', 'O'),
('μ', 'O'),
('λ', 'O'),
('μκ°', 'O'),
('μμ ', 'O'),
('λΆμΌ', 'O'),
('κΉμ κ²½', 'B-PER'),
('μκ°', 'O'),
('λ', 'O'),
('μ΅κ·Ό', 'O'),
('μ§μ', 'O'),
('AI', 'O'),
('κΈ°μ
', 'O'),
('μ½μ΄', 'B-ORG'),
('λ·', 'I-ORG'),
('ν¬λ°μ΄', 'I-ORG'),
('μ', 'O'),
('μ', 'O'),
('νμ
', 'O'),
('μ', 'O'),
('ν΅ν', 'O'),
('νΉλ³', 'O'),
('ν', 'O'),
('μ μ', 'O'),
('λ₯Ό', 'O'),
('μ΄', 'O'),
('μ', 'O'),
('λ€', 'O'),
('.', 'O')],
[('κ°μΉκ΄', 'O'),
('μ΄λ', 'O'),
('μΈμ', 'O'),
('μ', 'O'),
('λ°λΌ', 'O'),
('μΈμ', 'O'),
('μ', 'O'),
('λΆ', 'O'),
('μμ ', 'O'),
('ν', 'O'),
('κ²', 'O'),
('보', 'O'),
('λ', 'O'),
('μΈκ°', 'O'),
('μ΄', 'O'),
('νμ΅', 'O'),
('μ', 'O'),
('ν΅ν΄', 'O'),
('μΈμ§', 'O'),
('ν', 'O'),
('λΆλΆ', 'O'),
('λ§', 'O'),
('μ', 'O'),
('μΈμ', 'O'),
('ν', 'O'),
('λ', 'O'),
('AI', 'O'),
('μ', 'O'),
('λΉμ·', 'O'),
('ν', 'O'),
('λ€κ³ ', 'O'),
('보', 'O'),
('κ³ ', 'O'),
('μ μ', 'O'),
('λ₯Ό', 'O'),
('κΈ°ν', 'O'),
('ν', 'O'),
('λ€', 'O'),
('.', 'O')],
[('λΆμ°', 'B-LOC'),
('κ΄μ리', 'I-LOC'),
('ν΄λ³', 'I-LOC'),
('κ³Ό', 'O'),
('λ¬λ§μ΄', 'B-LOC'),
('κ³ κ°', 'I-LOC'),
('λ±', 'O'),
('μ λ', 'O'),
('μΈκ΅¬', 'O'),
('μ', 'O'),
('μ°¨λ', 'O'),
('μ΄λ', 'O'),
('μ΄', 'O'),
('λ§', 'O'),
('μ', 'O'),
('μ§μ', 'O'),
('λͺ', 'O'),
('κ³³', 'O'),
('μ', 'O'),
('골λΌ', 'O'),
('CCTV', 'O'),
('λ°μ΄ν°', 'O'),
('μ', 'O'),
('μ 보', 'O'),
('λ₯Ό', 'O'),
('μ΄λ»κ²', 'O'),
('μΈμ', 'O'),
('ν', 'O'),
('λμ§', 'O'),
(',', 'O'),
('곡κ°', 'O'),
('μ', 'O'),
('λν', 'O'),
('μ°°λ', 'O'),
('λ₯Ό', 'O'),
('νν', 'O'),
('ν', 'O'),
('μκ°', 'O'),
('μ', 'O'),
('μν', 'O'),
('μ', 'O'),
('μ΄λ»κ²', 'O'),
('μΈμ', 'O'),
('ν', 'O'),
('λμ§', 'O'),
('μ°¨μ΄', 'O'),
('λ₯Ό', 'O'),
('λΉκ΅', 'O'),
('ν', 'O'),
('λ€', 'O'),
('.', 'O')]]
tensorflow-estimator==2.5.0
tensorflow-gpu==2.5.3
transformers @ git+https://github.com/davidegazze/transformers@cf28c1db00410f0df3e654d9866e0ff1d3a45f29
numpy==1.24.3
konlpy==0.6.0