koorukuroo/korean_bert_ner

f1: 88.43

	precision	recall	f1-score	support
DAT	0.96	0.97	0.96	182
DUR	0.79	0.82	0.80	50
LOC	0.70	0.79	0.74	206
MNY	0.87	1.00	0.93	20
NOH	0.91	0.93	0.92	1007
ORG	0.86	0.89	0.88	795
PER	0.92	0.95	0.94	853
PNT	0.78	0.78	0.78	60
POH	0.64	0.71	0.68	214
TIM	0.76	1.00	0.86	19
-------	-----------	--------	----------	---------
micro avg	0.87	0.90	0.88	3406
macro avg	0.82	0.89	0.85	3406
weighted avg	0.87	0.90	0.89	3406

from transformers import TFBertModel, BertTokenizer
import os
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from konlpy.tag import Mecab

mecab = Mecab()

checkpoint_path = "./cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)

index_to_tag = {0: 'B-PER', 1: 'B-LOC', 2: 'I-ORG', 3: 'B-DAT', 4: 'O', 5: 'I-DUR', 6: 'I-TIM', 7: 'I-NOH', 8: 'B-MNY', 9: 'B-PNT', 10: 'I-PER', 11: 'I-PNT', 12: 'I-LOC', 13: 'I-DAT', 14: 'B-TIM', 15: 'B-POH', 16: 'B-NOH', 17: 'I-POH', 18: 'I-MNY', 19: 'B-ORG', 20: 'B-DUR'}

tokenizer = BertTokenizer.from_pretrained("klue/bert-base")
model = TFBertForTokenClassification("klue/bert-base", num_labels=21)
model.load_weights(latest)

class TFBertForTokenClassification(tf.keras.Model):
    def __init__(self, model_name, num_labels):
        super(TFBertForTokenClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_labels,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        all_output = outputs[0]
        prediction = self.classifier(all_output)

        return prediction


def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer,
                                 pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []

    for example in tqdm(examples):
        tokens = []
        label_mask = []
        for one_word in example:
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            label_mask = label_mask[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        label_mask += [pad_token_id_for_label]

        tokens = [cls_token] + tokens
        label_mask = [pad_token_id_for_label] + label_mask


        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)
        token_type_id = [pad_token_id_for_segment] * max_seq_len
        label_mask = label_mask + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_masks.append(label_mask)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_masks = np.asarray(label_masks, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), label_masks


def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'):

    if lang == 'ko':
        examples = [mecab.morphs(sent) for sent in examples]
    else:
        examples = [sent.split() for sent in examples]

    X_pred, label_masks = convert_examples_to_features_for_prediction(
        examples, max_seq_len=128, tokenizer=tokenizer)
    y_predicted = model.predict(X_pred)
    y_predicted = np.argmax(y_predicted, axis=2)

    pred_list = []
    result_list = []

    for i in range(0, len(label_masks)):
        pred_tag = []
        for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
            if label_index != -100:
                pred_tag.append(index_to_tag[pred_index])

        pred_list.append(pred_tag)

    for example, pred in zip(examples, pred_list):
        one_sample_result = []
        for one_word, label_token in zip(example, pred):
            one_sample_result.append((one_word, label_token))
        result_list.append(one_sample_result)

    return result_list


sent1 = '울산에서 활동하고 있는 시각예술 분야 김유경 작가는 최근 지역 AI 기업 코어닷투데이와의 협업을 통한 특별한 전시를 열었다.'
sent2 = '가치관이나 인식에 따라 세상을 불완전하게 보는 인간이 학습을 통해 인지한 부분만을 인식하는 AI와 비슷하다고 보고 전시를 기획했다.'
sent3 = '부산 광안리 해변과 달맞이 고개 등 유동 인구와 차량 이동이 많은 지역 몇 곳을 골라 CCTV 데이터 속 정보를 어떻게 인식하는지, 공간에 대한 찰나를 표현한 작가의 작품을 어떻게 인식하는지 차이를 비교했다.'
test_samples = [sent1, sent2, sent3]
ner_prediction(test_samples, max_seq_len=128, tokenizer=tokenizer, lang='ko')

[[('울산', 'B-LOC'),
  ('에서', 'O'),
  ('활동', 'O'),
  ('하', 'O'),
  ('고', 'O'),
  ('있', 'O'),
  ('는', 'O'),
  ('시각', 'O'),
  ('예술', 'O'),
  ('분야', 'O'),
  ('김유경', 'B-PER'),
  ('작가', 'O'),
  ('는', 'O'),
  ('최근', 'O'),
  ('지역', 'O'),
  ('AI', 'O'),
  ('기업', 'O'),
  ('코어', 'B-ORG'),
  ('닷', 'I-ORG'),
  ('투데이', 'I-ORG'),
  ('와', 'O'),
  ('의', 'O'),
  ('협업', 'O'),
  ('을', 'O'),
  ('통한', 'O'),
  ('특별', 'O'),
  ('한', 'O'),
  ('전시', 'O'),
  ('를', 'O'),
  ('열', 'O'),
  ('었', 'O'),
  ('다', 'O'),
  ('.', 'O')],
 [('가치관', 'O'),
  ('이나', 'O'),
  ('인식', 'O'),
  ('에', 'O'),
  ('따라', 'O'),
  ('세상', 'O'),
  ('을', 'O'),
  ('불', 'O'),
  ('완전', 'O'),
  ('하', 'O'),
  ('게', 'O'),
  ('보', 'O'),
  ('는', 'O'),
  ('인간', 'O'),
  ('이', 'O'),
  ('학습', 'O'),
  ('을', 'O'),
  ('통해', 'O'),
  ('인지', 'O'),
  ('한', 'O'),
  ('부분', 'O'),
  ('만', 'O'),
  ('을', 'O'),
  ('인식', 'O'),
  ('하', 'O'),
  ('는', 'O'),
  ('AI', 'O'),
  ('와', 'O'),
  ('비슷', 'O'),
  ('하', 'O'),
  ('다고', 'O'),
  ('보', 'O'),
  ('고', 'O'),
  ('전시', 'O'),
  ('를', 'O'),
  ('기획', 'O'),
  ('했', 'O'),
  ('다', 'O'),
  ('.', 'O')],
 [('부산', 'B-LOC'),
  ('광안리', 'I-LOC'),
  ('해변', 'I-LOC'),
  ('과', 'O'),
  ('달맞이', 'B-LOC'),
  ('고개', 'I-LOC'),
  ('등', 'O'),
  ('유동', 'O'),
  ('인구', 'O'),
  ('와', 'O'),
  ('차량', 'O'),
  ('이동', 'O'),
  ('이', 'O'),
  ('많', 'O'),
  ('은', 'O'),
  ('지역', 'O'),
  ('몇', 'O'),
  ('곳', 'O'),
  ('을', 'O'),
  ('골라', 'O'),
  ('CCTV', 'O'),
  ('데이터', 'O'),
  ('속', 'O'),
  ('정보', 'O'),
  ('를', 'O'),
  ('어떻게', 'O'),
  ('인식', 'O'),
  ('하', 'O'),
  ('는지', 'O'),
  (',', 'O'),
  ('공간', 'O'),
  ('에', 'O'),
  ('대한', 'O'),
  ('찰나', 'O'),
  ('를', 'O'),
  ('표현', 'O'),
  ('한', 'O'),
  ('작가', 'O'),
  ('의', 'O'),
  ('작품', 'O'),
  ('을', 'O'),
  ('어떻게', 'O'),
  ('인식', 'O'),
  ('하', 'O'),
  ('는지', 'O'),
  ('차이', 'O'),
  ('를', 'O'),
  ('비교', 'O'),
  ('했', 'O'),
  ('다', 'O'),
  ('.', 'O')]]

tensorflow-estimator==2.5.0
tensorflow-gpu==2.5.3
transformers @ git+https://github.com/davidegazze/transformers@cf28c1db00410f0df3e654d9866e0ff1d3a45f29
numpy==1.24.3
konlpy==0.6.0