๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ฒ์
- transformers: 4.21.1
- datasets: 2.4.0
- tokenizers: 0.12.1
ํ๋ จ ์ฝ๋
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc", trim_offsets=True)
ds = load_dataset("Bingsu/my-korean-training-corpus", use_auth_token=True)
# ๊ณต๊ฐ๋ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉํ ๊ฒฝ์ฐ
# ds = load_dataset("cc100", lang="ko") # 50GB
# ์ด ๋ฐ์ดํฐ๋ 35GB์ด๊ณ , ๋ฐ์ดํฐ๊ฐ ๋๋ฌด ๋ง์ผ๋ฉด ์ปดํจํฐ๊ฐ ํฐ์ ธ์ ์ผ๋ถ๋ง ์ฌ์ฉํ์ต๋๋ค.
ds_sample = ds["train"].train_test_split(0.35, seed=20220819)["test"]
def gen_text(batch_size: int = 5000):
for i in range(0, len(ds_sample), batch_size):
yield ds_sample[i : i + batch_size]["text"]
tokenizer.train_from_iterator(
gen_text(),
vocab_size=50265, # roberta-base์ ๊ฐ์ ํฌ๊ธฐ
min_frequency=2,
special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
],
)
tokenizer.save("my_tokenizer.json")
์ฝ 7์๊ฐ ์๋ชจ (i5-12600 non-k)
์ดํ ํ ํฌ๋์ด์ ์ post-processor๋ฅผ RobertaProcessing์ผ๋ก ๊ต์ฒดํฉ๋๋ค.
from tokenizers import Tokenizer
from tokenizers.processors import RobertaProcessing
tokenizer = Tokenizer.from_file("my_tokenizer.json")
tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
add_prefix_space=False,
)
tokenizer.save("my_tokenizer2.json")
add_prefix_space=False
์ต์
์ roberta-base๋ฅผ ๊ทธ๋๋ก ๋ฐ๋ผํ๊ธฐ ์ํ ๊ฒ์
๋๋ค.
๊ทธ๋ฆฌ๊ณ model_max_length
์ค์ ์ ํด์ฃผ์์ต๋๋ค.
from transformers import RobertaTokenizerFast
rt = RobertaTokenizerFast(tokenizer_file="tokenizer.json")
rt.save_pretrained("./my_roberta_tokenizer")
์ ์ฅ๋ ํด๋์ tokenizer_config.json
ํ์ผ์ "model_max_length": 512,
๋ฅผ ์ถ๊ฐ.
์ฌ์ฉ๋ฒ
1.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Bingsu/BBPE_tokenizer_test")
# tokenizer๋ RobertaTokenizerFast ํด๋์ค๊ฐ ๋ฉ๋๋ค.
2.
tokenizer.json
ํ์ผ์ ๋จผ์ ๋ค์ด๋ฐ์ต๋๋ค.
from transformers import BartTokenizerFast, BertTokenizerFast
bart_tokenizer = BartTokenizerFast(tokenizer_file="tokenizer.json")
bert_tokenizer = BertTokenizerFast(tokenizer_file="tokenizer.json")
roberta์ ๊ฐ์ด BBPE๋ฅผ ์ฌ์ฉํ bart๋ ๋ฌผ๋ก ์ด๊ณ bert์๋ ๋ถ๋ฌ์ฌ ์ ์์ต๋๋ค. ๋ค๋ง ์ด๋ ๊ฒ ๋ถ๋ฌ์์ ๊ฒฝ์ฐ, model_max_len์ด ์ง์ ์ด ๋์ด์์ง ์์ผ๋ ์ง์ ํด์ผ ํฉ๋๋ค.