Edit model card

Japanese BERT-base (Sudachi + Unigram)

How to load the tokenizer

Please download the dictionary file for Sudachi + Unigram from our GitHub repository. Then you can load the tokenizer by specifying the path of the dictionary file to dict_path.

from typing import Optional

from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast

from sudachipy import tokenizer
from sudachipy import dictionary
import textspan

class SudachiPreTokenizer:
    def __init__(self, mecab_dict_path: Optional[str] = None):
        self.sudachi = dictionary.Dictionary().create()
    
    def tokenize(self, sequence: str) -> list[str]:
        return [token.surface() for token in self.sudachi.tokenize(sequence)]
    
    def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
        text = str(normalized_string)
        tokens = self.tokenize(text)
        tokens_spans = textspan.get_original_spans(tokens, text)
        return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.custom_split)

# load a pre-tokenizer
pre_tokenizer = SudachiPreTokenizer()

# load a tokenizer
dict_path = /path/to/sudachi_unigram.json
tokenizer = Tokenizer.from_file(dict_path)
tokenizer.post_processor = BertProcessing(
    cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
    sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)

# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    mask_token='[MASK]'
)

# set a pre-tokenizer
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
# Test
test_str = "γ“γ‚“γ«γ‘γ―γ€‚η§γ―ε½’ζ…‹η΄ θ§£ζžε™¨γ«γ€γ„γ¦η ”η©Άγ‚’γ—γ¦γ„γΎγ™γ€‚"
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
# -> ['[CLS]','こんにけ','は','。','私','は','ε½’ζ…‹','η΄ ','解','析','器','に','぀い','て','η ”η©Ά','γ‚’','し','て','い','ま','す','。','[SEP]']

How to load the model

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_sudachi-unigram")

See our repository for more details!

Downloads last month
2
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Datasets used to train hitachi-nlp/bert-base-japanese_sudachi-unigram