fasttext-classification / fasttext_jp_embedding.py

commit files to HF hub

cd708ec almost 2 years ago

2.23 kB

	from __future__ import annotations
	from transformers import PretrainedConfig
	from transformers import PreTrainedModel
	from torch import nn
	import torch
	from torchtyping import TensorType


	class FastTextJpConfig(PretrainedConfig):
	"""FastTextJpModelのConfig
	"""
	model_type = "fasttext_jp"

	def __init__(self,
	vocab_size=1,
	hidden_size=1,
	tokenizer_class="FastTextJpTokenizer",
	**kwargs):
	"""初期化処理

	Args:
	tokenizer_class (str, optional):
	tokenizer_classを指定しないと、pipelineから読み込まれません。
	config.jsonに記載されます。
	vocab_size (str, optional):
	vocab_sizeを指定しないと、pipelineから読み込まれません。
	config.jsonに記載されます。
	hidden_size (str, optional):
	hidden_sizeを指定しないと、pipelineから読み込まれません。
	config.jsonに記載されます。
	"""
	kwargs["vocab_size"] = vocab_size
	kwargs["hidden_size"] = hidden_size
	kwargs["tokenizer_class"] = tokenizer_class
	super().__init__(**kwargs)


	class FastTextJpModel(PreTrainedModel):
	"""FastTextのEmbeddingを行います。
	"""
	config_class = FastTextJpConfig

	def __init__(self, config: FastTextJpConfig):
	super().__init__(config)
	self.word_embeddings = nn.Embedding(config.vocab_size,
	config.hidden_size)

	def forward(self, **inputs) -> TensorType["batch", "word", "vectors"]:
	"""embeddingを行います。

	Returns:
	TensorType["batch", "word", "vectors"]: 単語ごとにベクトルを返します。
	"""
	return self.word_embeddings(torch.Tensor(inputs["input_ids"]))


	# AutoModelに登録が必要だが、いろいろやり方が変わっているようで定まっていない。(2022/11/6)
	# https://huggingface.co/docs/transformers/custom_models#sending-the-code-to-the-hub
	FastTextJpConfig.register_for_auto_class()
	FastTextJpModel.register_for_auto_class("AutoModel")