paulhindemith commited on
Commit
cd708ec
1 Parent(s): 624864d

commit files to HF hub

Browse files
fasttext_fsc.py CHANGED
@@ -34,7 +34,7 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
34
 
35
  def __init__(self, config: FastTextForSeuqenceClassificationConfig):
36
 
37
- self.ngram = config.ngram
38
  super().__init__(config)
39
 
40
  def forward(self, **inputs) -> SequenceClassifierOutput:
@@ -58,7 +58,7 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
58
  attention_mask == 1)]
59
  candidate_label = output[torch.logical_and(token_type_ids == 1,
60
  attention_mask == 1)]
61
- sentence_words = self.split_ngram(sentence, self.ngram)
62
  candidate_label_mean = torch.mean(candidate_label,
63
  dim=-2,
64
  keepdim=True)
@@ -76,7 +76,8 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
76
  self, sentence_words: TensorType["words", "vectors"],
77
  candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
78
  res = torch.tensor(0.)
79
- for sw in sentence_words:
 
80
  p = torch.nn.functional.cosine_similarity(sw,
81
  candidate_label_means[0],
82
  dim=0)
@@ -87,6 +88,8 @@ class FastTextForSeuqenceClassification(FastTextJpModel):
87
  def split_ngram(self, sentences: TensorType["word", "vectors"],
88
  n: int) -> TensorType["word", "vectors"]:
89
  res = []
 
 
90
  for i in range(len(sentences) - n + 1):
91
  ngram = sentences[i:i + n]
92
  res.append(torch.mean(ngram, dim=0, keepdim=False))
 
34
 
35
  def __init__(self, config: FastTextForSeuqenceClassificationConfig):
36
 
37
+ self.max_ngram = config.ngram
38
  super().__init__(config)
39
 
40
  def forward(self, **inputs) -> SequenceClassifierOutput:
 
58
  attention_mask == 1)]
59
  candidate_label = output[torch.logical_and(token_type_ids == 1,
60
  attention_mask == 1)]
61
+ sentence_words = self.split_ngram(sentence, self.max_ngram)
62
  candidate_label_mean = torch.mean(candidate_label,
63
  dim=-2,
64
  keepdim=True)
 
76
  self, sentence_words: TensorType["words", "vectors"],
77
  candidate_label_means: TensorType[1, "vectors"]) -> TensorType[1]:
78
  res = torch.tensor(0.)
79
+ for i in range(len(sentence_words)):
80
+ sw = sentence_words[i]
81
  p = torch.nn.functional.cosine_similarity(sw,
82
  candidate_label_means[0],
83
  dim=0)
 
88
  def split_ngram(self, sentences: TensorType["word", "vectors"],
89
  n: int) -> TensorType["word", "vectors"]:
90
  res = []
91
+ if len(sentences) <= n:
92
+ return torch.stack([torch.mean(sentences, dim=0, keepdim=False)])
93
  for i in range(len(sentences) - n + 1):
94
  ngram = sentences[i:i + n]
95
  res.append(torch.mean(ngram, dim=0, keepdim=False))
fasttext_jp_embedding.py CHANGED
@@ -11,14 +11,26 @@ class FastTextJpConfig(PretrainedConfig):
11
  """
12
  model_type = "fasttext_jp"
13
 
14
- def __init__(self, tokenizer_class="FastTextJpTokenizer", **kwargs):
 
 
 
 
15
  """初期化処理
16
 
17
  Args:
18
  tokenizer_class (str, optional):
19
  tokenizer_classを指定しないと、pipelineから読み込まれません。
20
  config.jsonに記載されます。
 
 
 
 
 
 
21
  """
 
 
22
  kwargs["tokenizer_class"] = tokenizer_class
23
  super().__init__(**kwargs)
24
 
 
11
  """
12
  model_type = "fasttext_jp"
13
 
14
+ def __init__(self,
15
+ vocab_size=1,
16
+ hidden_size=1,
17
+ tokenizer_class="FastTextJpTokenizer",
18
+ **kwargs):
19
  """初期化処理
20
 
21
  Args:
22
  tokenizer_class (str, optional):
23
  tokenizer_classを指定しないと、pipelineから読み込まれません。
24
  config.jsonに記載されます。
25
+ vocab_size (str, optional):
26
+ vocab_sizeを指定しないと、pipelineから読み込まれません。
27
+ config.jsonに記載されます。
28
+ hidden_size (str, optional):
29
+ hidden_sizeを指定しないと、pipelineから読み込まれません。
30
+ config.jsonに記載されます。
31
  """
32
+ kwargs["vocab_size"] = vocab_size
33
+ kwargs["hidden_size"] = hidden_size
34
  kwargs["tokenizer_class"] = tokenizer_class
35
  super().__init__(**kwargs)
36
 
mecab_tokenizer.py CHANGED
@@ -12,6 +12,8 @@ class MeCabResult(NamedTuple):
12
 
13
 
14
  class MeCabTokenizer(PreTrainedTokenizer):
 
 
15
 
16
  def __init__(self,
17
  hinshi: list[str] | None = None,
 
12
 
13
 
14
  class MeCabTokenizer(PreTrainedTokenizer):
15
+ target_hinshi: list[str] | None
16
+ mecab: MeCab.Tagger
17
 
18
  def __init__(self,
19
  hinshi: list[str] | None = None,