lijie.wang commited on
Commit
caf7c16
1 Parent(s): 25ba07f

update model and config 3T

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. model_hubs/Skywork-13B-Base-3T/config.json +27 -0
  3. model_hubs/Skywork-13B-Base-3T/configuration_skywork.py +89 -0
  4. model_hubs/Skywork-13B-Base-3T/generation_config.json +10 -0
  5. model_hubs/Skywork-13B-Base-3T/modeling_skywork.py +911 -0
  6. model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin +3 -0
  7. model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin +3 -0
  8. model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin +3 -0
  9. model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin +3 -0
  10. model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin +3 -0
  11. model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin +3 -0
  12. model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin +3 -0
  13. model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin +3 -0
  14. model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin +3 -0
  15. model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin +3 -0
  16. model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin +3 -0
  17. model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin +3 -0
  18. model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin +3 -0
  19. model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin +3 -0
  20. model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin +3 -0
  21. model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin +3 -0
  22. model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin +3 -0
  23. model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin +3 -0
  24. model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin +3 -0
  25. model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin +3 -0
  26. model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin +3 -0
  27. model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin +3 -0
  28. model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin +3 -0
  29. model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin +3 -0
  30. model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin +3 -0
  31. model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin +3 -0
  32. model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin +3 -0
  33. model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin +3 -0
  34. model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin +3 -0
  35. model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin +3 -0
  36. model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin +3 -0
  37. model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin +3 -0
  38. model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin +3 -0
  39. model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin +3 -0
  40. model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin +3 -0
  41. model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin +3 -0
  42. model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin +3 -0
  43. model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin +3 -0
  44. model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin +3 -0
  45. model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin +3 -0
  46. model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin +3 -0
  47. model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin +3 -0
  48. model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin +3 -0
  49. model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin +3 -0
  50. model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin +3 -0
.gitattributes CHANGED
@@ -351,3 +351,4 @@ model_hubs/Skywork-13B-Base-1T/pytorch_model-00051-of-00053.bin filter=lfs diff=
351
  model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
352
  model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
353
  model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
 
 
351
  model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
352
  model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
353
  model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
354
+ model_hubs/model_hubs/Skywork-13B-Base-3T/*.bin filter=lfs diff=lfs merge=lfs -text
model_hubs/Skywork-13B-Base-3T/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SkyworkForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
7
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "pad_token_id": 0,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 4608,
14
+ "initializer_range": 0.01,
15
+ "intermediate_size": 12288,
16
+ "max_position_embeddings": 131072,
17
+ "model_type": "skywork",
18
+ "num_attention_heads": 36,
19
+ "num_hidden_layers": 52,
20
+ "num_key_value_heads": 36,
21
+ "rms_norm_eps": 1e-06,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.33.1",
25
+ "use_cache": true,
26
+ "vocab_size": 65519
27
+ }
model_hubs/Skywork-13B-Base-3T/configuration_skywork.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ from transformers.utils import logging
7
+
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
12
+
13
+
14
+ class SkyworkConfig(PretrainedConfig):
15
+
16
+ model_type = "skywork"
17
+ keys_to_ignore_at_inference = ["past_key_values"]
18
+
19
+ def __init__(
20
+ self,
21
+ vocab_size=32000,
22
+ hidden_size=4096,
23
+ intermediate_size=11008,
24
+ num_hidden_layers=32,
25
+ num_attention_heads=32,
26
+ num_key_value_heads=None,
27
+ hidden_act="silu",
28
+ max_position_embeddings=2048,
29
+ initializer_range=0.02,
30
+ rms_norm_eps=1e-6,
31
+ use_cache=True,
32
+ pad_token_id=None,
33
+ bos_token_id=1,
34
+ eos_token_id=2,
35
+ pretraining_tp=1,
36
+ tie_word_embeddings=False,
37
+ rope_theta=10000.0,
38
+ rope_scaling=None,
39
+ **kwargs,
40
+ ):
41
+ self.vocab_size = vocab_size
42
+ self.max_position_embeddings = max_position_embeddings
43
+ self.hidden_size = hidden_size
44
+ self.intermediate_size = intermediate_size
45
+ self.num_hidden_layers = num_hidden_layers
46
+ self.num_attention_heads = num_attention_heads
47
+
48
+ # for backward compatibility
49
+ if num_key_value_heads is None:
50
+ num_key_value_heads = num_attention_heads
51
+
52
+ self.num_key_value_heads = num_key_value_heads
53
+ self.hidden_act = hidden_act
54
+ self.initializer_range = initializer_range
55
+ self.rms_norm_eps = rms_norm_eps
56
+ self.pretraining_tp = pretraining_tp
57
+ self.use_cache = use_cache
58
+ self.rope_theta = rope_theta
59
+ self.rope_scaling = rope_scaling
60
+ self._rope_scaling_validation()
61
+
62
+ super().__init__(
63
+ pad_token_id=pad_token_id,
64
+ bos_token_id=bos_token_id,
65
+ eos_token_id=eos_token_id,
66
+ tie_word_embeddings=tie_word_embeddings,
67
+ **kwargs,
68
+ )
69
+
70
+ def _rope_scaling_validation(self):
71
+ """
72
+ Validate the `rope_scaling` configuration.
73
+ """
74
+ if self.rope_scaling is None:
75
+ return
76
+
77
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
78
+ raise ValueError(
79
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
80
+ f"got {self.rope_scaling}"
81
+ )
82
+ rope_scaling_type = self.rope_scaling.get("type", None)
83
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
84
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
85
+ raise ValueError(
86
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
87
+ )
88
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
89
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
model_hubs/Skywork-13B-Base-3T/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "transformers_version": "4.33.1"
10
+ }
model_hubs/Skywork-13B-Base-3T/modeling_skywork.py ADDED
@@ -0,0 +1,911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+ import math
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.utils.checkpoint
10
+ from torch import nn
11
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
12
+
13
+ from transformers.activations import ACT2FN
14
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
15
+ from transformers.modeling_utils import PreTrainedModel
16
+ from transformers.utils import logging
17
+ from .configuration_skywork import SkyworkConfig
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ _CONFIG_FOR_DOC = "SkyworkConfig"
23
+
24
+
25
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
26
+ def _make_causal_mask(
27
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
28
+ ):
29
+ """
30
+ Make causal mask used for bi-directional self-attention.
31
+ """
32
+ bsz, tgt_len = input_ids_shape
33
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
34
+ mask_cond = torch.arange(mask.size(-1), device=device)
35
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
36
+ mask = mask.to(dtype)
37
+
38
+ if past_key_values_length > 0:
39
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
40
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
41
+
42
+
43
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
44
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
45
+ """
46
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
47
+ """
48
+ bsz, src_len = mask.size()
49
+ tgt_len = tgt_len if tgt_len is not None else src_len
50
+
51
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
52
+
53
+ inverted_mask = 1.0 - expanded_mask
54
+
55
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
56
+
57
+
58
+ class SkyworkRMSNorm(nn.Module):
59
+ def __init__(self, hidden_size, eps=1e-6):
60
+ """
61
+ SkyworkRMSNorm is equivalent to T5LayerNorm
62
+ """
63
+ super().__init__()
64
+ self.weight = nn.Parameter(torch.ones(hidden_size))
65
+ self.variance_epsilon = eps
66
+
67
+ def forward(self, hidden_states):
68
+ input_dtype = hidden_states.dtype
69
+ hidden_states = hidden_states.to(torch.float32)
70
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
71
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
72
+ return self.weight * hidden_states.to(input_dtype)
73
+
74
+
75
+ class SkyworkRotaryEmbedding(torch.nn.Module):
76
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
77
+ super().__init__()
78
+
79
+ self.dim = dim
80
+ self.max_position_embeddings = max_position_embeddings
81
+ self.base = base
82
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
83
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
84
+
85
+ # Build here to make `torch.jit.trace` work.
86
+ self._set_cos_sin_cache(
87
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
88
+ )
89
+
90
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
91
+ self.max_seq_len_cached = seq_len
92
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
93
+
94
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
95
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
96
+ emb = torch.cat((freqs, freqs), dim=-1)
97
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
98
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
99
+
100
+ def forward(self, x, seq_len=None):
101
+ # x: [bs, num_attention_heads, seq_len, head_size]
102
+ if seq_len > self.max_seq_len_cached:
103
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
104
+
105
+ return (
106
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
107
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
108
+ )
109
+
110
+
111
+ class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
112
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
113
+
114
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
115
+ self.scaling_factor = scaling_factor
116
+ super().__init__(dim, max_position_embeddings, base, device)
117
+
118
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
119
+ self.max_seq_len_cached = seq_len
120
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
121
+ t = t / self.scaling_factor
122
+
123
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
124
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
125
+ emb = torch.cat((freqs, freqs), dim=-1)
126
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
127
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
128
+
129
+
130
+ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
131
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
132
+
133
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
134
+ self.scaling_factor = scaling_factor
135
+ super().__init__(dim, max_position_embeddings, base, device)
136
+
137
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
138
+ self.max_seq_len_cached = seq_len
139
+
140
+ if seq_len > self.max_position_embeddings:
141
+ base = self.base * (
142
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
143
+ ) ** (self.dim / (self.dim - 2))
144
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
145
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
146
+
147
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
148
+
149
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
150
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
151
+ emb = torch.cat((freqs, freqs), dim=-1)
152
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
153
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
154
+
155
+
156
+
157
+ class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
158
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
159
+ super().__init__()
160
+
161
+ self.dim = dim
162
+ self.max_position_embeddings = max_position_embeddings
163
+ self.base = base * scaling_factor
164
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
165
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
166
+
167
+ # Build here to make `torch.jit.trace` work.
168
+ self._set_cos_sin_cache(
169
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
170
+ )
171
+
172
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
173
+ self.max_seq_len_cached = seq_len
174
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
175
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
176
+ emb = torch.cat((freqs, freqs), dim=-1)
177
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
178
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
179
+
180
+ def forward(self, x, seq_len=None):
181
+ if seq_len > self.max_seq_len_cached:
182
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
183
+
184
+ return (
185
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
186
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
187
+ )
188
+
189
+ def rotate_half(x):
190
+ """Rotates half the hidden dims of the input."""
191
+ x1 = x[..., : x.shape[-1] // 2]
192
+ x2 = x[..., x.shape[-1] // 2 :]
193
+ return torch.cat((-x2, x1), dim=-1)
194
+
195
+
196
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
197
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
198
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
199
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
200
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
201
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
202
+ q_embed = (q * cos) + (rotate_half(q) * sin)
203
+ k_embed = (k * cos) + (rotate_half(k) * sin)
204
+ return q_embed, k_embed
205
+
206
+
207
+ class SkyworkMLP(nn.Module):
208
+ def __init__(self, config):
209
+ super().__init__()
210
+ self.config = config
211
+ self.hidden_size = config.hidden_size
212
+ self.intermediate_size = config.intermediate_size
213
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
214
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
216
+ self.act_fn = ACT2FN[config.hidden_act]
217
+
218
+ def forward(self, x):
219
+ if self.config.pretraining_tp > 1:
220
+ slice = self.intermediate_size // self.config.pretraining_tp
221
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
222
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
223
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
224
+
225
+ gate_proj = torch.cat(
226
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
227
+ )
228
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
229
+
230
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
231
+ down_proj = [
232
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
233
+ ]
234
+ down_proj = sum(down_proj)
235
+ else:
236
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
237
+
238
+ return down_proj
239
+
240
+
241
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
242
+ """
243
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
244
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
245
+ """
246
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
247
+ if n_rep == 1:
248
+ return hidden_states
249
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
250
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
251
+
252
+
253
+ class SkyworkAttention(nn.Module):
254
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
255
+
256
+ def __init__(self, config: SkyworkConfig):
257
+ super().__init__()
258
+ self.config = config
259
+ self.hidden_size = config.hidden_size
260
+ self.num_heads = config.num_attention_heads
261
+ self.head_dim = self.hidden_size // self.num_heads
262
+ self.num_key_value_heads = config.num_key_value_heads
263
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
264
+ self.max_position_embeddings = config.max_position_embeddings
265
+ self.rope_theta = config.rope_theta
266
+
267
+ if (self.head_dim * self.num_heads) != self.hidden_size:
268
+ raise ValueError(
269
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
270
+ f" and `num_heads`: {self.num_heads})."
271
+ )
272
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
273
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
274
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
275
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
276
+ self._init_rope()
277
+
278
+ def _init_rope(self):
279
+ if self.config.rope_scaling is None:
280
+ self.rotary_emb = SkyworkRotaryEmbedding(
281
+ self.head_dim,
282
+ max_position_embeddings=self.max_position_embeddings,
283
+ base=self.rope_theta,
284
+ )
285
+ else:
286
+ scaling_type = self.config.rope_scaling["type"]
287
+ scaling_factor = self.config.rope_scaling["factor"]
288
+ if scaling_type == "linear":
289
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
290
+ self.head_dim,
291
+ max_position_embeddings=self.max_position_embeddings,
292
+ scaling_factor=scaling_factor,
293
+ base=self.rope_theta,
294
+ )
295
+ elif scaling_type == "dynamic":
296
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
297
+ self.head_dim,
298
+ max_position_embeddings=self.max_position_embeddings,
299
+ scaling_factor=scaling_factor,
300
+ base=self.rope_theta,
301
+ )
302
+ elif scaling_type == "ntk":
303
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
304
+ self.head_dim,
305
+ max_position_embeddings=self.max_position_embeddings,
306
+ scaling_factor=scaling_factor,
307
+ base=self.rope_theta,
308
+ )
309
+ else:
310
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
311
+ print('-'*80)
312
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
313
+
314
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
315
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
316
+
317
+ def forward(
318
+ self,
319
+ hidden_states: torch.Tensor,
320
+ attention_mask: Optional[torch.Tensor] = None,
321
+ position_ids: Optional[torch.LongTensor] = None,
322
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
323
+ output_attentions: bool = False,
324
+ use_cache: bool = False,
325
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
326
+ bsz, q_len, _ = hidden_states.size()
327
+
328
+ if self.config.pretraining_tp > 1:
329
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
330
+ query_slices = self.q_proj.weight.split(
331
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
332
+ )
333
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
334
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
335
+
336
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
337
+ query_states = torch.cat(query_states, dim=-1)
338
+
339
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
340
+ key_states = torch.cat(key_states, dim=-1)
341
+
342
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
343
+ value_states = torch.cat(value_states, dim=-1)
344
+
345
+ else:
346
+ query_states = self.q_proj(hidden_states)
347
+ key_states = self.k_proj(hidden_states)
348
+ value_states = self.v_proj(hidden_states)
349
+
350
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
351
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
352
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
353
+
354
+ kv_seq_len = key_states.shape[-2]
355
+ if past_key_value is not None:
356
+ kv_seq_len += past_key_value[0].shape[-2]
357
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
358
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
359
+
360
+ if past_key_value is not None:
361
+ # reuse k, v, self_attention
362
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
363
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
364
+
365
+ past_key_value = (key_states, value_states) if use_cache else None
366
+
367
+ # repeat k/v heads if n_kv_heads < n_heads
368
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
369
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
370
+
371
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
372
+
373
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
374
+ raise ValueError(
375
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
376
+ f" {attn_weights.size()}"
377
+ )
378
+
379
+ if attention_mask is not None:
380
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
381
+ raise ValueError(
382
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
383
+ )
384
+ attn_weights = attn_weights + attention_mask
385
+
386
+ # upcast attention to fp32
387
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
388
+ attn_output = torch.matmul(attn_weights, value_states)
389
+
390
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
391
+ raise ValueError(
392
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
393
+ f" {attn_output.size()}"
394
+ )
395
+
396
+ attn_output = attn_output.transpose(1, 2).contiguous()
397
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
398
+
399
+ if self.config.pretraining_tp > 1:
400
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
401
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
402
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
403
+ else:
404
+ attn_output = self.o_proj(attn_output)
405
+
406
+ if not output_attentions:
407
+ attn_weights = None
408
+
409
+ return attn_output, attn_weights, past_key_value
410
+
411
+
412
+ class SkyworkDecoderLayer(nn.Module):
413
+ def __init__(self, config: SkyworkConfig):
414
+ super().__init__()
415
+ self.hidden_size = config.hidden_size
416
+ self.self_attn = SkyworkAttention(config=config)
417
+ self.mlp = SkyworkMLP(config)
418
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
419
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
420
+
421
+ def forward(
422
+ self,
423
+ hidden_states: torch.Tensor,
424
+ attention_mask: Optional[torch.Tensor] = None,
425
+ position_ids: Optional[torch.LongTensor] = None,
426
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
427
+ output_attentions: Optional[bool] = False,
428
+ use_cache: Optional[bool] = False,
429
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
430
+ """
431
+ Args:
432
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
433
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
434
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
435
+ output_attentions (`bool`, *optional*):
436
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
437
+ returned tensors for more detail.
438
+ use_cache (`bool`, *optional*):
439
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
440
+ (see `past_key_values`).
441
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
442
+ """
443
+
444
+ residual = hidden_states
445
+
446
+ hidden_states = self.input_layernorm(hidden_states)
447
+
448
+ # Self Attention
449
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
450
+ hidden_states=hidden_states,
451
+ attention_mask=attention_mask,
452
+ position_ids=position_ids,
453
+ past_key_value=past_key_value,
454
+ output_attentions=output_attentions,
455
+ use_cache=use_cache,
456
+ )
457
+ hidden_states = residual + hidden_states
458
+
459
+ # Fully Connected
460
+ residual = hidden_states
461
+ hidden_states = self.post_attention_layernorm(hidden_states)
462
+ hidden_states = self.mlp(hidden_states)
463
+ hidden_states = residual + hidden_states
464
+
465
+ outputs = (hidden_states,)
466
+
467
+ if output_attentions:
468
+ outputs += (self_attn_weights,)
469
+
470
+ if use_cache:
471
+ outputs += (present_key_value,)
472
+
473
+ return outputs
474
+
475
+ class SkyworkPreTrainedModel(PreTrainedModel):
476
+ config_class = SkyworkConfig
477
+ base_model_prefix = "model"
478
+ supports_gradient_checkpointing = True
479
+ _no_split_modules = ["SkyworkDecoderLayer"]
480
+ _skip_keys_device_placement = "past_key_values"
481
+
482
+ def _init_weights(self, module):
483
+ std = self.config.initializer_range
484
+ if isinstance(module, nn.Linear):
485
+ module.weight.data.normal_(mean=0.0, std=std)
486
+ if module.bias is not None:
487
+ module.bias.data.zero_()
488
+ elif isinstance(module, nn.Embedding):
489
+ module.weight.data.normal_(mean=0.0, std=std)
490
+ if module.padding_idx is not None:
491
+ module.weight.data[module.padding_idx].zero_()
492
+
493
+ def _set_gradient_checkpointing(self, module, value=False):
494
+ if isinstance(module, SkyworkModel):
495
+ module.gradient_checkpointing = value
496
+
497
+ class SkyworkModel(SkyworkPreTrainedModel):
498
+ """
499
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
500
+
501
+ Args:
502
+ config: SkyworkConfig
503
+ """
504
+
505
+ def __init__(self, config: SkyworkConfig):
506
+ super().__init__(config)
507
+ self.padding_idx = config.pad_token_id
508
+ self.vocab_size = config.vocab_size
509
+
510
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
511
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
512
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
513
+
514
+ self.gradient_checkpointing = False
515
+ # Initialize weights and apply final processing
516
+ self.post_init()
517
+
518
+ def get_input_embeddings(self):
519
+ return self.embed_tokens
520
+
521
+ def set_input_embeddings(self, value):
522
+ self.embed_tokens = value
523
+
524
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
525
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
526
+ # create causal mask
527
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
528
+ combined_attention_mask = None
529
+ if input_shape[-1] > 1:
530
+ combined_attention_mask = _make_causal_mask(
531
+ input_shape,
532
+ inputs_embeds.dtype,
533
+ device=inputs_embeds.device,
534
+ past_key_values_length=past_key_values_length,
535
+ )
536
+
537
+ if attention_mask is not None:
538
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
539
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
540
+ inputs_embeds.device
541
+ )
542
+ combined_attention_mask = (
543
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
544
+ )
545
+
546
+ return combined_attention_mask
547
+
548
+ def forward(
549
+ self,
550
+ input_ids: torch.LongTensor = None,
551
+ attention_mask: Optional[torch.Tensor] = None,
552
+ position_ids: Optional[torch.LongTensor] = None,
553
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
554
+ inputs_embeds: Optional[torch.FloatTensor] = None,
555
+ use_cache: Optional[bool] = None,
556
+ output_attentions: Optional[bool] = None,
557
+ output_hidden_states: Optional[bool] = None,
558
+ return_dict: Optional[bool] = None,
559
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
560
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
561
+ output_hidden_states = (
562
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
563
+ )
564
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
565
+
566
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
567
+
568
+ # retrieve input_ids and inputs_embeds
569
+ if input_ids is not None and inputs_embeds is not None:
570
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
571
+ elif input_ids is not None:
572
+ batch_size, seq_length = input_ids.shape
573
+ elif inputs_embeds is not None:
574
+ batch_size, seq_length, _ = inputs_embeds.shape
575
+ else:
576
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
577
+
578
+ seq_length_with_past = seq_length
579
+ past_key_values_length = 0
580
+
581
+ if past_key_values is not None:
582
+ past_key_values_length = past_key_values[0][0].shape[2]
583
+ seq_length_with_past = seq_length_with_past + past_key_values_length
584
+
585
+ if position_ids is None:
586
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
587
+ position_ids = torch.arange(
588
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
589
+ )
590
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
591
+ else:
592
+ position_ids = position_ids.view(-1, seq_length).long()
593
+
594
+ if inputs_embeds is None:
595
+ inputs_embeds = self.embed_tokens(input_ids)
596
+ # embed positions
597
+ if attention_mask is None:
598
+ attention_mask = torch.ones(
599
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
600
+ )
601
+ attention_mask = self._prepare_decoder_attention_mask(
602
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
603
+ )
604
+
605
+ hidden_states = inputs_embeds
606
+
607
+ if self.gradient_checkpointing and self.training:
608
+ if use_cache:
609
+ logger.warning_once(
610
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
611
+ )
612
+ use_cache = False
613
+
614
+ # decoder layers
615
+ all_hidden_states = () if output_hidden_states else None
616
+ all_self_attns = () if output_attentions else None
617
+ next_decoder_cache = () if use_cache else None
618
+
619
+ for idx, decoder_layer in enumerate(self.layers):
620
+ if output_hidden_states:
621
+ all_hidden_states += (hidden_states,)
622
+
623
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
624
+
625
+ if self.gradient_checkpointing and self.training:
626
+
627
+ def create_custom_forward(module):
628
+ def custom_forward(*inputs):
629
+ # None for past_key_value
630
+ return module(*inputs, past_key_value, output_attentions)
631
+
632
+ return custom_forward
633
+
634
+ layer_outputs = torch.utils.checkpoint.checkpoint(
635
+ create_custom_forward(decoder_layer),
636
+ hidden_states,
637
+ attention_mask,
638
+ position_ids,
639
+ )
640
+ else:
641
+ layer_outputs = decoder_layer(
642
+ hidden_states,
643
+ attention_mask=attention_mask,
644
+ position_ids=position_ids,
645
+ past_key_value=past_key_value,
646
+ output_attentions=output_attentions,
647
+ use_cache=use_cache,
648
+ )
649
+
650
+ hidden_states = layer_outputs[0]
651
+
652
+ if use_cache:
653
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
654
+
655
+ if output_attentions:
656
+ all_self_attns += (layer_outputs[1],)
657
+
658
+ hidden_states = self.norm(hidden_states)
659
+
660
+ # add hidden states from the last decoder layer
661
+ if output_hidden_states:
662
+ all_hidden_states += (hidden_states,)
663
+
664
+ next_cache = next_decoder_cache if use_cache else None
665
+ if not return_dict:
666
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
667
+ return BaseModelOutputWithPast(
668
+ last_hidden_state=hidden_states,
669
+ past_key_values=next_cache,
670
+ hidden_states=all_hidden_states,
671
+ attentions=all_self_attns,
672
+ )
673
+
674
+
675
+ class SkyworkForCausalLM(SkyworkPreTrainedModel):
676
+ _tied_weights_keys = ["lm_head.weight"]
677
+
678
+ def __init__(self, config):
679
+ super().__init__(config)
680
+ self.model = SkyworkModel(config)
681
+ self.vocab_size = config.vocab_size
682
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
683
+
684
+ # Initialize weights and apply final processing
685
+ self.post_init()
686
+
687
+ def get_input_embeddings(self):
688
+ return self.model.embed_tokens
689
+
690
+ def set_input_embeddings(self, value):
691
+ self.model.embed_tokens = value
692
+
693
+ def get_output_embeddings(self):
694
+ return self.lm_head
695
+
696
+ def set_output_embeddings(self, new_embeddings):
697
+ self.lm_head = new_embeddings
698
+
699
+ def set_decoder(self, decoder):
700
+ self.model = decoder
701
+
702
+ def get_decoder(self):
703
+ return self.model
704
+
705
+ def forward(
706
+ self,
707
+ input_ids: torch.LongTensor = None,
708
+ attention_mask: Optional[torch.Tensor] = None,
709
+ position_ids: Optional[torch.LongTensor] = None,
710
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
711
+ inputs_embeds: Optional[torch.FloatTensor] = None,
712
+ labels: Optional[torch.LongTensor] = None,
713
+ use_cache: Optional[bool] = None,
714
+ output_attentions: Optional[bool] = None,
715
+ output_hidden_states: Optional[bool] = None,
716
+ return_dict: Optional[bool] = None,
717
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
718
+
719
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
720
+ output_hidden_states = (
721
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
722
+ )
723
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
724
+
725
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
726
+ outputs = self.model(
727
+ input_ids=input_ids,
728
+ attention_mask=attention_mask,
729
+ position_ids=position_ids,
730
+ past_key_values=past_key_values,
731
+ inputs_embeds=inputs_embeds,
732
+ use_cache=use_cache,
733
+ output_attentions=output_attentions,
734
+ output_hidden_states=output_hidden_states,
735
+ return_dict=return_dict,
736
+ )
737
+
738
+ hidden_states = outputs[0]
739
+ if self.config.pretraining_tp > 1:
740
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
741
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
742
+ logits = torch.cat(logits, dim=-1)
743
+ else:
744
+ logits = self.lm_head(hidden_states)
745
+ logits = logits.float()
746
+
747
+ loss = None
748
+ if labels is not None:
749
+ # Shift so that tokens < n predict n
750
+ shift_logits = logits[..., :-1, :].contiguous()
751
+ shift_labels = labels[..., 1:].contiguous()
752
+ # Flatten the tokens
753
+ loss_fct = CrossEntropyLoss()
754
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
755
+ shift_labels = shift_labels.view(-1)
756
+ # Enable model parallelism
757
+ shift_labels = shift_labels.to(shift_logits.device)
758
+ loss = loss_fct(shift_logits, shift_labels)
759
+
760
+ if not return_dict:
761
+ output = (logits,) + outputs[1:]
762
+ return (loss,) + output if loss is not None else output
763
+
764
+ return CausalLMOutputWithPast(
765
+ loss=loss,
766
+ logits=logits,
767
+ past_key_values=outputs.past_key_values,
768
+ hidden_states=outputs.hidden_states,
769
+ attentions=outputs.attentions,
770
+ )
771
+
772
+ def prepare_inputs_for_generation(
773
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
774
+ ):
775
+ if past_key_values:
776
+ input_ids = input_ids[:, -1:]
777
+
778
+ position_ids = kwargs.get("position_ids", None)
779
+ if attention_mask is not None and position_ids is None:
780
+ # create position_ids on the fly for batch generation
781
+ position_ids = attention_mask.long().cumsum(-1) - 1
782
+ position_ids.masked_fill_(attention_mask == 0, 1)
783
+ if past_key_values:
784
+ position_ids = position_ids[:, -1].unsqueeze(-1)
785
+
786
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
787
+ if inputs_embeds is not None and past_key_values is None:
788
+ model_inputs = {"inputs_embeds": inputs_embeds}
789
+ else:
790
+ model_inputs = {"input_ids": input_ids}
791
+
792
+ model_inputs.update(
793
+ {
794
+ "position_ids": position_ids,
795
+ "past_key_values": past_key_values,
796
+ "use_cache": kwargs.get("use_cache"),
797
+ "attention_mask": attention_mask,
798
+ }
799
+ )
800
+ return model_inputs
801
+
802
+ @staticmethod
803
+ def _reorder_cache(past_key_values, beam_idx):
804
+ reordered_past = ()
805
+ for layer_past in past_key_values:
806
+ reordered_past += (
807
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
808
+ )
809
+ return reordered_past
810
+
811
+
812
+ class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
813
+ def __init__(self, config):
814
+ super().__init__(config)
815
+ self.num_labels = config.num_labels
816
+ self.model = SkyworkModel(config)
817
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
818
+
819
+ # Initialize weights and apply final processing
820
+ self.post_init()
821
+
822
+ def get_input_embeddings(self):
823
+ return self.model.embed_tokens
824
+
825
+ def set_input_embeddings(self, value):
826
+ self.model.embed_tokens = value
827
+
828
+ def forward(
829
+ self,
830
+ input_ids: torch.LongTensor = None,
831
+ attention_mask: Optional[torch.Tensor] = None,
832
+ position_ids: Optional[torch.LongTensor] = None,
833
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
834
+ inputs_embeds: Optional[torch.FloatTensor] = None,
835
+ labels: Optional[torch.LongTensor] = None,
836
+ use_cache: Optional[bool] = None,
837
+ output_attentions: Optional[bool] = None,
838
+ output_hidden_states: Optional[bool] = None,
839
+ return_dict: Optional[bool] = None,
840
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
841
+
842
+
843
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
844
+
845
+ transformer_outputs = self.model(
846
+ input_ids,
847
+ attention_mask=attention_mask,
848
+ position_ids=position_ids,
849
+ past_key_values=past_key_values,
850
+ inputs_embeds=inputs_embeds,
851
+ use_cache=use_cache,
852
+ output_attentions=output_attentions,
853
+ output_hidden_states=output_hidden_states,
854
+ return_dict=return_dict,
855
+ )
856
+ hidden_states = transformer_outputs[0]
857
+ logits = self.score(hidden_states)
858
+
859
+ if input_ids is not None:
860
+ batch_size = input_ids.shape[0]
861
+ else:
862
+ batch_size = inputs_embeds.shape[0]
863
+
864
+ if self.config.pad_token_id is None and batch_size != 1:
865
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
866
+ if self.config.pad_token_id is None:
867
+ sequence_lengths = -1
868
+ else:
869
+ if input_ids is not None:
870
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
871
+ logits.device
872
+ )
873
+ else:
874
+ sequence_lengths = -1
875
+
876
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
877
+
878
+ loss = None
879
+ if labels is not None:
880
+ labels = labels.to(logits.device)
881
+ if self.config.problem_type is None:
882
+ if self.num_labels == 1:
883
+ self.config.problem_type = "regression"
884
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
885
+ self.config.problem_type = "single_label_classification"
886
+ else:
887
+ self.config.problem_type = "multi_label_classification"
888
+
889
+ if self.config.problem_type == "regression":
890
+ loss_fct = MSELoss()
891
+ if self.num_labels == 1:
892
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
893
+ else:
894
+ loss = loss_fct(pooled_logits, labels)
895
+ elif self.config.problem_type == "single_label_classification":
896
+ loss_fct = CrossEntropyLoss()
897
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
898
+ elif self.config.problem_type == "multi_label_classification":
899
+ loss_fct = BCEWithLogitsLoss()
900
+ loss = loss_fct(pooled_logits, labels)
901
+ if not return_dict:
902
+ output = (pooled_logits,) + transformer_outputs[1:]
903
+ return ((loss,) + output) if loss is not None else output
904
+
905
+ return SequenceClassifierOutputWithPast(
906
+ loss=loss,
907
+ logits=pooled_logits,
908
+ past_key_values=transformer_outputs.past_key_values,
909
+ hidden_states=transformer_outputs.hidden_states,
910
+ attentions=transformer_outputs.attentions,
911
+ )
model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d092e2aeba4877c134bbc302ffe800ff72fabd7dfb8a1b7ca255ff4cfdb26de0
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79b754b7a428242286a89725ad65e1860b360472394091ed531fbbc1e171fbf
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e5b4469fdefb65bdc7caabcd749e1f103ac48fa522d5e1c1f80236c3c62a76
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1cda4a258856a9186373e182c7befecce32a72dd8fec1acb6f9b3f11897b8ee
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2275b4e33fd630282177bd629636d410cab800a31e572b5396789c70c69c92c4
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4a752ccd71220f2ec7d066efb28e28faf19bd5c0728ceee5f780433c34651e8
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b02461fc3b93e20d77c249f9c3dcb9f38f10165a3b4213c8525cd4d61bf875a
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f814a64b2d23139871fb3b846be421426ce39d73d557e1af0c4e6ff17ff5fb
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c90240f14e0dc36e90b17ac1224e62655356e3e656e6b54b8859b89af660f92b
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6414bd7fac64792a80332f9475a49c1fb9e489daab5b45d8ba26143d6701bb2b
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce47824f844a384b1f10dc0d4c424934fce8f5796465c505b26beb9d0eba66b7
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd4e517aefff968a25e3f149cd7f80aea65620de2eed4750a965c9266d92ad63
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f318eb88ef0dc35978c4f5544aff58536e6c68519df4053611c13e81ba6828
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b621aef6b4f71d49d6d419d36fa8116c57c55b35ad1d8c4535560cc9a2321cd
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ebbc3800c17d523461d2bb784fb98cb819055c7ef0e04c302f765900f8b971a
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:401d29d06a4bb093f25459a40dac2a1950512ecb965d1f1b9102448f832f94da
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94b64bd20285e8914e88149508d24ffeec0cccad557278081306c640c0194578
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaaf84dba722b2d44db99a76322dd80ef5047071fc62e544842a843276dc39c4
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d994f002cff01abd65455b5ba4556600433a4a433d5315442989c298006cdab1
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2b96e62364f435b0143be7f6f8124d0aff25c50d208e0485aa7c6fda42d6c86
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e12e2709313bab576cc1dd112d610e70d3bba27781fa7ae0071e84c16e53cd8
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a96816b3c2a97885ca385b5cf52cdc7cfe530629fff80cc692d03ff636ec2ca
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86cb140414cc2ebffa98fc8952308d2d48bb0b210018f96f1e775a88c41d3404
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43009affeff3e2d4931226d516bf4899c34f1111ba67718050c9f1cd09a5fb0
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91c619344db6b1d296754478881c3297f78ad456aea68c96aa8199cee79bde85
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1888e10c7b290e9f1616e9797ec117dd716bbbe285387e1171cd316286d521a6
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea4aaa229e0498f2c99fae52e02426262a905b2b1d5e9bbf504595ff03bb312
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a745700ec8a5d94deb1d0418430ea1f3396090694a4fc9737d758451a10f030
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f98c09a79c5fbec097a72fa36b59a12b84f620e62148536cb36853429e3a7b6
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19f103a43d812050b736dd93b5c836ec706a44251737182b2248868e0ce39857
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32bc7edb191abd247ad28194aed084eb1ec7964f755251293c08d0493c264542
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df03de0ceb916770d781ddd05265039b3e70a0d55b8174dce552a355e0bfdf8c
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5838693871bee163df5bb7f3862f2ecfbe4a53b979f7ac30f3a3f3e1afb7e6a1
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b90afa5524c7bf468ad4de5a05b99eed10f16a0333a8771a3fe7290f9c3104
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec887104980935ed5258f6facc1b9e788b144030a50903d245fdefccd94784c
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3e201fabeb31ad7db6849a61396a40829de108c3c7e9c0ed642fb1db57c92e8
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:457df823d806af28959f9dd42d19a90cd952ae8da098e83034dbdfa886c734c6
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b596588e63748121ec2c9203b5b3d893bc57c23cc2c6a9b220afb13f45d5da67
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:520fff8c8e4219f332dde2ec6177faaa928fe81a1101c998405238640c52595b
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91db97e987a6b4073c089ca2c741bd36cbc1b312c3e22078f3e983543174e434
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3bcfda2aec610151c66733895d869bdfecb02109c74621241f15e4a215534fe
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81d28f7772ac1c8f9fe6db38a1fbd2f0bfce8e26b28f8d6007e0c6e822591a94
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:819e6e193546c58441363a55897f576a1719e86a946eb33966ab91f4fdc1ed05
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26cc5467e35420772862525c75de6212354b2d9a45a541b7aa50289eb2218199
3
+ size 509630258
model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d6ade17f2664101a2e2c18ef016bb5cc1426e713b0f69489f66439b844c765
3
+ size 509630258