test-00-switchllama-i3b-f10b-e4-init / modeling_switchllama.py

Update modeling_switchllama.py

479ce09 over 1 year ago

32.1 kB

	# copyright idek

	from transformers.models.llama.modeling_llama import *
	from torch import nn
	import torch
	from .configuration_switchllama import SwitchLlamaConfig


	def router_z_loss_func(router_logits: torch.Tensor) -> float:
	r"""
	Compute the router z-loss implemented in PyTorch.

	The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://arxiv.org/abs/2202.08906).
	It encourages router logits to remain small in an effort to improve stability.

	Args:
	router_logits (`float`):
	Input logits of shape [batch_size, sequence_length, num_experts]

	Returns:
	Scalar router z-loss.
	"""
	num_groups, tokens_per_group, _ = router_logits.shape
	log_z = torch.logsumexp(router_logits, dim=-1)
	z_loss = log_z**2
	return torch.sum(z_loss) / (num_groups * tokens_per_group)


	def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
	r"""
	Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

	See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
	function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
	experts is too unbalanced.

	Args:
	router_probs (`torch.Tensor`):
	Probability assigned to each expert per token. Shape: [batch_size, seqeunce_length, num_experts].
	expert_indices (`torch.Tensor`):
	Indices tensor of shape [batch_size, seqeunce_length] identifying the selected expert for a given token.

	Returns:
	The auxiliary loss.
	"""
	num_experts = router_probs.shape[-1]

	# cast the expert indices to int64, otherwise one-hot encoding will fail
	if expert_indices.dtype != torch.int64:
	expert_indices = expert_indices.to(torch.int64)

	if len(expert_indices.shape) == 2:
	expert_indices = expert_indices.unsqueeze(2)

	expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)

	# For a given token, determine if it was routed to a given expert.
	expert_mask = torch.max(expert_mask, axis=-2).values

	# cast to float32 otherwise mean will fail
	expert_mask = expert_mask.to(torch.float32)
	tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)

	router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
	return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)


	# Copied from transformers.models.bart.modeling_bart._make_causal_mask
	def _make_causal_mask(
	input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
	):
	"""
	Make causal mask used for bi-directional self-attention.
	"""
	bsz, tgt_len = input_ids_shape
	mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
	mask_cond = torch.arange(mask.size(-1), device=device)
	mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
	mask = mask.to(dtype)

	if past_key_values_length > 0:
	mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
	return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)

	def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
	"""
	Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
	"""
	bsz, src_len = mask.size()
	tgt_len = tgt_len if tgt_len is not None else src_len

	expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

	inverted_mask = 1.0 - expanded_mask

	return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)



	class SwitchLlamaTop1Router(nn.Module):
	"""
	Router using tokens choose top-1 experts assignment.

	This router uses the same mechanism as in Switch Transformer (https://arxiv.org/abs/2101.03961) and V-MoE
	(https://arxiv.org/abs/2106.05974): tokens choose their top experts. Items are sorted by router_probs and then
	routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee that each
	token is processed by an expert**, or that each expert receives at least one token.

	"""

	def __init__(self, config: SwitchLlamaConfig):
	super().__init__()
	self.num_experts = config.num_experts
	self.expert_capacity = config.expert_capacity
	self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
	self.jitter_noise = config.router_jitter_noise
	self.ignore_padding_tokens = config.router_ignore_padding_tokens

	def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""
	Computes router probabilities from input hidden states.

	Args:
	hidden_states (`torch.Tensor`):
	(batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
	Returns:
	router_probabilities (`torch.Tensor`):
	Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
	token and expert. Used for routing tokens to experts.
	router_logits (`torch.Tensor`):
	Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
	This is used later for computing router z-loss.
	"""
	if self.jitter_noise > 0:
	# Get the lower and upper bound of the uniform distribution
	# Adapted from: https://stackoverflow.com/questions/44328530/how-to-get-a-uniform-distribution-in-a-range-r1-r2-in-pytorch
	distrib_lower_bound = 1.0 - self.jitter_noise
	distrib_upper_bound = 1.0 + self.jitter_noise

	uniform_distrib = torch.rand(hidden_states.shape, device=hidden_states.device, dtype=hidden_states.dtype)
	uniform_distrib = uniform_distrib * (distrib_lower_bound - distrib_upper_bound)

	uniform_distrib = uniform_distrib + distrib_upper_bound
	# Multiply the token inputs by the uniform distribution - adding some noise
	hidden_states *= uniform_distrib

	# Shape: [num_groups, tokens_per_group, num_experts]
	router_logits = self.classifier(hidden_states)

	# Apply Softmax
	router_probabilities = nn.functional.softmax(router_logits, dim=-1)
	return router_probabilities, router_logits

	def forward(self, hidden_states: torch.Tensor) -> Tuple:
	r"""
	Generic forward function for every Router class. Each Router expects to have the same input hidden states
	(`hidden_states`) corresponding to the hidden states for each token, the `expert_capacity` corresponding to the
	number of tokens the Router will send to each expert, some Routers can send up to few tokens to each expert.

	Each Router works as the following: it expects the hidden states for each token, gets the `router_probs` and
	`router_logits` from the `router_weights`. This will assign for each token, the raw probability to be assigned
	to an expert. Then each Router class will have to define its own `_compute_routing_instructions`.

	Args:
	hidden_states (`torch.Tensor`) :
	[num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
	Returns:
	Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`] Tuple containing the expert index, the router probs
	and the router logits. The router probabilities and logits are required to compute the loss.
	"""
	router_probs, router_logits = self._compute_router_probabilities(hidden_states)

	expert_index = torch.argmax(router_probs, dim=-1)
	expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)

	# Mask tokens outside expert capacity. Sum over each sequence
	token_priority = torch.cumsum(expert_index, dim=-2)
	# mask if the token routed to to the expert will overflow
	expert_capacity_mask = token_priority <= self.expert_capacity
	expert_index = expert_index * expert_capacity_mask

	router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
	return expert_index, router_probs, router_logits

	class SwitchLlamaSparseMLP(nn.Module):
	r"""
	Implementation of the Switch Transformers Sparse MLP module.
	"""

	def __init__(self, config: SwitchLlamaConfig, expert_class: nn.Module = LlamaMLP):
	super().__init__()
	# Step 1: Get the correct router according to its class
	self.router = SwitchLlamaTop1Router(config)

	# Step 2: Get the experts
	self.experts = nn.ModuleDict()
	for idx in range(config.num_experts):
	self.experts[f"expert_{idx}"] = expert_class(config)

	def forward(self, hidden_states):
	r"""
	Hold on, this will be slightly tricky to understand In the correct order, a MoE layer does the following:

	1- Gets the `router_mask` from the router. The shape of the mask is `(batch_size, sequence_length, num_expert)`
	and corresponds to the argmax of the `router_probs`. The probabilities are needed in the computation of the
	hidden states : they are broadcasted to the hidden states values (can be interpreted as a scaling factor).

	2- Dispatch the tokens to its associated experts. We do a classic for loop over the experts and assign for each
	expert the corresponding hidden states.

	"""
	# Step 1: Get the router_mask from the router as wel as the probabilities
	router_mask, router_probs, router_logits = self.router(hidden_states)
	expert_index = torch.argmax(router_mask, dim=-1)

	# The routers introduced might not always map all the tokens, to a router, which means that some hidden states
	# can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones.

	next_states = hidden_states.clone()
	for idx, expert in enumerate(self.experts.values()):
	token_indices = router_mask[:, :, idx].bool()
	next_states[token_indices] = expert(hidden_states[token_indices])

	hidden_states = router_probs * next_states
	return hidden_states, (router_logits, expert_index)

	class SwitchLlamaLayerFF(nn.Module):
	r"""
	Switch Transformers Feed Forward layer module. This is a wrapper around the Mixture of Experts module.

	Parameters:
	config : ([`SwitchTransformersConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	is_sparse (`bool`):
	Whether the MLP layer is a `Sparse` layer (contains a Mixture of Experts) or not
	"""

	def __init__(self, config: SwitchLlamaConfig, is_sparse=True):
	super().__init__()
	self.is_sparse = is_sparse

	# Check if it is a sparse layer, if not then it is a dense layer
	if not self.is_sparse:
	self.mlp = LlamaMLP(config)
	else:
	self.mlp = SwitchLlamaSparseMLP(config)

	# self.layer_norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.dropout = nn.Dropout(config.dropout_rate)

	def forward(self, hidden_states, output_router_logits=False):
	# forwarded_states = self.layer_norm(hidden_states)
	forwarded_states = self.mlp(hidden_states)

	if isinstance(forwarded_states, tuple):
	forwarded_states, router_tuple = forwarded_states
	else:
	router_tuple = None

	output = hidden_states + self.dropout(forwarded_states)

	if output_router_logits and router_tuple is not None:
	output = (output, router_tuple)

	return output

	class SwitchLlamaDecoderLayer(nn.Module):
	def __init__(self, config: SwitchLlamaConfig):
	super().__init__()
	self.hidden_size = config.hidden_size
	self.self_attn = LlamaAttention(config=config)
	self.mlp = SwitchLlamaLayerFF(config)
	self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	output_router_logits = True
	) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`, optional): attention mask of size
	`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
	(see `past_key_values`).
	past_key_value (`Tuple(torch.FloatTensor)`, optional): cached past key and value projection states
	"""

	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, self_attn_weights, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states, output_router_logits=output_router_logits)
	if type(hidden_states)==tuple:
	hidden_states, router_tuple = hidden_states
	else:
	router_tuple = (torch.tensor([0], device=hidden_states.device),)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	if use_cache:
	outputs += (present_key_value,)

	# if output_router_logits:
	# outputs += (router_tuple,)
	return outputs + (router_tuple,)

	class SwitchLlamaPreTrainedModel(PreTrainedModel):
	config_class = SwitchLlamaConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["SwitchLlamaDecoderLayer"]
	_skip_keys_device_placement = "past_key_values"

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	def _set_gradient_checkpointing(self, module, value=False):
	if isinstance(module, LlamaModel):
	module.gradient_checkpointing = value


	class SwitchLlamaModel(SwitchLlamaPreTrainedModel):
	"""
	Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [`LlamaDecoderLayer`]

	Args:
	config: SwitchLlamaConfig
	"""

	def __init__(self, config: SwitchLlamaConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
	self.layers = nn.ModuleList([SwitchLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	self.gradient_checkpointing = False
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
	def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
	# create causal mask
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	combined_attention_mask = None
	if input_shape[-1] > 1:
	combined_attention_mask = _make_causal_mask(
	input_shape,
	inputs_embeds.dtype,
	device=inputs_embeds.device,
	past_key_values_length=past_key_values_length,
	)

	if attention_mask is not None:
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
	inputs_embeds.device
	)
	combined_attention_mask = (
	expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
	)

	return combined_attention_mask

	@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	output_router_logits = False
	) -> Union[Tuple, BaseModelOutputWithPast]:
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	return_dict = return_dict if return_dict is not None else self.config.use_return_dict
	all_router_probs = () if output_router_logits else None
	# retrieve input_ids and inputs_embeds
	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
	elif input_ids is not None:
	batch_size, seq_length = input_ids.shape
	elif inputs_embeds is not None:
	batch_size, seq_length, _ = inputs_embeds.shape
	else:
	raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

	seq_length_with_past = seq_length
	past_key_values_length = 0

	if past_key_values is not None:
	past_key_values_length = past_key_values[0][0].shape[2]
	seq_length_with_past = seq_length_with_past + past_key_values_length

	if position_ids is None:
	device = input_ids.device if input_ids is not None else inputs_embeds.device
	position_ids = torch.arange(
	past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
	)
	position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
	else:
	position_ids = position_ids.view(-1, seq_length).long()

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)
	# embed positions
	if attention_mask is None:
	attention_mask = torch.ones(
	(batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
	)
	attention_mask = self._prepare_decoder_attention_mask(
	attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
	)

	hidden_states = inputs_embeds

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	all_self_attns = () if output_attentions else None
	next_decoder_cache = () if use_cache else None

	for idx, decoder_layer in enumerate(self.layers):
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	past_key_value = past_key_values[idx] if past_key_values is not None else None

	if self.gradient_checkpointing and self.training:

	def create_custom_forward(module):
	def custom_forward(*inputs):
	# None for past_key_value
	return module(*inputs, past_key_value, output_attentions)

	return custom_forward

	layer_outputs = torch.utils.checkpoint.checkpoint(
	create_custom_forward(decoder_layer),
	hidden_states,
	attention_mask,
	position_ids,
	)
	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	output_router_logits=output_router_logits
	)

	hidden_states = layer_outputs[0]
	router_probs = layer_outputs[-1]

	if use_cache:
	next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

	if output_attentions:
	all_self_attns += (layer_outputs[1],)

	if output_router_logits:
	all_router_probs = all_router_probs + (router_probs,)
	hidden_states = self.norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	next_cache = next_decoder_cache if use_cache else None
	if not return_dict:
	return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)

	from transformers.models.switch_transformers.modeling_switch_transformers import MoEModelOutputWithPastAndCrossAttentions
	return MoEModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=next_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	router_probs=all_router_probs,
	)

	class SwitchLlamaForCausalLM(SwitchLlamaPreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = SwitchLlamaModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	self.router_z_loss_coef = config.router_z_loss_coef
	self.router_aux_loss_coef = config.router_aux_loss_coef
	# Initialize weights and apply final processing
	self.post_init()
	def _unpack_router_logits(self, router_outputs):
	total_router_logits = []
	total_expert_indexes = []
	for router_output in router_outputs:
	if len(router_output[0].shape) > 1:
	router_logits, expert_indexes = router_output
	total_router_logits.append(router_logits)
	total_expert_indexes.append(expert_indexes)
	return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1)


	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	output_router_logits = False,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	r"""
	Args:
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Returns:

	Example:

	```python
	>>> from transformers import AutoTokenizer, LlamaForCausalLM

	>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
	>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

	>>> prompt = "Hey, are you conscious? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
	```"""

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	output_router_logits=output_router_logits
	)

	hidden_states = outputs[0]
	if self.config.pretraining_tp > 1:
	lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
	logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
	logits = torch.cat(logits, dim=-1)
	else:
	logits = self.lm_head(hidden_states)
	logits = logits.float()

	loss = None
	decoder_z_loss = None
	decoder_aux_loss = None

	if output_router_logits:
	decoder_router_logits, decoder_expert_indexes = self._unpack_router_logits(outputs[-1])
	decoder_z_loss = router_z_loss_func(decoder_router_logits)
	decoder_router_probs = nn.Softmax(dim=-1)(decoder_router_logits)
	decoder_aux_loss = load_balancing_loss_func(decoder_router_probs, decoder_expert_indexes)

	if labels is not None:
	# Shift so that tokens < n predict n
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	##########################
	if output_router_logits:
	z_loss = self.router_z_loss_coef * decoder_z_loss
	aux_loss = self.router_aux_loss_coef * decoder_aux_loss
	loss = loss + z_loss + aux_loss
	#########################
	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
	):
	if past_key_values:
	input_ids = input_ids[:, -1:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -1].unsqueeze(-1)

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	@staticmethod
	def _reorder_cache(past_key_values, beam_idx):
	reordered_past = ()
	for layer_past in past_key_values:
	reordered_past += (
	tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
	)
	return reordered_past