import timm
import os
from typing import Any
from pytorch_lightning.utilities.types import LRSchedulerTypeUnion
import torch as t
from torch import nn
import numpy as np
import transformers
import pytorch_lightning as plight
import torchmetrics
import einops as eo
from loss_functions import coral_loss, corn_loss, corn_label_from_logits, macro_soft_f1

t.set_float32_matmul_precision("medium")
global_settings = dict(try_using_torch_compile=False)


class EnsembleModel(plight.LightningModule):
    def __init__(self, models_without_norm_df, models_with_norm_df, learning_rate=0.0002, use_simple_average=False):
        super().__init__()
        self.models_without_norm = nn.ModuleList(list(models_without_norm_df))
        self.models_with_norm = nn.ModuleList(list(models_with_norm_df))
        self.learning_rate = learning_rate
        self.use_simple_average = use_simple_average

        if not self.use_simple_average:
            self.combiner = nn.Linear(
                self.models_with_norm[0].num_classes * (len(self.models_with_norm) + len(self.models_without_norm)),
                self.models_with_norm[0].num_classes,
            )

    def forward(self, x):
        x_unnormed, x_normed = x
        if not self.use_simple_average:
            out_unnormed = t.cat([model.model_step(x_unnormed, 0)[0] for model in self.models_without_norm], dim=-1)
            out_normed = t.cat([model.model_step(x_normed, 0)[0] for model in self.models_with_norm], dim=-1)
            out_avg = self.combiner(t.cat((out_unnormed, out_normed), dim=-1))
        else:
            out_unnormed = [model.model_step(x_unnormed, 0)[0] for model in self.models_without_norm]
            out_normed = [model.model_step(x_normed, 0)[0] for model in self.models_with_norm]

            out_avg = (t.stack(out_unnormed + out_normed, dim=-1) / 2).mean(-1)
        return {"out_avg": out_avg, "out_unnormed": out_unnormed, "out_normed": out_normed}, x_unnormed[-1]

    def training_step(self, batch, batch_idx):
        out, y = self(batch)
        loss = self.models_with_norm[0]._get_loss(out["out_avg"], y, batch[0])
        self.log("train_loss", loss, on_epoch=True, on_step=True, sync_dist=True)
        return loss

    def validation_step(self, batch, batch_idx):
        out, y = self(batch)
        preds, y_onecold, ignore_index_val = self.models_with_norm[0]._get_preds_reals(out["out_avg"], y)
        acc = torchmetrics.functional.accuracy(
            preds,
            y_onecold.to(t.long),
            ignore_index=ignore_index_val,
            num_classes=self.models_with_norm[0].num_classes,
            task="multiclass",
        )
        self.log("acc", acc * 100, prog_bar=True, sync_dist=True)
        loss = self.models_with_norm[0]._get_loss(out["out_avg"], y, batch[0])
        self.log("val_loss", loss, prog_bar=True, sync_dist=True)
        return loss

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        out, y = self(batch)
        preds, y_onecold, ignore_index_val = self.models_with_norm[0]._get_preds_reals(out["out_avg"], y)
        return preds, out, y_onecold

    def configure_optimizers(self):
        return t.optim.Adam(self.parameters(), lr=self.learning_rate)


class TimmHeadReplace(nn.Module):
    def __init__(self, pooling=None, in_channels=512, pooling_output_dimension=1, all_identity=False) -> None:
        super().__init__()

        if all_identity:
            self.head = nn.Identity()
            self.pooling = None
        else:
            self.pooling = pooling
            if pooling is not None:
                self.pooling_output_dimension = pooling_output_dimension
                if self.pooling == "AdaptiveAvgPool2d":
                    self.pooling_layer = nn.AdaptiveAvgPool2d(pooling_output_dimension)
                elif self.pooling == "AdaptiveMaxPool2d":
                    self.pooling_layer = nn.AdaptiveMaxPool2d(pooling_output_dimension)
            self.head = nn.Flatten()

    def forward(self, x, pre_logits=False):
        if self.pooling is not None:
            if self.pooling == "stack_avg_max_attn":
                x = t.cat([layer(x) for layer in self.pooling_layer], dim=-1)
            else:
                x = self.pooling_layer(x)
        return self.head(x)


class CVModel(nn.Module):
    def __init__(
        self,
        modelname,
        in_shape,
        num_classes,
        loss_func,
        last_activation: str,
        input_padding_val=10,
        char_dims=2,
        max_seq_length=1000,
    ) -> None:
        super().__init__()
        self.modelname = modelname
        self.loss_func = loss_func
        self.in_shape = in_shape
        self.char_dims = char_dims
        self.x_shape = in_shape
        self.last_activation = last_activation
        self.max_seq_length = max_seq_length
        self.num_classes = num_classes
        if self.loss_func == "OrdinalRegLoss":
            self.out_shape = 1
        else:
            self.out_shape = num_classes

        self.cv_model = timm.create_model(modelname, pretrained=True, num_classes=0)
        self.cv_model.classifier = nn.Identity()
        with t.inference_mode():
            test_out = self.cv_model(t.ones(self.in_shape, dtype=t.float32))
        self.cv_model_out_dim = test_out.shape[1]
        self.cv_model.classifier = nn.Sequential(nn.Flatten(), nn.Linear(self.cv_model_out_dim, self.max_seq_length))
        if self.out_shape == 1:
            self.logit_norm = nn.Identity()
            self.out_project = nn.Identity()
        else:
            self.logit_norm = nn.LayerNorm(self.max_seq_length)
            self.out_project = nn.Linear(1, self.out_shape)

        if last_activation == "Softmax":
            self.final_activation = nn.Softmax(dim=-1)
        elif last_activation == "Sigmoid":
            self.final_activation = nn.Sigmoid()
        elif last_activation == "LogSigmoid":
            self.final_activation = nn.LogSigmoid()
        elif last_activation == "Identity":
            self.final_activation = nn.Identity()
        else:
            raise NotImplementedError(f"{last_activation} not implemented")

    def forward(self, x):
        if isinstance(x, list):
            x = x[0]
        x = self.cv_model(x)
        x = self.cv_model.classifier(x).unsqueeze(-1)
        x = self.out_project(x)
        return self.final_activation(x)


class LitModel(plight.LightningModule):
    def __init__(
        self,
        in_shape: tuple,
        hidden_dim: int,
        num_attention_heads: int,
        num_layers: int,
        loss_func: str,
        learning_rate: float,
        weight_decay: float,
        cfg: dict,
        use_lr_warmup: bool,
        use_reduce_on_plateau: bool,
        track_gradient_histogram=False,
        register_forw_hook=False,
        char_dims=2,
    ) -> None:
        super().__init__()
        if "only_use_2nd_input_stream" not in cfg:
            cfg["only_use_2nd_input_stream"] = False

        if "gamma_step_size" not in cfg:
            cfg["gamma_step_size"] = 5
        if "gamma_step_factor" not in cfg:
            cfg["gamma_step_factor"] = 0.5
        self.save_hyperparameters(
            dict(
                in_shape=in_shape,
                hidden_dim=hidden_dim,
                num_attention_heads=num_attention_heads,
                num_layers=num_layers,
                loss_func=loss_func,
                learning_rate=learning_rate,
                cfg=cfg,
                x_shape=in_shape,
                num_classes=cfg["num_classes"],
                use_lr_warmup=use_lr_warmup,
                num_warmup_steps=cfg["num_warmup_steps"],
                use_reduce_on_plateau=use_reduce_on_plateau,
                weight_decay=weight_decay,
                track_gradient_histogram=track_gradient_histogram,
                register_forw_hook=register_forw_hook,
                char_dims=char_dims,
                remove_timm_classifier_head_pooling=cfg["remove_timm_classifier_head_pooling"],
                change_pooling_for_timm_head_to=cfg["change_pooling_for_timm_head_to"],
                chars_conv_pooling_out_dim=cfg["chars_conv_pooling_out_dim"],
            )
        )
        self.model_to_use = cfg["model_to_use"]
        self.num_classes = cfg["num_classes"]
        self.x_shape = in_shape
        self.in_shape = in_shape
        self.hidden_dim = hidden_dim
        self.num_attention_heads = num_attention_heads
        self.num_layers = num_layers

        self.use_lr_warmup = use_lr_warmup
        self.num_warmup_steps = cfg["num_warmup_steps"]
        self.warmup_exponent = cfg["warmup_exponent"]

        self.use_reduce_on_plateau = use_reduce_on_plateau
        self.loss_func = loss_func
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.using_one_hot_targets = cfg["one_hot_y"]
        self.track_gradient_histogram = track_gradient_histogram
        self.register_forw_hook = register_forw_hook
        if self.loss_func == "OrdinalRegLoss":
            self.ord_reg_loss_max = cfg["ord_reg_loss_max"]
            self.ord_reg_loss_min = cfg["ord_reg_loss_min"]

        self.num_lin_layers = cfg["num_lin_layers"]
        self.linear_activation = cfg["linear_activation"]
        self.last_activation = cfg["last_activation"]

        self.max_seq_length = cfg["manual_max_sequence_for_model"]

        self.use_char_embed_info = cfg["use_embedded_char_pos_info"]

        self.method_chars_into_model = cfg["method_chars_into_model"]
        self.source_for_pretrained_cv_model = cfg["source_for_pretrained_cv_model"]
        self.method_to_include_char_positions = cfg["method_to_include_char_positions"]

        self.char_dims = char_dims
        self.char_sequence_length = cfg["max_len_chars_list"] if self.use_char_embed_info else 0

        self.chars_conv_lr_reduction_factor = cfg["chars_conv_lr_reduction_factor"]
        if self.use_char_embed_info:
            self.chars_bert_reduction_factor = cfg["chars_bert_reduction_factor"]

        self.use_in_projection_bias = cfg["use_in_projection_bias"]
        self.add_layer_norm_to_in_projection = cfg["add_layer_norm_to_in_projection"]

        self.hidden_dropout_prob = cfg["hidden_dropout_prob"]
        self.layer_norm_after_in_projection = cfg["layer_norm_after_in_projection"]
        self.method_chars_into_model = cfg["method_chars_into_model"]
        self.input_padding_val = cfg["input_padding_val"]
        self.cv_char_modelname = cfg["cv_char_modelname"]
        self.char_plot_shape = cfg["char_plot_shape"]

        self.remove_timm_classifier_head_pooling = cfg["remove_timm_classifier_head_pooling"]
        self.change_pooling_for_timm_head_to = cfg["change_pooling_for_timm_head_to"]
        self.chars_conv_pooling_out_dim = cfg["chars_conv_pooling_out_dim"]

        self.add_layer_norm_to_char_mlp = cfg["add_layer_norm_to_char_mlp"]
        if "profile_torch_run" in cfg:
            self.profile_torch_run = cfg["profile_torch_run"]
        else:
            self.profile_torch_run = False
        if self.loss_func == "OrdinalRegLoss":
            self.out_shape = 1
        else:
            self.out_shape = cfg["num_classes"]

        if not self.hparams.cfg["only_use_2nd_input_stream"]:
            if (
                self.method_chars_into_model == "dense"
                and self.use_char_embed_info
                and self.method_to_include_char_positions == "concat"
            ):
                self.project = nn.Linear(self.x_shape[-1], self.hidden_dim // 2, bias=self.use_in_projection_bias)
            elif (
                self.method_chars_into_model == "bert"
                and self.use_char_embed_info
                and self.method_to_include_char_positions == "concat"
            ):
                self.hidden_dim_chars = self.hidden_dim // 2
                self.project = nn.Linear(self.x_shape[-1], self.hidden_dim_chars, bias=self.use_in_projection_bias)
            elif (
                self.method_chars_into_model == "resnet"
                and self.method_to_include_char_positions == "concat"
                and self.use_char_embed_info
            ):
                self.project = nn.Linear(self.x_shape[-1], self.hidden_dim // 2, bias=self.use_in_projection_bias)
            elif self.model_to_use == "cv_only_model":
                self.project = nn.Identity()
            else:
                self.project = nn.Linear(self.x_shape[-1], self.hidden_dim, bias=self.use_in_projection_bias)
            if self.add_layer_norm_to_in_projection:
                self.project = nn.Sequential(
                    nn.Linear(self.project.in_features, self.project.out_features, bias=self.use_in_projection_bias),
                    nn.LayerNorm(self.project.out_features),
                )

        if hasattr(self, "project") and "posix" in os.name and global_settings["try_using_torch_compile"]:
            self.project = t.compile(self.project)

        if self.use_char_embed_info:
            self._create_char_model()

        if self.layer_norm_after_in_projection:
            if self.hparams.cfg["only_use_2nd_input_stream"]:
                self.layer_norm_in = nn.LayerNorm(self.hidden_dim // 2)
            else:
                self.layer_norm_in = nn.LayerNorm(self.hidden_dim)

            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.layer_norm_in = t.compile(self.layer_norm_in)

        self._create_main_seq_model(cfg)

        if register_forw_hook:
            self.register_hooks()
        if self.hparams.cfg["only_use_2nd_input_stream"]:
            linear_in_dim = self.hidden_dim // 2
        else:
            linear_in_dim = self.hidden_dim

        if self.num_lin_layers == 1:
            self.linear = nn.Linear(linear_in_dim, self.out_shape)
        else:
            lin_layers = []
            for _ in range(self.num_lin_layers - 1):
                lin_layers.extend(
                    [
                        nn.Linear(linear_in_dim, linear_in_dim),
                        getattr(nn, self.linear_activation)(),
                    ]
                )
            self.linear = nn.Sequential(*lin_layers, nn.Linear(linear_in_dim, self.out_shape))

        if "posix" in os.name and global_settings["try_using_torch_compile"]:
            self.linear = t.compile(self.linear)

        if self.last_activation == "Softmax":
            self.final_activation = nn.Softmax(dim=-1)
        elif self.last_activation == "Sigmoid":
            self.final_activation = nn.Sigmoid()
        elif self.last_activation == "Identity":
            self.final_activation = nn.Identity()
        else:
            raise NotImplementedError(f"{self.last_activation} not implemented")

        if self.profile_torch_run:
            self.profilerr = t.profiler.profile(
                schedule=t.profiler.schedule(wait=1, warmup=10, active=10, repeat=1),
                on_trace_ready=t.profiler.tensorboard_trace_handler("tblogs"),
                with_stack=True,
                record_shapes=True,
                profile_memory=False,
            )

    def _create_main_seq_model(self, cfg):
        if self.hparams.cfg["only_use_2nd_input_stream"]:
            hidden_dim = self.hidden_dim // 2
        else:
            hidden_dim = self.hidden_dim
        if self.model_to_use == "BERT":
            self.bert_config = transformers.BertConfig(
                vocab_size=self.x_shape[-1],
                hidden_size=hidden_dim,
                num_hidden_layers=self.num_layers,
                intermediate_size=hidden_dim,
                num_attention_heads=self.num_attention_heads,
                max_position_embeddings=self.max_seq_length,
            )
            self.bert_model = transformers.BertModel(self.bert_config)
        elif self.model_to_use == "cv_only_model":
            self.bert_model = CVModel(
                modelname=cfg["cv_modelname"],
                in_shape=self.in_shape,
                num_classes=cfg["num_classes"],
                loss_func=cfg["loss_function"],
                last_activation=cfg["last_activation"],
                input_padding_val=cfg["input_padding_val"],
                char_dims=self.char_dims,
                max_seq_length=cfg["manual_max_sequence_for_model"],
            )
        else:
            raise NotImplementedError(f"{self.model_to_use} not implemented")
        if "posix" in os.name and global_settings["try_using_torch_compile"]:
            self.bert_model = t.compile(self.bert_model)
        return 0

    def _create_char_model(self):
        if self.method_chars_into_model == "dense":
            self.chars_project_0 = nn.Linear(self.char_dims, 1, bias=self.use_in_projection_bias)
            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_project_0 = t.compile(self.chars_project_0)
            if self.method_to_include_char_positions == "concat":
                self.chars_project_1 = nn.Linear(
                    self.char_sequence_length, self.hidden_dim // 2, bias=self.use_in_projection_bias
                )
            else:
                self.chars_project_1 = nn.Linear(
                    self.char_sequence_length, self.hidden_dim, bias=self.use_in_projection_bias
                )

            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_project_1 = t.compile(self.chars_project_1)
        elif not self.method_chars_into_model == "resnet":
            self.chars_project = nn.Linear(self.char_dims, self.hidden_dim_chars, bias=self.use_in_projection_bias)
            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_project = t.compile(self.chars_project)

        if self.method_chars_into_model == "bert":
            if not hasattr(self, "hidden_dim_chars"):
                if self.hidden_dim // self.chars_bert_reduction_factor > 1:
                    self.hidden_dim_chars = self.hidden_dim // self.chars_bert_reduction_factor
                else:
                    self.hidden_dim_chars = self.hidden_dim
            self.num_attention_heads_chars = self.hidden_dim_chars // (self.hidden_dim // self.num_attention_heads)
            self.chars_bert_config = transformers.BertConfig(
                vocab_size=self.x_shape[-1],
                hidden_size=self.hidden_dim_chars,
                num_hidden_layers=self.num_layers,
                intermediate_size=self.hidden_dim_chars,
                num_attention_heads=self.num_attention_heads_chars,
                max_position_embeddings=self.char_sequence_length + 1,
                num_labels=1,
            )
            self.chars_bert = transformers.BertForSequenceClassification(self.chars_bert_config)

            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_bert = t.compile(self.chars_bert)
            self.chars_project_class_output = nn.Linear(1, self.hidden_dim_chars, bias=self.use_in_projection_bias)
            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_project_class_output = t.compile(self.chars_project_class_output)
        elif self.method_chars_into_model == "resnet":
            if self.source_for_pretrained_cv_model == "timm":
                self.chars_conv = timm.create_model(
                    self.cv_char_modelname,
                    pretrained=True,
                    num_classes=0,  # remove classifier nn.Linear
                )
                if self.remove_timm_classifier_head_pooling:
                    self.chars_conv.head = TimmHeadReplace(all_identity=True)
                    with t.inference_mode():
                        test_out = self.chars_conv(
                            t.ones((1, 3, self.char_plot_shape[0], self.char_plot_shape[1]), dtype=t.float32)
                        )
                    if test_out.ndim > 3:
                        self.chars_conv.head = TimmHeadReplace(
                            self.change_pooling_for_timm_head_to,
                            test_out.shape[1],
                        )
            elif self.source_for_pretrained_cv_model == "huggingface":
                self.chars_conv = transformers.AutoModelForImageClassification.from_pretrained(self.cv_char_modelname)
            elif self.source_for_pretrained_cv_model == "torch_hub":
                self.chars_conv = t.hub.load(*self.cv_char_modelname.split(","))

            if hasattr(self.chars_conv, "classifier"):
                self.chars_conv.classifier = nn.Identity()
            elif hasattr(self.chars_conv, "cls_classifier"):
                self.chars_conv.cls_classifier = nn.Identity()
            elif hasattr(self.chars_conv, "fc"):
                self.chars_conv.fc = nn.Identity()

            if hasattr(self.chars_conv, "distillation_classifier"):
                self.chars_conv.distillation_classifier = nn.Identity()
            with t.inference_mode():
                test_out = self.chars_conv(
                    t.ones((1, 3, self.char_plot_shape[0], self.char_plot_shape[1]), dtype=t.float32)
                )
            if hasattr(test_out, "last_hidden_state"):
                self.chars_conv_out_dim = test_out.last_hidden_state.shape[1]
            elif hasattr(test_out, "logits"):
                self.chars_conv_out_dim = test_out.logits.shape[1]
            elif isinstance(test_out, list):
                self.chars_conv_out_dim = test_out[0].shape[1]
            else:
                self.chars_conv_out_dim = test_out.shape[1]

            char_lin_layers = [nn.Flatten(), nn.Linear(self.chars_conv_out_dim, self.hidden_dim // 2)]
            if self.add_layer_norm_to_char_mlp:
                char_lin_layers.append(nn.LayerNorm(self.hidden_dim // 2))
            self.chars_classifier = nn.Sequential(*char_lin_layers)
            if hasattr(self.chars_conv, "distillation_classifier"):
                self.chars_conv.distillation_classifier = nn.Sequential(
                    nn.Flatten(), nn.Linear(self.chars_conv_out_dim, self.hidden_dim // 2)
                )

            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_classifier = t.compile(self.chars_classifier)
            if "posix" in os.name and global_settings["try_using_torch_compile"]:
                self.chars_conv = t.compile(self.chars_conv)
        return 0

    def register_hooks(self):
        def add_to_tb(layer):
            def hook(model, input, output):
                if hasattr(output, "detach"):
                    for logger in self.loggers:
                        if hasattr(logger.experiment, "add_histogram"):
                            logger.experiment.add_histogram(
                                tag=f"{layer}_{str(list(output.shape))}",
                                values=output.detach(),
                                global_step=self.trainer.global_step,
                            )

            return hook

        for layer_id, layer in dict([*self.named_modules()]).items():
            layer.register_forward_hook(add_to_tb(f"act_{layer_id}"))

    def on_after_backward(self) -> None:
        if self.track_gradient_histogram:
            if self.trainer.global_step % 200 == 0:
                for logger in self.loggers:
                    if hasattr(logger.experiment, "add_histogram"):
                        for layer_id, layer in dict([*self.named_modules()]).items():
                            parameters = layer.parameters()
                            for idx2, p in enumerate(parameters):
                                grad_val = p.grad
                                if grad_val is not None:
                                    grad_name = f"grad_{idx2}_{layer_id}_{str(list(p.grad.shape))}"
                                    logger.experiment.add_histogram(
                                        tag=grad_name, values=grad_val, global_step=self.trainer.global_step
                                    )

        return super().on_after_backward()

    def _fold_in_seq_dim(self, out, y):
        batch_size, seq_len, num_classes = out.shape
        out = eo.rearrange(out, "b s c -> (b s) c", s=seq_len)
        if y is None:
            return out, None
        if len(y.shape) > 2:
            y = eo.rearrange(y, "b s c -> (b s) c", s=seq_len)
        else:
            y = eo.rearrange(y, "b s -> (b s)", s=seq_len)
        return out, y

    def _get_loss(self, out, y, batch):
        attention_mask = batch[-2]
        if self.loss_func == "BCELoss":
            if self.last_activation == "Identity":
                loss = t.nn.functional.binary_cross_entropy_with_logits(out, y, reduction="none")
            else:
                loss = t.nn.functional.binary_cross_entropy(out, y, reduction="none")

            replace_tensor = t.zeros(loss[1, 1, :].shape, device=loss.device, dtype=loss.dtype, requires_grad=False)
            loss[~attention_mask.bool()] = replace_tensor
            loss = loss.mean()
        elif self.loss_func == "CrossEntropyLoss":
            if len(out.shape) > 2:
                out, y = self._fold_in_seq_dim(out, y)
                loss = t.nn.functional.cross_entropy(out, y, reduction="mean", ignore_index=-100)
            else:
                loss = t.nn.functional.cross_entropy(out, y, reduction="mean", ignore_index=-100)

        elif self.loss_func == "OrdinalRegLoss":
            loss = t.nn.functional.mse_loss(out, y, reduction="none")
            loss = loss[attention_mask.bool()].sum() * 10.0 / attention_mask.sum()
        elif self.loss_func == "macro_soft_f1":
            loss = macro_soft_f1(y, out, reduction="mean")
        elif self.loss_func == "coral_loss":
            loss = coral_loss(out, y)
        elif self.loss_func == "corn_loss":
            out, y = self._fold_in_seq_dim(out, y)
            loss = corn_loss(out, y.squeeze(), self.out_shape)
        else:
            raise ValueError("Loss Function not reckognized")
        return loss

    def training_step(self, batch, batch_idx):
        if self.profile_torch_run:
            self.profilerr.step()
        out, y = self.model_step(batch, batch_idx)
        loss = self._get_loss(out, y, batch)
        self.log("train_loss", loss, on_epoch=True, on_step=True, sync_dist=True)
        return loss

    def forward(*args):
        return forward(args[0], args[1:])

    def model_step(self, batch, batch_idx):
        out = self.forward(batch)
        return out, batch[-1]

    def optimizer_step(
        self,
        epoch,
        batch_idx,
        optimizer,
        optimizer_closure,
    ):
        optimizer.step(closure=optimizer_closure)

        if self.use_lr_warmup and self.hparams["cfg"]["lr_scheduling"] != "OneCycleLR":
            if self.trainer.global_step < self.num_warmup_steps:
                lr_scale = min(1.0, float(self.trainer.global_step + 1) / self.num_warmup_steps) ** self.warmup_exponent
                for pg in optimizer.param_groups:
                    pg["lr"] = lr_scale * self.hparams.learning_rate
        if self.trainer.global_step % 10 == 0 or self.trainer.global_step == 0:
            for idx, pg in enumerate(optimizer.param_groups):
                self.log(f"lr_{idx}", pg["lr"], prog_bar=True, sync_dist=True)

    def lr_scheduler_step(self, scheduler: LRSchedulerTypeUnion, metric: Any | None) -> None:
        if self.use_lr_warmup and self.hparams["cfg"]["lr_scheduling"] != "OneCycleLR":
            if self.trainer.global_step > self.num_warmup_steps:
                if metric is None:
                    scheduler.step()
                else:
                    scheduler.step(metric)
        else:
            if metric is None:
                scheduler.step()
            else:
                scheduler.step(metric)

    def _get_preds_reals(self, out, y):
        if self.loss_func == "corn_loss":
            seq_len = out.shape[1]
            out, y = self._fold_in_seq_dim(out, y)
            preds = corn_label_from_logits(out)
            preds = eo.rearrange(preds, "(b s) -> b s", s=seq_len)
            if y is not None:
                y = eo.rearrange(y.squeeze(), "(b s) -> b s", s=seq_len)

        elif self.loss_func == "OrdinalRegLoss":
            preds = out * (self.ord_reg_loss_max - self.ord_reg_loss_min)
            preds = (preds + self.ord_reg_loss_min).round().to(t.long)

        else:
            preds = t.argmax(out, dim=-1)
        if y is None:
            return preds, y, -100
        else:
            if self.using_one_hot_targets:
                y_onecold = t.argmax(y, dim=-1)
                ignore_index_val = 0
            elif self.loss_func == "OrdinalRegLoss":
                y_onecold = (y * self.num_classes).round().to(t.long)

                y_onecold = y * (self.ord_reg_loss_max - self.ord_reg_loss_min)
                y_onecold = (y_onecold + self.ord_reg_loss_min).round().to(t.long)
                ignore_index_val = t.min(y_onecold).to(t.long)
            else:
                y_onecold = y
                ignore_index_val = -100

            if len(preds.shape) > len(y_onecold.shape):
                preds = preds.squeeze()
            return preds, y_onecold, ignore_index_val

    def validation_step(self, batch, batch_idx):
        out, y = self.model_step(batch, batch_idx)
        preds, y_onecold, ignore_index_val = self._get_preds_reals(out, y)

        if self.loss_func == "OrdinalRegLoss":
            y_onecold = y_onecold.flatten()
            preds = preds.flatten()[y_onecold != ignore_index_val]
            y_onecold = y_onecold[y_onecold != ignore_index_val]
            acc = (preds == y_onecold).sum() / len(y_onecold)
        else:
            acc = torchmetrics.functional.accuracy(
                preds,
                y_onecold.to(t.long),
                ignore_index=ignore_index_val,
                num_classes=self.num_classes,
                task="multiclass",
            )
            self.log("acc", acc * 100, prog_bar=True, sync_dist=True)
        loss = self._get_loss(out, y, batch)
        self.log("val_loss", loss, prog_bar=True, sync_dist=True)

        return loss

    def predict_step(self, batch, batch_idx):
        out, y = self.model_step(batch, batch_idx)
        preds, y_onecold, ignore_index_val = self._get_preds_reals(out, y)
        return preds, y_onecold

    def configure_optimizers(self):
        params = list(self.named_parameters())

        def is_chars_conv(n):
            if "chars_conv" not in n:
                return False
            if "chars_conv" in n and "classifier" in n:
                return False
            else:
                return True

        grouped_parameters = [
            {
                "params": [p for n, p in params if is_chars_conv(n)],
                "lr": self.learning_rate / self.chars_conv_lr_reduction_factor,
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in params if not is_chars_conv(n)],
                "lr": self.learning_rate,
                "weight_decay": self.weight_decay,
            },
        ]
        opti = t.optim.AdamW(grouped_parameters, lr=self.learning_rate, weight_decay=self.weight_decay)
        if self.use_reduce_on_plateau:
            opti_dict = {
                "optimizer": opti,
                "lr_scheduler": {
                    "scheduler": t.optim.lr_scheduler.ReduceLROnPlateau(opti, mode="min", patience=2, factor=0.5),
                    "monitor": "val_loss",
                    "frequency": 1,
                    "interval": "epoch",
                },
            }
            return opti_dict
        else:
            cfg = self.hparams["cfg"]
            if cfg["use_reduce_on_plateau"]:
                scheduler = None
            elif cfg["lr_scheduling"] == "multistep":
                scheduler = t.optim.lr_scheduler.MultiStepLR(
                    opti, milestones=cfg["multistep_milestones"], gamma=cfg["gamma_multistep"], verbose=False
                )
                interval = "step" if cfg["use_training_steps_for_end_and_lr_decay"] else "epoch"
            elif cfg["lr_scheduling"] == "StepLR":
                scheduler = t.optim.lr_scheduler.StepLR(
                    opti, step_size=cfg["gamma_step_size"], gamma=cfg["gamma_step_factor"]
                )
                interval = "step" if cfg["use_training_steps_for_end_and_lr_decay"] else "epoch"
            elif cfg["lr_scheduling"] == "anneal":
                scheduler = t.optim.lr_scheduler.CosineAnnealingLR(
                    opti, 250, eta_min=cfg["min_lr_anneal"], last_epoch=-1, verbose=False
                )
                interval = "step"
            elif cfg["lr_scheduling"] == "ExponentialLR":
                scheduler = t.optim.lr_scheduler.ExponentialLR(opti, gamma=cfg["lr_sched_exp_fac"])
                interval = "step"
            else:
                scheduler = None
            if scheduler is None:
                return [opti]
            else:
                opti_dict = {
                    "optimizer": opti,
                    "lr_scheduler": {
                        "scheduler": scheduler,
                        "monitor": "global_step",
                        "frequency": 1,
                        "interval": interval,
                    },
                }
                return opti_dict

    def on_fit_start(self) -> None:
        if self.profile_torch_run:
            self.profilerr.start()
        return super().on_fit_start()

    def on_fit_end(self) -> None:
        if self.profile_torch_run:
            self.profilerr.stop()
        return super().on_fit_end()


def prep_model_input(self, batch):
    if len(batch) == 1:
        batch = batch[0]
    if self.use_char_embed_info:
        if len(batch) == 5:
            x, chars_coords, ims, attention_mask, _ = batch
        elif batch[1].ndim == 4:
            x, ims, attention_mask, _ = batch
        else:
            x, chars_coords, attention_mask, _ = batch
        padding_list = None
    else:
        if len(batch) > 3:
            x = batch[0]
            y = batch[-1]
            attention_mask = batch[1]
        else:
            x, attention_mask, y = batch

    if self.model_to_use != "cv_only_model" and not self.hparams.cfg["only_use_2nd_input_stream"]:
        x_embedded = self.project(x)
    else:
        x_embedded = x
    if self.use_char_embed_info:
        if self.method_chars_into_model == "dense":
            bool_mask = chars_coords == self.input_padding_val
            bool_mask = bool_mask[:, :, 0]
            chars_coords_projected = self.chars_project_0(chars_coords).squeeze(-1)
            chars_coords_projected = chars_coords_projected * bool_mask
            if self.chars_project_1.in_features == chars_coords_projected.shape[-1]:
                chars_coords_projected = self.chars_project_1(chars_coords_projected)
            else:
                chars_coords_projected = chars_coords_projected.mean(dim=-1)
                chars_coords_projected = chars_coords_projected.unsqueeze(1).repeat(1, x_embedded.shape[2])
        elif self.method_chars_into_model == "bert":
            chars_mask = chars_coords != self.input_padding_val
            chars_mask = t.cat(
                (
                    t.ones(chars_mask[:, :1, 0].shape, dtype=t.long, device=chars_coords.device),
                    chars_mask[:, :, 0].to(t.long),
                ),
                dim=1,
            )
            chars_coords_projected = self.chars_project(chars_coords)

            position_ids = t.arange(
                0, chars_coords_projected.shape[1] + 1, dtype=t.long, device=chars_coords_projected.device
            )
            token_type_ids = t.zeros(
                (chars_coords_projected.size()[0], chars_coords_projected.size()[1] + 1),
                dtype=t.long,
                device=chars_coords_projected.device,
            )  # +1 for CLS
            chars_coords_projected = t.cat(
                (t.ones_like(chars_coords_projected[:, :1, :]), chars_coords_projected), dim=1
            )  # to add CLS token
            chars_coords_projected = self.chars_bert(
                position_ids=position_ids,
                inputs_embeds=chars_coords_projected,
                token_type_ids=token_type_ids,
                attention_mask=chars_mask,
            )
            if hasattr(chars_coords_projected, "last_hidden_state"):
                chars_coords_projected = chars_coords_projected.last_hidden_state[:, 0, :]
            elif hasattr(chars_coords_projected, "logits"):
                chars_coords_projected = chars_coords_projected.logits
            else:
                chars_coords_projected = chars_coords_projected.hidden_states[-1][:, 0, :]
        elif self.method_chars_into_model == "resnet":
            chars_conv_out = self.chars_conv(ims)
            if isinstance(chars_conv_out, list):
                chars_conv_out = chars_conv_out[0]
            if hasattr(chars_conv_out, "logits"):
                chars_conv_out = chars_conv_out.logits
            chars_coords_projected = self.chars_classifier(chars_conv_out)

        chars_coords_projected = chars_coords_projected.unsqueeze(1).repeat(1, x_embedded.shape[1], 1)
        if hasattr(self, "chars_project_class_output"):
            chars_coords_projected = self.chars_project_class_output(chars_coords_projected)

        if self.hparams.cfg["only_use_2nd_input_stream"]:
            x_embedded = chars_coords_projected
        elif self.method_to_include_char_positions == "concat":
            x_embedded = t.cat((x_embedded, chars_coords_projected), dim=-1)
        else:
            x_embedded = x_embedded + chars_coords_projected
    return x_embedded, attention_mask


def forward(self, batch):
    prepped_input = prep_model_input(self, batch)

    if len(batch) > 5:
        x_embedded, padding_list, attention_mask, attention_mask_for_prediction = prepped_input
    elif len(batch) > 2:
        x_embedded, attention_mask = prepped_input
    else:
        x_embedded = prepped_input[0]
        attention_mask = prepped_input[-1]

    position_ids = t.arange(0, x_embedded.shape[1], dtype=t.long, device=x_embedded.device)
    token_type_ids = t.zeros(x_embedded.size()[:-1], dtype=t.long, device=x_embedded.device)

    if self.layer_norm_after_in_projection:
        x_embedded = self.layer_norm_in(x_embedded)

    if self.model_to_use == "LSTM":
        bert_out = self.bert_model(x_embedded)
    elif self.model_to_use in ["ProphetNet", "T5", "FunnelModel"]:
        bert_out = self.bert_model(inputs_embeds=x_embedded, attention_mask=attention_mask)
    elif self.model_to_use == "xBERT":
        bert_out = self.bert_model(x_embedded, mask=attention_mask.to(bool))
    elif self.model_to_use == "cv_only_model":
        bert_out = self.bert_model(x_embedded)
    else:
        bert_out = self.bert_model(
            position_ids=position_ids,
            inputs_embeds=x_embedded,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
        )
    if hasattr(bert_out, "last_hidden_state"):
        last_hidden_state = bert_out.last_hidden_state
        out = self.linear(last_hidden_state)
    elif hasattr(bert_out, "logits"):
        out = bert_out.logits
    else:
        out = bert_out
    out = self.final_activation(out)
    return out