objects76
/

Phi-3.5-mini-instruct-fc

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniforge3/envs/unsloth_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token is valid (permission: write).\n",
+      "Your token has been saved in your configured git credential helpers (store).\n",
+      "Your token has been saved to /home/ubuntu/.cache/huggingface/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "if '__file__' not in globals():\n",
+    "    __file__, __name__ = globals()['__vsc_ipynb_file__'], '__ipynb__'\n",
+    "    import types, sys; sys.modules['__ipynb__'] = types.ModuleType('__ipynb__')\n",
+    "    from IPython.core.magic import register_cell_magic\n",
+    "    @register_cell_magic\n",
+    "    def skip_if(flag, cell): exec(cell, globals())if flag and not eval(flag) else print('Cell skipped...')\n",
+    "\n",
+    "import sys, os\n",
+    "if os.path.abspath('.') not in sys.path: sys.path.append(os.path.abspath('.'))\n",
+    "\n",
+    "import os, huggingface_hub # !pip install huggingface_hub[hf_transfer]\n",
+    "huggingface_hub.login(token = os.environ.get('HF_TOKEN'), add_to_git_credential=True)\n",
+    "\n",
+    "import inspect\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "from glob import glob\n",
+    "import numpy as np; np.set_printoptions(precision=8, suppress=True); np.random.seed(42)\n",
+    "\n",
+    "class whitechar:\n",
+    "    def __ror__(self, x): return x.replace('\\n', '\\\\n\\n').replace('\\t', '\\\\t\\t').replace(' ', '⎵')\n",
+    "wc = whitechar()\n",
+    "\n",
+    "class text_color:\n",
+    "    black,red,green,yellow,blue,magenta,cyan,white,gray = [*range(30,38), 90] # fgclr,  [*range(90,98), ''] # light-fgclr\n",
+    "    bold, italic, underline, strike = 1, 3, 4, 9  # attrs supported on vscode notebook.\n",
+    "    def __init__(self, fg,bg=0,attr=0):\n",
+    "        attr = f'{attr};' if attr > 0 else ''\n",
+    "        bg = f'{bg+10};' if bg > 0 else ''\n",
+    "        self.clr = f'\\33[{attr}{bg}{fg}m'\n",
+    "\n",
+    "    def __ror__(self, obj): return self.clr + str(obj) + '\\33[0m'\n",
+    "    @staticmethod\n",
+    "    def all(): return (text_color(clr) for clr in [*range(30,38), 90])\n",
+    "\n",
+    "black,red,green,yellow,blue,magenta,cyan,white,gray = text_color.all()\n",
+    "\n",
+    "class cout:\n",
+    "    def __ror__(self, obj): print(f'[{inspect.stack()[1].lineno}] {str(obj)}')\n",
+    "    def __call__(self, *args, **kwds): print(f'[{inspect.stack()[1].lineno+1}]', *args, **kwds)\n",
+    "out = cout()\n",
+    "\n",
+    "\n",
+    "os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' #can help a little with VRAM reqs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import unsloth\n",
+    "import torch\n",
+    "\n",
+    "import wandb\n",
+    "wandb.init(project=\"phi-3.5-mini\", name='run-phi-3.5-mini')\n",
+    "os.environ[\"WANDB_NOTEBOOK_NAME\"] =__file__\n",
+    "\n",
+    "max_seq_length = 4096\n",
+    "use_4bit = False\n",
+    "\n",
+    "model, tokenizer = unsloth.FastLanguageModel.from_pretrained(\n",
+    "    model_name=\"microsoft/Phi-3.5-mini-instruct\",\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    dtype=None, # auto detect\n",
+    "    load_in_4bit=use_4bit,\n",
+    ")\n",
+    "\n",
+    "model = unsloth.FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=16,\n",
+    "    target_modules=[\n",
+    "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "        \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0,\n",
+    "    bias=\"none\",\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
+    "    random_state=3407,\n",
+    "    use_rslora=False, # True\n",
+    "    loftq_config=None,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.padding_side = 'left' # right -> left\n",
+    "# tokenizer.add_bos_token = False\n",
+    "# tokenizer.truncation_side # right\n",
+    "tokenizer.special_tokens_map_extended\n",
+    "tokenizer.special_tokens_map\n",
+    "tokenizer.added_tokens_decoder\n",
+    "\n",
+    "tokenizer | out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%skip_if\n",
+    "tokenizer.apply_chat_template(\n",
+    "    [\n",
+    "        {\"role\": \"user\", \"content\": \"hello\"},\n",
+    "        {\"role\": \"assistant\", \"content\": \"hi\"},\n",
+    "        {\"role\": \"user\", \"content\": \"how are you?\"},\n",
+    "    ],\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    ")|wc | out\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "data_collator = None\n",
+    "ds_xlam_fc = load_dataset('json', data_files={\n",
+    "    'train': 'xlam-dataset-60k-qwen2-train.jsonl',\n",
+    "})\n",
+    "\n",
+    "# sample 3000 datas from ds_xlam_fc\n",
+    "ds_xlam_fc3k = ds_xlam_fc['train'].shuffle(seed=42).select(range(3000))\n",
+    "ds_xlam_fc3k[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def formatting_prompts_func(examples):\n",
+    "    print( 'formatting_prompts_func:', len(examples) )\n",
+    "    convos = examples[\"messages\"]\n",
+    "    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]\n",
+    "    return {\"text\": texts}\n",
+    "\n",
+    "dataset_formatted = ds_xlam_fc3k.map(\n",
+    "    formatting_prompts_func, batched=True,\n",
+    "    remove_columns=[\"messages\", \"type\", \"source\"])\n",
+    "\n",
+    "dataset_formatted[199] | out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import trl\n",
+    "\n",
+    "def print_tokens_with_ids(txt):\n",
+    "    tokens = tokenizer.tokenize(txt, add_special_tokens=False)\n",
+    "    token_ids = tokenizer.encode(txt, add_special_tokens=False)\n",
+    "    return list(zip(tokens, token_ids))\n",
+    "\n",
+    "input_text = tokenizer.apply_chat_template(\n",
+    "    [dict(role=\"user\", content=\"\\n111 222\"),\n",
+    "     dict(role=\"assistant\", content=\"\\nxxx yyy\\n\"),\n",
+    "     dict(role=\"user\", content=\"444 555\\n\"),],\n",
+    "    tokenize=False, add_generation_prompt=True)\n",
+    "print_tokens_with_ids(input_text) | out\n",
+    "print_tokens_with_ids(\"\\n<|assistant|>\\n\") | green | out\n",
+    "\n",
+    "\n",
+    "data_collator = trl.DataCollatorForCompletionOnlyLM([32001], tokenizer=tokenizer)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import transformers\n",
+    "import unsloth\n",
+    "import trl\n",
+    "\n",
+    "train_args = trl.SFTConfig(\n",
+    "    per_device_train_batch_size=8,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "\n",
+    "    warmup_steps=5,\n",
+    "    # max_steps=60,\n",
+    "    num_train_epochs = 1,\n",
+    "\n",
+    "    # learning_rate=2e-4,\n",
+    "    learning_rate = 5e-5,\n",
+    "    bf16= unsloth.is_bfloat16_supported(),\n",
+    "    optim= \"adamw_torch\", # \"adamw_8bit\",\n",
+    "\n",
+    "    weight_decay=0.01,\n",
+    "    lr_scheduler_type=\"linear\",\n",
+    "    seed=3407,\n",
+    "\n",
+    "    gradient_checkpointing = True,\n",
+    "    gradient_checkpointing_kwargs = {\"use_reentrant\": True},\n",
+    "\n",
+    "    output_dir = \"outputs_unslot\",\n",
+    "    run_name = \"phi35-inst\",\n",
+    "    logging_steps=1,\n",
+    "    report_to= 'wandb',\n",
+    ")\n",
+    "\n",
+    "trainer = trl.SFTTrainer(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "\n",
+    "    train_dataset=dataset_formatted,\n",
+    "    dataset_text_field=\"text\",\n",
+    "    data_collator=data_collator,\n",
+    "    packing=False,\n",
+    "\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    dataset_num_proc=2,\n",
+    "\n",
+    "    args = train_args,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "gpu_stats = torch.cuda.get_device_properties(0)\n",
+    "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+    "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+    "print(f\"{start_gpu_memory} GB of memory reserved.\")\n",
+    "\n",
+    "trainer_stats = trainer.train()\n",
+    "\n",
+    "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+    "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+    "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+    "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+    "print(f\"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.\")\n",
+    "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+    "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+    "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+    "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")\n",
+    "\n",
+    "model.save_pretrained_merged('outputs_unslot/model', tokenizer, save_method = \"merged_16bit\",) # for best quality\n",
+    "\n",
+    "import unsloth\n",
+    "unsloth.FastLanguageModel.for_inference(model) # Enable native 2x faster inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.save_pretrained_merged('outputs_unslot/model', tokenizer, save_method = \"merged_16bit\",) # for best quality\n",
+    "model.save_pretrained_merged('outputs_unslot/model/lora', tokenizer, save_method = \"lora\",)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### load weight from saved"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import transformers, peft, torch, unsloth\n",
+    "\n",
+    "try:\n",
+    "    del model\n",
+    "    del tokenizer\n",
+    "    torch.cuda.empty_cache()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "if 1: # loading from hf\n",
+    "    repo_name = \"objects76/phi-3.5-fc\" # phi-3.5-mini\n",
+    "    repo_name = \"outputs_unslot/merged-model\"\n",
+    "    model = transformers.AutoModelForCausalLM.from_pretrained(\n",
+    "        repo_name, revision=\"main\",\n",
+    "        torch_dtype=torch.bfloat16,\n",
+    "        device_map=\"auto\",\n",
+    "        trust_remote_code=True,\n",
+    "        # attn_implementation=\"flash_attention_2\", #turn off if not supported by model or your GPU\n",
+    "    )\n",
+    "    model.config.use_cache = True\n",
+    "    model.eval()\n",
+    "\n",
+    "    tokenizer = transformers.AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)\n",
+    "\n",
+    "elif 1:\n",
+    "    max_seq_length = 4096\n",
+    "    dtype = None\n",
+    "    load_in_4bit = True\n",
+    "\n",
+    "    # model, tokenizer = unsloth.FastLanguageModel.from_pretrained(\n",
+    "    #     model_name = \"outputs_unslot/lora_model\", # YOUR MODEL YOU USED FOR TRAINING\n",
+    "    #     max_seq_length = max_seq_length,\n",
+    "    #     dtype = dtype,\n",
+    "    #     load_in_4bit = load_in_4bit,\n",
+    "    # )\n",
+    "    # unsloth.FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "     # I highly do NOT suggest - use Unsloth if possible\n",
+    "    base_model = \"microsoft/Phi-3.5-mini-instruct\"\n",
+    "\n",
+    "    model = peft.AutoPeftModelForCausalLM.from_pretrained(\n",
+    "        \"outputs_unslot/lora_model\", # YOUR MODEL YOU USED FOR TRAINING\n",
+    "        load_in_4bit = False,\n",
+    "    )\n",
+    "    tokenizer = transformers.AutoTokenizer.from_pretrained(\"outputs_unslot/lora_model\")\n",
+    "    model.config.use_cache = True\n",
+    "    model.eval()\n",
+    "    print(model.config)\n",
+    "\n",
+    "tokenizer | out\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "ds_test = datasets.load_dataset(\"json\", data_files=\"xlam-dataset-60k-qwen2-test.jsonl\")['train']\n",
+    "ds_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re,json\n",
+    "\n",
+    "def infer(M,T, messages):\n",
+    "    input_ids = T.apply_chat_template(\n",
+    "                    messages,\n",
+    "                    tokenize=True,\n",
+    "                    add_generation_prompt=True,\n",
+    "                    max_length=T.model_max_length,\n",
+    "                    padding=False,\n",
+    "                    truncation=True,\n",
+    "                    return_tensors='pt',\n",
+    "                ).to(M.device)\n",
+    "\n",
+    "    text_streamer = None # transformers.TextStreamer(tokenizer, skip_prompt = True)\n",
+    "    outputs = M.generate(\n",
+    "        input_ids = input_ids, # attention_mask=attention_mask,\n",
+    "        streamer = text_streamer,\n",
+    "        max_new_tokens=1024,\n",
+    "        eos_token_id=tokenizer.eos_token_id,\n",
+    "        pad_token_id=tokenizer.pad_token_id,\n",
+    "        do_sample=True, temperature=0.01, top_p= 0.01,\n",
+    "        use_cache=True)\n",
+    "\n",
+    "    # gen = T.batch_decode(outputs, skip_special_tokens=True)[0]\n",
+    "    gen = T.decode(outputs[0, input_ids.shape[-1]:], skip_special_tokens=True)\n",
+    "\n",
+    "    return input_ids, outputs, gen\n",
+    "\n",
+    "\n",
+    "\n",
+    "for i, sample in enumerate(ds_test):\n",
+    "    message = sample[\"messages\"]\n",
+    "    user_content = message[0][\"content\"]\n",
+    "    ans = message[1][\"content\"]\n",
+    "    _, _, gen = infer(model, tokenizer, message[:-1])\n",
+    "    gen = gen.replace('```json', '').replace('```', '')\n",
+    "\n",
+    "    # normalize = lambda s: re.sub(r\"\"\"\\s+\"\"\", \"\", s, flags=re.MULTILINE|re.DOTALL)\n",
+    "    # gen = normalize(gen.replace('```json', '').replace('```', ''))\n",
+    "    # ans = normalize(ans)\n",
+    "    true,false = True,False\n",
+    "    gen = json.dumps(eval(gen), indent=3)\n",
+    "    ans = json.dumps(eval(ans), indent=3)\n",
+    "    if gen != ans:\n",
+    "        # print(user_content|gray)\n",
+    "        print(f\"{i} ----------------\"|gray)\n",
+    "        print('gen:', gen|green)\n",
+    "        print('ans:', ans)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### no fc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate(input_text, system_prompt, max_length=0):\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": input_text}\n",
+    "    ]\n",
+    "    _, _, prediction = infer(model, tokenizer, messages)\n",
+    "    print(input_text|gray)\n",
+    "    print(prediction|green)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "(\"Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?\", 11),\n",
+    "(\"Yes or no: Would a pear sink in water?\",None),\n",
+    "(\"How would you bring me something that isn’t a fruit?\",None),\n",
+    "(\"How many keystrokes are needed to type the numbers from 1 to 500?\", 1392),\n",
+    "(\"The concert was scheduled to be on 06/01/1943, but was delayed by one day to today. What is the date 10 days ago in MM/DD/YYYY?\", \"05/23/1943\"),\n",
+    "(\"Take the last letters of the words in 'Lady Gaga' and concatenate them.\", 'ya'),\n",
+    "(\"Sammy wanted to go to where the people were. Where might he go?\",None),\n",
+    "(\"Is the following sentence plausible? 'Joao Moutinho caught the screen pass in the NFC championship.'\", 'not plausible'),\n",
+    "(\"A coin is heads up. Maybelle flips the coin. Shalonda does not flip the coin. Is the coin still heads up?\", \"No\"),\n",
+    "('Answer the following question by reasoning step by step. The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?', 9)\n",
+    "]\n",
+    "\n",
+    "line = 2\n",
+    "generate(prompts[line-2][0],\n",
+    "         system_prompt=\"Write out your reasoning step-by-step to be sure you get the right answers!\",\n",
+    "         max_length=512)\n",
+    "\n",
+    "if prompts[line-2][1]:\n",
+    "    print('answer:', prompts[line-2][1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Saving weights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### lora"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(\"outputs_unslot/lora_model\") # Local saving\n",
+    "tokenizer.save_pretrained(\"outputs_unslot/lora_model\")\n",
+    "# model.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving\n",
+    "# tokenizer.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving\n",
+    "\n",
+    "# loading\n",
+    "# from unsloth import FastLanguageModel\n",
+    "# model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "#     model_name = \"outputs_unslot/lora_model\", # YOUR MODEL YOU USED FOR TRAINING\n",
+    "#     max_seq_length = max_seq_length,\n",
+    "#     dtype = dtype,\n",
+    "#     load_in_4bit = load_in_4bit,\n",
+    "# )\n",
+    "# FastLanguageModel.for_inference(model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### hf-model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.save_pretrained_merged(\"outputs_unslot/hf-model\", tokenizer, save_method = \"merged_16bit\",)\n",
+    "# merge with lora model\n",
+    "# model.save_pretrained(\"outputs_unslot/hf-model\") # safe_serialization = None\n",
+    "# tokenizer.save_pretrained(\"outputs_unslot/hf-model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### gguf\n",
+    "- it will make hf weight(safe tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "model.save_pretrained_gguf(\"outputs_unslot/model\", tokenizer, quantization_method=\"q8_0\")\n",
+    "model.save_pretrained_gguf(\"outputs_unslot/model\", tokenizer, quantization_method=\"q4_k_m\")\n",
+    "model.save_pretrained_gguf(\"outputs_unslot/model\", tokenizer, quantization_method=\"q5_k_m\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def create_modelfile(gguf_path, template, output_modelfile):\n",
+    "    strip_lines = lambda x : '\\n'.join(line.strip() for line in x.splitlines())\n",
+    "    assert Path(gguf_path).exists()\n",
+    "    output_modelfile = Path(gguf_path).parent / Path(output_modelfile).name\n",
+    "    gguf_path = Path(gguf_path).name\n",
+    "\n",
+    "    with open(output_modelfile, \"w\") as f:\n",
+    "        f.write(f\"FROM {gguf_path}\\n\\n\")\n",
+    "        f.write(f\"TEMPLATE \\\"\\\"\\\"{strip_lines(template)}\\\"\\\"\\\"\\n\")\n",
+    "        # f.write(strip_lines(\"\"\"\n",
+    "        #     SYSTEM \"You are a helpful assistant.\"\n",
+    "\n",
+    "        #     PARAMETER temperature 0.01\n",
+    "        #     PARAMETER top_p 0.01\n",
+    "        #     PARAMETER stop \"<|im_end|>\" \"\"\")+'\\n')\n",
+    "\n",
+    "phi_3_5_template = \"\"\"\\\n",
+    "{{ if .System }}<|system|>\n",
+    "{{ .System }}<|end|>\n",
+    "{{ end }}{{ if .Prompt }}<|user|>\n",
+    "{{ .Prompt }}<|end|>\n",
+    "{{ end }}<|assistant|>\n",
+    "{{ .Response }}<|end|>\"\"\"\n",
+    "\n",
+    "create_modelfile(\"outputs_unslot/model/unsloth.Q8_0.gguf\", phi_3_5_template, \"phi3.5-fc-Q8_0.mf\")\n",
+    "\n",
+    "!ollama create jjkim76/phi3.5-fc:Q8_0 -f outputs_unslot/model/phi3.5-fc-Q8_0.mf\n",
+    "!ollama push   jjkim76/phi3.5-fc:Q8_0\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### merge with lora"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import transformers, peft\n",
+    "\n",
+    "pretrained_path = 'microsoft/Phi-3.5-mini-instruct'\n",
+    "# model_max_length = 4096\n",
+    "\n",
+    "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
+    "    pretrained_path,\n",
+    "    # model_max_length=model_max_length,\n",
+    "    trust_remote_code=True,\n",
+    "    )\n",
+    "\n",
+    "config = transformers.AutoConfig.from_pretrained(\n",
+    "    pretrained_path\n",
+    ")\n",
+    "\n",
+    "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
+    "    pretrained_path,\n",
+    "    # config=config,\n",
+    "    device_map=\"auto\",\n",
+    "    trust_remote_code=True,\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    # use_flash_attention_2=True,\n",
+    ")\n",
+    "\n",
+    "lora_path = 'outputs_unslot/model/lora'\n",
+    "lora_model = peft.PeftModel.from_pretrained(model, lora_path, torch_dtype=torch.float16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_model = lora_model.merge_and_unload()\n",
+    "merged_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merged_model.save_pretrained('outputs_unslot/merged-model')\n",
+    "tokenizer.save_pretrained('outputs_unslot/merged-model')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### upload to hf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch, peft, transformers\n",
+    "\n",
+    "model_local = \"outputs_unslot/merged-model\"\n",
+    "\n",
+    "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
+    "    model_local,\n",
+    "    device_map=\"auto\",\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    trust_remote_code=True,\n",
+    "    low_cpu_mem_usage=True,\n",
+    "    attn_implementation=\"flash_attention_2\",\n",
+    ")\n",
+    "\n",
+    "model.config.use_cache = True\n",
+    "model.eval()\n",
+    "\n",
+    "tokenizer = transformers.AutoTokenizer.from_pretrained(model_local)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model_id, revision=\"objects76/Phi-3.5-mini-instruct-fc\", \"main\"\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "tokenizer.model: 100%|██████████| 500k/500k [00:01<00:00, 463kB/s]\n",
+      "100%|██████████| 1/1 [00:01<00:00,  1.30s/it]\n",
+      "100%|██████████| 2/2 [00:15<00:00,  7.94s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/objects76/Phi-3.5-mini-instruct-fc/commit/8c379468173a1f6b05d39488cb7c61a51eeed72e', commit_message='instruction following added. without system message.', commit_description='', oid='8c379468173a1f6b05d39488cb7c61a51eeed72e', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi, HfFolder\n",
+    "from datetime import datetime\n",
+    "\n",
+    "tag = 'main' # datetime.now().strftime(\"%m%d\")\n",
+    "repo_name = f\"objects76/Phi-3.5-mini-instruct-fc\"\n",
+    "print(f'model_id, revision=\"{repo_name}\", \"{tag}\"')\n",
+    "\n",
+    "\n",
+    "# Instantiate HfApi to interact with Hugging Face Hub\n",
+    "tokenizer.push_to_hub(repo_name, revision=tag)\n",
+    "model.push_to_hub(repo_name, revision=tag,\n",
+    "                max_shard_size=\"5GB\",\n",
+    "                #   safe_serialization=True, private=True,\n",
+    "                commit_message='instruction following added. without system message.')\n",
+    "\n",
+    "#\n",
+    "# upload additional files\n",
+    "#\n",
+    "# srcfiles = 'output_qwen/2024-08-09/source.tar.gz'\n",
+    "# !tar -czvf {srcfiles} alpaca*.jsonl xlam*.jsonl qwen2-xlam-5.py\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "def build_readme(readme_path, outpath):\n",
+    "    # get prompt sample\n",
+    "    test_samples = []\n",
+    "    with open(\"xlam-dataset-60k-qwen2-test.jsonl\") as fp:\n",
+    "        test_samples = [json.loads(line) for line in fp]\n",
+    "    messages = test_samples[5]['messages']\n",
+    "\n",
+    "    with open(readme_path) as fp:\n",
+    "        txt = fp.read()\n",
+    "    txt = txt.replace('USERMSG_PLACE_HOLDER', messages[0]['content'].replace('```', '[TRIPLE_BACKTICK]'))\n",
+    "    # txt = txt.replace('MESSAGE_PLACE_HOLDER', str(messages[:-1]))\n",
+    "    txt = txt.replace('RESPONSE_PLACE_HOLDER', str(messages[-1]['content']))\n",
+    "    with open(outpath, 'w') as fp:\n",
+    "        fp.write(txt)\n",
+    "\n",
+    "\n",
+    "    # tokenizer.apply_chat_template(messages, tokenize=False) | out\n",
+    "    # evals = []\n",
+    "    # for i, sample in enumerate(test_samples):\n",
+    "    #     print(f'-- sample {i} --'| magenta)\n",
+    "    #     evals.append( get_answer(sample['messages'], model, tokenizer) )\n",
+    "\n",
+    "build_readme('outputs_unslot/README_TEMPLATE.md', model_local + '/README.md')\n",
+    "\n",
+    "if 1: # update file\n",
+    "    local_files = [\n",
+    "        # \"output_qwen/README.md\",\n",
+    "        __file__\n",
+    "    ]\n",
+    "    api = HfApi()\n",
+    "    for file_path in local_files:\n",
+    "        # target_path = file_path.replace('output_qwen/2024-08-08/', '')\n",
+    "        target_path = Path(file_path).name\n",
+    "        api.upload_file(\n",
+    "            path_or_fileobj= file_path,\n",
+    "            path_in_repo= target_path,\n",
+    "            repo_id=repo_name, revision=tag,\n",
+    "            repo_type=\"model\",\n",
+    "            # commit_message=\"Add README.md file\"\n",
+    "        )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fcv3-2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}