SUMMARY

Just a model using to learn Fine Tuning of 'DialoGPT-medium'

on a self made datasets
on a self made special tokens
on a multiple fine tuned with ~30K dataset (in progress mode)

If interested in how I got to this point and how I created the datasets you can visit:
Crafting GPT2 for Personalized AI-Preparing Data the Long Way

DECLARING NEW SPECIAL TOKENS

special_tokens_dict = {
    'eos_token': '<|STOP|>',
    'bos_token': '<|STOP|>',
    'pad_token': '<|PAD|>',
    'additional_special_tokens': ['<|BEGIN_QUERY|>', '<|BEGIN_QUERY|>', 
                                  '<|BEGIN_ANALYSIS|>', '<|END_ANALYSIS|>',
                                  '<|BEGIN_RESPONSE|>', '<|END_RESPONSE|>',
                                  '<|BEGIN_SENTIMENT|>', '<|END_SENTIMENT|>',
                                  '<|BEGIN_CLASSIFICATION|>', '<|END_CLASSIFICATION|>',]
}

tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<|STOP|>')
tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<|STOP|>')
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('<|PAD|>')

The order of tokens is as follows:

def combine_text(user_prompt, analysis, sentiment, new_response, classification):
    user_q = f"<|STOP|><|BEGIN_QUERY|>{user_prompt}<|END_QUERY|>"
    analysis = f"<|BEGIN_ANALYSIS|>{analysis}<|END_ANALYSIS|>"
    new_response = f"<|BEGIN_RESPONSE|>{new_response}<|END_RESPONSE|>"
    sentiment = f"<|BEGIN_SENTIMENT|>Sentiment: {sentiment}<|END_SENTIMENT|><|STOP|>"
    classification = f"<|BEGIN_CLASSIFICATION|>{classification}<|END_CLASSIFICATION|>"
    return user_q + analysis + new_response + classification + sentiment

INFERANCING

I am currently testing two ways, if anyone knows a better one, please let me know!

import torch
from transformers import AutoModelForCausalLLM, AutoTokenizer

models_folder = "Deeokay/DialoGPT-special-tokens-medium4"

model = AutoModelForCausalLM.from_pretrained(models_folder)
tokenizer = AutoTokenizer.from_pretrained(models_folder)

# Device configuration <<change as needed>> 
device = torch.device("cpu")
model.to(device)

OPTION 1 INFERFENCE

import time

class Stopwatch:
    def __init__(self):
        self.start_time = None
        self.end_time = None

    def start(self):
        self.start_time = time.time()

    def stop(self):
        self.end_time = time.time()

    def elapsed_time(self):
        if self.start_time is None:
            return "Stopwatch hasn't been started"
        if self.end_time is None:
            return "Stopwatch hasn't been stopped"
        return self.end_time - self.start_time

stopwatch1 = Stopwatch()

def generate_response(input_text, max_length=250):
    
    stopwatch1.start()
    
    # Prepare the input
    # input_text = f"<|BEGIN_QUERY|>{input_text}<|END_QUERY|><|BEGIN_ANALYSIS|>{input_text}<|END_ANALYSIS|><|BEGIN_RESPONSE|>"
    input_text = f"<|BEGIN_QUERY|>{input_text}<|END_QUERY|><|BEGIN_ANALYSIS|>"
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Create attention mask
    attention_mask = torch.ones_like(input_ids).to(device)
    
    # Generate
    output = model.generate(
        input_ids,
        max_new_tokens=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.convert_tokens_to_ids('<|STOP|>'),
    )
    
    stopwatch1.stop()
    return tokenizer.decode(output[0], skip_special_tokens=False)

OPTION 2 INFERNCE

import time

class Stopwatch:
    def __init__(self):
        self.start_time = None
        self.end_time = None

    def start(self):
        self.start_time = time.time()

    def stop(self):
        self.end_time = time.time()

    def elapsed_time(self):
        if self.start_time is None:
            return "Stopwatch hasn't been started"
        if self.end_time is None:
            return "Stopwatch hasn't been stopped"
        return self.end_time - self.start_time

stopwatch2 = Stopwatch()

def generate_response2(input_text, max_length=250):
    
    stopwatch2.start()
    
    # Prepare the input
    # input_text = f"<|BEGIN_QUERY|>{input_text}<|END_QUERY|><|BEGIN_ANALYSIS|>{input_text}<|END_ANALYSIS|><|BEGIN_RESPONSE|>"
    input_text = f"<|BEGIN_QUERY|>{input_text}<|END_QUERY|><|BEGIN_ANALYSIS|>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Create attention mask
    attention_mask = torch.ones_like(input_ids).to(device)

    # # 2ND OPTION FOR : Generate
    output = model.generate(
        input_ids,
        max_new_tokens=max_length,
        attention_mask=attention_mask,
        do_sample=True,
        temperature=0.4,
        top_k=60,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    stopwatch2.stop()
    return tokenizer.decode(output[0], skip_special_tokens=False)

DECODING ANSWER

When I need just the response

def decode(text):
    full_text = text
    
    # Extract the response part
    start_token = "<|BEGIN_RESPONSE|>"
    end_token = "<|END_RESPONSE|>"
    start_idx = full_text.find(start_token)
    end_idx = full_text.find(end_token)
    
    if start_idx != -1 and end_idx != -1:
        response = full_text[start_idx + len(start_token):end_idx].strip()
    else:
        response = full_text.strip()
    
    return response

MY SETUP

I use the stopwatch to time the responses and I use both inference to see the difference

input_text = "Who is Steve Jobs and what was contribution?"
response1_full = generate_response(input_text)
#response1 = decode(response1_full)
print(f"Input: {input_text}")
print("=======================================")
print(f"Response1: {response1_full}")
elapsed1 = stopwatch1.elapsed_time()
print(f"Process took {elapsed1:.4f} seconds")
print("=======================================")
response2_full = generate_response2(input_text)
#response2 = decode(response2_full)
print(f"Response2: {response2_full}")
elapsed2 = stopwatch2.elapsed_time()
print(f"Process took {elapsed2:.4f} seconds")
print("=======================================")

Out-of-Scope Use

Well everything that has a factual data.. trust at your own risk!

Never tested on mathamatical knowledge.

I quite enjoy how the response feels closer to what I had in mind..