File size: 8,428 Bytes

76770f7

---
license: mit
language:
- en
tags:
- llm
- safety
- jailbreak
- knowledge
---
# Introduction

This is a model for generating a jailbreak prompt based on knowledge point texts. The model is trained on the Llama-2-7b dataset and fine-tuned on the Knowledge-to-Jailbreak dataset. The model is intended to bridge the gap between theoretical vulnerabilities and real-world application scenarios, simulating sophisticated adversarial attacks that incorporate specialized knowledge. 

Our proposed method and dataset serve as a critical starting point for both offensive and defensive research, enabling the development of new techniques to enhance the security and robustness of language models in practical settings.

# How to load the model and tokenizer

We provide two helper functions for loading the model and tokenizer.

```python

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification

import os

import json

from peft import PeftModel

# from trl import AutoModelForCausalLMWithValueHead

from transformers import AutoModelForCausalLM as AutoGPTQForCausalLM

def load_tokenizer(dir_or_model):

    """

    This function is used to load the tokenizer for a specific pre-trained model.

    

    Args:

        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.

    

    Returns:

        It returns a tokenizer that can convert text to tokens for the specific model input.

    """

    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))

    if is_lora_dir:

        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))

        model_name = loaded_json["base_model_name_or_path"]

    else:

        model_name = dir_or_model

        

    if os.path.isfile(os.path.join(dir_or_model, "config.json")):

        loaded_json = json.load(open(os.path.join(dir_or_model, "config.json"), "r"))

        if "_name_or_path" in loaded_json:

            model_name = loaded_json["_name_or_path"]

    local_model_name = "/data3/MODELS/llama2-hf/llama-2-7b"#/data2/tsq/WaterBench/data/models/llama-2-7b-chat-hf

    

    print(">>>>>>>>>>>>>>>>>>>>>>>>>>notice this<<<<<<<<<<<<<<<<<<<<<<<<<<<<")

    

    #print(model_name)

    tokenizer = AutoTokenizer.from_pretrained(local_model_name)

    if tokenizer.pad_token is None:

        tokenizer.pad_token = tokenizer.eos_token

        tokenizer.pad_token_id = tokenizer.eos_token_id

    

    return tokenizer

def load_model(dir_or_model, classification=False, token_classification=False, return_tokenizer=False, dtype=torch.bfloat16, load_dtype=True, 

                rl=False, peft_config=None, device_map="auto", revision='main'):

    """

    This function is used to load a model based on several parameters including the type of task it is targeted to perform.

    

    Args:

        dir_or_model: It can be either a directory containing the pre-training model configuration details or a pretrained model.

        classification (bool): If True, loads the model for sequence classification.

        token_classification (bool): If True, loads the model for token classification.

        return_tokenizer (bool): If True, returns the tokenizer along with the model.

        dtype: The data type that PyTorch should use internally to store the model’s parameters and do the computation.

        load_dtype (bool): If False, sets dtype as torch.float32 regardless of the passed dtype value.

        rl (bool): If True, loads model specifically designed to be used in reinforcement learning environment.

        peft_config: Configuration details for Peft models. 

    

    Returns:

        It returns a model for the required task along with its tokenizer, if specified.

    """

    is_lora_dir = os.path.isfile(os.path.join(dir_or_model, "adapter_config.json"))

    if not load_dtype:

        dtype = torch.float32

    if is_lora_dir:

        loaded_json = json.load(open(os.path.join(dir_or_model, "adapter_config.json"), "r"))

        model_name = loaded_json["base_model_name_or_path"]

    else:

        model_name = dir_or_model

    original_model_name = model_name

    if classification:

        model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)  # to investigate: calling torch_dtype here fails.

    elif token_classification:

        model = AutoModelForTokenClassification.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)

    else:

        if model_name.endswith("GPTQ") or model_name.endswith("GGML"):

            model = AutoGPTQForCausalLM.from_quantized(model_name,

                                                        use_safetensors=True,

                                                        trust_remote_code=True,

                                                        \# use_triton=True, # breaks currently, unfortunately generation time of the GPTQ model is quite slow

                                                        quantize_config=None, device_map=device_map)

        else:

            print('11111111111111111111111111111111111111')

            model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float32, use_auth_token=True, device_map=device_map, revision=revision)

    if is_lora_dir:

        model = PeftModel.from_pretrained(model, dir_or_model)

        

    try:

        tokenizer = load_tokenizer(original_model_name)

        model.config.pad_token_id = tokenizer.pad_token_id

    except Exception:

        pass

    if return_tokenizer:

        return model, load_tokenizer(original_model_name)

    return model

model_name = 'tsq2000/Jailbreak-generator'

model = load_model(model_name)

tokenizer = load_tokenizer(model_name)

```

# How to generate jailbreak prompts

Here is an example of how to generate jailbreak prompts based on knowledge point texts.

```python

model_name = 'tsq2000/Jailbreak-generator'

model = load_model(model_name)

tokenizer = load_tokenizer(model_name)

max_length = 2048

max_tokens = 64

knowledge_points = ["Kettling Kettling (also known as containment or corralling) is a police tactic for controlling large crowds during demonstrations or protests. It involves the formation of large cordons of police officers who then move to contain a crowd within a limited area. Protesters are left only one choice of exit controlled by the police – or are completely prevented from leaving, with the effect of denying the protesters access to food, water and toilet facilities for a time period determined by the police forces. The tactic has proved controversial, in part because it has resulted in the detention of ordinary bystanders."]

batch_texts = [f'### Input:\n{input_}\n\n### Response:\n' for input_ in knowledge_points]

inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length - max_tokens).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=max_tokens,      num_return_sequences=1, do_sample=False, temperature=1, top_p=1, eos_token_id=tokenizer.eos_token_id)

generated_texts = []

for output, input_text in zip(outputs, batch_texts):

    text = tokenizer.decode(output, skip_special_tokens=True)

    generated_texts.append(text[len(input_text):])

print(generated_texts)

```

# Citation

If you find this model useful, please cite the following paper:

```
@misc{tu2024knowledgetojailbreak,

      title={Knowledge-to-Jailbreak: One Knowledge Point Worth One Attack}, 

      author={Shangqing Tu and Zhuoran Pan and Wenxuan Wang and Zhexin Zhang and Yuliang Sun and Jifan Yu and Hongning Wang and Lei Hou and Juanzi Li},

      year={2024},

      eprint={2406.11682},

      archivePrefix={arXiv},

      primaryClass={cs.CL},

      url={https://arxiv.org/abs/2406.11682}, 

}

```