Spaces:

Yeyito
/

llm_contamination_detector

Running

App Files Files Community

Yeyito commited on Dec 18, 2023

Commit

2a135fe

•

1 Parent(s): 5b3849a

Upload 16 files

Browse files

Files changed (17) hide show

detect-pretrain-code-contamination +0 -1
detect-pretrain-code-contamination/README.md +17 -0
detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-39.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/eval.cpython-39.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/options.cpython-39.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc +0 -0
detect-pretrain-code-contamination/src/__pycache__/utils.cpython-39.pyc +0 -0
detect-pretrain-code-contamination/src/analyze.py +47 -0
detect-pretrain-code-contamination/src/eval.py +178 -0
detect-pretrain-code-contamination/src/options.py +23 -0
detect-pretrain-code-contamination/src/run.py +230 -0
detect-pretrain-code-contamination/src/scripts/run.sh +8 -0
detect-pretrain-code-contamination/src/utils.py +28 -0

detect-pretrain-code-contamination DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 616114e2334dc8dc8b7b538f6dbcc639cc42cb2c

detect-pretrain-code-contamination/README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Detect-Pretrain-Code-Contamination
+This repository contains scripts for detecting pretraining code contamination in datasets.
+## Datasets
+You can specify the dataset for analysis. Example datasets include `truthful_qa` and `cais/mmlu`.
+## Usage
+Run the script with the desired models and dataset. Below are two examples of how to use the script with different models and the `truthful_qa` dataset.
+### Example 1:
+```bash
+DATASET=truthful_qa
+python src/run.py --target_model Fredithefish/ReasonixPajama-3B-HF --ref_model huggyllama/llama-7b --data $DATASET --output_dir out/$DATASET --ratio_gen 0.4
+```
+The output of the script provides a metric for dataset contamination. If #the result < 0.1# with a percentage greater than 0.85, it is highly likely that the dataset has been trained.

detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc ADDED Viewed

Binary file (2.16 kB). View file

detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-39.pyc ADDED Viewed

Binary file (1.27 kB). View file

detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc ADDED Viewed

Binary file (9.99 kB). View file

detect-pretrain-code-contamination/src/__pycache__/eval.cpython-39.pyc ADDED Viewed

Binary file (4.68 kB). View file

detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc ADDED Viewed

Binary file (2.46 kB). View file

detect-pretrain-code-contamination/src/__pycache__/options.cpython-39.pyc ADDED Viewed

Binary file (1.45 kB). View file

detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (3.49 kB). View file

detect-pretrain-code-contamination/src/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.53 kB). View file

detect-pretrain-code-contamination/src/analyze.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+import statistics
+def load_jsonl(path):
+    with open(path) as f:
+        data = [json.loads(line) for line in f]
+    return data
+def analyze_data(data):
+    all_rmia = []
+    all_large_1 = []
+    for ex in data:
+        # Min_20.0% Prob
+        score = ex["pred"]["minkprob_w/_ref"]  # minkprob_w/_ref
+        all_rmia.append(score)
+        if score < 0.1:
+            all_large_1.append(score)
+    result = "result < 0.1, %: ", len(all_large_1)/len(all_rmia)
+    print(result)
+    return result
+    # print(f"RMIA mean: {statistics.mean(all_rmia)}")
+    # print(f"RMIA std: {statistics.stdev(all_rmia)}")
+    # print(f"RMIA min: {min(all_rmia)}")
+    # print(f"RMIA max: {max(all_rmia)}")
+    # # 25% percentile
+    # print(f"RMIA 25%: {statistics.quantiles(all_rmia)[0]}")
+    # # 50% percentile
+    # print(f"RMIA 50%: {statistics.quantiles(all_rmia)[1]}")
+    # # 75% percentile
+    # print(f"RMIA 75%: {statistics.quantiles(all_rmia)[2]}")
+if __name__ == "__main__":
+    print("contaminated model")
+    task = "ai2_arc" # ai2_arc cais/mmlu truthful_qa
+    # /fsx-onellm/swj0419/attack/test_contamination/detect-pretrain-code/out/ai2_arc/Fredithefish/ReasonixPajama-3B-HF_togethercomputer/RedPajama-INCITE-Chat-3B-v1/input/all_output.jsonl
+    path = f"/fsx-onellm/swj0419/attack/test_contamination/detect-pretrain-code/out/{task}/Fredithefish/ReasonixPajama-3B-HF_huggyllama/llama-7b/input/all_output.jsonl"
+    data = load_jsonl(path)
+    analyze_data(data)
+    print("raw model")
+    path = f"/fsx-onellm/swj0419/attack/test_contamination/detect-pretrain-code/out/{task}/togethercomputer/RedPajama-INCITE-Chat-3B-v1_huggyllama/llama-7b/input/all_output.jsonl"
+    data = load_jsonl(path)
+    analyze_data(data)

detect-pretrain-code-contamination/src/eval.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import logging
+logging.basicConfig(level='ERROR')
+import numpy as np
+from tqdm import tqdm
+import json
+from collections import defaultdict
+import matplotlib.pyplot as plt
+from sklearn.metrics import auc, roc_curve
+import matplotlib
+import random
+from ipdb import set_trace as bp
+import time
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['ps.fonttype'] = 42
+matplotlib.rcParams['pdf.fonttype'] = 42
+matplotlib.rcParams['ps.fonttype'] = 42
+# plot data
+def sweep(score, x):
+    """
+    Compute a ROC curve and then return the FPR, TPR, AUC, and ACC.
+    """
+    fpr, tpr, _ = roc_curve(x, -score)
+    acc = np.max(1-(fpr+(1-tpr))/2)
+    return fpr, tpr, auc(fpr, tpr), acc
+def do_plot(prediction, answers, sweep_fn=sweep, metric='auc', legend="", output_dir=None):
+    """
+    Generate the ROC curves by using ntest models as test models and the rest to train.
+    """
+    fpr, tpr, auc, acc = sweep_fn(np.array(prediction), np.array(answers, dtype=bool))
+    low = tpr[np.where(fpr<.05)[0][-1]]
+    # bp()
+    print('Attack %s   AUC %.4f, Accuracy %.4f, TPR@5%%FPR of %.4f\n'%(legend, auc,acc, low))
+    metric_text = ''
+    if metric == 'auc':
+        metric_text = 'auc=%.3f'%auc
+    elif metric == 'acc':
+        metric_text = 'acc=%.3f'%acc
+    plt.plot(fpr, tpr, label=legend+metric_text)
+    return legend, auc,acc, low
+def fig_fpr_tpr(all_output, output_dir):
+    print("output_dir", output_dir)
+    answers = []
+    metric2predictions = defaultdict(list)
+    for ex in all_output:
+        answers.append(ex["label"])
+        for metric in ex["pred"].keys():
+            if ("raw" in metric) and ("clf" not in metric):
+                continue
+            metric2predictions[metric].append(ex["pred"][metric])
+    plt.figure(figsize=(4,3))
+    with open(f"{output_dir}/auc.txt", "w") as f:
+        for metric, predictions in metric2predictions.items():
+            legend, auc, acc, low = do_plot(predictions, answers, legend=metric, metric='auc', output_dir=output_dir)
+            f.write('%s   AUC %.4f, Accuracy %.4f, [email protected]%%FPR of %.4f\n'%(legend, auc, acc, low))
+    plt.semilogx()
+    plt.semilogy()
+    plt.xlim(1e-5,1)
+    plt.ylim(1e-5,1)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.plot([0, 1], [0, 1], ls='--', color='gray')
+    plt.subplots_adjust(bottom=.18, left=.18, top=.96, right=.96)
+    plt.legend(fontsize=8)
+    plt.savefig(f"{output_dir}/auc.png")
+def load_jsonl(input_path):
+    with open(input_path, 'r') as f:
+        data = [json.loads(line) for line in tqdm(f)]
+    random.seed(0)
+    random.shuffle(data)
+    return data
+def dump_jsonl(data, path):
+    with open(path, 'w') as f:
+        for line in tqdm(data):
+            f.write(json.dumps(line) + "\n")
+def read_jsonl(path):
+    with open(path, 'r') as f:
+        return [json.loads(line) for line in tqdm(f)]
+def convert_huggingface_data_to_list_dic(dataset):
+    all_data = []
+    for i in range(len(dataset)):
+        ex = dataset[i]
+        all_data.append(ex)
+    return all_data
+def process_truthful_qa(data):
+    new_data = []
+    for ex in data:
+        new_ex = {}
+        label = ex["mc2_targets"]["labels"].index(1)
+        output = ex["mc2_targets"]["choices"][label]
+        # We change to mc2 instead of mc1 as it's those that open llm lead uses. (check about)
+        new_ex["output"] = output
+        new_ex["input"] = ex["question"] + " " + output
+        new_data.append(new_ex)
+    return new_data
+def process_mmlu(data):
+    new_data = []
+    for ex in data:
+        new_ex = {}
+        label = ex["choices"][ex["answer"]]
+        output = label
+        new_ex["output"] = output
+        new_ex["input"] = ex["question"] + " " + output
+        new_data.append(new_ex)
+    return new_data
+def process_arc(data):
+    new_data = []
+    choice2label = {"A": 0, "B": 1, "C": 2, "D": 3}
+    for ex in data:
+        new_ex = {}
+        # bp()
+        # print(ex["answerKey"])
+        if ex["answerKey"] not in choice2label:
+            continue
+        label = choice2label[ex["answerKey"]]
+        output = ex["choices"]["text"][label]
+        new_ex["output"] = output
+        new_ex["input"] = ex["question"] + " " + output
+        new_data.append(new_ex)
+    return new_data
+def process_gsm8k(data):
+    new_data = []
+    for ex in data:
+        new_ex = {}
+        #label = ;;
+        output = ex["answer"]
+        new_ex["output"] = output
+        new_ex["input"] = ex["question"] + " " + output
+        new_data.append(new_ex)
+    return new_data
+def process_winogrande(data):
+    new_data = []
+    for ex in data:
+        new_ex = {}
+        label = int(ex["answer"])
+        output = ex[f"option{label}"]
+        new_ex["output"] = output
+        new_ex["input"] = ex["sentence"] + " " + output
+        new_data.append(new_ex)
+    return new_data
+# I'm not sure if that's the correct format for winogrande given how the dataset works.
+def process_hellaswag(data):
+    new_data = []
+    for ex in data:
+        new_ex = {}
+        label = int(ex["label"]) # For some reason label is in str and not int?
+        output = ex["endings"][label]
+        new_ex["output"] = output
+        new_ex["input"] = ex["ctx"] + " " + output
+        new_data.append(new_ex)
+    return new_data

detect-pretrain-code-contamination/src/options.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import argparse
+import os
+from pathlib import Path
+import logging
+logger = logging.getLogger(__name__)
+class Options():
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        self.initialize_parser()
+    def initialize_parser(self):
+        self.parser.add_argument('--target_model', type=str, default="text-davinci-003", help="the model to attack: huggyllama/llama-65b, text-davinci-003")
+        self.parser.add_argument('--ref_model', type=str, default="huggyllama/llama-7b")
+        self.parser.add_argument('--output_dir', type=str, default="out")
+        self.parser.add_argument('--data', type=str, default="swj0419/WikiMIA", help="the dataset to evaluate: default is WikiMIA")
+        self.parser.add_argument('--length', type=int, default=64, help="the length of the input text to evaluate. Choose from 32, 64, 128, 256")
+        self.parser.add_argument('--key_name', type=str, default="input", help="the key name corresponding to the input text. Selecting from: input, parapgrase")
+        self.parser.add_argument('--ratio_gen', type=float, default=0.4)

detect-pretrain-code-contamination/src/run.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import logging
+logging.basicConfig(level='ERROR')
+import numpy as np
+from pathlib import Path
+import openai
+import torch
+import zlib
+import statistics
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+import math
+import numpy as np
+from datasets import load_dataset
+from options import Options
+from ipdb import set_trace as bp
+from eval import *
+from utils import evaluate_model
+from analyze import analyze_data
+import argparse
+import os
+import sys
+import gc
+import pickle
+def save_data(filename, data):
+    with open(filename, 'wb') as filehandle:
+        # store the data as binary data stream
+        pickle.dump(data, filehandle)
+def load_data(filename):
+    with open(filename, 'rb') as filehandle:
+        # read the data as binary data stream
+        loaded_data = pickle.load(filehandle)
+    return loaded_data
+def unload_model(model,tokenizer):
+    model = model.cpu()
+    del model
+    del tokenizer
+    time.sleep(0.5)
+    gc.collect()
+    torch.cuda.empty_cache()
+def load_model(name1):
+    model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
+    model1.eval()
+    tokenizer1 = AutoTokenizer.from_pretrained(name1)
+    tokenizer1.pad_token = tokenizer1.eos_token
+    return model1, tokenizer1
+def calculatePerplexity(sentence, model, tokenizer, gpu):
+    """
+    exp(loss)
+    """
+    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
+    input_ids = input_ids.to(gpu)
+    with torch.no_grad():
+        outputs = model(input_ids, labels=input_ids)
+    loss, logits = outputs[:2]
+    '''
+    extract logits:
+    '''
+    # Apply softmax to the logits to get probabilities
+    probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
+    # probabilities = torch.nn.functional.softmax(logits, dim=-1)
+    all_prob = []
+    input_ids_processed = input_ids[0][1:]
+    for i, token_id in enumerate(input_ids_processed):
+        probability = probabilities[0, i, token_id].item()
+        all_prob.append(probability)
+    return torch.exp(loss).item(), all_prob, loss.item()
+def sample_generation(sentence, model, tokenizer, args):
+    half_sentence_index = math.ceil(len(sentence.split())*args['prefix_length'])
+    if half_sentence_index > 0:
+        prefix = " ".join(sentence.split()[:half_sentence_index])
+    else:
+        prefix = '<|startoftext|> '
+    input_ids = torch.tensor(tokenizer.encode(prefix)).unsqueeze(0)
+    input_ids = input_ids.to(model.device)
+    output = model.generate(input_ids, max_new_tokens=len(sentence.split())-half_sentence_index, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
+    # print(output)
+    complete_generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+    return complete_generated_text
+def RMIA_1(text,target_loss,ref_loss,model1,tokenizer1,ratio_gen,neighbors_dl):
+    target_losses_z = evaluate_model(model1,tokenizer1,neighbors_dl)
+    result = torch.count_nonzero(target_losses_z < target_loss).item() / len(target_losses_z)
+    return result
+def get_neighbors(text,ref_loss,model2,tokenizer2,ratio_gen):
+    cur_args = {'prefix_length': ratio_gen, 'num_z': 100, 'generate_args': {'do_sample': True}}
+    neighbors = sample_generation(text, model2, tokenizer2, cur_args)
+    neighbors_dl = DataLoader(neighbors, batch_size=32, shuffle=False)
+    return neighbors_dl
+def evaluate_data(test_data, col_name, target_model, ref_model, ratio_gen, data_name):
+    print(f"all data size: {len(test_data)}")
+    random.seed(0)
+    random.shuffle(test_data)
+    test_data = test_data[:100]
+    inference2_pass = None
+    neighbors_dls = None
+    ref_model_clean = ref_model.replace("/","-")
+    data_name_clean = data_name.replace("/","-")
+    os.makedirs(os.path.join(f"saves/{ref_model_clean}",f"{data_name_clean}"),exist_ok=True)
+    try:
+        inference2_pass = load_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt')
+        neighbors_dls = load_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt')
+    except:
+        ### MODEL 2 likelihoods
+        model2, tokenizer2 = load_model(ref_model)
+        inference2_pass = [] #0: p_ref, #1: all_prob_ref, #2: p_ref_likelihood
+        for ex in tqdm(test_data):
+            text = ex[col_name]
+            new_ex = inference_model2(model2, tokenizer2, text)
+            inference2_pass.append(new_ex)
+        # Invariant. Doesn't take in model1 so I'm good
+        ### Neighbors:
+        neighbors_dls = []
+        counter = 0
+        for ex in tqdm(test_data):
+            text = ex[col_name]
+            new_ex = get_neighbors(text,inference2_pass[counter][2],model2,tokenizer2,ratio_gen)
+            counter = counter + 1
+            neighbors_dls.append(new_ex)
+        unload_model(model2,tokenizer2)
+        # Because it uses temp it is not invariant, however taking a snapshot in time should be just fine.
+        save_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt',inference2_pass)
+        save_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt',neighbors_dls)
+        print("Saved ref data, exiting.")
+    ### MODEL 1 likelihoods
+    model1, tokenizer1 = load_model(target_model)
+    inference1_pass = [] #0: p1, #1: all_prob, #2: p1_likelihood, #3: p_lower, #4: p_lower_likelihood
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        new_ex = inference_model1(model1,tokenizer1,text)
+        inference1_pass.append(new_ex)
+    ### RIMA results
+    model1, tokenizer1 = load_model(target_model)
+    counter = 0
+    results = []
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        new_ex = RMIA_1(text,inference1_pass[counter][2],inference2_pass[counter][2],model1,tokenizer1,ratio_gen,neighbors_dls[counter])
+        counter = counter + 1
+        results.append(new_ex)
+    unload_model(model1,tokenizer1)
+    ### Inference ex
+    all_output = []
+    counter = 0
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        pred = {}
+        pred["minkprob_w/_ref"] = results[counter]
+        pred["ppl"] = inference1_pass[counter][0]
+        pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = inference1_pass[counter][2]-inference2_pass[counter][2]
+        pred["ppl/lowercase_ppl"] = -(np.log(inference1_pass[counter][3]) / np.log(inference1_pass[counter][0])).item()
+        zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
+        pred["ppl/zlib"] = np.log(inference1_pass[counter][0])/zlib_entropy
+        ex["pred"] = pred
+        counter = counter + 1
+        all_output.append(ex)
+    return all_output
+def inference_model1 (model1, tokenizer1, text):
+    p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
+    p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)
+    return [p1, all_prob, p1_likelihood, p_lower, p_lower_likelihood]
+def inference_model2 (model2, tokenizer2, text):
+    p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
+    return [p_ref,all_prob_ref,p_ref_likelihood]
+def main(target_model,ref_model,output_dir,data,length,key_name,ratio_gen):
+    output_dir = f"{output_dir}/{target_model}_{ref_model}/{key_name}"
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    # load model and data
+    data_name = data
+    if "jsonl" in data:
+        data = load_jsonl(f"{data}")
+    elif data == "truthful_qa":
+        # bp()
+        dataset = load_dataset(data, "multiple_choice", split="validation")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_truthful_qa(data)
+    elif data == "cais/mmlu":
+        dataset = load_dataset(data, "all", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_mmlu(data)
+    elif data == "ai2_arc":
+        dataset = load_dataset(data, "ARC-Challenge", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_arc(data)
+    elif data == "gsm8k":
+        dataset = load_dataset(data, "main", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_gsm8k(data)
+    elif data == "Rowan/hellaswag":
+        dataset = load_dataset(data, "default", split="validation")
+        # We use validation since labels for the test set are not available?
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_hellaswag(data)
+    elif data == "winogrande":
+        dataset = load_dataset(data,"winogrande_debiased", split="validation")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_winogrande(data)
+    #model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
+    all_output = evaluate_data(data,key_name, target_model, ref_model,ratio_gen,data_name)
+    dump_jsonl(all_output, f"{output_dir}/all_output.jsonl")
+    return analyze_data(all_output)
+    # fig_fpr_tpr(all_output, output_dir)

detect-pretrain-code-contamination/src/scripts/run.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+DATASET=truthful_qa #cais/mmlu #truthful_qa
+python src/run.py --target_model Fredithefish/ReasonixPajama-3B-HF --ref_model huggyllama/llama-7b --data $DATASET --output_dir out/$DATASET --ratio_gen 0.4
+# DATASET=cais/mmlu #cais/mmlu #truthful_qa
+DATASET=truthful_qa #cais/mmlu #truthful_qa
+python src/run.py --target_model togethercomputer/RedPajama-INCITE-Chat-3B-v1 --ref_model huggyllama/llama-7b --data $DATASET --output_dir out/$DATASET --ratio_gen 0.4

detect-pretrain-code-contamination/src/utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from tqdm import tqdm
+import torch
+from torch.nn import CrossEntropyLoss
+def evaluate_model(model, tokenizer, dl):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = model.to(device)
+    losses = []
+    for batch in dl:
+        batch = tokenizer(batch, padding=True, return_tensors='pt', truncation=True, max_length=150)
+        labels = torch.tensor([
+            [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch['attention_mask'], batch['input_ids'])]
+            ])
+        batch['labels'] = labels
+        batch = {k: v.to(device) for k, v in batch.items()}
+        with torch.no_grad():
+            outputs = model(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
+            shift_logits = outputs.logits[..., :-1, :].contiguous()
+            shift_labels = batch['labels'][..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction='none')
+            loss = loss_fct(shift_logits.transpose(1,2), shift_labels)
+            num_tokens = torch.sum(shift_labels != -100, dim=1)
+            loss_sum = torch.sum(loss, dim=1)
+            loss = loss_sum / num_tokens
+            losses.append(loss)
+    losses = torch.cat(losses)
+    return losses