|
import json |
|
import statistics |
|
|
|
def load_jsonl(path): |
|
with open(path) as f: |
|
data = [json.loads(line) for line in f] |
|
return data |
|
|
|
def analyze_data(data): |
|
all_rmia = [] |
|
all_large_1 = [] |
|
for ex in data: |
|
|
|
score = ex["pred"]["minkprob_w/_ref"] |
|
all_rmia.append(score) |
|
if score < 0.1: |
|
all_large_1.append(score) |
|
result = "result < 0.1, %: ", len(all_large_1)/len(all_rmia) |
|
print(result) |
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
print("contaminated model") |
|
task = "ai2_arc" |
|
|
|
path = f"/fsx-onellm/swj0419/attack/test_contamination/detect-pretrain-code/out/{task}/Fredithefish/ReasonixPajama-3B-HF_huggyllama/llama-7b/input/all_output.jsonl" |
|
data = load_jsonl(path) |
|
analyze_data(data) |
|
|
|
print("raw model") |
|
path = f"/fsx-onellm/swj0419/attack/test_contamination/detect-pretrain-code/out/{task}/togethercomputer/RedPajama-INCITE-Chat-3B-v1_huggyllama/llama-7b/input/all_output.jsonl" |
|
data = load_jsonl(path) |
|
analyze_data(data) |
|
|
|
|