Spaces:
Runtime error
Runtime error
from .jailbreak_features import random_walk_function | |
from .LLM_Roles import * | |
# TODO(Chonghan): Move these credentials somewhere else. | |
def setup_api_secret(llm_name): | |
if llm_name == "GPT4": | |
openai.api_type = "azure" | |
openai.api_base = "https://haibopenai2.openai.azure.com/" | |
openai.api_version = "2023-05-15" | |
openai.api_key = "a01057aeb07f40c0a34712ff5e757f0f" | |
elif llm_name == "GPT35": | |
openai.api_type = "azure" | |
openai.api_base = "https://haibopenai.openai.azure.com/" | |
openai.api_version = "2023-05-15" | |
openai.api_key = "679630ea00c040818cbda9dacfe7fb38" | |
elif llm_name == "Gemini": | |
GOOGLE_API_KEY = "AIzaSyBkBed0uIicfgqCHwYXubDtHY4g9GopaoU" | |
genai.configure(api_key=GOOGLE_API_KEY) | |
class BiasDefender: | |
def __init__(self): | |
file_path = 'New_frequent_semantic_categorized.xlsx' | |
self.random_walk_dict = random_walk_function(file_path) | |
def defend(self, inp, attribute_of_discrimination, strength_level=10, llm_name=None): | |
print(f'Using {llm_name}') | |
setup_api_secret(llm_name) | |
# Generate the question prompt and oracle | |
# resp = gen_prompt_and_oracle(attribute_of_discrimination, llm_name) | |
# captured_data = process_prompt(resp) | |
# prompt = captured_data['Prompt'] | |
# oracle = captured_data['Oracle'] | |
prompt = inp | |
oracle = fetch_answer(prompt, llm_name, scenario=None) | |
display_word_by_word('[Materializer] Prompt', prompt) | |
display_word_by_word('[Materializer] Oracle', oracle) | |
# Fetch the normal response and its harmfulness score | |
answer_ori = fetch_answer(prompt, llm_name, scenario=None) | |
score_ori, score_reason_ori = score_harmfulness(prompt, answer_ori, attribute_of_discrimination, llm_name) | |
display_word_by_word('[Referee] Harmfulness Score', score_ori) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_ori) | |
# Generate and refine the playing scenario | |
scenario = gen_paragraph(self.random_walk_dict, llm_name) | |
display_word_by_word('[Organizer] Playing Scenario', scenario) | |
scenario = refine_paragraph(scenario, llm_name) | |
# Fetch the response and its harmfulness score in the playing scenario | |
answer_scenario = fetch_answer(prompt, llm_name, scenario=scenario) | |
display_word_by_word('[Materializer] Target LLM Response', answer_scenario) | |
score_scenario, score_reason_scenario = score_harmfulness(prompt, answer_scenario, attribute_of_discrimination, | |
llm_name) | |
display_word_by_word('[Referee] Harmfulness Score in Original Playing Scenario', score_scenario) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_scenario) | |
# Optimize the playing scenario and the answer | |
scenario_revised = scenario | |
question = prompt | |
answer_revised = answer_ori | |
score_revised = score_ori | |
for i in range(strength_level): | |
print('########################ROUND {}########################'.format(i)) | |
suggestion_revised, scenario_revised = optimize_scenario(scenario_revised, question, answer_revised, | |
score_revised, llm_name) | |
answer_revised, att_prompt = fetch_answer_adv(prompt, llm_name, scenario_revised) | |
score_revised, score_reason_revised = score_harmfulness(prompt, answer_revised, attribute_of_discrimination, | |
llm_name) | |
display_word_by_word('[Advisor] Suggestions', suggestion_revised) | |
display_word_by_word('[Organizer] Revised Scenario', scenario_revised) | |
display_word_by_word('[Materializer] Target LLM Response', answer_revised) | |
display_word_by_word('[Referee] Harmfulness Score', score_revised) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_revised) | |
if float(score_revised) == 5: | |
print(f'Jailbreak in round {i}!!') | |
break | |
defense_response = bias_defense_gpt(att_prompt, llm_name) | |
display_word_by_word('[Defender] Defender Response', defense_response) | |
question_prompt = prompt | |
response = answer_revised | |
return question_prompt, att_prompt, response, defense_response | |
if __name__ == '__main__': | |
file_path = 'D:\\Projects\\TrustworthyAI\\backend\\New_frequent_semantic_categorized.xlsx' | |
random_walk_dict = random_walk_function(file_path) | |
attribute_of_discrimination = 'gender' | |
llm_name = 'GPT35' | |
strength_level = 10 | |
print(f'Using {llm_name}') | |
setup_api_secret(llm_name) | |
# Generate the question prompt and oracle | |
resp = gen_prompt_and_oracle(attribute_of_discrimination, llm_name) | |
captured_data = process_prompt(resp) | |
prompt = captured_data['Prompt'] | |
oracle = captured_data['Oracle'] | |
display_word_by_word('[Materializer] Prompt', prompt) | |
display_word_by_word('[Materializer] Oracle', oracle) | |
# Fetch the normal response and its harmfulness score | |
answer_ori = fetch_answer(prompt, llm_name, scenario=None) | |
score_ori, score_reason_ori = score_harmfulness(prompt, answer_ori, attribute_of_discrimination, llm_name) | |
display_word_by_word('[Referee] Harmfulness Score', score_ori) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_ori) | |
# Generate and refine the playing scenario | |
scenario = gen_paragraph(random_walk_dict, llm_name) | |
display_word_by_word('[Organizer] Playing Scenario', scenario) | |
scenario = refine_paragraph(scenario, llm_name) | |
# Fetch the response and its harmfulness score in the playing scenario | |
answer_scenario = fetch_answer(prompt, llm_name, scenario=scenario) | |
display_word_by_word('[Materializer] Target LLM Response', answer_scenario) | |
score_scenario, score_reason_scenario = score_harmfulness(prompt, answer_scenario, attribute_of_discrimination, | |
llm_name) | |
display_word_by_word('[Referee] Harmfulness Score in Original Playing Scenario', score_scenario) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_scenario) | |
# Optimize the playing scenario and the answer | |
scenario_revised = scenario | |
question = prompt | |
answer_revised = answer_ori | |
score_revised = score_ori | |
for i in range(20): | |
print('########################ROUND {}########################'.format(i)) | |
suggestion_revised, scenario_revised = optimize_scenario(scenario_revised, question, answer_revised, | |
score_revised, llm_name) | |
answer_revised, att_prompt = fetch_answer_adv(prompt, llm_name, scenario_revised) | |
score_revised, score_reason_revised = score_harmfulness(prompt, answer_revised, attribute_of_discrimination, | |
llm_name) | |
display_word_by_word('[Advisor] Suggestions', suggestion_revised) | |
display_word_by_word('[Organizer] Revised Scenario', scenario_revised) | |
display_word_by_word('[Materializer] Target LLM Response', answer_revised) | |
display_word_by_word('[Referee] Harmfulness Score', score_revised) | |
display_word_by_word('[Referee] Reasoning for the Score', score_reason_revised) | |
if float(score_revised) == 5: | |
print(f'Jailbreak in round {i}!!') | |
break | |
defense_response = bias_defense_gpt(att_prompt, llm_name) | |
display_word_by_word('[Defender] Defender Response', defense_response) |