|
import pandas as pd |
|
from sentence_transformers.util import cos_sim |
|
|
|
from utils.models import SBert |
|
|
|
|
|
def p0_originality(df: pd.DataFrame, model_name: str) -> pd.DataFrame: |
|
assert 'prompt' in df.columns |
|
assert 'response' in df.columns |
|
model = SBert(model_name) |
|
|
|
def get_cos_sim(prompt: str, response: str) -> float: |
|
prompt_vec = model(prompt) |
|
response_vec = model(response) |
|
score = cos_sim(prompt_vec, response_vec).item() |
|
return score |
|
|
|
df['originality'] = df.apply(lambda x: 1 - get_cos_sim(x['prompt'], x['response']), axis=1) |
|
return df |
|
|
|
|
|
def p1_flexibility(df: pd.DataFrame, model_name: str) -> pd.DataFrame: |
|
assert 'prompt' in df.columns |
|
assert 'response' in df.columns |
|
assert 'id' in df.columns |
|
model = SBert(model_name) |
|
|
|
def get_cos_sim(responses: list[str]) -> float: |
|
responses_vec = [model(_) for _ in responses] |
|
count = 0 |
|
score = 0 |
|
for i in range(len(responses_vec)): |
|
for j in range(1, len(responses_vec)): |
|
if i == j: |
|
continue |
|
score += cos_sim(responses_vec[i], responses_vec[j]).item() |
|
count += 1 |
|
return score / count |
|
|
|
df_out = df.groupby(by=['id', 'prompt']) \ |
|
.agg({'id': 'first', 'prompt': 'first', 'response': get_cos_sim}) \ |
|
.rename(columns={'response': 'flexibility'}) \ |
|
.reset_index(drop=True) |
|
return df_out |
|
|
|
|
|
if __name__ == '__main__': |
|
_df_input = pd.read_csv('data/example_3.csv') |
|
_df_0 = p0_originality(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2') |
|
_df_1 = p1_flexibility(_df_input, 'paraphrase-multilingual-MiniLM-L12-v2') |
|
|