File size: 4,367 Bytes
7345da1
839ca71
 
7345da1
839ca71
 
 
d358781
 
ebe320f
 
a7883dd
c1d9dc3
561c1fb
d358781
3b394b8
 
ebe320f
 
 
0356c02
ebe320f
 
 
 
a7883dd
839ca71
d358781
839ca71
 
78313b3
839ca71
 
 
 
b25bb07
 
 
159dda8
 
7345da1
 
 
 
d358781
 
7345da1
 
 
 
b25bb07
c1d9dc3
7345da1
839ca71
 
 
 
 
 
78313b3
d358781
839ca71
4c71672
09c5f1e
 
 
 
 
 
 
9edca9e
d358781
 
 
9edca9e
657095c
d358781
b7275fb
 
c39065b
7b056b1
b223b27
d358781
a870703
b25bb07
 
a870703
16842d6
657095c
b25bb07
0bceca6
8f2e74d
d358781
89aa4a2
 
34162d5
c2ac8ae
 
40760a4
09c5f1e
e845a55
40760a4
36ca842
40760a4
 
 
 
13c9123
e845a55
b25bb07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import re
import time
import json_repair
import pandas as pd
from tqdm import tqdm


def create_summary(group_name, label, occupation, row, template):
    """Generate a dynamic summary for scoring the applicant, excluding the group feature.
       The occupation parameter allows customization of the job position.
    """

    resume_info = row['Cleaned_Resume']
    # resume_info = resume_info[:int(len(resume_info) * proportion)]

    info = f"{group_name}: {label};" if label else ''

    summary = template.format(
        role=row['Role'],
        counterfactual_info=info,
        resume_info=resume_info
    )

    return summary


def invoke_retry(prompt, agent, parameters, string_input=False):
    attempts = 0
    delay = 2  # Initial delay in seconds
    max_attempts = 5  # Maximum number of retry attempts

    while attempts < max_attempts:
        try:
            score_text = agent.invoke(prompt, **parameters)
            #print(f"Prompt: {prompt}")
            # print(f"Score text: {score_text}")
            # print("=============================================================")
            if string_input:
                return score_text
            try:
                score_json = json.loads(score_text)
            except json.JSONDecodeError:
                try:
                    score_json = json.loads(
                        json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
                except json.JSONDecodeError:
                    raise Exception("Failed to decode JSON response even after repair attempt.")
            # score = re.search(r'\d+', score_text)
            # return int(score.group()) if score else -1
            #print(f"Score JSON: {score_json}")
            return int(score_json['Score'])

        except Exception as e:
            print(f"Attempt {attempts + 1} failed: {e}")
            time.sleep(delay)
            delay *= 2  # Exponential increase of the delay
            attempts += 1

    return -1
    # raise Exception("Failed to complete the API call after maximum retry attempts.")


def calculate_avg_score(score_list):
    if isinstance(score_list, list) and score_list:
        valid_scores = [score for score in score_list if score is not None]
        if valid_scores:
            avg_score = sum(valid_scores) / len(valid_scores)
            return avg_score
    return None


def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation
                            , template):
    print(f"Processing {len(df)} entries with {num_run} runs each.")
    """ Process entries and compute scores concurrently, with progress updates. """
    scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}

    for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
        for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):

            for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
                prompt_normal = create_summary(group_name, label, occupation, row, template)

                # print(f"Run {run + 1} - Entry {index + 1} - {key}")
                # print("=============================================================")
                result_normal = invoke_retry(prompt_normal, agent, parameters)
                scores[key][index].append(result_normal)

    #print(f"Scores: {scores}")

    # Ensure all scores are lists and calculate average scores
    for category in ['Privilege', 'Protect', 'Neutral']:
        # Ensure the scores are lists and check before assignment
        series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
        df[f'{category}_Scores'] = series_data

        # Calculate the average score with additional debug info

        df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)

    # Add ranks for each score within each row
    ranks = df[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=False)

    df['Privilege_Rank'] = ranks['Privilege_Avg_Score']
    df['Protect_Rank'] = ranks['Protect_Avg_Score']
    df['Neutral_Rank'] = ranks['Neutral_Avg_Score']

    return df