Optimise evaluation logic
Browse files- .gitignore +1 -0
- README.md +56 -0
- app.py +26 -23
- data/amazon_reviews.json +0 -9
- data/imdb.json +0 -9
- data/imdb_100.csv +0 -0
- data/tweet_eval.json +0 -9
- data/z_animal.csv +0 -11
- scripts/genbit.py +3 -3
- scripts/gender_divide.py +34 -65
- scripts/gender_profession_bias.py +11 -12
.gitignore
CHANGED
@@ -5,6 +5,7 @@ flagged/
|
|
5 |
check_gender_tagging.py
|
6 |
*.py[cod]
|
7 |
*$py.class
|
|
|
8 |
|
9 |
# C extensions
|
10 |
*.so
|
|
|
5 |
check_gender_tagging.py
|
6 |
*.py[cod]
|
7 |
*$py.class
|
8 |
+
playground.ipynb
|
9 |
|
10 |
# C extensions
|
11 |
*.so
|
README.md
CHANGED
@@ -11,3 +11,59 @@ license: mit
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
{
|
16 |
+
gender: "307",
|
17 |
+
no gender: "193",
|
18 |
+
equal gender: "2",
|
19 |
+
female pg: "0",
|
20 |
+
male pg: "2",
|
21 |
+
female spg: "0",
|
22 |
+
male spg: "300"
|
23 |
+
}
|
24 |
+
|
25 |
+
{
|
26 |
+
both_gender_prof_match: "94",
|
27 |
+
count_male_pronoun: "738",
|
28 |
+
count_female_pronoun: "435",
|
29 |
+
count_male_pronoun_profession: "63",
|
30 |
+
count_female_pronoun_profession: "44",
|
31 |
+
total_sentence: "5224"
|
32 |
+
}
|
33 |
+
|
34 |
+
{
|
35 |
+
"genbit_score": 0.7511277214181936,
|
36 |
+
"percentage_of_female_gender_definition_words": 0.28019425675675674,
|
37 |
+
"percentage_of_male_gender_definition_words": 0.39125844594594594,
|
38 |
+
"percentage_of_non_binary_gender_definition_words": 0.3285472972972973,
|
39 |
+
"percentage_of_trans_gender_definition_words": 1,
|
40 |
+
"percentage_of_cis_gender_definition_words": 0,
|
41 |
+
"additional_metrics": {
|
42 |
+
"avg_bias_ratio": 0.30200560886941735,
|
43 |
+
"avg_bias_conditional": 0.24803272381904817,
|
44 |
+
"avg_bias_ratio_absolute": 0.7634929817138464,
|
45 |
+
"avg_bias_conditional_absolute": 0.7511277214181936,
|
46 |
+
"avg_non_binary_bias_ratio": 0.6588525475408009,
|
47 |
+
"avg_non_binary_bias_conditional": 0.5042607755622402,
|
48 |
+
"avg_non_binary_bias_ratio_absolute": 0.8116124669471863,
|
49 |
+
"avg_non_binary_bias_conditional_absolute": 0.7191749038014791,
|
50 |
+
"avg_trans_cis_bias_ratio": -1.4428077555033436,
|
51 |
+
"avg_trans_cis_bias_conditional": -1.1196624683470209,
|
52 |
+
"avg_trans_cis_bias_ratio_absolute": 1.4428077555033436,
|
53 |
+
"avg_trans_cis_bias_conditional_absolute": 1.1683757280093239,
|
54 |
+
"std_dev_bias_ratio": 0,
|
55 |
+
"std_dev_bias_conditional": 0,
|
56 |
+
"std_dev_non_binary_bias_ratio": 0,
|
57 |
+
"std_dev_non_binary_bias_conditional": 0,
|
58 |
+
"std_dev_trans_cis_bias_ratio": 0,
|
59 |
+
"std_dev_trans_cis_bias_conditional": 0
|
60 |
+
},
|
61 |
+
"statistics": {
|
62 |
+
"frequency_cutoff": 7.433655937499999,
|
63 |
+
"num_words_considered": 1539,
|
64 |
+
"freq_of_female_gender_definition_words": 1327,
|
65 |
+
"freq_of_male_gender_definition_words": 1853,
|
66 |
+
"freq_of_non_binary_gender_definition_words": 1556,
|
67 |
+
"jsd": 0.067655503412491
|
68 |
+
}
|
69 |
+
}
|
app.py
CHANGED
@@ -9,7 +9,8 @@ from scripts.gender_divide import *
|
|
9 |
|
10 |
methodologies = json.load(open("config/methodologies.json", "r"))
|
11 |
|
12 |
-
MAX_THRESHOLD =
|
|
|
13 |
|
14 |
|
15 |
def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
@@ -17,7 +18,9 @@ def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
|
17 |
print(
|
18 |
f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries"
|
19 |
)
|
20 |
-
data = pd.read_csv(dataset.name
|
|
|
|
|
21 |
|
22 |
if sampling_method == "First":
|
23 |
data = data.head(sampling_size)
|
@@ -26,24 +29,25 @@ def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
|
26 |
elif sampling_method == "Random":
|
27 |
data = data.sample(n=sampling_size, random_state=42)
|
28 |
|
29 |
-
|
30 |
|
31 |
-
|
|
|
|
|
|
|
32 |
except Exception as e:
|
33 |
return gr.JSON.update(
|
34 |
-
{
|
35 |
-
"error": f"An error occurred while processing the dataset. Please check the dataset and try again. Error: {e}"
|
36 |
-
},
|
37 |
visible=True,
|
38 |
)
|
39 |
|
40 |
|
41 |
def display_dataset_config(dataset):
|
42 |
try:
|
43 |
-
data = pd.read_csv(dataset.name)
|
44 |
|
45 |
columns = data.select_dtypes(include=["object"]).columns.tolist()
|
46 |
-
corpus = data[columns[0]].tolist()
|
47 |
|
48 |
return (
|
49 |
gr.Radio.update(
|
@@ -59,7 +63,7 @@ def display_dataset_config(dataset):
|
|
59 |
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
|
60 |
minimum=1,
|
61 |
maximum=min(data.shape[0], MAX_THRESHOLD),
|
62 |
-
value=min(data.shape[0], MAX_THRESHOLD)
|
63 |
visible=True,
|
64 |
interactive=True,
|
65 |
),
|
@@ -72,7 +76,7 @@ def display_dataset_config(dataset):
|
|
72 |
interactive=True,
|
73 |
),
|
74 |
gr.DataFrame.update(
|
75 |
-
value=pd.DataFrame({f"
|
76 |
),
|
77 |
)
|
78 |
except:
|
@@ -85,12 +89,10 @@ def display_dataset_config(dataset):
|
|
85 |
|
86 |
|
87 |
def update_column_metadata(dataset, column):
|
88 |
-
data = pd.read_csv(dataset.name)
|
89 |
-
corpus = data[column].tolist()
|
90 |
|
91 |
-
return gr.Dataframe.update(
|
92 |
-
value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
|
93 |
-
)
|
94 |
|
95 |
|
96 |
def get_methodology_metadata(methodology):
|
@@ -109,7 +111,11 @@ BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
|
|
109 |
|
110 |
with BiasAware:
|
111 |
gr.Markdown(
|
112 |
-
"
|
|
|
|
|
|
|
|
|
113 |
)
|
114 |
|
115 |
with gr.Row():
|
@@ -119,7 +125,7 @@ with BiasAware:
|
|
119 |
dataset_file = gr.File(label="Dataset", file_types=["csv"])
|
120 |
dataset_examples = gr.Examples(
|
121 |
[
|
122 |
-
os.path.join(os.path.dirname(__file__), "data/
|
123 |
os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
|
124 |
os.path.join(os.path.dirname(__file__), "data/z_sentences.csv"),
|
125 |
],
|
@@ -151,10 +157,7 @@ with BiasAware:
|
|
151 |
with gr.Column(scale=4):
|
152 |
gr.Markdown("## Result")
|
153 |
|
154 |
-
|
155 |
-
result = gr.DataFrame(
|
156 |
-
row_count=(5, "fixed"), col_count=(3, "fixed"), visible=False
|
157 |
-
)
|
158 |
|
159 |
dataset_file.change(
|
160 |
fn=display_dataset_config,
|
@@ -188,7 +191,7 @@ with BiasAware:
|
|
188 |
dataset_column,
|
189 |
methodology,
|
190 |
],
|
191 |
-
outputs=[
|
192 |
)
|
193 |
|
194 |
BiasAware.launch()
|
|
|
9 |
|
10 |
methodologies = json.load(open("config/methodologies.json", "r"))
|
11 |
|
12 |
+
MAX_THRESHOLD = 5000
|
13 |
+
DATASET_CACHE = {}
|
14 |
|
15 |
|
16 |
def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
|
|
18 |
print(
|
19 |
f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries"
|
20 |
)
|
21 |
+
data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))[
|
22 |
+
[column]
|
23 |
+
]
|
24 |
|
25 |
if sampling_method == "First":
|
26 |
data = data.head(sampling_size)
|
|
|
29 |
elif sampling_method == "Random":
|
30 |
data = data.sample(n=sampling_size, random_state=42)
|
31 |
|
32 |
+
result_json = globals()[methodologies.get(methodology).get("fx")](data)
|
33 |
|
34 |
+
result_df = pd.DataFrame.from_dict(result_json, orient="index").reset_index()
|
35 |
+
result_df.columns = ["Metric", "Value"]
|
36 |
+
|
37 |
+
return gr.Dataframe.update(result_df, visible=True)
|
38 |
except Exception as e:
|
39 |
return gr.JSON.update(
|
40 |
+
{"error": f"An error occurred while processing the dataset. {e}"},
|
|
|
|
|
41 |
visible=True,
|
42 |
)
|
43 |
|
44 |
|
45 |
def display_dataset_config(dataset):
|
46 |
try:
|
47 |
+
data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))
|
48 |
|
49 |
columns = data.select_dtypes(include=["object"]).columns.tolist()
|
50 |
+
corpus = data[columns[0]].tolist()[0:5]
|
51 |
|
52 |
return (
|
53 |
gr.Radio.update(
|
|
|
63 |
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
|
64 |
minimum=1,
|
65 |
maximum=min(data.shape[0], MAX_THRESHOLD),
|
66 |
+
value=min(data.shape[0], MAX_THRESHOLD),
|
67 |
visible=True,
|
68 |
interactive=True,
|
69 |
),
|
|
|
76 |
interactive=True,
|
77 |
),
|
78 |
gr.DataFrame.update(
|
79 |
+
value=pd.DataFrame({f"{columns[0]}": corpus}), visible=True
|
80 |
),
|
81 |
)
|
82 |
except:
|
|
|
89 |
|
90 |
|
91 |
def update_column_metadata(dataset, column):
|
92 |
+
data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))
|
93 |
+
corpus = data[column].tolist()[0:5]
|
94 |
|
95 |
+
return gr.Dataframe.update(value=pd.DataFrame({f"{column}": corpus}), visible=True)
|
|
|
|
|
96 |
|
97 |
|
98 |
def get_methodology_metadata(methodology):
|
|
|
111 |
|
112 |
with BiasAware:
|
113 |
gr.Markdown(
|
114 |
+
"""
|
115 |
+
# BiasAware: Dataset Bias Detection
|
116 |
+
|
117 |
+
BiasAware is a specialized tool for detecting and quantifying biases within datasets used for Natural Language Processing (NLP) tasks. NLP training datasets frequently mirror the inherent biases of their source materials, resulting in AI models that unintentionally perpetuate stereotypes, exhibit underrepresentation, and showcase skewed perspectives.
|
118 |
+
"""
|
119 |
)
|
120 |
|
121 |
with gr.Row():
|
|
|
125 |
dataset_file = gr.File(label="Dataset", file_types=["csv"])
|
126 |
dataset_examples = gr.Examples(
|
127 |
[
|
128 |
+
os.path.join(os.path.dirname(__file__), "data/imdb_100.csv"),
|
129 |
os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
|
130 |
os.path.join(os.path.dirname(__file__), "data/z_sentences.csv"),
|
131 |
],
|
|
|
157 |
with gr.Column(scale=4):
|
158 |
gr.Markdown("## Result")
|
159 |
|
160 |
+
result = gr.DataFrame(visible=False)
|
|
|
|
|
|
|
161 |
|
162 |
dataset_file.change(
|
163 |
fn=display_dataset_config,
|
|
|
191 |
dataset_column,
|
192 |
methodology,
|
193 |
],
|
194 |
+
outputs=[result],
|
195 |
)
|
196 |
|
197 |
BiasAware.launch()
|
data/amazon_reviews.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"gender" : 14500,
|
3 |
-
"no gender" : 195500,
|
4 |
-
"equal gender" : 253,
|
5 |
-
"female pg" : 125,
|
6 |
-
"male pg" : 117,
|
7 |
-
"female spg" : 7196,
|
8 |
-
"male spg" : 6809
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/imdb.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"gender" : 36174,
|
3 |
-
"no gender" : 13826,
|
4 |
-
"equal gender" : 2160,
|
5 |
-
"female pg" : 2776,
|
6 |
-
"male pg" : 3440,
|
7 |
-
"female spg" : 6918,
|
8 |
-
"male spg" : 20880
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/imdb_100.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/tweet_eval.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"gender" : 10247,
|
3 |
-
"no gender" : 49652,
|
4 |
-
"equal gender" : 141,
|
5 |
-
"female pg" : 37,
|
6 |
-
"male pg" : 42,
|
7 |
-
"female spg" : 2478,
|
8 |
-
"male spg" : 7549
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/z_animal.csv
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
|
2 |
-
1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
|
3 |
-
2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
|
4 |
-
3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
|
5 |
-
4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
|
6 |
-
5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
|
7 |
-
6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
|
8 |
-
7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
|
9 |
-
8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
|
10 |
-
9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
|
11 |
-
10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/genbit.py
CHANGED
@@ -6,9 +6,9 @@ def eval_genbit(data):
|
|
6 |
language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80
|
7 |
)
|
8 |
|
9 |
-
data
|
10 |
|
11 |
genbit_metrics.add_data(data, tokenized=False)
|
12 |
-
|
13 |
|
14 |
-
return
|
|
|
6 |
language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80
|
7 |
)
|
8 |
|
9 |
+
data = data[data.columns[0]].to_list()
|
10 |
|
11 |
genbit_metrics.add_data(data, tokenized=False)
|
12 |
+
genbit_metrics_dict = genbit_metrics.get_metrics(output_word_list=False)
|
13 |
|
14 |
+
return genbit_metrics_dict
|
scripts/gender_divide.py
CHANGED
@@ -4,78 +4,47 @@ import json
|
|
4 |
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
|
5 |
|
6 |
|
7 |
-
def
|
8 |
-
pattern = r"\b({})\b".format("|".join(
|
9 |
-
|
10 |
-
return len(
|
11 |
-
|
12 |
-
|
13 |
-
def count_female_terms(text, female_terms):
|
14 |
-
pattern = r"\b({})\b".format("|".join(female_terms))
|
15 |
-
match = re.findall(pattern, str(text))
|
16 |
-
return len(match)
|
17 |
|
18 |
|
19 |
def get_gender_tag(count_m_term, count_f_term):
|
20 |
-
|
21 |
-
if
|
22 |
-
|
23 |
-
|
24 |
-
elif count_m_term == count_f_term:
|
25 |
-
tag = "Equal Gender"
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
tag = "Male Strongly Positive Gender"
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
tag = "Female Strongly Positive Gender"
|
40 |
|
41 |
-
return
|
42 |
|
43 |
|
44 |
def get_pg_spg(sample_df):
|
45 |
-
|
46 |
-
"
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
"
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
count_male_spg = sample_df[
|
60 |
-
sample_df["gender_cat"] == "Male Strongly Positive Gender"
|
61 |
-
]["gender_cat"].count()
|
62 |
-
|
63 |
-
count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][
|
64 |
-
"gender_cat"
|
65 |
-
].count()
|
66 |
-
count_female_spg = sample_df[
|
67 |
-
sample_df["gender_cat"] == "Female Stronly Positive Gender"
|
68 |
-
]["gender_cat"].count()
|
69 |
-
|
70 |
-
return {
|
71 |
-
"gender": str(count_gender_sentences),
|
72 |
-
"no gender": str(count_no_gender_sentences),
|
73 |
-
"equal gender": str(count_equal_gender),
|
74 |
-
"female pg": str(count_female_pg),
|
75 |
-
"male pg": str(count_male_pg),
|
76 |
-
"female spg": str(count_female_spg),
|
77 |
-
"male spg": str(count_male_spg),
|
78 |
-
}
|
79 |
|
80 |
|
81 |
def eval_gender_divide(data):
|
@@ -85,10 +54,10 @@ def eval_gender_divide(data):
|
|
85 |
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
|
86 |
|
87 |
data["count_male_term"] = data.apply(
|
88 |
-
lambda x:
|
89 |
)
|
90 |
data["count_female_term"] = data.apply(
|
91 |
-
lambda x:
|
92 |
)
|
93 |
|
94 |
data["gender_cat"] = data.apply(
|
|
|
4 |
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
|
5 |
|
6 |
|
7 |
+
def count_gender_terms(text, gender_terms):
|
8 |
+
pattern = r"\b({})\b".format("|".join(gender_terms))
|
9 |
+
matches = re.findall(pattern, str(text))
|
10 |
+
return len(matches)
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
def get_gender_tag(count_m_term, count_f_term):
|
14 |
+
total_terms = count_m_term + count_f_term
|
15 |
+
if total_terms == 0:
|
16 |
+
return "No Gender"
|
|
|
|
|
|
|
17 |
|
18 |
+
m_proportion = (count_m_term / total_terms) * 100
|
19 |
+
if m_proportion >= 75:
|
20 |
+
return "Male Strongly Positive Gender"
|
21 |
+
elif m_proportion >= 50:
|
22 |
+
return "Male Positive Gender"
|
|
|
23 |
|
24 |
+
f_proportion = (count_f_term / total_terms) * 100
|
25 |
+
if f_proportion >= 75:
|
26 |
+
return "Female Strongly Positive Gender"
|
27 |
+
elif f_proportion >= 50:
|
28 |
+
return "Female Positive Gender"
|
|
|
29 |
|
30 |
+
return "Equal Gender"
|
31 |
|
32 |
|
33 |
def get_pg_spg(sample_df):
|
34 |
+
gender_labels = [
|
35 |
+
"Gender",
|
36 |
+
"No Gender",
|
37 |
+
"Equal Gender",
|
38 |
+
"Female Positive Gender",
|
39 |
+
"Male Positive Gender",
|
40 |
+
"Female Strongly Positive Gender",
|
41 |
+
"Male Strongly Positive Gender",
|
42 |
+
]
|
43 |
+
|
44 |
+
gender_counts = sample_df["gender_cat"].value_counts()
|
45 |
+
result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
|
46 |
+
|
47 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def eval_gender_divide(data):
|
|
|
54 |
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
|
55 |
|
56 |
data["count_male_term"] = data.apply(
|
57 |
+
lambda x: count_gender_terms(x[data.columns[0]], male_terms), axis=1
|
58 |
)
|
59 |
data["count_female_term"] = data.apply(
|
60 |
+
lambda x: count_gender_terms(x[:], female_terms), axis=1
|
61 |
)
|
62 |
|
63 |
data["gender_cat"] = data.apply(
|
scripts/gender_profession_bias.py
CHANGED
@@ -85,20 +85,19 @@ def call_multiprocessing_pool(df_text):
|
|
85 |
|
86 |
|
87 |
def get_statistics(result):
|
88 |
-
|
89 |
-
"both_gender_prof_match": result["Both Match"]
|
90 |
-
"count_male_pronoun": result["Male Pronoun"]
|
91 |
-
"count_female_pronoun": result["Female Pronoun"]
|
92 |
-
"count_male_pronoun_profession":
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
96 |
}
|
97 |
|
98 |
-
stats = {key: str(value.sum()) for key, value in conditions.items()}
|
99 |
-
|
100 |
-
stats["total_sentence"] = str(len(result))
|
101 |
-
|
102 |
return stats
|
103 |
|
104 |
|
|
|
85 |
|
86 |
|
87 |
def get_statistics(result):
|
88 |
+
stats = {
|
89 |
+
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
|
90 |
+
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
|
91 |
+
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
|
92 |
+
"count_male_pronoun_profession": str(
|
93 |
+
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
|
94 |
+
),
|
95 |
+
"count_female_pronoun_profession": str(
|
96 |
+
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
|
97 |
+
),
|
98 |
+
"total_sentence": str(len(result)),
|
99 |
}
|
100 |
|
|
|
|
|
|
|
|
|
101 |
return stats
|
102 |
|
103 |
|