Spaces:
Runtime error
Runtime error
magnolia-pm
commited on
Commit
•
a746976
1
Parent(s):
ddae422
added scatter plot
Browse files- .gitignore +2 -0
- .streamlit/config.toml +6 -0
- app.py +177 -85
- data.feather +0 -0
- requirements.txt +2 -1
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tmp.ipynb
|
2 |
+
__pycache__
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#4361ee"
|
3 |
+
backgroundColor="#FFFFFF"
|
4 |
+
secondaryBackgroundColor="#F0F2F6"
|
5 |
+
textColor="#262730"
|
6 |
+
font="sans serif"
|
app.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
import os
|
2 |
import torch
|
|
|
3 |
import streamlit as st
|
|
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
5 |
from transformers import pipeline
|
6 |
from plotly.subplots import make_subplots
|
7 |
import plotly.graph_objects as go
|
|
|
8 |
|
9 |
|
10 |
def z_score(y, mean=.04853076, sd=.9409466):
|
@@ -42,103 +45,192 @@ def indicator_plot(value, title, value_range, domain):
|
|
42 |
|
43 |
return plot
|
44 |
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# NLP for Item Desirability Ratings
|
47 |
-
This web application accompanies the paper *
|
48 |
-
A Machine-Based Alternative to Human Judges* submitted to the Journal *Personality and Individual Differences*.
|
49 |
|
50 |
## What is this research about?
|
51 |
Researchers use personality scales to measure people's traits and behaviors, but biases can affect the accuracy of these scales.
|
52 |
Socially desirable responding is a common bias that can skew results. To overcome this, researchers gather item desirability ratings, e.g., to ensure that questions are neutral.
|
53 |
Recently, advancements in natural language processing have made it possible to use machines to estimate social desirability ratings,
|
54 |
-
which can provide a viable alternative to human ratings and help researchers, scale developers, and practitioners improve the accuracy of personality scales.
|
|
|
55 |
|
|
|
|
|
56 |
## Try it yourself!
|
57 |
-
Use the text field below to enter a statement that might be part of a psychological questionnaire (e.g., "I love a good fight.").
|
58 |
-
The left dial
|
59 |
-
The right dial indicates sentiment (i.e., valence) as estimated by regular sentiment analysis (using the `cardiffnlp/twitter-xlm-roberta-base-sentiment` model).
|
60 |
-
"""
|
61 |
-
|
62 |
-
st.markdown(body)
|
63 |
-
|
64 |
-
input_text = st.text_input(
|
65 |
-
label='Estimate item desirability:',
|
66 |
-
value='I love a good fight.',
|
67 |
-
placeholder='Enter item'
|
68 |
-
)
|
69 |
-
|
70 |
-
# desirability model
|
71 |
-
# remote or local?
|
72 |
-
if os.environ.get("item-desirability"):
|
73 |
-
model_path = 'magnolia-psychometrics/item-desirability'
|
74 |
-
else:
|
75 |
-
model_path = '/nlp/nlp/models/finetuned/twitter-xlm-roberta-base-regressive-desirability-ft-4'
|
76 |
-
|
77 |
-
auth_token = os.environ.get("item-desirability") or True
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
pretrained_model_name_or_path=model_path,
|
82 |
-
use_fast=True,
|
83 |
-
use_auth_token=auth_token
|
84 |
-
)
|
85 |
-
|
86 |
-
if 'model' not in globals():
|
87 |
-
model = AutoModelForSequenceClassification.from_pretrained(
|
88 |
-
pretrained_model_name_or_path=model_path,
|
89 |
-
num_labels=1,
|
90 |
-
ignore_mismatched_sizes=True,
|
91 |
-
use_auth_token=auth_token
|
92 |
-
)
|
93 |
-
|
94 |
-
# sentiment classifier
|
95 |
-
if 'classifier' not in globals():
|
96 |
-
sentiment_model = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'
|
97 |
-
classifier = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_model, use_fast=False, top_k=3)
|
98 |
-
|
99 |
-
classifier_output = classifier(input_text)
|
100 |
-
classifier_output_dict = {x['label']: x['score'] for x in classifier_output[0]}
|
101 |
-
classifier_score = classifier_output_dict['positive'] - classifier_output_dict['negative']
|
102 |
-
|
103 |
-
if input_text:
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
)
|
117 |
|
118 |
-
|
119 |
-
value=classifier_score,
|
120 |
-
title=f"Item Sentiment",
|
121 |
-
value_range=[-1, 1],
|
122 |
-
domain={'x': [.55, 1], 'y': [0, 1]}
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
|
142 |
-
"""
|
143 |
|
144 |
-
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
+
import dash
|
4 |
import streamlit as st
|
5 |
+
import pandas as pd
|
6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
7 |
from transformers import pipeline
|
8 |
from plotly.subplots import make_subplots
|
9 |
import plotly.graph_objects as go
|
10 |
+
import plotly.express as px
|
11 |
|
12 |
|
13 |
def z_score(y, mean=.04853076, sd=.9409466):
|
|
|
45 |
|
46 |
return plot
|
47 |
|
48 |
+
def scatter_plot(df, group_var):
|
49 |
+
|
50 |
+
colors = ['#36def1', '#4361ee'] if group_var else ['#4361ee']
|
51 |
+
|
52 |
+
plot = px.scatter(
|
53 |
+
df,
|
54 |
+
x='Machine-ratings',
|
55 |
+
y='Human-ratings',
|
56 |
+
color=group_var,
|
57 |
+
facet_col='x_group',
|
58 |
+
facet_col_wrap=2,
|
59 |
+
trendline='ols',
|
60 |
+
trendline_scope='trace',
|
61 |
+
hover_data={
|
62 |
+
'Text': df.text,
|
63 |
+
'Language': False,
|
64 |
+
'x_group': False,
|
65 |
+
'Human-ratings': ':.2f',
|
66 |
+
'Machine-ratings': ':.2f',
|
67 |
+
'Study': df.study,
|
68 |
+
'Instrument': df.instrument,
|
69 |
+
},
|
70 |
+
width=400,
|
71 |
+
height=400,
|
72 |
+
color_discrete_sequence=colors
|
73 |
+
)
|
74 |
+
|
75 |
+
plot.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
|
76 |
+
plot.update_layout(
|
77 |
+
legend={
|
78 |
+
'orientation':'h',
|
79 |
+
'yanchor': 'bottom',
|
80 |
+
'y': -.30
|
81 |
+
})
|
82 |
+
plot.update_xaxes(title_standoff = 0)
|
83 |
+
|
84 |
+
return plot
|
85 |
+
|
86 |
+
# data import and wrangling
|
87 |
+
covariate_columns = {
|
88 |
+
'content_domain': 'Content Domain',
|
89 |
+
'language': 'Language',
|
90 |
+
'rater_group': 'Rater Group',
|
91 |
+
}
|
92 |
+
|
93 |
+
df = (
|
94 |
+
pd
|
95 |
+
.read_feather(path='data.feather').query('partition == "test" | partition == "dev"')
|
96 |
+
.melt(
|
97 |
+
value_vars=['sentiment_model', 'desirability_model'],
|
98 |
+
var_name='x_group',
|
99 |
+
value_name='x',
|
100 |
+
id_vars=['mean_z', 'text', 'content_domain', 'language', 'rater_group', 'study', 'instrument']
|
101 |
+
)
|
102 |
+
.replace(
|
103 |
+
to_replace={
|
104 |
+
'en': 'English',
|
105 |
+
'de': 'German',
|
106 |
+
'other': 'Other',
|
107 |
+
'personality': 'Personality',
|
108 |
+
'laypeople': 'Laypeople',
|
109 |
+
'students': 'Students',
|
110 |
+
'sentiment_model': 'Sentiment Model',
|
111 |
+
'desirability_model': 'Desirability Model'
|
112 |
+
}
|
113 |
+
)
|
114 |
+
.rename(columns=covariate_columns)
|
115 |
+
.rename(
|
116 |
+
columns={
|
117 |
+
'mean_z': 'Human-ratings',
|
118 |
+
'x': 'Machine-ratings',
|
119 |
+
}
|
120 |
+
)
|
121 |
+
)
|
122 |
+
|
123 |
+
st.markdown("""
|
124 |
# NLP for Item Desirability Ratings
|
125 |
+
This web application accompanies the paper "*Expanding the Methodological Toolbox: Machine-Based Item Desirability Ratings as an Alternative to Human-Based Ratings*".
|
|
|
126 |
|
127 |
## What is this research about?
|
128 |
Researchers use personality scales to measure people's traits and behaviors, but biases can affect the accuracy of these scales.
|
129 |
Socially desirable responding is a common bias that can skew results. To overcome this, researchers gather item desirability ratings, e.g., to ensure that questions are neutral.
|
130 |
Recently, advancements in natural language processing have made it possible to use machines to estimate social desirability ratings,
|
131 |
+
which can provide a viable alternative to human ratings and help researchers, scale developers, and practitioners improve the accuracy of personality scales.
|
132 |
+
""")
|
133 |
|
134 |
+
# demo
|
135 |
+
st.markdown("""
|
136 |
## Try it yourself!
|
137 |
+
Use the text field below to enter a statement that might be part of a psychological questionnaire (e.g., "I love a good fight.").
|
138 |
+
The left dial indicates how socially desirable it might be to endorse this item.
|
139 |
+
The right dial indicates sentiment (i.e., valence) as estimated by regular sentiment analysis (using the `cardiffnlp/twitter-xlm-roberta-base-sentiment` model).
|
140 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
## desirability model
|
143 |
+
with st.spinner('Processing...'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
+
if os.environ.get('item-desirability'):
|
146 |
+
model_path = 'magnolia-psychometrics/item-desirability'
|
147 |
+
else:
|
148 |
+
model_path = '/nlp/nlp/models/finetuned/twitter-xlm-roberta-base-regressive-desirability-ft-4'
|
149 |
+
|
150 |
+
auth_token = os.environ.get('item-desirability') or True
|
151 |
+
|
152 |
+
if 'tokenizer' not in globals():
|
153 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
154 |
+
pretrained_model_name_or_path=model_path,
|
155 |
+
use_fast=True,
|
156 |
+
use_auth_token=auth_token
|
157 |
+
)
|
158 |
+
|
159 |
+
if 'model' not in globals():
|
160 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
161 |
+
pretrained_model_name_or_path=model_path,
|
162 |
+
num_labels=1,
|
163 |
+
ignore_mismatched_sizes=True,
|
164 |
+
use_auth_token=auth_token
|
165 |
+
)
|
166 |
+
|
167 |
+
## sentiment model
|
168 |
+
if 'classifier' not in globals():
|
169 |
+
sentiment_model = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'
|
170 |
+
classifier = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_model, use_fast=False, top_k=3)
|
171 |
+
|
172 |
+
input_text = st.text_input(
|
173 |
+
label='Estimate item desirability:',
|
174 |
+
value='I love a good fight.',
|
175 |
+
placeholder='Enter item text'
|
176 |
)
|
177 |
|
178 |
+
if input_text:
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
classifier_output = classifier(input_text)
|
181 |
+
classifier_output_dict = {x['label']: x['score'] for x in classifier_output[0]}
|
182 |
+
classifier_score = classifier_output_dict['positive'] - classifier_output_dict['negative']
|
183 |
+
|
184 |
+
inputs = tokenizer(input_text, padding=True, return_tensors='pt')
|
185 |
+
|
186 |
+
with torch.no_grad():
|
187 |
+
score = model(**inputs).logits.squeeze().tolist()
|
188 |
+
z = z_score(score)
|
189 |
+
|
190 |
+
p1 = indicator_plot(
|
191 |
+
value=classifier_score,
|
192 |
+
title=f'Item Sentiment',
|
193 |
+
value_range=[-1, 1],
|
194 |
+
domain={'x': [.55, 1], 'y': [0, 1]}
|
195 |
+
)
|
196 |
+
|
197 |
+
p2 = indicator_plot(
|
198 |
+
value=z,
|
199 |
+
title=f'Item Desirability',
|
200 |
+
value_range=[-4, 4],
|
201 |
+
domain={'x': [0, .45], 'y': [0, 1]},
|
202 |
+
)
|
203 |
+
|
204 |
+
fig = go.Figure()
|
205 |
+
fig.add_trace(p1)
|
206 |
+
fig.add_trace(p2)
|
207 |
+
|
208 |
+
fig.update_layout(
|
209 |
+
title=dict(text=f'"{input_text}"', font=dict(size=36),yref='paper'),
|
210 |
+
paper_bgcolor = "white",
|
211 |
+
font = {'color': "black", 'family': "Arial"})
|
212 |
+
|
213 |
+
st.plotly_chart(fig, theme=None, use_container_width=True)
|
214 |
+
|
215 |
+
st.markdown("""
|
216 |
+
Item sentiment: Absolute differences between positive and negative sentiment.
|
217 |
+
Item desirability: z-transformed values, 0 indicated "neutral".
|
218 |
+
""")
|
219 |
+
|
220 |
+
## plot
|
221 |
+
st.markdown("""
|
222 |
+
## Explore the data
|
223 |
+
Figures show the accuarcy in precitions of human-rated item desirability by the sentiment model (left) and the desirability model (right), using `test`-partition data only.
|
224 |
+
""")
|
225 |
+
|
226 |
+
|
227 |
+
show_covariates = st.checkbox('Show covariates', value=True)
|
228 |
+
|
229 |
+
if show_covariates:
|
230 |
+
option = st.selectbox('Group by', options=list(covariate_columns.values()))
|
231 |
+
else:
|
232 |
+
option = None
|
233 |
|
234 |
+
plot = scatter_plot(df, option)
|
|
|
235 |
|
236 |
+
st.plotly_chart(plot, theme=None, use_container_width=True)
|
data.feather
ADDED
Binary file (557 kB). View file
|
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
plotly
|
4 |
-
|
|
|
|
1 |
torch
|
2 |
transformers
|
3 |
plotly
|
4 |
+
dash
|
5 |
+
statsmodels
|