ProfessorLeVesseur commited on
Commit
c60d2d1
·
verified ·
1 Parent(s): 978b3e6

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +201 -101
data_processor.py CHANGED
@@ -1,7 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import os
3
  import re
4
  from huggingface_hub import InferenceClient
 
5
 
6
  class DataProcessor:
7
  INTERVENTION_COLUMN = 'Did the intervention happen today?'
@@ -9,11 +179,12 @@ class DataProcessor:
9
  PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
10
  NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
11
 
12
- def __init__(self):
13
  self.hf_api_key = os.getenv('HF_API_KEY')
14
  if not self.hf_api_key:
15
  raise ValueError("HF_API_KEY not set in environment variables")
16
  self.client = InferenceClient(api_key=self.hf_api_key)
 
17
 
18
  def read_excel(self, uploaded_file):
19
  return pd.read_excel(uploaded_file)
@@ -23,15 +194,11 @@ class DataProcessor:
23
  df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
24
  df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
25
  df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
26
- df = df[['Date of Session', 'Timestamp'] + [col for col in df.columns if col not in ['Date of Session', 'Timestamp']]]
27
  return df
28
 
29
  def safe_convert_to_time(self, series, format_str='%I:%M %p'):
30
  try:
31
- converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
32
- if format_str:
33
- return converted.dt.strftime(format_str)
34
- return converted
35
  except Exception as e:
36
  print(f"Error converting series to time: {e}")
37
  return series
@@ -53,11 +220,7 @@ class DataProcessor:
53
  match = re.match(r'Student Attendance \[(.+?)\]', col)
54
  if match:
55
  name = match.group(1)
56
- name_parts = name.split()
57
- if len(name_parts) == 1:
58
- initials = name_parts[0][0]
59
- else:
60
- initials = ''.join([part[0] for part in name_parts])
61
  updated_columns.append(f'Student Attendance [{initials}]')
62
  else:
63
  updated_columns.append(col)
@@ -69,98 +232,35 @@ class DataProcessor:
69
  def compute_intervention_statistics(self, df):
70
  total_days = len(df)
71
  sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
72
- sessions_not_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('no').sum()
73
  intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
74
- intervention_frequency = round(intervention_frequency, 0)
75
-
76
- stats = {
77
- 'Intervention Frequency (%)': [intervention_frequency],
78
  'Intervention Sessions Held': [sessions_held],
79
- 'Intervention Sessions Not Held': [sessions_not_held],
80
  'Total Number of Days Available': [total_days]
81
- }
82
- return pd.DataFrame(stats)
83
-
84
- def compute_student_metrics(self, df):
85
- intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
86
- intervention_sessions_held = len(intervention_df)
87
- student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
88
-
89
- student_metrics = {}
90
- for col in student_columns:
91
- student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
92
- student_data = intervention_df[[col]].copy()
93
- student_data[col] = student_data[col].fillna('Absent')
94
-
95
- attendance_values = student_data[col].apply(lambda x: 1 if x in [
96
- self.ENGAGED_STR,
97
- self.PARTIALLY_ENGAGED_STR,
98
- self.NOT_ENGAGED_STR
99
- ] else 0)
100
-
101
- sessions_attended = attendance_values.sum()
102
- attendance_pct = (sessions_attended / intervention_sessions_held) * 100 if intervention_sessions_held > 0 else 0
103
- attendance_pct = round(attendance_pct)
104
-
105
- engagement_counts = {
106
- 'Engaged': 0,
107
- 'Partially Engaged': 0,
108
- 'Not Engaged': 0,
109
- 'Absent': 0
110
- }
111
-
112
- for x in student_data[col]:
113
- if x == self.ENGAGED_STR:
114
- engagement_counts['Engaged'] += 1
115
- elif x == self.PARTIALLY_ENGAGED_STR:
116
- engagement_counts['Partially Engaged'] += 1
117
- elif x == self.NOT_ENGAGED_STR:
118
- engagement_counts['Not Engaged'] += 1
119
- else:
120
- engagement_counts['Absent'] += 1 # Count as Absent if not engaged
121
 
122
- # Calculate percentages for engagement states
123
- total_sessions = sum(engagement_counts.values())
124
-
125
- # Engagement (%)
126
- engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
127
- engagement_pct = round(engagement_pct)
128
-
129
- engaged_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
130
- engaged_pct = round(engaged_pct)
131
-
132
- partially_engaged_pct = (engagement_counts['Partially Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
133
- partially_engaged_pct = round(partially_engaged_pct)
134
-
135
- not_engaged_pct = (engagement_counts['Not Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
136
- not_engaged_pct = round(not_engaged_pct)
137
 
138
- absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
139
- absent_pct = round(absent_pct)
140
-
141
- # Store metrics in the required order
142
- student_metrics[student_name] = {
143
- 'Attendance (%)': attendance_pct,
144
- 'Attendance #': sessions_attended, # Raw number of sessions attended
145
- 'Engagement (%)': engagement_pct,
146
- 'Engaged (%)': engaged_pct,
147
- 'Partially Engaged (%)': partially_engaged_pct,
148
- 'Not Engaged (%)': not_engaged_pct,
149
- 'Absent (%)': absent_pct
150
- }
151
-
152
- # Create a DataFrame from student_metrics
153
- student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
154
- student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
155
- return student_metrics_df
156
-
157
- def compute_average_metrics(self, student_metrics_df):
158
- # Calculate the attendance and engagement average percentages across students
159
- attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage
160
- engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Calculate the average engagement percentage
161
-
162
- # Round the averages to make them whole numbers
163
- attendance_avg_stats = round(attendance_avg_stats)
164
- engagement_avg_stats = round(engagement_avg_stats)
165
-
166
- return attendance_avg_stats, engagement_avg_stats
 
1
+ # import pandas as pd
2
+ # import os
3
+ # import re
4
+ # from huggingface_hub import InferenceClient
5
+
6
+ # class DataProcessor:
7
+ # INTERVENTION_COLUMN = 'Did the intervention happen today?'
8
+ # ENGAGED_STR = 'Engaged (Respect, Responsibility, Effort)'
9
+ # PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
10
+ # NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
11
+
12
+ # def __init__(self):
13
+ # self.hf_api_key = os.getenv('HF_API_KEY')
14
+ # if not self.hf_api_key:
15
+ # raise ValueError("HF_API_KEY not set in environment variables")
16
+ # self.client = InferenceClient(api_key=self.hf_api_key)
17
+
18
+ # def read_excel(self, uploaded_file):
19
+ # return pd.read_excel(uploaded_file)
20
+
21
+ # def format_session_data(self, df):
22
+ # df['Date of Session'] = self.safe_convert_to_datetime(df['Date of Session'], '%m/%d/%Y')
23
+ # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
24
+ # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
25
+ # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
26
+ # df = df[['Date of Session', 'Timestamp'] + [col for col in df.columns if col not in ['Date of Session', 'Timestamp']]]
27
+ # return df
28
+
29
+ # def safe_convert_to_time(self, series, format_str='%I:%M %p'):
30
+ # try:
31
+ # converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
32
+ # if format_str:
33
+ # return converted.dt.strftime(format_str)
34
+ # return converted
35
+ # except Exception as e:
36
+ # print(f"Error converting series to time: {e}")
37
+ # return series
38
+
39
+ # def safe_convert_to_datetime(self, series, format_str=None):
40
+ # try:
41
+ # converted = pd.to_datetime(series, errors='coerce')
42
+ # if format_str:
43
+ # return converted.dt.strftime(format_str)
44
+ # return converted
45
+ # except Exception as e:
46
+ # print(f"Error converting series to datetime: {e}")
47
+ # return series
48
+
49
+ # def replace_student_names_with_initials(self, df):
50
+ # updated_columns = []
51
+ # for col in df.columns:
52
+ # if col.startswith('Student Attendance'):
53
+ # match = re.match(r'Student Attendance \[(.+?)\]', col)
54
+ # if match:
55
+ # name = match.group(1)
56
+ # name_parts = name.split()
57
+ # if len(name_parts) == 1:
58
+ # initials = name_parts[0][0]
59
+ # else:
60
+ # initials = ''.join([part[0] for part in name_parts])
61
+ # updated_columns.append(f'Student Attendance [{initials}]')
62
+ # else:
63
+ # updated_columns.append(col)
64
+ # else:
65
+ # updated_columns.append(col)
66
+ # df.columns = updated_columns
67
+ # return df
68
+
69
+ # def compute_intervention_statistics(self, df):
70
+ # total_days = len(df)
71
+ # sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
72
+ # sessions_not_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('no').sum()
73
+ # intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
74
+ # intervention_frequency = round(intervention_frequency, 0)
75
+
76
+ # stats = {
77
+ # 'Intervention Frequency (%)': [intervention_frequency],
78
+ # 'Intervention Sessions Held': [sessions_held],
79
+ # 'Intervention Sessions Not Held': [sessions_not_held],
80
+ # 'Total Number of Days Available': [total_days]
81
+ # }
82
+ # return pd.DataFrame(stats)
83
+
84
+ # def compute_student_metrics(self, df):
85
+ # intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
86
+ # intervention_sessions_held = len(intervention_df)
87
+ # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
88
+
89
+ # student_metrics = {}
90
+ # for col in student_columns:
91
+ # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
92
+ # student_data = intervention_df[[col]].copy()
93
+ # student_data[col] = student_data[col].fillna('Absent')
94
+
95
+ # attendance_values = student_data[col].apply(lambda x: 1 if x in [
96
+ # self.ENGAGED_STR,
97
+ # self.PARTIALLY_ENGAGED_STR,
98
+ # self.NOT_ENGAGED_STR
99
+ # ] else 0)
100
+
101
+ # sessions_attended = attendance_values.sum()
102
+ # attendance_pct = (sessions_attended / intervention_sessions_held) * 100 if intervention_sessions_held > 0 else 0
103
+ # attendance_pct = round(attendance_pct)
104
+
105
+ # engagement_counts = {
106
+ # 'Engaged': 0,
107
+ # 'Partially Engaged': 0,
108
+ # 'Not Engaged': 0,
109
+ # 'Absent': 0
110
+ # }
111
+
112
+ # for x in student_data[col]:
113
+ # if x == self.ENGAGED_STR:
114
+ # engagement_counts['Engaged'] += 1
115
+ # elif x == self.PARTIALLY_ENGAGED_STR:
116
+ # engagement_counts['Partially Engaged'] += 1
117
+ # elif x == self.NOT_ENGAGED_STR:
118
+ # engagement_counts['Not Engaged'] += 1
119
+ # else:
120
+ # engagement_counts['Absent'] += 1 # Count as Absent if not engaged
121
+
122
+ # # Calculate percentages for engagement states
123
+ # total_sessions = sum(engagement_counts.values())
124
+
125
+ # # Engagement (%)
126
+ # engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
127
+ # engagement_pct = round(engagement_pct)
128
+
129
+ # engaged_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
130
+ # engaged_pct = round(engaged_pct)
131
+
132
+ # partially_engaged_pct = (engagement_counts['Partially Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
133
+ # partially_engaged_pct = round(partially_engaged_pct)
134
+
135
+ # not_engaged_pct = (engagement_counts['Not Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
136
+ # not_engaged_pct = round(not_engaged_pct)
137
+
138
+ # absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
139
+ # absent_pct = round(absent_pct)
140
+
141
+ # # Store metrics in the required order
142
+ # student_metrics[student_name] = {
143
+ # 'Attendance (%)': attendance_pct,
144
+ # 'Attendance #': sessions_attended, # Raw number of sessions attended
145
+ # 'Engagement (%)': engagement_pct,
146
+ # 'Engaged (%)': engaged_pct,
147
+ # 'Partially Engaged (%)': partially_engaged_pct,
148
+ # 'Not Engaged (%)': not_engaged_pct,
149
+ # 'Absent (%)': absent_pct
150
+ # }
151
+
152
+ # # Create a DataFrame from student_metrics
153
+ # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
154
+ # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
155
+ # return student_metrics_df
156
+
157
+ # def compute_average_metrics(self, student_metrics_df):
158
+ # # Calculate the attendance and engagement average percentages across students
159
+ # attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage
160
+ # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Calculate the average engagement percentage
161
+
162
+ # # Round the averages to make them whole numbers
163
+ # attendance_avg_stats = round(attendance_avg_stats)
164
+ # engagement_avg_stats = round(engagement_avg_stats)
165
+
166
+ # return attendance_avg_stats, engagement_avg_stats
167
+
168
+
169
+
170
  import pandas as pd
171
  import os
172
  import re
173
  from huggingface_hub import InferenceClient
174
+ from graphviz import Digraph
175
 
176
  class DataProcessor:
177
  INTERVENTION_COLUMN = 'Did the intervention happen today?'
 
179
  PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
180
  NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
181
 
182
+ def __init__(self, student_metrics_df=None):
183
  self.hf_api_key = os.getenv('HF_API_KEY')
184
  if not self.hf_api_key:
185
  raise ValueError("HF_API_KEY not set in environment variables")
186
  self.client = InferenceClient(api_key=self.hf_api_key)
187
+ self.student_metrics_df = student_metrics_df
188
 
189
  def read_excel(self, uploaded_file):
190
  return pd.read_excel(uploaded_file)
 
194
  df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
195
  df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
196
  df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
 
197
  return df
198
 
199
  def safe_convert_to_time(self, series, format_str='%I:%M %p'):
200
  try:
201
+ return pd.to_datetime(series, format=format_str, errors='coerce')
 
 
 
202
  except Exception as e:
203
  print(f"Error converting series to time: {e}")
204
  return series
 
220
  match = re.match(r'Student Attendance \[(.+?)\]', col)
221
  if match:
222
  name = match.group(1)
223
+ initials = ''.join([part[0] for part in name.split()])
 
 
 
 
224
  updated_columns.append(f'Student Attendance [{initials}]')
225
  else:
226
  updated_columns.append(col)
 
232
  def compute_intervention_statistics(self, df):
233
  total_days = len(df)
234
  sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
 
235
  intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
236
+ return pd.DataFrame({
237
+ 'Intervention Frequency (%)': [round(intervention_frequency, 0)],
 
 
238
  'Intervention Sessions Held': [sessions_held],
239
+ 'Intervention Sessions Not Held': [total_days - sessions_held],
240
  'Total Number of Days Available': [total_days]
241
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ def compute_student_metrics(self):
244
+ # Add metrics processing logic here
245
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ def evaluate_student(self, row, attendance_threshold=90, engagement_threshold=80):
248
+ if row["Attended ≥ 90%"] == "No":
249
+ return "Address Attendance"
250
+ elif row["Engagement 80%"] == "No":
251
+ return "Address Engagement"
252
+ return "Consider addressing logistical barriers, improving fidelity, and/or collecting progress monitoring data"
253
+
254
+ def build_tree_diagram(self, row):
255
+ dot = Digraph()
256
+ dot.node("Q1", "Has the student attended ≥ 90% of interventions?")
257
+ dot.node("Q2", "Has the student been engaged ≥ 80% of intervention time?")
258
+ dot.node("A1", "Address Attendance", shape="box")
259
+ dot.node("A2", "Address Engagement", shape="box")
260
+ dot.node("A3", "Consider addressing logistical barriers", shape="box")
261
+ if row["Attended 90%"] == "No":
262
+ dot.edge("Q1", "A1", label="No")
263
+ else:
264
+ dot.edge("Q1", "Q2", label="Yes")
265
+ dot.edge("Q2", "A2" if row["Engagement ≥ 80%"] == "No" else "A3", label="Yes")
266
+ return dot