ProfessorLeVesseur commited on
Commit
74ec333
·
verified ·
1 Parent(s): ab77f08

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +235 -80
data_processor.py CHANGED
@@ -1,3 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import pandas as pd
3
  import os
@@ -81,7 +313,6 @@ class DataProcessor:
81
  df.columns = updated_columns
82
  return df
83
 
84
-
85
  def find_intervention_column(self, df):
86
  for column in self.INTERVENTION_COLUMN_OPTIONS:
87
  if column in df.columns:
@@ -117,83 +348,6 @@ class DataProcessor:
117
  else:
118
  return 'Unknown'
119
 
120
- # def compute_student_metrics(self, df):
121
- # intervention_column = self.get_intervention_column(df)
122
- # intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)] # Modified line
123
- # intervention_sessions_held = len(intervention_df)
124
- # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
125
-
126
- # student_metrics = {}
127
- # for col in student_columns:
128
- # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
129
- # student_data = intervention_df[[col]].copy()
130
- # student_data[col] = student_data[col].fillna('Absent')
131
-
132
- # attendance_values = student_data[col].apply(lambda x: 1 if self.classify_engagement(x) in [
133
- # self.ENGAGED_STR,
134
- # self.PARTIALLY_ENGAGED_STR,
135
- # self.NOT_ENGAGED_STR
136
- # ] else 0)
137
-
138
- # sessions_attended = attendance_values.sum()
139
- # attendance_pct = (sessions_attended / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0
140
- # attendance_pct = round(attendance_pct)
141
-
142
- # engagement_counts = {
143
- # self.ENGAGED_STR: 0,
144
- # self.PARTIALLY_ENGAGED_STR: 0,
145
- # self.NOT_ENGAGED_STR: 0,
146
- # 'Absent': 0
147
- # }
148
-
149
- # for x in student_data[col]:
150
- # classified_engagement = self.classify_engagement(x)
151
- # if classified_engagement in engagement_counts:
152
- # engagement_counts[classified_engagement] += 1
153
- # else:
154
- # engagement_counts['Absent'] += 1 # Count as Absent if not engaged
155
-
156
- # total_sessions = sum(engagement_counts.values())
157
-
158
- # engaged_pct = (engagement_counts[self.ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
159
- # engaged_pct = round(engaged_pct)
160
-
161
- # partially_engaged_pct = (engagement_counts[self.PARTIALLY_ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
162
- # partially_engaged_pct = round(partially_engaged_pct)
163
-
164
- # not_engaged_pct = (engagement_counts[self.NOT_ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
165
- # not_engaged_pct = round(not_engaged_pct)
166
-
167
- # absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
168
- # absent_pct = round(absent_pct)
169
-
170
- # # Engagement percentage is based on Engaged and Partially Engaged sessions
171
- # engagement_pct = ((engagement_counts[self.ENGAGED_STR] + engagement_counts[self.PARTIALLY_ENGAGED_STR]) / total_sessions * 100) if total_sessions > 0 else 0
172
- # engagement_pct = round(engagement_pct)
173
-
174
- # # Determine if the student attended ≥ 90% of sessions
175
- # attended_90 = "Yes" if attendance_pct >= 90 else "No"
176
-
177
- # # Determine if the student was engaged ≥ 80% of the time
178
- # engaged_80 = "Yes" if engagement_pct >= 80 else "No"
179
-
180
- # # Store metrics in the required order
181
- # student_metrics[student_name] = {
182
- # 'Attended ≥ 90%': attended_90,
183
- # 'Engagement ≥ 80%': engaged_80,
184
- # 'Attendance (%)': attendance_pct,
185
- # 'Engagement (%)': engagement_pct,
186
- # f'{self.ENGAGED_STR} (%)': engaged_pct,
187
- # f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
188
- # f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
189
- # 'Absent (%)': absent_pct
190
- # }
191
-
192
- # # Create a DataFrame from student_metrics
193
- # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
194
- # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
195
- # return student_metrics_df
196
-
197
  def compute_student_metrics(self, df):
198
  intervention_column = self.get_intervention_column(df)
199
  intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)]
@@ -274,7 +428,7 @@ class DataProcessor:
274
  'Attended ≥ 90%': attended_90,
275
  'Engagement ≥ 80%': engaged_80,
276
  'Attendance (%)': attendance_pct,
277
- 'Engagement (%)': engagement_pct,
278
  f'{self.ENGAGED_STR} (%)': engaged_pct,
279
  f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
280
  f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
@@ -289,7 +443,8 @@ class DataProcessor:
289
  def compute_average_metrics(self, student_metrics_df):
290
  # Calculate the attendance and engagement average percentages across students
291
  attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Average attendance percentage
292
- engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Average engagement percentage
 
293
 
294
  # Round the averages to whole numbers
295
  attendance_avg_stats = round(attendance_avg_stats)
 
1
+ # import re
2
+ # import pandas as pd
3
+ # import os
4
+ # from huggingface_hub import InferenceClient
5
+
6
+ # class DataProcessor:
7
+ # INTERVENTION_COLUMN_OPTIONS = [
8
+ # 'Did the intervention happen today?',
9
+ # 'Did the intervention take place today?'
10
+ # ]
11
+ # YES_RESPONSES = ['yes', 'assessment day'] # Added this line
12
+ # ENGAGED_STR = 'Engaged'
13
+ # PARTIALLY_ENGAGED_STR = 'Partially Engaged'
14
+ # NOT_ENGAGED_STR = 'Not Engaged'
15
+
16
+ # def __init__(self, student_metrics_df=None):
17
+ # self.hf_api_key = os.getenv('HF_API_KEY')
18
+ # if not self.hf_api_key:
19
+ # raise ValueError("HF_API_KEY not set in environment variables")
20
+ # self.client = InferenceClient(api_key=self.hf_api_key)
21
+ # self.student_metrics_df = student_metrics_df
22
+ # self.intervention_column = None # Will be set when processing data
23
+
24
+ # def read_excel(self, uploaded_file):
25
+ # return pd.read_excel(uploaded_file)
26
+
27
+ # def format_session_data(self, df):
28
+ # date_column = next((col for col in df.columns if col in ["Date of Session", "Date"]), None)
29
+ # if date_column:
30
+ # df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.date
31
+ # else:
32
+ # print("Warning: Neither 'Date of Session' nor 'Date' column found in the dataframe.")
33
+
34
+ # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
35
+ # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
36
+ # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
37
+ # return df
38
+
39
+ # def safe_convert_to_time(self, series, format_str='%I:%M %p'):
40
+ # try:
41
+ # converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
42
+ # if format_str:
43
+ # return converted.dt.strftime(format_str)
44
+ # return converted
45
+ # except Exception as e:
46
+ # print(f"Error converting series to time: {e}")
47
+ # return series
48
+
49
+ # def safe_convert_to_datetime(self, series, format_str=None):
50
+ # try:
51
+ # converted = pd.to_datetime(series, errors='coerce')
52
+ # if format_str:
53
+ # return converted.dt.strftime(format_str)
54
+ # return converted
55
+ # except Exception as e:
56
+ # print(f"Error converting series to datetime: {e}")
57
+ # return series
58
+
59
+ # def replace_student_names_with_initials(self, df):
60
+ # updated_columns = []
61
+ # for col in df.columns:
62
+ # if 'Student Attendance' in col:
63
+ # # Search for the last occurrence of text within square brackets at the end of the string
64
+ # match = re.search(r'\[(.+?)\]$', col)
65
+ # if not match:
66
+ # # Handle cases where the closing bracket might be missing
67
+ # match = re.search(r'\[(.+)$', col)
68
+ # if match:
69
+ # name = match.group(1).strip()
70
+ # # Remove any trailing closing bracket if it wasn't matched earlier
71
+ # name = name.rstrip(']')
72
+ # # Get initials
73
+ # initials = ''.join([part[0] for part in name.strip().split()])
74
+ # updated_col = f'Student Attendance [{initials}]'
75
+ # updated_columns.append(updated_col)
76
+ # else:
77
+ # # If no match is found, keep the column name as is
78
+ # updated_columns.append(col)
79
+ # else:
80
+ # updated_columns.append(col)
81
+ # df.columns = updated_columns
82
+ # return df
83
+
84
+
85
+ # def find_intervention_column(self, df):
86
+ # for column in self.INTERVENTION_COLUMN_OPTIONS:
87
+ # if column in df.columns:
88
+ # self.intervention_column = column
89
+ # return column
90
+ # raise ValueError("No intervention column found in the dataframe.")
91
+
92
+ # def get_intervention_column(self, df):
93
+ # if self.intervention_column is None:
94
+ # self.intervention_column = self.find_intervention_column(df)
95
+ # return self.intervention_column
96
+
97
+ # def compute_intervention_statistics(self, df):
98
+ # intervention_column = self.get_intervention_column(df)
99
+ # total_days = len(df)
100
+ # sessions_held = df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES).sum() # Modified line
101
+ # intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
102
+ # return pd.DataFrame({
103
+ # 'Intervention Dosage (%)': [round(intervention_frequency, 0)],
104
+ # 'Intervention Sessions Held': [sessions_held],
105
+ # 'Intervention Sessions Not Held': [total_days - sessions_held],
106
+ # 'Total Number of Days Available': [total_days]
107
+ # })
108
+
109
+ # def classify_engagement(self, engagement_str):
110
+ # engagement_str = str(engagement_str).lower()
111
+ # if engagement_str.startswith(self.ENGAGED_STR.lower()):
112
+ # return self.ENGAGED_STR
113
+ # elif engagement_str.startswith(self.PARTIALLY_ENGAGED_STR.lower()):
114
+ # return self.PARTIALLY_ENGAGED_STR
115
+ # elif engagement_str.startswith(self.NOT_ENGAGED_STR.lower()):
116
+ # return self.NOT_ENGAGED_STR
117
+ # else:
118
+ # return 'Unknown'
119
+
120
+ # def compute_student_metrics(self, df):
121
+ # intervention_column = self.get_intervention_column(df)
122
+ # intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)]
123
+ # intervention_sessions_held = len(intervention_df)
124
+ # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
125
+
126
+ # student_metrics = {}
127
+ # for col in student_columns:
128
+ # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
129
+ # student_data = intervention_df[[col]].copy()
130
+ # student_data[col] = student_data[col].fillna('Absent')
131
+
132
+ # # Classify each entry
133
+ # student_data['Engagement'] = student_data[col].apply(self.classify_engagement)
134
+
135
+ # # Calculate attendance
136
+ # attendance_values = student_data['Engagement'].apply(
137
+ # lambda x: 1 if x in [self.ENGAGED_STR, self.PARTIALLY_ENGAGED_STR, self.NOT_ENGAGED_STR] else 0
138
+ # )
139
+
140
+ # sessions_attended = attendance_values.sum()
141
+ # attendance_pct = (sessions_attended / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0
142
+ # attendance_pct = round(attendance_pct)
143
+
144
+ # # Engagement counts (excluding 'Absent')
145
+ # engagement_counts = {
146
+ # self.ENGAGED_STR: 0,
147
+ # self.PARTIALLY_ENGAGED_STR: 0,
148
+ # self.NOT_ENGAGED_STR: 0
149
+ # }
150
+
151
+ # # Count the engagement types, excluding 'Absent'
152
+ # for x in student_data['Engagement']:
153
+ # if x in engagement_counts:
154
+ # engagement_counts[x] += 1
155
+ # # 'Absent' is not counted in engagement_counts
156
+
157
+ # total_present_sessions = sum(engagement_counts.values())
158
+
159
+ # engaged_pct = (
160
+ # (engagement_counts[self.ENGAGED_STR] / total_present_sessions * 100)
161
+ # if total_present_sessions > 0 else 0
162
+ # )
163
+ # engaged_pct = round(engaged_pct)
164
+
165
+ # partially_engaged_pct = (
166
+ # (engagement_counts[self.PARTIALLY_ENGAGED_STR] / total_present_sessions * 100)
167
+ # if total_present_sessions > 0 else 0
168
+ # )
169
+ # partially_engaged_pct = round(partially_engaged_pct)
170
+
171
+ # not_engaged_pct = (
172
+ # (engagement_counts[self.NOT_ENGAGED_STR] / total_present_sessions * 100)
173
+ # if total_present_sessions > 0 else 0
174
+ # )
175
+ # not_engaged_pct = round(not_engaged_pct)
176
+
177
+ # # Engagement percentage is based on Engaged and Partially Engaged sessions
178
+ # engagement_pct = (
179
+ # ((engagement_counts[self.ENGAGED_STR] + engagement_counts[self.PARTIALLY_ENGAGED_STR]) / total_present_sessions * 100)
180
+ # if total_present_sessions > 0 else 0
181
+ # )
182
+ # engagement_pct = round(engagement_pct)
183
+
184
+ # # Absent percentage (for reference, not used in engagement calculation)
185
+ # absent_sessions = student_data['Engagement'].value_counts().get('Absent', 0)
186
+ # absent_pct = (absent_sessions / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0
187
+ # absent_pct = round(absent_pct)
188
+
189
+ # # Determine if the student attended ≥ 90% of sessions
190
+ # attended_90 = "Yes" if attendance_pct >= 90 else "No"
191
+
192
+ # # Determine if the student was engaged ≥ 80% of the time
193
+ # engaged_80 = "Yes" if engagement_pct >= 80 else "No"
194
+
195
+ # # Store metrics
196
+ # student_metrics[student_name] = {
197
+ # 'Attended ≥ 90%': attended_90,
198
+ # 'Engagement ≥ 80%': engaged_80,
199
+ # 'Attendance (%)': attendance_pct,
200
+ # 'Engagement (%)': engagement_pct,
201
+ # f'{self.ENGAGED_STR} (%)': engaged_pct,
202
+ # f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
203
+ # f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
204
+ # 'Absent (%)': absent_pct
205
+ # }
206
+
207
+ # # Create a DataFrame from student_metrics
208
+ # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
209
+ # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
210
+ # return student_metrics_df
211
+
212
+ # def compute_average_metrics(self, student_metrics_df):
213
+ # # Calculate the attendance and engagement average percentages across students
214
+ # attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Average attendance percentage
215
+ # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Average engagement percentage
216
+
217
+ # # Round the averages to whole numbers
218
+ # attendance_avg_stats = round(attendance_avg_stats)
219
+ # engagement_avg_stats = round(engagement_avg_stats)
220
+
221
+ # return attendance_avg_stats, engagement_avg_stats
222
+
223
+ # def evaluate_student(self, row, attendance_threshold=90, engagement_threshold=80):
224
+ # if row["Attended ≥ 90%"] == "No":
225
+ # return "Address Attendance"
226
+ # elif row["Engagement ≥ 80%"] == "No":
227
+ # return "Address Engagement"
228
+ # else:
229
+ # return "Consider barriers, fidelity, and progress monitoring"
230
+
231
+
232
+
233
  import re
234
  import pandas as pd
235
  import os
 
313
  df.columns = updated_columns
314
  return df
315
 
 
316
  def find_intervention_column(self, df):
317
  for column in self.INTERVENTION_COLUMN_OPTIONS:
318
  if column in df.columns:
 
348
  else:
349
  return 'Unknown'
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  def compute_student_metrics(self, df):
352
  intervention_column = self.get_intervention_column(df)
353
  intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)]
 
428
  'Attended ≥ 90%': attended_90,
429
  'Engagement ≥ 80%': engaged_80,
430
  'Attendance (%)': attendance_pct,
431
+ # 'Engagement (%)': engagement_pct, REMOVED REMOVED
432
  f'{self.ENGAGED_STR} (%)': engaged_pct,
433
  f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
434
  f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
 
443
  def compute_average_metrics(self, student_metrics_df):
444
  # Calculate the attendance and engagement average percentages across students
445
  attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Average attendance percentage
446
+ # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Average engagement percentage REMOVED REMOVED
447
+ engagement_avg_stats = student_metrics_df[f'{self.ENGAGED_STR} (%)'].mean() # Average engagement percentage
448
 
449
  # Round the averages to whole numbers
450
  attendance_avg_stats = round(attendance_avg_stats)