pwc-india commited on
Commit
8ef4d78
1 Parent(s): 6b589e1

Update pages/2DATA PROFILER.py

Browse files
Files changed (1) hide show
  1. pages/2DATA PROFILER.py +694 -683
pages/2DATA PROFILER.py CHANGED
@@ -19,6 +19,7 @@ import base64
19
  from sdv.datasets.local import load_csvs
20
  import pyodbc
21
  import pymssql
 
22
 
23
  warnings.filterwarnings('ignore')
24
  st.set_page_config(
@@ -54,733 +55,743 @@ st.markdown("""
54
  </style>
55
  """, unsafe_allow_html=True)
56
 
57
- def load_dataframe_to_sqlserver(df, table_name, connection_string):
58
- # Establish a connection to the database
59
- conn = pyodbc.connect(connection_string)
60
- cursor = conn.cursor()
 
61
 
62
- # Drop table if it exists
63
- drop_table_sql = f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- try:
66
- cursor.execute(drop_table_sql)
67
- conn.commit()
68
- except Exception as e:
69
- st.error(f"Error dropping table. Please try with a different name.")
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Create table SQL statement based on DataFrame columns and types
72
- create_table_sql = f"CREATE TABLE {table_name} ("
73
- for column in df.columns:
74
- dtype = str(df[column].dtype)
75
- sql_dtype = 'NVARCHAR(MAX)'
76
- create_table_sql += f"{column} {sql_dtype}, "
77
- create_table_sql = create_table_sql.rstrip(', ') + ')'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- try:
80
- # Execute table creation
81
- cursor.execute(create_table_sql)
82
- conn.commit()
83
- except Exception as e:
84
- st.error(f"Error Creating table. Please try with a different name.")
85
-
86
- # Insert DataFrame data into the table using bulk insert
87
- insert_sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['?' for _ in df.columns])})"
88
 
89
- try:
90
- # Using `fast_executemany` for bulk inserts
91
- cursor.fast_executemany = True
92
- cursor.executemany(insert_sql, df.values.tolist())
93
- conn.commit()
94
- st.success(f"Data Imported with table name: '{table_name}' successfully.")
95
- except Exception as e:
96
- st.error(f"Error Inserting Data. Please try with a different name.")
97
 
98
- cursor.close()
99
- conn.close()
100
 
101
-
102
- def clear_cache():
103
- keys = list(st.session_state.keys())
104
- for key in keys:
105
- st.session_state.pop(key)
106
-
107
- def set_bg_hack(main_bg):
108
- '''
109
- A function to unpack an image from root folder and set as bg.
110
-
111
- Returns
112
- -------
113
- The background.
114
- '''
115
- # set bg name
116
- main_bg_ext = "png"
117
-
118
- st.markdown(
119
- f"""
120
- <style>
121
- .stApp {{
122
- background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(main_bg, "rb").read()).decode()});
123
- background-size: cover
124
- }}
125
- </style>
126
- """,
127
- unsafe_allow_html=True
128
- )
129
- #set_bg_hack("bg2.png")
130
- header_style = """
131
- <style>
132
- .header {
133
- color: black; /* Soft dark gray text color for readability */
134
- width: 103%;
135
- font-size: 60px; /* Large font size */
136
- font-weight: bold; /* Bold text */
137
- line-height: 1.2; /* Improved readability */
138
- margin-bottom: 30px; /* Add some space below the header */
139
- padding: 20px; /* Add padding for better spacing */
140
- background-image:
141
- linear-gradient(to right, rgba(255, 140, 0, 0.3) 25%, transparent 75%), /* Darker orange with higher opacity */
142
- linear-gradient(to bottom, rgba(255, 140, 0, 0.3) 15%, transparent 75%),
143
- linear-gradient(to left, rgba(255, 140, 0, 0.3) 25%, transparent 55%),
144
- linear-gradient(to top, rgba(255, 140, 0, 0.3) 25%, transparent 95%);
145
- background-blend-mode: overlay;
146
- background-size: 250px 350px;
147
- border-radius: 10px; /* Add border radius for rounded corners */
148
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */
149
- }
150
- </style>
151
- """
152
-
153
-
154
-
155
-
156
-
157
- content_style = """
158
- <style>
159
- .content {
160
- font-size: 40px; /* Larger font size for content */
161
- line-height: 1.6; /* Improved readability */
162
- width: 103%;
163
- padding: 10px; /* Add padding for better spacing */
164
- margin-bottom: 20px;
165
- background-color: sky-blue; /* Background color for the header */
166
- border-radius: 10px; /* Add border radius for rounded corners */
167
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */
168
- }
169
- </style>
170
- """
171
-
172
- small_style = """
173
- <style>
174
- .small {
175
- color: black;
176
- font-size: 30px; /* Larger font size for content */
177
- line-height: 1.6; /* Improved readability */
178
- width: 100%;
179
- padding: 10px; /* Add padding for better spacing */
180
- margin-bottom: 10px;
181
- background-color: white; /* Background color for the header */
182
- border-radius: 10px; /* Add border radius for rounded corners */
183
- }
184
- </style>
185
- """
186
-
187
- def update_column_dtype(df, column_name, dtype):
188
- error_entries = pd.DataFrame()
189
- flag = None
190
- if dtype == 'System Detected':
191
- pass
192
- elif dtype == 'int64':
193
- try:
194
- df[column_name] = df[column_name].astype('int64')
195
- except ValueError:
196
- error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())]
197
- st.error('Unable to convert some entries to integer. Please Clean the column.')
198
- elif dtype == 'float64/numeric':
199
- try:
200
- df[column_name] = df[column_name].astype('float64')
201
- except ValueError:
202
- error_entries = df[pd.to_numeric(df[column_name], errors='coerce').isna()]
203
- st.error('Unable to convert some entries to float. Please Clean the column.')
204
- elif dtype == 'id':
205
- try:
206
- df[column_name] = df[column_name].astype('int64')
207
- except ValueError:
208
- error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())]
209
- st.error('Unable to convert some entries to id. Please Clean the column.')
210
- elif dtype == 'categorical/string':
211
- df[column_name] = df[column_name].astype('category')
212
- elif dtype == 'datetime':
213
- try:
214
- df[column_name] = pd.to_datetime(df[column_name], errors='raise', infer_datetime_format=True)
215
- except ValueError:
216
- error_entries = df[pd.to_datetime(df[column_name], errors='coerce', infer_datetime_format=True).isna()]
217
- custom_format = st.text_input("Please provide the datetime format (e.g., %Y-%m-%d):")
218
- if custom_format:
219
- try:
220
- df[column_name] = pd.to_datetime(df[column_name], errors='raise', format=custom_format)
221
- except ValueError:
222
- error_entries = df[pd.to_datetime(df[column_name], errors='coerce', format=custom_format).isna()]
223
- st.error('Unable to parse datetime with the provided format. Please Clean the column.')
224
- elif dtype == 'email':
225
- df[column_name] = df[column_name].astype('category')
226
- flag= 'email'
227
- elif dtype == 'phone_number':
228
- df[column_name] = df[column_name].astype('category')
229
- flag= 'phone_number'
230
 
231
- return df, error_entries, flag
232
-
233
- def convert_to_special_representation(value):
234
- value = str(value)
235
- special_chars = set("!@#$%^&*()_+-=[]{}|;:,.<>?`~")
236
- result = ''
237
- for char in value:
238
- if char.isdigit():
239
- result += 'N'
240
- elif char.isalpha():
241
- result += 'A'
242
- elif char in special_chars:
243
- result += char
244
- else:
245
- # Handle other characters as needed
246
- result += char
247
- return result
248
- with st.container(border=True):
249
- st.subheader('SELECT TABLE')
250
- metadata = SingleTableMetadata()
251
- conn = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;"
252
- "Database=sqldb-ext-dev-uks-001;"
253
- "UID=dbadmin;"
254
- "PWD=mYpa$$w0rD" )
255
- query1_1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME in ('TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM') ORDER BY TABLE_NAME ASC"
256
- query1_2="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME LIKE 'PROFILED%' ORDER BY TABLE_NAME ASC"
257
- tab_names=list(pd.read_sql_query(query1_1,con=conn)['TABLE_NAME'])
258
- tab_names_edited= list(pd.read_sql_query(query1_2,con=conn)['TABLE_NAME'])
259
- sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
260
- mode_selector=st.selectbox("Select How you want to Proceed", ["Start Profiling with Source Data", "Load Previously Profiled Data For Further Processing"], on_change=clear_cache,placeholder='Show Options')
261
- if mode_selector == "Start Profiling with Source Data":
262
- table_selector=st.selectbox('SELECT TABLE NAME',tab_names,index=None,on_change=clear_cache,placeholder='Select table name')
263
 
264
- if mode_selector == "Load Previously Profiled Data For Further Processing":
265
- table_selector=st.selectbox('SELECT TABLE NAME',tab_names_edited,index=None,on_change=clear_cache,placeholder='Select table name')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- if table_selector is not None and sample_selector is not None:
268
- if sample_selector=='100':
269
- count="top 100"
270
- elif sample_selector=='10K':
271
- count="top 10000"
272
- elif sample_selector=='100K':
273
- count="top 100000"
274
- elif sample_selector=='1M':
275
- count="top 1000000"
276
- else:
277
- count=""
278
- query2="select "+count+" * from [dbo].["+table_selector+"]"
279
- df = pd.read_sql_query(query2,con=conn)
280
- main_list=df.columns.to_list()
281
- sub_list=['ID','LOADID','FILE_NAME']
282
- if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
283
- df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
284
- conn.close()
285
- if 'data' not in st.session_state:
286
- st.session_state.data= df
287
- metadata.detect_from_dataframe(st.session_state.data)
288
- st.sidebar.header("DataFrame Live Preview")
289
- st.sidebar.markdown("*This Window keeps the live status of the dataframe under processing. You can review this dataframe after all the changes.*")
290
- df_preview= st.sidebar.empty()
291
- df_preview.write(st.session_state.data)
292
- st.markdown(content_style, unsafe_allow_html=True)
293
  with st.container(border=True):
294
- cols= df.columns.to_list()
295
- primary_key= metadata.primary_key
296
- sugg_primary_keys = [col for col in cols if df[col].is_unique and df[col].dtype != 'float' and not df[col].isnull().any()]
297
- prob_key= sugg_primary_keys
298
- if primary_key in sugg_primary_keys:
299
- default_index = sugg_primary_keys.index(primary_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  else:
301
- sugg_primary_keys.append(primary_key)
302
- default_index = sugg_primary_keys.index(primary_key)
303
- no_y_data =[]
304
- email_cols=[]
305
- phone_cols=[]
306
- # cols_select= st.multiselect('Please select column(s) for Profiling and Cleansing', cols, default= cols[:5])
307
- tabs3= st.tabs(cols)
308
- for i, tab in enumerate(tabs3):
309
- with tab:
310
- col= cols[i]
311
- scol1,scol2= st.columns([4,1])
312
- with scol1:
313
- taba, tabb, tabc, tabd, tabe = st.tabs(["📝 DataType Validation", "🧹 Missing Value Handling", "📈 Statistical Profiling", " Pattern Exploration", "🤖 AI Assisted Data Cleansing"])
314
- with taba:
315
- if st.session_state.data[col].dtype.name == 'category':
316
- st.session_state.data[col] = st.session_state.data[col].astype('str')
317
- dtypes= ['System Detected', 'int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number']
318
- no_dtypes= ['int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number']
319
- no_dtype = False
320
- if metadata.columns[col]['sdtype'] != "unknown":
321
- datatype= metadata.columns[col]['sdtype']
322
- st.info(f"System Identified DataType: {datatype}")
323
- elif str(df[col].dtype) != 'object' and metadata.columns[col]['sdtype'] == "unknown":
324
- datatype= str(df[col].dtype)
325
- st.info(f"System Identified DataType: {datatype}")
326
- else:
327
- datatype= 'NA'
328
- #st.warning("System Could Not Understand Datatype. Please Specify the Datatype")
329
- no_dtype= True
330
- if datatype in ['int64']:
331
- def_index=1
332
- if datatype in ['float64', 'numerical']:
333
- def_index=2
334
- if datatype in ['id']:
335
- def_index=3
336
- if datatype in ['categorical', 'string']:
337
- def_index=4
338
- if datatype in ['datetime']:
339
- def_index=5
340
- if datatype in ['email']:
341
- def_index=6
342
- if datatype in ['phone_number']:
343
- def_index=7
344
-
345
- if col == primary_key:
346
- st.success("This is System Identified Primary Key")
347
- elif col in prob_key:
348
- st.warning("This is System suggested potential Primary Key")
349
- if f'dtype_{col}' not in st.session_state:
350
- st.session_state[f'dtype_{col}'] = 'initiate'
351
- if st.session_state[f'dtype_{col}'] not in ['email', 'phone_number']:
352
- st.session_state.flag = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
- if no_dtype == True:
355
- fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",no_dtypes, index=3, key= f'datatype_{col}')
356
- else:
357
- fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",dtypes, index=def_index, key= f'datatype_{col}')
358
- st.session_state[f'dtype_{col}'] = st.session_state[f'datatype_{col}']
359
- st.session_state.data, error_df, st.session_state.flag= update_column_dtype(st.session_state.data,col,fin_datatype)
360
-
361
- if error_df.empty:
362
- st.success("No Datatype Validation Errors For Current Datatype")
363
- try:
364
- df_preview.write(st.session_state.data)
365
- except:
366
- st.warning("DataFrame Updated. But Could Not Load Preview")
367
- else:
368
- st.subheader("Prepare the Column for Conversion:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  try:
370
- edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={
371
  col: st.column_config.TextColumn(
372
  col,
373
  width="medium",
374
  )
375
- }, key=f'dtype_error_{col}')
376
  except:
377
- edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={
378
- col: st.column_config.TextColumn(
379
  col,
380
  width="medium",
381
  )
382
- }, key=f'dtype_error_{col}')
383
- check = st.button("Fix Error", key=f"Fix{col}")
384
- if check:
385
- st.session_state.data= st.session_state.data.drop(error_df.index)
386
- st.session_state.data = pd.concat([st.session_state.data, edited_error_df])
387
- df_preview.write(st.session_state.data)
388
- if fin_datatype in ['id', 'email', 'phone_number']:
389
- no_y_data.append(col)
390
- if fin_datatype in ['email']:
391
- email_cols.append(col)
392
- if fin_datatype in ['phone_number']:
393
- phone_cols.append(col)
394
- no_y_data.extend(['Validity','Validity_phone','Validity_email'])
395
- total_records = len(st.session_state.data)
396
- with tabc:
397
- if col not in no_y_data:
398
- y_data_col= st.session_state.data[[col]]
399
- pr = ProfileReport(y_data_col, dark_mode=True, explorative=False, config_file=r"ydata_config.yml")
400
- pr.config.html.style.primary_colors = ['#e41a1c']
401
- with st.container(border=True):
402
- st_profile_report(pr, navbar=False, key=f'profile{col}')
403
- elif col in email_cols:
404
- unique_emails = st.session_state.data[col].nunique()
405
- duplicate_emails = total_records - unique_emails
406
- # Extract email domains
407
- email_domains = st.session_state.data[col].str.extract(r'@(.+)$')[0]
408
- # Count occurrences of each domain
409
- email_domain_counts = email_domains.value_counts()
410
- # Get the top 5 email domains
411
- top_email_domains = email_domain_counts.head(5)
412
-
413
-
414
- # Format the top email domains for display
415
- top_email_domains_str = '\n|\n'.join([f"{domain}: {count}" for domain, count in top_email_domains.items()])
416
- if f'invalid_em_{col}' in st.session_state:
417
- invalid_emails= len(st.session_state[f'invalid_em_{col}'])
418
- valid_emails= total_records - invalid_emails
419
- percent_invalid_emails = invalid_emails / total_records * 100
420
- email_message = f"""
421
- ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} ({100 - percent_invalid_emails:.2f}%)\n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails} ({percent_invalid_emails:.2f}%)\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str}
422
- """
423
 
424
- else:
425
- invalid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
426
- valid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
427
- percent_invalid_emails = "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
- email_message = f"""
430
- ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} \n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails}\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str}
431
- """
432
-
433
- with st.container(border=True):
434
- st.markdown(str(email_message))
435
- ref_em=st.button('Refresh', key=f'email{col}')
436
- if ref_em:
437
- pass
438
-
439
-
440
- elif col in phone_cols:
441
- unique_phones = st.session_state.data[col].nunique()
442
- duplicate_phones = total_records - unique_phones
443
- phone_country_codes = st.session_state.data[col].str.extract(r'^\+(\d+)')[0].value_counts()
444
- top_phone_country_codes = list(phone_country_codes.head(5).to_string())
445
- to_remove = ['\n', ' ']
446
- top_phone_country_codes = [item for item in top_phone_country_codes if item not in to_remove]
447
- if f'invalid_ph_{col}' in st.session_state:
448
- invalid_phones= len(st.session_state[f'invalid_ph_{col}'])
449
- valid_phones= total_records - invalid_phones
450
- percent_invalid_phones = invalid_phones / total_records * 100
451
- phone_message= f"""
452
-
453
- ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} ({100 - percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Invalid Phone Numbers:** {invalid_phones} ({percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Top 5 Phone Country Codes:** {top_phone_country_codes}
454
- """
455
- else:
456
- invalid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
457
- valid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
458
- percent_invalid_phones = "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
459
- phone_message=f"""
460
-
461
- ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Invalid Phone Numbers:** {invalid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Top 5 Phone Country Codes:** {top_phone_country_codes}
462
- """
463
-
464
 
465
- with st.container(border=True):
466
- st.markdown(str(phone_message))
467
- ref_ph=st.button('Refresh', key=f'phone{col}')
468
- if ref_ph:
469
- pass
470
- with tabd:
471
- st.session_state.data_encoded = st.session_state.data.copy()
472
- st.session_state.data_encoded[f'Pattern_{col}'] = st.session_state.data_encoded[col].apply(convert_to_special_representation)
473
- patterns= list(st.session_state.data_encoded[f'Pattern_{col}'].unique())
474
- patt_col1, patt_col2 = st.columns([1,4])
475
- with patt_col1:
476
- st.session_state.pattern_list= pd.DataFrame(patterns,columns=['Pattern Name'])
477
- event = st.dataframe(
478
- st.session_state.pattern_list,
479
- key=f"pattern_list_data{col}",
480
- on_select="rerun",
481
- selection_mode=["multi-row"],
482
- hide_index=True,
483
- width= 10000,
484
- height= 450
485
- )
486
- if len(event.selection.rows) > 0:
487
- filter= list(st.session_state.pattern_list.loc[event.selection.rows]['Pattern Name'].values)
488
- else:
489
- filter = None
490
- if filter is not None:
491
- with patt_col2:
492
- with st.container(border= True, height= 450):
493
- st.write("#####")
494
-
495
- if not st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].empty:
496
- st.session_state.data_encoded[col] = st.session_state.data_encoded[col].astype('str')
497
- try:
498
- edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={
499
- col: st.column_config.TextColumn(
500
- col,
501
- width="medium",
502
- )
503
- }, height=300, key=f'Valid_pattern_{col}')
504
- except:
505
- edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={
506
- col: st.column_config.Column(
507
- col,
508
- width="medium",
509
- )
510
- }, height=300, key=f'Valid_pattern_{col}')
511
- valid_pattern = st.button("Confirm", key=f"Fix_valid_pattern_{col}")
512
- if valid_pattern:
513
- st.session_state.data= st.session_state.data.drop(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].index)
514
- st.session_state.data = pd.concat([st.session_state.data, edited_pattern_df])
515
- st.session_state.data=st.session_state.data.drop([f'Pattern_{col}'], axis=1)
516
- st.session_state.data= st.session_state.data.sort_index()
517
- df_preview.write(st.session_state.data)
518
- else:
519
- with patt_col2:
520
- with stylable_container(
521
- key=f"container_select_pattern_none{col}",
522
- css_styles="""
523
- {
524
- border: 1px solid white;
525
- border-radius: 0.5rem;
526
- padding: calc(1em - 1px);
527
- width: 100%;
528
- color: orange;
529
- size: 100px;
530
- }
531
- """
532
- ):
533
- st.write('##\n\n##\n\n')
534
- st.markdown("""
535
- <style>
536
- .big-font {
537
- font-size:15px;
538
- width: 100%;
539
- text-align: center;
540
- }
541
- </style>
542
- """, unsafe_allow_html=True)
543
- st.markdown(f'<p class="big-font">🛈 There are total {len(st.session_state.pattern_list)} Number of Patterns Available. Please Select Pattern(s) for Matching Records</p>', unsafe_allow_html=True)
544
- st.write('##\n\n##\n\n')
545
 
546
- with tabb:
547
- try:
548
- edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={
549
- col: st.column_config.TextColumn(
550
- col,
551
- width="medium",
552
- )
553
- }, key=f'miss_{col}')
554
- except:
555
- edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={
556
- col: st.column_config.Column(
557
- col,
558
- width="medium",
559
- )
560
- }, key=f'miss_{col}')
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
- incol1,incol2, extra= st.columns([1.1,1.5,8])
563
- with incol1:
564
- #st.write(st.session_state[f'dtype_{col}'])
565
- if st.session_state[f'dtype_{col}'] not in ['int64', 'float64/numeric']:
566
- def_fill = st.text_input("Default Autofill Value",key=f"def_fill_{col}")
567
- autofill= st.button("Autofill", key=f"autofill_{col}")
568
-
569
- if autofill:
570
- if st.session_state[f'dtype_{col}'] not in ['int','float']:
571
- st.session_state.data[col] = st.session_state.data[col].astype('str').replace('', pd.NA).replace({None: pd.NA}).fillna(def_fill)
572
- else:
573
- st.session_state.data[col] = st.session_state.data[col].replace({None: pd.NA}).fillna(method='ffill')
574
- st.success("Column Autofilled. Please Review the Sidebar for updated status of the Dataframe.")
575
- df_preview.write(st.session_state.data)
576
- with incol2:
577
- confirm= st.button("Confirm", key=f"Confirm_{col}")
578
- if confirm:
579
- st.session_state.data[col] = st.session_state.data[col].replace('', np.nan).replace({None: np.nan})
580
- st.session_state.data = st.session_state.data.dropna(subset=[col])
581
- st.session_state.data.update(edited_df)
582
- st.session_state.data = pd.concat([st.session_state.data, edited_df[~edited_df.index.isin(st.session_state.data.index)]])
583
- st.session_state.data= st.session_state.data.sort_index()
584
- st.success("State Saved. Please Review the Sidebar for updated status of the Dataframe.")
585
- df_preview.write(st.session_state.data)
586
- with tabe:
587
- if "overall_invalid_df" not in st.session_state:
588
- st.session_state.overall_invalid_df = pd.DataFrame()
589
- if (st.session_state[f'dtype_{col}'] not in ['email', 'phone_number'] and st.session_state.flag not in ['email', 'phone_number']):
590
- st.dataframe(st.session_state.data)
591
- AI_check= st.button("Check For Anomalies", key= f'AI_CHECK_{col}')
592
- if AI_check:
593
- with st.spinner("Running Anomaly Detection AI"):
594
- #my_bar = st.progress(0, text="Progress")
595
-
596
- if st.session_state[f'dtype_{col}'] in ['categorical/string']:
597
- if 'missing@123' not in st.session_state.data[col].cat.categories:
598
- st.session_state.data[col] = st.session_state.data[col].cat.add_categories(['missing@123'])
599
-
600
- st.session_state.data[col] = st.session_state.data[col].fillna('missing@123').astype(str)
601
- st.session_state.data_encoded = st.session_state.data[col].apply(convert_to_special_representation)
602
- mixed_transformer = Pipeline(steps=[
603
- ('vectorizer', CountVectorizer(analyzer='char', lowercase=False))
604
- ])
605
-
606
- df_transformed = mixed_transformer.fit_transform(st.session_state.data_encoded)
607
-
608
- input_dim = df_transformed.shape[1]
609
- encoding_dim = (input_dim // 2) + 1
610
-
611
- input_layer = Input(shape=(None, input_dim))
612
- conv1d_layer = Conv1D(64, 3, activation='relu', padding='same')(input_layer)
613
- maxpooling_layer = MaxPooling1D(pool_size=2, padding='same')(conv1d_layer)
614
- encoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=False))(maxpooling_layer)
615
-
616
- repeat_vector = RepeatVector(input_dim)(encoder_lstm)
617
- decoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=True))(repeat_vector)
618
- conv1d_layer_decoder = Conv1D(64, 3, activation='relu', padding='same')(decoder_lstm)
619
- upsampling_layer = Conv1D(input_dim, 2, activation='relu', padding='same')(conv1d_layer_decoder)
620
-
621
- autoencoder = Model(inputs=input_layer, outputs=upsampling_layer)
622
-
623
- autoencoder.compile(optimizer=Adam(), loss=MeanSquaredError())
624
- #my_bar.progress(40, text='Progress')
625
- autoencoder.fit(np.expand_dims(df_transformed.toarray(), axis=1), np.expand_dims(df_transformed.toarray(), axis=1),
626
- epochs=100, batch_size=2, shuffle=True, validation_split=0.2, verbose=1)
627
- reconstructions = autoencoder.predict(np.expand_dims(df_transformed.toarray(), axis=1))
628
- reconstruction_error = np.mean(np.abs(reconstructions - np.expand_dims(df_transformed.toarray(), axis=1)), axis=(1, 2))
629
-
630
- threshold = np.percentile(reconstruction_error, 95) # Adjust the percentile based on desired sensitivity
631
- #my_bar.progress(90, text='Progress')
632
- st.session_state.data['Validity'] = ['Invalid' if error > threshold else 'Valid' for error in reconstruction_error]
633
- st.session_state.data[col] = st.session_state.data[col].replace('missing@123', '')
634
- st.session_state[f"invalid_ai_data_{col}"]= st.session_state.data[st.session_state.data['Validity']== 'Invalid']
635
- #my_bar.progress(100, text='Progress')
636
-
637
- if f"invalid_ai_data_{col}" in st.session_state:
638
- st.session_state[f"invalid_ai_data_{col}"]["Invalid Field"] = col
639
- if 'Validity' in st.session_state[f"invalid_ai_data_{col}"].columns:
640
- st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"].drop(['Validity'], axis=1)], ignore_index=True)
641
- else:
642
- st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"]], ignore_index=True)
643
-
644
  try:
645
- edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={
646
  col: st.column_config.TextColumn(
647
  col,
648
  width="medium",
649
  )
650
- }, key=f'Valid_{col}')
651
  except:
652
- edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={
653
  col: st.column_config.Column(
654
  col,
655
  width="medium",
656
  )
657
- }, key=f'Valid_{col}')
658
- valid = st.button("Confirm", key=f"Fix_valid_{col}")
659
- #my_bar.empty()
660
- if valid:
661
- st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity'] == 'Invalid'].index)
662
  st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
663
- st.session_state.data= st.session_state.data.sort_index()
 
 
664
  df_preview.write(st.session_state.data)
665
-
666
-
667
-
668
-
669
- elif (st.session_state[f'dtype_{col}'] in ['phone_number'] or st.session_state.flag in ['phone_number'] ):
670
- #st.dataframe(st.session_state.data)
671
- phone_regex = r'^\+?[0-9\s\-\(\)]+$'
672
- # st.write(phone_regex)
673
- st.session_state.data['Validity_phone'] = st.session_state.data[col].apply(lambda xy: 'phone_is_valid' if re.match(phone_regex,str(xy)) else 'phone_is_invalid')
674
- st.session_state[f'invalid_phone_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1)
675
- if f'invalid_phone_{col}_check' not in st.session_state:
676
- st.session_state[f'invalid_phone_{col}']["Invalid Field"] = col
677
- st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_phone_{col}']], ignore_index=True, axis=0)
678
- st.session_state[f'invalid_phone_{col}_check'] = 'yes'
679
- try:
680
- edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={
681
- col: st.column_config.TextColumn(
682
- col,
683
- width="medium",
684
- )
685
- }, num_rows="dynamic", key=f'Valid_phone_{col}')
686
- except:
687
- edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={
688
  col: st.column_config.Column(
689
  col,
690
  width="medium",
691
  )
692
- }, num_rows="dynamic", key=f'Valid_phone_{col}')
693
- valid_phone = st.button("Confirm", key=f"Fix_valid_phone_{col}")
694
- if valid_phone:
695
- st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].index)
696
- st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
697
- st.session_state[f'invalid_ph_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1)
698
- st.session_state.data = st.session_state.data.drop(['Validity_phone'], axis=1)
699
-
700
- df_preview.write(st.session_state.data)
701
-
702
- elif (st.session_state[f'dtype_{col}'] in ['email'] or st.session_state.flag in ['email']):
703
- email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
704
- st.session_state.data['Validity_email'] = st.session_state.data[col].apply(lambda x: 'email_is_valid' if re.match(email_regex, x) else 'email_is_invalid')
705
- if st.session_state.data[col].dtype.name == 'category':
706
- st.session_state.data[col] = st.session_state.data[col].astype('str')
707
- st.session_state[f'invalid_email_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1)
708
- if f'invalid_email_{col}_check' not in st.session_state:
709
- st.session_state[f'invalid_email_{col}']["Invalid Field"] = col
710
- st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_email_{col}']], ignore_index=True, axis=0)
711
- st.session_state[f'invalid_email_{col}_check'] = 'yes'
712
- try:
713
- edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={
714
- col: st.column_config.TextColumn(
715
- col,
716
- width="medium",
717
- )
718
  }, key=f'Valid_email_{col}')
719
- except:
720
- edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={
721
- col: st.column_config.Column(
722
- col,
723
- width="medium",
724
- )
725
- }, key=f'Valid_email_{col}')
726
- valid_email = st.button("Confirm", key=f"Fix_valid_email_{col}")
727
- if valid_email:
728
- st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].index)
729
- st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
730
- st.session_state[f'invalid_em_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1)
731
- st.session_state.data = st.session_state.data.drop(['Validity_email'], axis=1)
732
- df_preview.write(st.session_state.data)
733
-
734
-
735
 
736
-
737
- with scol2:
738
- st.markdown("**Column Being Processed**")
739
- col_view= st.empty()
740
- try:
741
- col_view.write(st.session_state.data[col])
742
- except:
743
- st.warning("DataFrame Updated. But Could Not Load Preview")
 
 
 
 
 
 
 
 
 
 
744
 
745
- pkcol1, pkcol2=st.columns(2)
746
- with pkcol1:
747
- if primary_key != None:
748
- st.info(f"Primary Key Identified by AI: {primary_key}")
749
- else:
750
- st.warning("Could Not Finalize the Primary Key Automatically. Please go through the suggestions and Finalize one.")
751
- with pkcol2:
752
- st.selectbox("Please Finalize the Primary Key:", sugg_primary_keys, index= default_index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
 
754
- with st.expander("Save and Download Data"):
755
- name_data= st.text_input("Please Specify Name of the saved/downloaded data")
756
- csv = st.session_state.data.to_csv(index=False).encode('utf-8')
757
- for col in ['Validity', 'Validity_email', 'Validity_phone']:
758
- if col in st.session_state.overall_invalid_df:
759
- st.session_state.overall_invalid_df = st.session_state.overall_invalid_df.drop([col], axis=1)
760
- csv2 = st.session_state.overall_invalid_df.to_csv(index=False).encode('utf-8')
761
- #st.write(st.session_state.overall_invalid_df)
762
- # Create a download button
763
- dldcol1, dldcol2= st.columns([1,4])
764
- with dldcol1:
765
- st.download_button(
766
- label="Download Cleaned Data as CSV",
767
- data=csv,
768
- file_name=f'{name_data}.csv',
769
- mime='text/csv',
770
- )
771
- with dldcol2:
772
- st.download_button(
773
- label="Download Anomalous Data as CSV",
774
- data=csv2,
775
- file_name=f'Anomaly_{name_data}.csv',
776
- mime='text/csv',
777
- )
778
- save = st.button("Save Data For Further Processing")
779
- if save:
780
- connection_string = ( 'SERVER=sql-ext-dev-uks-001.database.windows.net;'
781
- 'DATABASE=sqldb-ext-dev-uks-001;'
782
- 'UID=dbadmin;'
783
- 'PWD=mYpa$$w0rD'
784
- )
785
- st.session_state.data = st.session_state.data.astype(str)
786
- load_dataframe_to_sqlserver(st.session_state.data, f'[dbo].[PROFILED_{name_data}]', connection_string)
 
19
  from sdv.datasets.local import load_csvs
20
  import pyodbc
21
  import pymssql
22
+ from streamlit_app import sidebar
23
 
24
  warnings.filterwarnings('ignore')
25
  st.set_page_config(
 
55
  </style>
56
  """, unsafe_allow_html=True)
57
 
58
+ ######
59
+ def main():
60
+ # st.title('PAGE TITLE') # Change this for each page
61
+ sidebar()
62
+ ########
63
 
64
+ def load_dataframe_to_sqlserver(df, table_name, connection_string):
65
+ # Establish a connection to the database
66
+ conn = pyodbc.connect(connection_string)
67
+ cursor = conn.cursor()
68
+
69
+ # Drop table if it exists
70
+ drop_table_sql = f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}"
71
+
72
+ try:
73
+ cursor.execute(drop_table_sql)
74
+ conn.commit()
75
+ except Exception as e:
76
+ st.error(f"Error dropping table. Please try with a different name.")
77
+
78
+ # Create table SQL statement based on DataFrame columns and types
79
+ create_table_sql = f"CREATE TABLE {table_name} ("
80
+ for column in df.columns:
81
+ dtype = str(df[column].dtype)
82
+ sql_dtype = 'NVARCHAR(MAX)'
83
+ create_table_sql += f"{column} {sql_dtype}, "
84
+ create_table_sql = create_table_sql.rstrip(', ') + ')'
85
+
86
+ try:
87
+ # Execute table creation
88
+ cursor.execute(create_table_sql)
89
+ conn.commit()
90
+ except Exception as e:
91
+ st.error(f"Error Creating table. Please try with a different name.")
92
 
93
+ # Insert DataFrame data into the table using bulk insert
94
+ insert_sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['?' for _ in df.columns])})"
95
+
96
+ try:
97
+ # Using `fast_executemany` for bulk inserts
98
+ cursor.fast_executemany = True
99
+ cursor.executemany(insert_sql, df.values.tolist())
100
+ conn.commit()
101
+ st.success(f"Data Imported with table name: '{table_name}' successfully.")
102
+ except Exception as e:
103
+ st.error(f"Error Inserting Data. Please try with a different name.")
104
+
105
+ cursor.close()
106
+ conn.close()
107
+
108
 
109
+ def clear_cache():
110
+ keys = list(st.session_state.keys())
111
+ for key in keys:
112
+ st.session_state.pop(key)
113
+
114
+ def set_bg_hack(main_bg):
115
+ '''
116
+ A function to unpack an image from root folder and set as bg.
117
+
118
+ Returns
119
+ -------
120
+ The background.
121
+ '''
122
+ # set bg name
123
+ main_bg_ext = "png"
124
+
125
+ st.markdown(
126
+ f"""
127
+ <style>
128
+ .stApp {{
129
+ background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(main_bg, "rb").read()).decode()});
130
+ background-size: cover
131
+ }}
132
+ </style>
133
+ """,
134
+ unsafe_allow_html=True
135
+ )
136
+ #set_bg_hack("bg2.png")
137
+ header_style = """
138
+ <style>
139
+ .header {
140
+ color: black; /* Soft dark gray text color for readability */
141
+ width: 103%;
142
+ font-size: 60px; /* Large font size */
143
+ font-weight: bold; /* Bold text */
144
+ line-height: 1.2; /* Improved readability */
145
+ margin-bottom: 30px; /* Add some space below the header */
146
+ padding: 20px; /* Add padding for better spacing */
147
+ background-image:
148
+ linear-gradient(to right, rgba(255, 140, 0, 0.3) 25%, transparent 75%), /* Darker orange with higher opacity */
149
+ linear-gradient(to bottom, rgba(255, 140, 0, 0.3) 15%, transparent 75%),
150
+ linear-gradient(to left, rgba(255, 140, 0, 0.3) 25%, transparent 55%),
151
+ linear-gradient(to top, rgba(255, 140, 0, 0.3) 25%, transparent 95%);
152
+ background-blend-mode: overlay;
153
+ background-size: 250px 350px;
154
+ border-radius: 10px; /* Add border radius for rounded corners */
155
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */
156
+ }
157
+ </style>
158
+ """
159
 
 
 
 
 
 
 
 
 
 
160
 
 
 
 
 
 
 
 
 
161
 
 
 
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ content_style = """
165
+ <style>
166
+ .content {
167
+ font-size: 40px; /* Larger font size for content */
168
+ line-height: 1.6; /* Improved readability */
169
+ width: 103%;
170
+ padding: 10px; /* Add padding for better spacing */
171
+ margin-bottom: 20px;
172
+ background-color: sky-blue; /* Background color for the header */
173
+ border-radius: 10px; /* Add border radius for rounded corners */
174
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */
175
+ }
176
+ </style>
177
+ """
178
+
179
+ small_style = """
180
+ <style>
181
+ .small {
182
+ color: black;
183
+ font-size: 30px; /* Larger font size for content */
184
+ line-height: 1.6; /* Improved readability */
185
+ width: 100%;
186
+ padding: 10px; /* Add padding for better spacing */
187
+ margin-bottom: 10px;
188
+ background-color: white; /* Background color for the header */
189
+ border-radius: 10px; /* Add border radius for rounded corners */
190
+ }
191
+ </style>
192
+ """
 
 
 
193
 
194
+ def update_column_dtype(df, column_name, dtype):
195
+ error_entries = pd.DataFrame()
196
+ flag = None
197
+ if dtype == 'System Detected':
198
+ pass
199
+ elif dtype == 'int64':
200
+ try:
201
+ df[column_name] = df[column_name].astype('int64')
202
+ except ValueError:
203
+ error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())]
204
+ st.error('Unable to convert some entries to integer. Please Clean the column.')
205
+ elif dtype == 'float64/numeric':
206
+ try:
207
+ df[column_name] = df[column_name].astype('float64')
208
+ except ValueError:
209
+ error_entries = df[pd.to_numeric(df[column_name], errors='coerce').isna()]
210
+ st.error('Unable to convert some entries to float. Please Clean the column.')
211
+ elif dtype == 'id':
212
+ try:
213
+ df[column_name] = df[column_name].astype('int64')
214
+ except ValueError:
215
+ error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())]
216
+ st.error('Unable to convert some entries to id. Please Clean the column.')
217
+ elif dtype == 'categorical/string':
218
+ df[column_name] = df[column_name].astype('category')
219
+ elif dtype == 'datetime':
220
+ try:
221
+ df[column_name] = pd.to_datetime(df[column_name], errors='raise', infer_datetime_format=True)
222
+ except ValueError:
223
+ error_entries = df[pd.to_datetime(df[column_name], errors='coerce', infer_datetime_format=True).isna()]
224
+ custom_format = st.text_input("Please provide the datetime format (e.g., %Y-%m-%d):")
225
+ if custom_format:
226
+ try:
227
+ df[column_name] = pd.to_datetime(df[column_name], errors='raise', format=custom_format)
228
+ except ValueError:
229
+ error_entries = df[pd.to_datetime(df[column_name], errors='coerce', format=custom_format).isna()]
230
+ st.error('Unable to parse datetime with the provided format. Please Clean the column.')
231
+ elif dtype == 'email':
232
+ df[column_name] = df[column_name].astype('category')
233
+ flag= 'email'
234
+ elif dtype == 'phone_number':
235
+ df[column_name] = df[column_name].astype('category')
236
+ flag= 'phone_number'
237
+
238
+ return df, error_entries, flag
239
 
240
+ def convert_to_special_representation(value):
241
+ value = str(value)
242
+ special_chars = set("!@#$%^&*()_+-=[]{}|;:,.<>?`~")
243
+ result = ''
244
+ for char in value:
245
+ if char.isdigit():
246
+ result += 'N'
247
+ elif char.isalpha():
248
+ result += 'A'
249
+ elif char in special_chars:
250
+ result += char
251
+ else:
252
+ # Handle other characters as needed
253
+ result += char
254
+ return result
 
 
 
 
 
 
 
 
 
 
 
255
  with st.container(border=True):
256
+ st.subheader('SELECT TABLE')
257
+ metadata = SingleTableMetadata()
258
+ conn = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;"
259
+ "Database=sqldb-ext-dev-uks-001;"
260
+ "UID=dbadmin;"
261
+ "PWD=mYpa$$w0rD" )
262
+ query1_1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME in ('TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM') ORDER BY TABLE_NAME ASC"
263
+ query1_2="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME LIKE 'PROFILED%' ORDER BY TABLE_NAME ASC"
264
+ tab_names=list(pd.read_sql_query(query1_1,con=conn)['TABLE_NAME'])
265
+ tab_names_edited= list(pd.read_sql_query(query1_2,con=conn)['TABLE_NAME'])
266
+ sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
267
+ mode_selector=st.selectbox("Select How you want to Proceed", ["Start Profiling with Source Data", "Load Previously Profiled Data For Further Processing"], on_change=clear_cache,placeholder='Show Options')
268
+ if mode_selector == "Start Profiling with Source Data":
269
+ table_selector=st.selectbox('SELECT TABLE NAME',tab_names,index=None,on_change=clear_cache,placeholder='Select table name')
270
+
271
+ if mode_selector == "Load Previously Profiled Data For Further Processing":
272
+ table_selector=st.selectbox('SELECT TABLE NAME',tab_names_edited,index=None,on_change=clear_cache,placeholder='Select table name')
273
+
274
+ if table_selector is not None and sample_selector is not None:
275
+ if sample_selector=='100':
276
+ count="top 100"
277
+ elif sample_selector=='10K':
278
+ count="top 10000"
279
+ elif sample_selector=='100K':
280
+ count="top 100000"
281
+ elif sample_selector=='1M':
282
+ count="top 1000000"
283
  else:
284
+ count=""
285
+ query2="select "+count+" * from [dbo].["+table_selector+"]"
286
+ df = pd.read_sql_query(query2,con=conn)
287
+ main_list=df.columns.to_list()
288
+ sub_list=['ID','LOADID','FILE_NAME']
289
+ if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
290
+ df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
291
+ conn.close()
292
+ if 'data' not in st.session_state:
293
+ st.session_state.data= df
294
+ metadata.detect_from_dataframe(st.session_state.data)
295
+ st.sidebar.header("DataFrame Live Preview")
296
+ st.sidebar.markdown("*This Window keeps the live status of the dataframe under processing. You can review this dataframe after all the changes.*")
297
+ df_preview= st.sidebar.empty()
298
+ df_preview.write(st.session_state.data)
299
+ st.markdown(content_style, unsafe_allow_html=True)
300
+ with st.container(border=True):
301
+ cols= df.columns.to_list()
302
+ primary_key= metadata.primary_key
303
+ sugg_primary_keys = [col for col in cols if df[col].is_unique and df[col].dtype != 'float' and not df[col].isnull().any()]
304
+ prob_key= sugg_primary_keys
305
+ if primary_key in sugg_primary_keys:
306
+ default_index = sugg_primary_keys.index(primary_key)
307
+ else:
308
+ sugg_primary_keys.append(primary_key)
309
+ default_index = sugg_primary_keys.index(primary_key)
310
+ no_y_data =[]
311
+ email_cols=[]
312
+ phone_cols=[]
313
+ # cols_select= st.multiselect('Please select column(s) for Profiling and Cleansing', cols, default= cols[:5])
314
+ tabs3= st.tabs(cols)
315
+ for i, tab in enumerate(tabs3):
316
+ with tab:
317
+ col= cols[i]
318
+ scol1,scol2= st.columns([4,1])
319
+ with scol1:
320
+ taba, tabb, tabc, tabd, tabe = st.tabs(["📝 DataType Validation", "🧹 Missing Value Handling", "📈 Statistical Profiling", " ✨ Pattern Exploration", "🤖 AI Assisted Data Cleansing"])
321
+ with taba:
322
+ if st.session_state.data[col].dtype.name == 'category':
323
+ st.session_state.data[col] = st.session_state.data[col].astype('str')
324
+ dtypes= ['System Detected', 'int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number']
325
+ no_dtypes= ['int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number']
326
+ no_dtype = False
327
+ if metadata.columns[col]['sdtype'] != "unknown":
328
+ datatype= metadata.columns[col]['sdtype']
329
+ st.info(f"System Identified DataType: {datatype}")
330
+ elif str(df[col].dtype) != 'object' and metadata.columns[col]['sdtype'] == "unknown":
331
+ datatype= str(df[col].dtype)
332
+ st.info(f"System Identified DataType: {datatype}")
333
+ else:
334
+ datatype= 'NA'
335
+ #st.warning("System Could Not Understand Datatype. Please Specify the Datatype")
336
+ no_dtype= True
337
+ if datatype in ['int64']:
338
+ def_index=1
339
+ if datatype in ['float64', 'numerical']:
340
+ def_index=2
341
+ if datatype in ['id']:
342
+ def_index=3
343
+ if datatype in ['categorical', 'string']:
344
+ def_index=4
345
+ if datatype in ['datetime']:
346
+ def_index=5
347
+ if datatype in ['email']:
348
+ def_index=6
349
+ if datatype in ['phone_number']:
350
+ def_index=7
351
+
352
+ if col == primary_key:
353
+ st.success("This is System Identified Primary Key")
354
+ elif col in prob_key:
355
+ st.warning("This is System suggested potential Primary Key")
356
+ if f'dtype_{col}' not in st.session_state:
357
+ st.session_state[f'dtype_{col}'] = 'initiate'
358
+ if st.session_state[f'dtype_{col}'] not in ['email', 'phone_number']:
359
+ st.session_state.flag = None
360
+
361
+ if no_dtype == True:
362
+ fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",no_dtypes, index=3, key= f'datatype_{col}')
363
+ else:
364
+ fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",dtypes, index=def_index, key= f'datatype_{col}')
365
+ st.session_state[f'dtype_{col}'] = st.session_state[f'datatype_{col}']
366
+ st.session_state.data, error_df, st.session_state.flag= update_column_dtype(st.session_state.data,col,fin_datatype)
367
+
368
+ if error_df.empty:
369
+ st.success("No Datatype Validation Errors For Current Datatype")
370
+ try:
371
+ df_preview.write(st.session_state.data)
372
+ except:
373
+ st.warning("DataFrame Updated. But Could Not Load Preview")
374
+ else:
375
+ st.subheader("Prepare the Column for Conversion:")
376
+ try:
377
+ edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={
378
+ col: st.column_config.TextColumn(
379
+ col,
380
+ width="medium",
381
+ )
382
+ }, key=f'dtype_error_{col}')
383
+ except:
384
+ edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={
385
+ col: st.column_config.TextColumn(
386
+ col,
387
+ width="medium",
388
+ )
389
+ }, key=f'dtype_error_{col}')
390
+ check = st.button("Fix Error", key=f"Fix{col}")
391
+ if check:
392
+ st.session_state.data= st.session_state.data.drop(error_df.index)
393
+ st.session_state.data = pd.concat([st.session_state.data, edited_error_df])
394
+ df_preview.write(st.session_state.data)
395
+ if fin_datatype in ['id', 'email', 'phone_number']:
396
+ no_y_data.append(col)
397
+ if fin_datatype in ['email']:
398
+ email_cols.append(col)
399
+ if fin_datatype in ['phone_number']:
400
+ phone_cols.append(col)
401
+ no_y_data.extend(['Validity','Validity_phone','Validity_email'])
402
+ total_records = len(st.session_state.data)
403
+ with tabc:
404
+ if col not in no_y_data:
405
+ y_data_col= st.session_state.data[[col]]
406
+ pr = ProfileReport(y_data_col, dark_mode=True, explorative=False, config_file=r"ydata_config.yml")
407
+ pr.config.html.style.primary_colors = ['#e41a1c']
408
+ with st.container(border=True):
409
+ st_profile_report(pr, navbar=False, key=f'profile{col}')
410
+ elif col in email_cols:
411
+ unique_emails = st.session_state.data[col].nunique()
412
+ duplicate_emails = total_records - unique_emails
413
+ # Extract email domains
414
+ email_domains = st.session_state.data[col].str.extract(r'@(.+)$')[0]
415
+ # Count occurrences of each domain
416
+ email_domain_counts = email_domains.value_counts()
417
+ # Get the top 5 email domains
418
+ top_email_domains = email_domain_counts.head(5)
419
+
420
+
421
+ # Format the top email domains for display
422
+ top_email_domains_str = '\n|\n'.join([f"{domain}: {count}" for domain, count in top_email_domains.items()])
423
+ if f'invalid_em_{col}' in st.session_state:
424
+ invalid_emails= len(st.session_state[f'invalid_em_{col}'])
425
+ valid_emails= total_records - invalid_emails
426
+ percent_invalid_emails = invalid_emails / total_records * 100
427
+ email_message = f"""
428
+ ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} ({100 - percent_invalid_emails:.2f}%)\n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails} ({percent_invalid_emails:.2f}%)\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str}
429
+ """
430
+
431
+ else:
432
+ invalid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
433
+ valid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
434
+ percent_invalid_emails = "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them."
435
+
436
+ email_message = f"""
437
+ ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} \n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails}\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str}
438
+ """
439
+
440
+ with st.container(border=True):
441
+ st.markdown(str(email_message))
442
+ ref_em=st.button('Refresh', key=f'email{col}')
443
+ if ref_em:
444
+ pass
445
+
446
+
447
+ elif col in phone_cols:
448
+ unique_phones = st.session_state.data[col].nunique()
449
+ duplicate_phones = total_records - unique_phones
450
+ phone_country_codes = st.session_state.data[col].str.extract(r'^\+(\d+)')[0].value_counts()
451
+ top_phone_country_codes = list(phone_country_codes.head(5).to_string())
452
+ to_remove = ['\n', ' ']
453
+ top_phone_country_codes = [item for item in top_phone_country_codes if item not in to_remove]
454
+ if f'invalid_ph_{col}' in st.session_state:
455
+ invalid_phones= len(st.session_state[f'invalid_ph_{col}'])
456
+ valid_phones= total_records - invalid_phones
457
+ percent_invalid_phones = invalid_phones / total_records * 100
458
+ phone_message= f"""
459
+
460
+ ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} ({100 - percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Invalid Phone Numbers:** {invalid_phones} ({percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Top 5 Phone Country Codes:** {top_phone_country_codes}
461
+ """
462
+ else:
463
+ invalid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
464
+ valid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
465
+ percent_invalid_phones = "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them."
466
+ phone_message=f"""
467
+
468
+ ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Invalid Phone Numbers:** {invalid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Top 5 Phone Country Codes:** {top_phone_country_codes}
469
+ """
470
+
471
 
472
+ with st.container(border=True):
473
+ st.markdown(str(phone_message))
474
+ ref_ph=st.button('Refresh', key=f'phone{col}')
475
+ if ref_ph:
476
+ pass
477
+ with tabd:
478
+ st.session_state.data_encoded = st.session_state.data.copy()
479
+ st.session_state.data_encoded[f'Pattern_{col}'] = st.session_state.data_encoded[col].apply(convert_to_special_representation)
480
+ patterns= list(st.session_state.data_encoded[f'Pattern_{col}'].unique())
481
+ patt_col1, patt_col2 = st.columns([1,4])
482
+ with patt_col1:
483
+ st.session_state.pattern_list= pd.DataFrame(patterns,columns=['Pattern Name'])
484
+ event = st.dataframe(
485
+ st.session_state.pattern_list,
486
+ key=f"pattern_list_data{col}",
487
+ on_select="rerun",
488
+ selection_mode=["multi-row"],
489
+ hide_index=True,
490
+ width= 10000,
491
+ height= 450
492
+ )
493
+ if len(event.selection.rows) > 0:
494
+ filter= list(st.session_state.pattern_list.loc[event.selection.rows]['Pattern Name'].values)
495
+ else:
496
+ filter = None
497
+ if filter is not None:
498
+ with patt_col2:
499
+ with st.container(border= True, height= 450):
500
+ st.write("#####")
501
+
502
+ if not st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].empty:
503
+ st.session_state.data_encoded[col] = st.session_state.data_encoded[col].astype('str')
504
+ try:
505
+ edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={
506
+ col: st.column_config.TextColumn(
507
+ col,
508
+ width="medium",
509
+ )
510
+ }, height=300, key=f'Valid_pattern_{col}')
511
+ except:
512
+ edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={
513
+ col: st.column_config.Column(
514
+ col,
515
+ width="medium",
516
+ )
517
+ }, height=300, key=f'Valid_pattern_{col}')
518
+ valid_pattern = st.button("Confirm", key=f"Fix_valid_pattern_{col}")
519
+ if valid_pattern:
520
+ st.session_state.data= st.session_state.data.drop(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].index)
521
+ st.session_state.data = pd.concat([st.session_state.data, edited_pattern_df])
522
+ st.session_state.data=st.session_state.data.drop([f'Pattern_{col}'], axis=1)
523
+ st.session_state.data= st.session_state.data.sort_index()
524
+ df_preview.write(st.session_state.data)
525
+ else:
526
+ with patt_col2:
527
+ with stylable_container(
528
+ key=f"container_select_pattern_none{col}",
529
+ css_styles="""
530
+ {
531
+ border: 1px solid white;
532
+ border-radius: 0.5rem;
533
+ padding: calc(1em - 1px);
534
+ width: 100%;
535
+ color: orange;
536
+ size: 100px;
537
+ }
538
+ """
539
+ ):
540
+ st.write('##\n\n##\n\n')
541
+ st.markdown("""
542
+ <style>
543
+ .big-font {
544
+ font-size:15px;
545
+ width: 100%;
546
+ text-align: center;
547
+ }
548
+ </style>
549
+ """, unsafe_allow_html=True)
550
+ st.markdown(f'<p class="big-font">🛈 There are total {len(st.session_state.pattern_list)} Number of Patterns Available. Please Select Pattern(s) for Matching Records</p>', unsafe_allow_html=True)
551
+ st.write('##\n\n##\n\n')
552
+
553
+ with tabb:
554
  try:
555
+ edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={
556
  col: st.column_config.TextColumn(
557
  col,
558
  width="medium",
559
  )
560
+ }, key=f'miss_{col}')
561
  except:
562
+ edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={
563
+ col: st.column_config.Column(
564
  col,
565
  width="medium",
566
  )
567
+ }, key=f'miss_{col}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
+ incol1,incol2, extra= st.columns([1.1,1.5,8])
570
+ with incol1:
571
+ #st.write(st.session_state[f'dtype_{col}'])
572
+ if st.session_state[f'dtype_{col}'] not in ['int64', 'float64/numeric']:
573
+ def_fill = st.text_input("Default Autofill Value",key=f"def_fill_{col}")
574
+ autofill= st.button("Autofill", key=f"autofill_{col}")
575
+
576
+ if autofill:
577
+ if st.session_state[f'dtype_{col}'] not in ['int','float']:
578
+ st.session_state.data[col] = st.session_state.data[col].astype('str').replace('', pd.NA).replace({None: pd.NA}).fillna(def_fill)
579
+ else:
580
+ st.session_state.data[col] = st.session_state.data[col].replace({None: pd.NA}).fillna(method='ffill')
581
+ st.success("Column Autofilled. Please Review the Sidebar for updated status of the Dataframe.")
582
+ df_preview.write(st.session_state.data)
583
+ with incol2:
584
+ confirm= st.button("Confirm", key=f"Confirm_{col}")
585
+ if confirm:
586
+ st.session_state.data[col] = st.session_state.data[col].replace('', np.nan).replace({None: np.nan})
587
+ st.session_state.data = st.session_state.data.dropna(subset=[col])
588
+ st.session_state.data.update(edited_df)
589
+ st.session_state.data = pd.concat([st.session_state.data, edited_df[~edited_df.index.isin(st.session_state.data.index)]])
590
+ st.session_state.data= st.session_state.data.sort_index()
591
+ st.success("State Saved. Please Review the Sidebar for updated status of the Dataframe.")
592
+ df_preview.write(st.session_state.data)
593
+ with tabe:
594
+ if "overall_invalid_df" not in st.session_state:
595
+ st.session_state.overall_invalid_df = pd.DataFrame()
596
+ if (st.session_state[f'dtype_{col}'] not in ['email', 'phone_number'] and st.session_state.flag not in ['email', 'phone_number']):
597
+ st.dataframe(st.session_state.data)
598
+ AI_check= st.button("Check For Anomalies", key= f'AI_CHECK_{col}')
599
+ if AI_check:
600
+ with st.spinner("Running Anomaly Detection AI"):
601
+ #my_bar = st.progress(0, text="Progress")
602
 
603
+ if st.session_state[f'dtype_{col}'] in ['categorical/string']:
604
+ if 'missing@123' not in st.session_state.data[col].cat.categories:
605
+ st.session_state.data[col] = st.session_state.data[col].cat.add_categories(['missing@123'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
 
607
+ st.session_state.data[col] = st.session_state.data[col].fillna('missing@123').astype(str)
608
+ st.session_state.data_encoded = st.session_state.data[col].apply(convert_to_special_representation)
609
+ mixed_transformer = Pipeline(steps=[
610
+ ('vectorizer', CountVectorizer(analyzer='char', lowercase=False))
611
+ ])
612
+
613
+ df_transformed = mixed_transformer.fit_transform(st.session_state.data_encoded)
614
+
615
+ input_dim = df_transformed.shape[1]
616
+ encoding_dim = (input_dim // 2) + 1
617
+
618
+ input_layer = Input(shape=(None, input_dim))
619
+ conv1d_layer = Conv1D(64, 3, activation='relu', padding='same')(input_layer)
620
+ maxpooling_layer = MaxPooling1D(pool_size=2, padding='same')(conv1d_layer)
621
+ encoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=False))(maxpooling_layer)
622
+
623
+ repeat_vector = RepeatVector(input_dim)(encoder_lstm)
624
+ decoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=True))(repeat_vector)
625
+ conv1d_layer_decoder = Conv1D(64, 3, activation='relu', padding='same')(decoder_lstm)
626
+ upsampling_layer = Conv1D(input_dim, 2, activation='relu', padding='same')(conv1d_layer_decoder)
627
+
628
+ autoencoder = Model(inputs=input_layer, outputs=upsampling_layer)
629
+
630
+ autoencoder.compile(optimizer=Adam(), loss=MeanSquaredError())
631
+ #my_bar.progress(40, text='Progress')
632
+ autoencoder.fit(np.expand_dims(df_transformed.toarray(), axis=1), np.expand_dims(df_transformed.toarray(), axis=1),
633
+ epochs=100, batch_size=2, shuffle=True, validation_split=0.2, verbose=1)
634
+ reconstructions = autoencoder.predict(np.expand_dims(df_transformed.toarray(), axis=1))
635
+ reconstruction_error = np.mean(np.abs(reconstructions - np.expand_dims(df_transformed.toarray(), axis=1)), axis=(1, 2))
636
+
637
+ threshold = np.percentile(reconstruction_error, 95) # Adjust the percentile based on desired sensitivity
638
+ #my_bar.progress(90, text='Progress')
639
+ st.session_state.data['Validity'] = ['Invalid' if error > threshold else 'Valid' for error in reconstruction_error]
640
+ st.session_state.data[col] = st.session_state.data[col].replace('missing@123', '')
641
+ st.session_state[f"invalid_ai_data_{col}"]= st.session_state.data[st.session_state.data['Validity']== 'Invalid']
642
+ #my_bar.progress(100, text='Progress')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
+ if f"invalid_ai_data_{col}" in st.session_state:
645
+ st.session_state[f"invalid_ai_data_{col}"]["Invalid Field"] = col
646
+ if 'Validity' in st.session_state[f"invalid_ai_data_{col}"].columns:
647
+ st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"].drop(['Validity'], axis=1)], ignore_index=True)
648
+ else:
649
+ st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"]], ignore_index=True)
650
+
651
+ try:
652
+ edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={
653
+ col: st.column_config.TextColumn(
654
+ col,
655
+ width="medium",
656
+ )
657
+ }, key=f'Valid_{col}')
658
+ except:
659
+ edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={
660
+ col: st.column_config.Column(
661
+ col,
662
+ width="medium",
663
+ )
664
+ }, key=f'Valid_{col}')
665
+ valid = st.button("Confirm", key=f"Fix_valid_{col}")
666
+ #my_bar.empty()
667
+ if valid:
668
+ st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity'] == 'Invalid'].index)
669
+ st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
670
+ st.session_state.data= st.session_state.data.sort_index()
671
+ df_preview.write(st.session_state.data)
672
 
673
+
674
+
675
+
676
+ elif (st.session_state[f'dtype_{col}'] in ['phone_number'] or st.session_state.flag in ['phone_number'] ):
677
+ #st.dataframe(st.session_state.data)
678
+ phone_regex = r'^\+?[0-9\s\-\(\)]+$'
679
+ # st.write(phone_regex)
680
+ st.session_state.data['Validity_phone'] = st.session_state.data[col].apply(lambda xy: 'phone_is_valid' if re.match(phone_regex,str(xy)) else 'phone_is_invalid')
681
+ st.session_state[f'invalid_phone_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1)
682
+ if f'invalid_phone_{col}_check' not in st.session_state:
683
+ st.session_state[f'invalid_phone_{col}']["Invalid Field"] = col
684
+ st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_phone_{col}']], ignore_index=True, axis=0)
685
+ st.session_state[f'invalid_phone_{col}_check'] = 'yes'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  try:
687
+ edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={
688
  col: st.column_config.TextColumn(
689
  col,
690
  width="medium",
691
  )
692
+ }, num_rows="dynamic", key=f'Valid_phone_{col}')
693
  except:
694
+ edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={
695
  col: st.column_config.Column(
696
  col,
697
  width="medium",
698
  )
699
+ }, num_rows="dynamic", key=f'Valid_phone_{col}')
700
+ valid_phone = st.button("Confirm", key=f"Fix_valid_phone_{col}")
701
+ if valid_phone:
702
+ st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].index)
 
703
  st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
704
+ st.session_state[f'invalid_ph_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1)
705
+ st.session_state.data = st.session_state.data.drop(['Validity_phone'], axis=1)
706
+
707
  df_preview.write(st.session_state.data)
708
+
709
+ elif (st.session_state[f'dtype_{col}'] in ['email'] or st.session_state.flag in ['email']):
710
+ email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
711
+ st.session_state.data['Validity_email'] = st.session_state.data[col].apply(lambda x: 'email_is_valid' if re.match(email_regex, x) else 'email_is_invalid')
712
+ if st.session_state.data[col].dtype.name == 'category':
713
+ st.session_state.data[col] = st.session_state.data[col].astype('str')
714
+ st.session_state[f'invalid_email_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1)
715
+ if f'invalid_email_{col}_check' not in st.session_state:
716
+ st.session_state[f'invalid_email_{col}']["Invalid Field"] = col
717
+ st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_email_{col}']], ignore_index=True, axis=0)
718
+ st.session_state[f'invalid_email_{col}_check'] = 'yes'
719
+ try:
720
+ edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={
721
+ col: st.column_config.TextColumn(
722
+ col,
723
+ width="medium",
724
+ )
725
+ }, key=f'Valid_email_{col}')
726
+ except:
727
+ edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={
 
 
 
728
  col: st.column_config.Column(
729
  col,
730
  width="medium",
731
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732
  }, key=f'Valid_email_{col}')
733
+ valid_email = st.button("Confirm", key=f"Fix_valid_email_{col}")
734
+ if valid_email:
735
+ st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].index)
736
+ st.session_state.data = pd.concat([st.session_state.data, edited_valid_df])
737
+ st.session_state[f'invalid_em_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1)
738
+ st.session_state.data = st.session_state.data.drop(['Validity_email'], axis=1)
739
+ df_preview.write(st.session_state.data)
740
+
 
 
 
 
 
 
 
 
741
 
742
+
743
+
744
+ with scol2:
745
+ st.markdown("**Column Being Processed**")
746
+ col_view= st.empty()
747
+ try:
748
+ col_view.write(st.session_state.data[col])
749
+ except:
750
+ st.warning("DataFrame Updated. But Could Not Load Preview")
751
+
752
+ pkcol1, pkcol2=st.columns(2)
753
+ with pkcol1:
754
+ if primary_key != None:
755
+ st.info(f"Primary Key Identified by AI: {primary_key}")
756
+ else:
757
+ st.warning("Could Not Finalize the Primary Key Automatically. Please go through the suggestions and Finalize one.")
758
+ with pkcol2:
759
+ st.selectbox("Please Finalize the Primary Key:", sugg_primary_keys, index= default_index)
760
 
761
+ with st.expander("Save and Download Data"):
762
+ name_data= st.text_input("Please Specify Name of the saved/downloaded data")
763
+ csv = st.session_state.data.to_csv(index=False).encode('utf-8')
764
+ for col in ['Validity', 'Validity_email', 'Validity_phone']:
765
+ if col in st.session_state.overall_invalid_df:
766
+ st.session_state.overall_invalid_df = st.session_state.overall_invalid_df.drop([col], axis=1)
767
+ csv2 = st.session_state.overall_invalid_df.to_csv(index=False).encode('utf-8')
768
+ #st.write(st.session_state.overall_invalid_df)
769
+ # Create a download button
770
+ dldcol1, dldcol2= st.columns([1,4])
771
+ with dldcol1:
772
+ st.download_button(
773
+ label="Download Cleaned Data as CSV",
774
+ data=csv,
775
+ file_name=f'{name_data}.csv',
776
+ mime='text/csv',
777
+ )
778
+ with dldcol2:
779
+ st.download_button(
780
+ label="Download Anomalous Data as CSV",
781
+ data=csv2,
782
+ file_name=f'Anomaly_{name_data}.csv',
783
+ mime='text/csv',
784
+ )
785
+ save = st.button("Save Data For Further Processing")
786
+ if save:
787
+ connection_string = ( 'SERVER=sql-ext-dev-uks-001.database.windows.net;'
788
+ 'DATABASE=sqldb-ext-dev-uks-001;'
789
+ 'UID=dbadmin;'
790
+ 'PWD=mYpa$$w0rD'
791
+ )
792
+ st.session_state.data = st.session_state.data.astype(str)
793
+ load_dataframe_to_sqlserver(st.session_state.data, f'[dbo].[PROFILED_{name_data}]', connection_string)
794
 
795
+ ######
796
+ if __name__ == '__main__':
797
+ main()