|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import re |
|
from streamlit_extras.dataframe_explorer import dataframe_explorer |
|
import warnings |
|
from sdv.metadata import SingleTableMetadata |
|
from streamlit_extras.stateful_button import button |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.pipeline import Pipeline |
|
from tensorflow.keras.models import Model |
|
from tensorflow.keras.layers import Input, Dense, LSTM, Bidirectional, Conv1D, MaxPooling1D, Flatten, Concatenate, Reshape, RepeatVector |
|
from tensorflow.keras.optimizers import Adam |
|
from tensorflow.keras.losses import MeanSquaredError |
|
from streamlit_extras.stylable_container import stylable_container |
|
from ydata_profiling import ProfileReport |
|
from streamlit_pandas_profiling import st_profile_report |
|
import base64 |
|
from sdv.datasets.local import load_csvs |
|
import pyodbc |
|
import pymssql |
|
from streamlit_app import sidebar |
|
|
|
warnings.filterwarnings('ignore') |
|
st.set_page_config( |
|
page_title='Profilify: Your AI Assisted Data Profiling App', |
|
layout='wide', |
|
initial_sidebar_state='collapsed' |
|
) |
|
st.markdown(""" |
|
<style> |
|
|
|
/* Remove blank space at top and bottom */ |
|
.block-container { |
|
padding-top: 2.8rem; |
|
/*padding-bottom: 1rem;*/ |
|
} |
|
|
|
/* Remove blank space at the center canvas */ |
|
.st-emotion-cache-z5fcl4 { |
|
position: relative; |
|
top: -62px; |
|
} |
|
|
|
/* Make the toolbar transparent and the content below it clickable */ |
|
.st-emotion-cache-18ni7ap { |
|
pointer-events: none; |
|
background: rgb(255 255 255 / 0%) |
|
} |
|
.st-emotion-cache-zq5wmm { |
|
pointer-events: auto; |
|
background: rgb(255 255 255); |
|
border-radius: 5px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
def main(): |
|
|
|
sidebar() |
|
|
|
|
|
def load_dataframe_to_sqlserver(df, table_name, connection_string): |
|
|
|
conn = pyodbc.connect(connection_string) |
|
cursor = conn.cursor() |
|
|
|
|
|
drop_table_sql = f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}" |
|
|
|
try: |
|
cursor.execute(drop_table_sql) |
|
conn.commit() |
|
except Exception as e: |
|
st.error(f"Error dropping table. Please try with a different name.") |
|
|
|
|
|
create_table_sql = f"CREATE TABLE {table_name} (" |
|
for column in df.columns: |
|
dtype = str(df[column].dtype) |
|
sql_dtype = 'NVARCHAR(MAX)' |
|
create_table_sql += f"{column} {sql_dtype}, " |
|
create_table_sql = create_table_sql.rstrip(', ') + ')' |
|
|
|
try: |
|
|
|
cursor.execute(create_table_sql) |
|
conn.commit() |
|
except Exception as e: |
|
st.error(f"Error Creating table. Please try with a different name.") |
|
|
|
|
|
insert_sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['?' for _ in df.columns])})" |
|
|
|
try: |
|
|
|
cursor.fast_executemany = True |
|
cursor.executemany(insert_sql, df.values.tolist()) |
|
conn.commit() |
|
st.success(f"Data Imported with table name: '{table_name}' successfully.") |
|
except Exception as e: |
|
st.error(f"Error Inserting Data. Please try with a different name.") |
|
|
|
cursor.close() |
|
conn.close() |
|
|
|
|
|
def clear_cache(): |
|
keys = list(st.session_state.keys()) |
|
for key in keys: |
|
st.session_state.pop(key) |
|
|
|
def set_bg_hack(main_bg): |
|
''' |
|
A function to unpack an image from root folder and set as bg. |
|
|
|
Returns |
|
------- |
|
The background. |
|
''' |
|
|
|
main_bg_ext = "png" |
|
|
|
st.markdown( |
|
f""" |
|
<style> |
|
.stApp {{ |
|
background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(main_bg, "rb").read()).decode()}); |
|
background-size: cover |
|
}} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
header_style = """ |
|
<style> |
|
.header { |
|
color: black; /* Soft dark gray text color for readability */ |
|
width: 103%; |
|
font-size: 60px; /* Large font size */ |
|
font-weight: bold; /* Bold text */ |
|
line-height: 1.2; /* Improved readability */ |
|
margin-bottom: 30px; /* Add some space below the header */ |
|
padding: 20px; /* Add padding for better spacing */ |
|
background-image: |
|
linear-gradient(to right, rgba(255, 140, 0, 0.3) 25%, transparent 75%), /* Darker orange with higher opacity */ |
|
linear-gradient(to bottom, rgba(255, 140, 0, 0.3) 15%, transparent 75%), |
|
linear-gradient(to left, rgba(255, 140, 0, 0.3) 25%, transparent 55%), |
|
linear-gradient(to top, rgba(255, 140, 0, 0.3) 25%, transparent 95%); |
|
background-blend-mode: overlay; |
|
background-size: 250px 350px; |
|
border-radius: 10px; /* Add border radius for rounded corners */ |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */ |
|
} |
|
</style> |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
content_style = """ |
|
<style> |
|
.content { |
|
font-size: 40px; /* Larger font size for content */ |
|
line-height: 1.6; /* Improved readability */ |
|
width: 103%; |
|
padding: 10px; /* Add padding for better spacing */ |
|
margin-bottom: 20px; |
|
background-color: sky-blue; /* Background color for the header */ |
|
border-radius: 10px; /* Add border radius for rounded corners */ |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* Add shadow for depth */ |
|
} |
|
</style> |
|
""" |
|
|
|
small_style = """ |
|
<style> |
|
.small { |
|
color: black; |
|
font-size: 30px; /* Larger font size for content */ |
|
line-height: 1.6; /* Improved readability */ |
|
width: 100%; |
|
padding: 10px; /* Add padding for better spacing */ |
|
margin-bottom: 10px; |
|
background-color: white; /* Background color for the header */ |
|
border-radius: 10px; /* Add border radius for rounded corners */ |
|
} |
|
</style> |
|
""" |
|
|
|
def update_column_dtype(df, column_name, dtype): |
|
error_entries = pd.DataFrame() |
|
flag = None |
|
if dtype == 'System Detected': |
|
pass |
|
elif dtype == 'int64': |
|
try: |
|
df[column_name] = df[column_name].astype('int64') |
|
except ValueError: |
|
error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] |
|
st.error('Unable to convert some entries to integer. Please Clean the column.') |
|
elif dtype == 'float64/numeric': |
|
try: |
|
df[column_name] = df[column_name].astype('float64') |
|
except ValueError: |
|
error_entries = df[pd.to_numeric(df[column_name], errors='coerce').isna()] |
|
st.error('Unable to convert some entries to float. Please Clean the column.') |
|
elif dtype == 'id': |
|
try: |
|
df[column_name] = df[column_name].astype('int64') |
|
except ValueError: |
|
error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] |
|
st.error('Unable to convert some entries to id. Please Clean the column.') |
|
elif dtype == 'categorical/string': |
|
df[column_name] = df[column_name].astype('category') |
|
elif dtype == 'datetime': |
|
try: |
|
df[column_name] = pd.to_datetime(df[column_name], errors='raise', infer_datetime_format=True) |
|
except ValueError: |
|
error_entries = df[pd.to_datetime(df[column_name], errors='coerce', infer_datetime_format=True).isna()] |
|
custom_format = st.text_input("Please provide the datetime format (e.g., %Y-%m-%d):") |
|
if custom_format: |
|
try: |
|
df[column_name] = pd.to_datetime(df[column_name], errors='raise', format=custom_format) |
|
except ValueError: |
|
error_entries = df[pd.to_datetime(df[column_name], errors='coerce', format=custom_format).isna()] |
|
st.error('Unable to parse datetime with the provided format. Please Clean the column.') |
|
elif dtype == 'email': |
|
df[column_name] = df[column_name].astype('category') |
|
flag= 'email' |
|
elif dtype == 'phone_number': |
|
df[column_name] = df[column_name].astype('category') |
|
flag= 'phone_number' |
|
|
|
return df, error_entries, flag |
|
|
|
def convert_to_special_representation(value): |
|
value = str(value) |
|
special_chars = set("!@#$%^&*()_+-=[]{}|;:,.<>?`~") |
|
result = '' |
|
for char in value: |
|
if char.isdigit(): |
|
result += 'N' |
|
elif char.isalpha(): |
|
result += 'A' |
|
elif char in special_chars: |
|
result += char |
|
else: |
|
|
|
result += char |
|
return result |
|
with st.container(border=True): |
|
st.subheader('SELECT TABLE') |
|
metadata = SingleTableMetadata() |
|
conn = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;" |
|
"Database=sqldb-ext-dev-uks-001;" |
|
"UID=dbadmin;" |
|
"PWD=mYpa$$w0rD" ) |
|
query1_1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME in ('TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM') ORDER BY TABLE_NAME ASC" |
|
query1_2="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME LIKE 'PROFILED%' ORDER BY TABLE_NAME ASC" |
|
tab_names=list(pd.read_sql_query(query1_1,con=conn)['TABLE_NAME']) |
|
tab_names_edited= list(pd.read_sql_query(query1_2,con=conn)['TABLE_NAME']) |
|
sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache) |
|
mode_selector=st.selectbox("Select How you want to Proceed", ["Start Profiling with Source Data", "Load Previously Profiled Data For Further Processing"], on_change=clear_cache,placeholder='Show Options') |
|
if mode_selector == "Start Profiling with Source Data": |
|
table_selector=st.selectbox('SELECT TABLE NAME',tab_names,index=None,on_change=clear_cache,placeholder='Select table name') |
|
|
|
if mode_selector == "Load Previously Profiled Data For Further Processing": |
|
table_selector=st.selectbox('SELECT TABLE NAME',tab_names_edited,index=None,on_change=clear_cache,placeholder='Select table name') |
|
|
|
if table_selector is not None and sample_selector is not None: |
|
if sample_selector=='100': |
|
count="top 100" |
|
elif sample_selector=='10K': |
|
count="top 10000" |
|
elif sample_selector=='100K': |
|
count="top 100000" |
|
elif sample_selector=='1M': |
|
count="top 1000000" |
|
else: |
|
count="" |
|
query2="select "+count+" * from [dbo].["+table_selector+"]" |
|
df = pd.read_sql_query(query2,con=conn) |
|
main_list=df.columns.to_list() |
|
sub_list=['ID','LOADID','FILE_NAME'] |
|
if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): |
|
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) |
|
conn.close() |
|
if 'data' not in st.session_state: |
|
st.session_state.data= df |
|
metadata.detect_from_dataframe(st.session_state.data) |
|
st.sidebar.header("DataFrame Live Preview") |
|
st.sidebar.markdown("*This Window keeps the live status of the dataframe under processing. You can review this dataframe after all the changes.*") |
|
df_preview= st.sidebar.empty() |
|
df_preview.write(st.session_state.data) |
|
st.markdown(content_style, unsafe_allow_html=True) |
|
with st.container(border=True): |
|
cols= df.columns.to_list() |
|
primary_key= metadata.primary_key |
|
sugg_primary_keys = [col for col in cols if df[col].is_unique and df[col].dtype != 'float' and not df[col].isnull().any()] |
|
prob_key= sugg_primary_keys |
|
if primary_key in sugg_primary_keys: |
|
default_index = sugg_primary_keys.index(primary_key) |
|
else: |
|
sugg_primary_keys.append(primary_key) |
|
default_index = sugg_primary_keys.index(primary_key) |
|
no_y_data =[] |
|
email_cols=[] |
|
phone_cols=[] |
|
|
|
tabs3= st.tabs(cols) |
|
for i, tab in enumerate(tabs3): |
|
with tab: |
|
col= cols[i] |
|
scol1,scol2= st.columns([4,1]) |
|
with scol1: |
|
taba, tabb, tabc, tabd, tabe = st.tabs(["📝 DataType Validation", "🧹 Missing Value Handling", "📈 Statistical Profiling", " ✨ Pattern Exploration", "🤖 AI Assisted Data Cleansing"]) |
|
with taba: |
|
if st.session_state.data[col].dtype.name == 'category': |
|
st.session_state.data[col] = st.session_state.data[col].astype('str') |
|
dtypes= ['System Detected', 'int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] |
|
no_dtypes= ['int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] |
|
no_dtype = False |
|
if metadata.columns[col]['sdtype'] != "unknown": |
|
datatype= metadata.columns[col]['sdtype'] |
|
st.info(f"System Identified DataType: {datatype}") |
|
elif str(df[col].dtype) != 'object' and metadata.columns[col]['sdtype'] == "unknown": |
|
datatype= str(df[col].dtype) |
|
st.info(f"System Identified DataType: {datatype}") |
|
else: |
|
datatype= 'NA' |
|
|
|
no_dtype= True |
|
if datatype in ['int64']: |
|
def_index=1 |
|
if datatype in ['float64', 'numerical']: |
|
def_index=2 |
|
if datatype in ['id']: |
|
def_index=3 |
|
if datatype in ['categorical', 'string']: |
|
def_index=4 |
|
if datatype in ['datetime']: |
|
def_index=5 |
|
if datatype in ['email']: |
|
def_index=6 |
|
if datatype in ['phone_number']: |
|
def_index=7 |
|
|
|
if col == primary_key: |
|
st.success("This is System Identified Primary Key") |
|
elif col in prob_key: |
|
st.warning("This is System suggested potential Primary Key") |
|
if f'dtype_{col}' not in st.session_state: |
|
st.session_state[f'dtype_{col}'] = 'initiate' |
|
if st.session_state[f'dtype_{col}'] not in ['email', 'phone_number']: |
|
st.session_state.flag = None |
|
|
|
if no_dtype == True: |
|
fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",no_dtypes, index=3, key= f'datatype_{col}') |
|
else: |
|
fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",dtypes, index=def_index, key= f'datatype_{col}') |
|
st.session_state[f'dtype_{col}'] = st.session_state[f'datatype_{col}'] |
|
st.session_state.data, error_df, st.session_state.flag= update_column_dtype(st.session_state.data,col,fin_datatype) |
|
|
|
if error_df.empty: |
|
st.success("No Datatype Validation Errors For Current Datatype") |
|
try: |
|
df_preview.write(st.session_state.data) |
|
except: |
|
st.warning("DataFrame Updated. But Could Not Load Preview") |
|
else: |
|
st.subheader("Prepare the Column for Conversion:") |
|
try: |
|
edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'dtype_error_{col}') |
|
except: |
|
edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'dtype_error_{col}') |
|
check = st.button("Fix Error", key=f"Fix{col}") |
|
if check: |
|
st.session_state.data= st.session_state.data.drop(error_df.index) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_error_df]) |
|
df_preview.write(st.session_state.data) |
|
if fin_datatype in ['id', 'email', 'phone_number']: |
|
no_y_data.append(col) |
|
if fin_datatype in ['email']: |
|
email_cols.append(col) |
|
if fin_datatype in ['phone_number']: |
|
phone_cols.append(col) |
|
no_y_data.extend(['Validity','Validity_phone','Validity_email']) |
|
total_records = len(st.session_state.data) |
|
with tabc: |
|
if col not in no_y_data: |
|
y_data_col= st.session_state.data[[col]] |
|
pr = ProfileReport(y_data_col, dark_mode=True, explorative=False, config_file=r"ydata_config.yml") |
|
pr.config.html.style.primary_colors = ['#e41a1c'] |
|
with st.container(border=True): |
|
st_profile_report(pr, navbar=False, key=f'profile{col}') |
|
elif col in email_cols: |
|
unique_emails = st.session_state.data[col].nunique() |
|
duplicate_emails = total_records - unique_emails |
|
|
|
email_domains = st.session_state.data[col].str.extract(r'@(.+)$')[0] |
|
|
|
email_domain_counts = email_domains.value_counts() |
|
|
|
top_email_domains = email_domain_counts.head(5) |
|
|
|
|
|
|
|
top_email_domains_str = '\n|\n'.join([f"{domain}: {count}" for domain, count in top_email_domains.items()]) |
|
if f'invalid_em_{col}' in st.session_state: |
|
invalid_emails= len(st.session_state[f'invalid_em_{col}']) |
|
valid_emails= total_records - invalid_emails |
|
percent_invalid_emails = invalid_emails / total_records * 100 |
|
email_message = f""" |
|
## Email Column: {col}\n\n **Valid Emails:** {valid_emails} ({100 - percent_invalid_emails:.2f}%)\n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails} ({percent_invalid_emails:.2f}%)\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} |
|
""" |
|
|
|
else: |
|
invalid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." |
|
valid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." |
|
percent_invalid_emails = "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." |
|
|
|
email_message = f""" |
|
## Email Column: {col}\n\n **Valid Emails:** {valid_emails} \n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails}\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} |
|
""" |
|
|
|
with st.container(border=True): |
|
st.markdown(str(email_message)) |
|
ref_em=st.button('Refresh', key=f'email{col}') |
|
if ref_em: |
|
pass |
|
|
|
|
|
elif col in phone_cols: |
|
unique_phones = st.session_state.data[col].nunique() |
|
duplicate_phones = total_records - unique_phones |
|
phone_country_codes = st.session_state.data[col].str.extract(r'^\+(\d+)')[0].value_counts() |
|
top_phone_country_codes = list(phone_country_codes.head(5).to_string()) |
|
to_remove = ['\n', ' '] |
|
top_phone_country_codes = [item for item in top_phone_country_codes if item not in to_remove] |
|
if f'invalid_ph_{col}' in st.session_state: |
|
invalid_phones= len(st.session_state[f'invalid_ph_{col}']) |
|
valid_phones= total_records - invalid_phones |
|
percent_invalid_phones = invalid_phones / total_records * 100 |
|
phone_message= f""" |
|
|
|
## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} ({100 - percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Invalid Phone Numbers:** {invalid_phones} ({percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Top 5 Phone Country Codes:** {top_phone_country_codes} |
|
""" |
|
else: |
|
invalid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." |
|
valid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." |
|
percent_invalid_phones = "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." |
|
phone_message=f""" |
|
|
|
## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Invalid Phone Numbers:** {invalid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Top 5 Phone Country Codes:** {top_phone_country_codes} |
|
""" |
|
|
|
|
|
with st.container(border=True): |
|
st.markdown(str(phone_message)) |
|
ref_ph=st.button('Refresh', key=f'phone{col}') |
|
if ref_ph: |
|
pass |
|
with tabd: |
|
st.session_state.data_encoded = st.session_state.data.copy() |
|
st.session_state.data_encoded[f'Pattern_{col}'] = st.session_state.data_encoded[col].apply(convert_to_special_representation) |
|
patterns= list(st.session_state.data_encoded[f'Pattern_{col}'].unique()) |
|
patt_col1, patt_col2 = st.columns([1,4]) |
|
with patt_col1: |
|
st.session_state.pattern_list= pd.DataFrame(patterns,columns=['Pattern Name']) |
|
event = st.dataframe( |
|
st.session_state.pattern_list, |
|
key=f"pattern_list_data{col}", |
|
on_select="rerun", |
|
selection_mode=["multi-row"], |
|
hide_index=True, |
|
width= 10000, |
|
height= 450 |
|
) |
|
if len(event.selection.rows) > 0: |
|
filter= list(st.session_state.pattern_list.loc[event.selection.rows]['Pattern Name'].values) |
|
else: |
|
filter = None |
|
if filter is not None: |
|
with patt_col2: |
|
with st.container(border= True, height= 450): |
|
st.write("#####") |
|
|
|
if not st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].empty: |
|
st.session_state.data_encoded[col] = st.session_state.data_encoded[col].astype('str') |
|
try: |
|
edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, height=300, key=f'Valid_pattern_{col}') |
|
except: |
|
edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ |
|
col: st.column_config.Column( |
|
col, |
|
width="medium", |
|
) |
|
}, height=300, key=f'Valid_pattern_{col}') |
|
valid_pattern = st.button("Confirm", key=f"Fix_valid_pattern_{col}") |
|
if valid_pattern: |
|
st.session_state.data= st.session_state.data.drop(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].index) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_pattern_df]) |
|
st.session_state.data=st.session_state.data.drop([f'Pattern_{col}'], axis=1) |
|
st.session_state.data= st.session_state.data.sort_index() |
|
df_preview.write(st.session_state.data) |
|
else: |
|
with patt_col2: |
|
with stylable_container( |
|
key=f"container_select_pattern_none{col}", |
|
css_styles=""" |
|
{ |
|
border: 1px solid white; |
|
border-radius: 0.5rem; |
|
padding: calc(1em - 1px); |
|
width: 100%; |
|
color: orange; |
|
size: 100px; |
|
} |
|
""" |
|
): |
|
st.write('##\n\n##\n\n') |
|
st.markdown(""" |
|
<style> |
|
.big-font { |
|
font-size:15px; |
|
width: 100%; |
|
text-align: center; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
st.markdown(f'<p class="big-font">🛈 There are total {len(st.session_state.pattern_list)} Number of Patterns Available. Please Select Pattern(s) for Matching Records</p>', unsafe_allow_html=True) |
|
st.write('##\n\n##\n\n') |
|
|
|
with tabb: |
|
try: |
|
edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'miss_{col}') |
|
except: |
|
edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ |
|
col: st.column_config.Column( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'miss_{col}') |
|
|
|
incol1,incol2, extra= st.columns([1.1,1.5,8]) |
|
with incol1: |
|
|
|
if st.session_state[f'dtype_{col}'] not in ['int64', 'float64/numeric']: |
|
def_fill = st.text_input("Default Autofill Value",key=f"def_fill_{col}") |
|
autofill= st.button("Autofill", key=f"autofill_{col}") |
|
|
|
if autofill: |
|
if st.session_state[f'dtype_{col}'] not in ['int','float']: |
|
st.session_state.data[col] = st.session_state.data[col].astype('str').replace('', pd.NA).replace({None: pd.NA}).fillna(def_fill) |
|
else: |
|
st.session_state.data[col] = st.session_state.data[col].replace({None: pd.NA}).fillna(method='ffill') |
|
st.success("Column Autofilled. Please Review the Sidebar for updated status of the Dataframe.") |
|
df_preview.write(st.session_state.data) |
|
with incol2: |
|
confirm= st.button("Confirm", key=f"Confirm_{col}") |
|
if confirm: |
|
st.session_state.data[col] = st.session_state.data[col].replace('', np.nan).replace({None: np.nan}) |
|
st.session_state.data = st.session_state.data.dropna(subset=[col]) |
|
st.session_state.data.update(edited_df) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_df[~edited_df.index.isin(st.session_state.data.index)]]) |
|
st.session_state.data= st.session_state.data.sort_index() |
|
st.success("State Saved. Please Review the Sidebar for updated status of the Dataframe.") |
|
df_preview.write(st.session_state.data) |
|
with tabe: |
|
if "overall_invalid_df" not in st.session_state: |
|
st.session_state.overall_invalid_df = pd.DataFrame() |
|
if (st.session_state[f'dtype_{col}'] not in ['email', 'phone_number'] and st.session_state.flag not in ['email', 'phone_number']): |
|
st.dataframe(st.session_state.data) |
|
AI_check= st.button("Check For Anomalies", key= f'AI_CHECK_{col}') |
|
if AI_check: |
|
with st.spinner("Running Anomaly Detection AI"): |
|
|
|
|
|
if st.session_state[f'dtype_{col}'] in ['categorical/string']: |
|
if 'missing@123' not in st.session_state.data[col].cat.categories: |
|
st.session_state.data[col] = st.session_state.data[col].cat.add_categories(['missing@123']) |
|
|
|
st.session_state.data[col] = st.session_state.data[col].fillna('missing@123').astype(str) |
|
st.session_state.data_encoded = st.session_state.data[col].apply(convert_to_special_representation) |
|
mixed_transformer = Pipeline(steps=[ |
|
('vectorizer', CountVectorizer(analyzer='char', lowercase=False)) |
|
]) |
|
|
|
df_transformed = mixed_transformer.fit_transform(st.session_state.data_encoded) |
|
|
|
input_dim = df_transformed.shape[1] |
|
encoding_dim = (input_dim // 2) + 1 |
|
|
|
input_layer = Input(shape=(None, input_dim)) |
|
conv1d_layer = Conv1D(64, 3, activation='relu', padding='same')(input_layer) |
|
maxpooling_layer = MaxPooling1D(pool_size=2, padding='same')(conv1d_layer) |
|
encoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=False))(maxpooling_layer) |
|
|
|
repeat_vector = RepeatVector(input_dim)(encoder_lstm) |
|
decoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=True))(repeat_vector) |
|
conv1d_layer_decoder = Conv1D(64, 3, activation='relu', padding='same')(decoder_lstm) |
|
upsampling_layer = Conv1D(input_dim, 2, activation='relu', padding='same')(conv1d_layer_decoder) |
|
|
|
autoencoder = Model(inputs=input_layer, outputs=upsampling_layer) |
|
|
|
autoencoder.compile(optimizer=Adam(), loss=MeanSquaredError()) |
|
|
|
autoencoder.fit(np.expand_dims(df_transformed.toarray(), axis=1), np.expand_dims(df_transformed.toarray(), axis=1), |
|
epochs=100, batch_size=2, shuffle=True, validation_split=0.2, verbose=1) |
|
reconstructions = autoencoder.predict(np.expand_dims(df_transformed.toarray(), axis=1)) |
|
reconstruction_error = np.mean(np.abs(reconstructions - np.expand_dims(df_transformed.toarray(), axis=1)), axis=(1, 2)) |
|
|
|
threshold = np.percentile(reconstruction_error, 95) |
|
|
|
st.session_state.data['Validity'] = ['Invalid' if error > threshold else 'Valid' for error in reconstruction_error] |
|
st.session_state.data[col] = st.session_state.data[col].replace('missing@123', '') |
|
st.session_state[f"invalid_ai_data_{col}"]= st.session_state.data[st.session_state.data['Validity']== 'Invalid'] |
|
|
|
|
|
if f"invalid_ai_data_{col}" in st.session_state: |
|
st.session_state[f"invalid_ai_data_{col}"]["Invalid Field"] = col |
|
if 'Validity' in st.session_state[f"invalid_ai_data_{col}"].columns: |
|
st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"].drop(['Validity'], axis=1)], ignore_index=True) |
|
else: |
|
st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"]], ignore_index=True) |
|
|
|
try: |
|
edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'Valid_{col}') |
|
except: |
|
edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ |
|
col: st.column_config.Column( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'Valid_{col}') |
|
valid = st.button("Confirm", key=f"Fix_valid_{col}") |
|
|
|
if valid: |
|
st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity'] == 'Invalid'].index) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) |
|
st.session_state.data= st.session_state.data.sort_index() |
|
df_preview.write(st.session_state.data) |
|
|
|
|
|
|
|
|
|
elif (st.session_state[f'dtype_{col}'] in ['phone_number'] or st.session_state.flag in ['phone_number'] ): |
|
|
|
phone_regex = r'^\+?[0-9\s\-\(\)]+$' |
|
|
|
st.session_state.data['Validity_phone'] = st.session_state.data[col].apply(lambda xy: 'phone_is_valid' if re.match(phone_regex,str(xy)) else 'phone_is_invalid') |
|
st.session_state[f'invalid_phone_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) |
|
if f'invalid_phone_{col}_check' not in st.session_state: |
|
st.session_state[f'invalid_phone_{col}']["Invalid Field"] = col |
|
st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_phone_{col}']], ignore_index=True, axis=0) |
|
st.session_state[f'invalid_phone_{col}_check'] = 'yes' |
|
try: |
|
edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, num_rows="dynamic", key=f'Valid_phone_{col}') |
|
except: |
|
edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ |
|
col: st.column_config.Column( |
|
col, |
|
width="medium", |
|
) |
|
}, num_rows="dynamic", key=f'Valid_phone_{col}') |
|
valid_phone = st.button("Confirm", key=f"Fix_valid_phone_{col}") |
|
if valid_phone: |
|
st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].index) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) |
|
st.session_state[f'invalid_ph_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) |
|
st.session_state.data = st.session_state.data.drop(['Validity_phone'], axis=1) |
|
|
|
df_preview.write(st.session_state.data) |
|
|
|
elif (st.session_state[f'dtype_{col}'] in ['email'] or st.session_state.flag in ['email']): |
|
email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$' |
|
st.session_state.data['Validity_email'] = st.session_state.data[col].apply(lambda x: 'email_is_valid' if re.match(email_regex, x) else 'email_is_invalid') |
|
if st.session_state.data[col].dtype.name == 'category': |
|
st.session_state.data[col] = st.session_state.data[col].astype('str') |
|
st.session_state[f'invalid_email_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) |
|
if f'invalid_email_{col}_check' not in st.session_state: |
|
st.session_state[f'invalid_email_{col}']["Invalid Field"] = col |
|
st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_email_{col}']], ignore_index=True, axis=0) |
|
st.session_state[f'invalid_email_{col}_check'] = 'yes' |
|
try: |
|
edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ |
|
col: st.column_config.TextColumn( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'Valid_email_{col}') |
|
except: |
|
edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ |
|
col: st.column_config.Column( |
|
col, |
|
width="medium", |
|
) |
|
}, key=f'Valid_email_{col}') |
|
valid_email = st.button("Confirm", key=f"Fix_valid_email_{col}") |
|
if valid_email: |
|
st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].index) |
|
st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) |
|
st.session_state[f'invalid_em_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) |
|
st.session_state.data = st.session_state.data.drop(['Validity_email'], axis=1) |
|
df_preview.write(st.session_state.data) |
|
|
|
|
|
|
|
|
|
with scol2: |
|
st.markdown("**Column Being Processed**") |
|
col_view= st.empty() |
|
try: |
|
col_view.write(st.session_state.data[col]) |
|
except: |
|
st.warning("DataFrame Updated. But Could Not Load Preview") |
|
|
|
pkcol1, pkcol2=st.columns(2) |
|
with pkcol1: |
|
if primary_key != None: |
|
st.info(f"Primary Key Identified by AI: {primary_key}") |
|
else: |
|
st.warning("Could Not Finalize the Primary Key Automatically. Please go through the suggestions and Finalize one.") |
|
with pkcol2: |
|
st.selectbox("Please Finalize the Primary Key:", sugg_primary_keys, index= default_index) |
|
|
|
with st.expander("Save and Download Data"): |
|
name_data= st.text_input("Please Specify Name of the saved/downloaded data") |
|
csv = st.session_state.data.to_csv(index=False).encode('utf-8') |
|
for col in ['Validity', 'Validity_email', 'Validity_phone']: |
|
if col in st.session_state.overall_invalid_df: |
|
st.session_state.overall_invalid_df = st.session_state.overall_invalid_df.drop([col], axis=1) |
|
csv2 = st.session_state.overall_invalid_df.to_csv(index=False).encode('utf-8') |
|
|
|
|
|
dldcol1, dldcol2= st.columns([1,4]) |
|
with dldcol1: |
|
st.download_button( |
|
label="Download Cleaned Data as CSV", |
|
data=csv, |
|
file_name=f'{name_data}.csv', |
|
mime='text/csv', |
|
) |
|
with dldcol2: |
|
st.download_button( |
|
label="Download Anomalous Data as CSV", |
|
data=csv2, |
|
file_name=f'Anomaly_{name_data}.csv', |
|
mime='text/csv', |
|
) |
|
save = st.button("Save Data For Further Processing") |
|
if save: |
|
connection_string = ( 'SERVER=sql-ext-dev-uks-001.database.windows.net;' |
|
'DATABASE=sqldb-ext-dev-uks-001;' |
|
'UID=dbadmin;' |
|
'PWD=mYpa$$w0rD' |
|
) |
|
st.session_state.data = st.session_state.data.astype(str) |
|
load_dataframe_to_sqlserver(st.session_state.data, f'[dbo].[PROFILED_{name_data}]', connection_string) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |