diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/backup/1DATA CATALOG.py b/backup/1DATA CATALOG.py new file mode 100644 index 0000000000000000000000000000000000000000..49c09343f22e621854c9c9db95cb875c0be44372 --- /dev/null +++ b/backup/1DATA CATALOG.py @@ -0,0 +1,221 @@ +import pandas as pd +import numpy as np +import streamlit as st +import sdv +from sdv.datasets.local import load_csvs +from sdv.metadata import MultiTableMetadata +from sdv.multi_table import HMASynthesizer +import time +import os +import gc +import warnings +from PIL import Image +from sdv.metadata import SingleTableMetadata +import pyodbc +import google.generativeai as genai +import textwrap +from streamlit_extras.stylable_container import stylable_container +genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0') +genai_mod = genai.GenerativeModel( + model_name='models/gemini-1.5-pro-latest' +) + +st.set_page_config(page_title='DATA DISCOVERY') +st.title('AUTOMATED DATA CATALOGUE') +st.subheader('SELECT SOURCE') +select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name') +if select1 =='DB_10001': + datasets = load_csvs( + folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', + read_csv_parameters={ + 'skipinitialspace': True, + 'encoding': 'utf_8' + }) + st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.") + select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery') + if select_main == 'Multi Table Discovery': + with st.spinner('Performing Data Discovery'): + time.sleep(2) + st.success('Data cataloguing complete!') + datasets = load_csvs( + folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', + read_csv_parameters={ + 'skipinitialspace': True, + 'encoding': 'utf_8' + }) + metadata = MultiTableMetadata() + metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE') + python_dict = metadata.to_dict() + st.markdown('---') + st.subheader('DATA CATALOG') + # st.json(python_dict) + brands=datasets['brands'] + categories=datasets['categories'] + customers=datasets['CUSTOMER_MASTER_TBL_1'] + orderitems=datasets['order_items'] + orders=datasets['orders'] + products=datasets['products'] + staffs=datasets['staffs'] + stocks=datasets['stocks'] + stores=datasets['stores'] + tables=python_dict['tables'] + table_names=[*tables] + col1, col2, col3 = st.columns([2,2,2]) + + with col1: + def view_callback(): + st.session_state.tdet = False + view= st.button("LIST TABLES",key='view',on_click=view_callback) + with col2: + if 'tdet' not in st.session_state: + st.session_state.tdet = False + tdet1 = st.button("SHOW TABLE DETAILS") + with col3: + rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback) + + if tdet1: + st.session_state.tdet = tdet1 + if view: + #st.write(python_dict) + + st.write(pd.DataFrame(table_names,columns=['TABLE NAME'])) + + if rel: + rlist1=python_dict['relationships'] + rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY']) + for i in range(len(rlist1)): + rlist=rlist1[i] + nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i]) + rdf=pd.concat([rdf,nrow],ignore_index=True) + st.write(rdf) + if st.session_state.tdet is True: + def tdet_callback(): + st.session_state.tdet=True + st.subheader('Select table name to view') + sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback) + col4, col5 = st.columns([1, 3]) + with col4: + preview= st.button("PREVIEW TABLE",key='preview') + with col5: + cdet = st.button("GET COLUMN DETAILS",key='prof') + if preview: + st.write(datasets[sbox1]) + if cdet: + cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) + t_dict=tables[sbox1] + c_dict=t_dict['columns'] + i=0 + for key in c_dict: + e_dict=c_dict[key] + if 'pii' in e_dict: + p='YES' + else: + p='NO' + if e_dict['sdtype']=='datetime': + v=e_dict['sdtype']+': '+e_dict['datetime_format'] + else: + v=e_dict['sdtype'] + new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) + cdetails=pd.concat([cdetails, new_row],ignore_index=True) + i=i+1 + if 'primary_key' in t_dict: + st.write('Primary Key:',t_dict['primary_key']) + else: + st.write('Primary Key: No key can be detected') + st.write(cdetails) + + if select_main == 'Single Table Discovery': + metadata = SingleTableMetadata() + conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" + "Server=ipzilnpxsssp001.database.windows.net;" + "Database=Marcopolo_DB;" + "UID=ssikder004;" + "PWD=Marcopolo@123" ) + query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC" + table1=pd.read_sql_query(query1,con=conn) + table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') + table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping') + if table_selector is not None: + st.markdown('---') + query2="select * from [Client].["+table_selector+"]" + df = pd.read_sql_query(query2,con=conn) + main_list=df.columns.to_list() + sub_list=['ID','LOADID','FILE_NAME'] + if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): + df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) + conn.close() + metadata = SingleTableMetadata() + metadata.detect_from_dataframe(df) + python_dict = metadata.to_dict() + if f'cont_{table_selector}' not in st.session_state: + with st.spinner("Processing Table"): + # Create a GenerativeModel instance + genai_mod = genai.GenerativeModel( + model_name='models/gemini-1.5-pro-latest' + ) + if 'primary_key' in python_dict: + primary_key = python_dict['primary_key'] + else: + primary_key = "Could Not be Identified" + + + story = f""" Details of the table: + table columns: {str(list(df.columns))} + column datatypes: {str(df.dtypes.to_string())} + table sample data: {df.head(10).to_string()} + """ + response = genai_mod.generate_content(textwrap.dedent(""" + You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only. + Important: Please consider All fields are mandetorily during your analysis. + + Here is the table details: + + """) + story + f"The Primary Key is:{primary_key}" ) + st.write(response.usage_metadata) + st.session_state[f'cont_{table_selector}'] = response.text + with stylable_container( + key=f"container_with_border", + css_styles=""" + { + border: 1px solid white; + border-radius: 0.5rem; + padding: calc(1em - 1px); + width: 110%; /* Set container width to 100% */ + } + """ + ): + st.write(st.session_state[f'cont_{table_selector}']) + col9, col10, col11 = st.columns([2, 3, 9]) + with col9: + preview= st.button("PREVIEW TABLE",key='preview') + # with col10: + # cdet = st.button("GET COLUMN DETAILS",key='prof') + if preview: + st.dataframe(df) + # if cdet: + # cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) + # c_dict=python_dict['columns'] + # i=0 + # for key in c_dict: + # e_dict=c_dict[key] + # if 'pii' in e_dict: + # p='YES' + # else: + # p='NO' + # if e_dict['sdtype']=='datetime': + # v=e_dict['sdtype']+': '+e_dict['datetime_format'] + # else: + # v=e_dict['sdtype'] + # new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) + # cdetails=pd.concat([cdetails, new_row],ignore_index=True) + # i=i+1 + # if 'primary_key' in python_dict: + # st.write('Primary Key:',python_dict['primary_key']) + # else: + # st.write('Primary Key: No key can be detected') + # st.write(cdetails) + + + + + \ No newline at end of file diff --git a/backup/DATA CATALOG.py b/backup/DATA CATALOG.py new file mode 100644 index 0000000000000000000000000000000000000000..7c732e986f0fc83ad2a39736e43c3726f0363f05 --- /dev/null +++ b/backup/DATA CATALOG.py @@ -0,0 +1,121 @@ +import pandas as pd +import numpy as np +import streamlit as st +import sdv +from sdv.datasets.local import load_csvs +from sdv.metadata import MultiTableMetadata +from sdv.multi_table import HMASynthesizer +import time + +import os +import gc +import warnings +import plotly.express as px +from PIL import Image +from io import BytesIO +from IPython.display import Image as IPImage, display +from PyPDF2 import PdfReader +from fpdf import FPDF +import statsmodels.api as sm +import matplotlib.pyplot as plt +import seaborn as sns +import plotly.express as px +from IPython.display import Markdown +import kaleido + +st.set_page_config(page_title='DATA DISCOVERY') +st.title('AUTOMATED DATA CATALOGUE') +st.subheader('SELECT DATABASE') +select1=st.selectbox('DATABASE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name') +if select1=='DB_10001': + with st.spinner('Performing Data Discovery'): + time.sleep(2) + st.success('Data cataloguing complete!') + datasets = load_csvs( + folder_name='BIKE_STORE_DATABASE/', + read_csv_parameters={ + 'skipinitialspace': True, + 'encoding': 'utf_8' + }) + metadata = MultiTableMetadata() + metadata.detect_from_csvs(folder_name='BIKE_STORE_DATABASE/') + python_dict = metadata.to_dict() + st.markdown('---') + st.subheader('DATA CATALOG') + # st.json(python_dict) + brands=datasets['brands'] + categories=datasets['categories'] + customers=datasets['customers'] + orderitems=datasets['order_items'] + orders=datasets['orders'] + products=datasets['products'] + staffs=datasets['staffs'] + stocks=datasets['stocks'] + stores=datasets['stores'] + tables=python_dict['tables'] + table_names=[*tables] + col1, col2, col3 = st.columns([2,2,2]) + + with col1: + def view_callback(): + st.session_state.tdet = False + view= st.button("LIST TABLES",key='view',on_click=view_callback) + with col2: + if 'tdet' not in st.session_state: + st.session_state.tdet = False + tdet1 = st.button("SHOW TABLE DETAILS") + with col3: + rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback) + + if tdet1: + st.session_state.tdet = tdet1 + if view: + #st.write(python_dict) + st.write(pd.DataFrame(table_names,columns=['TABLE NAME'])) + + if rel: + rlist1=python_dict['relationships'] + rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY']) + for i in range(len(rlist1)): + rlist=rlist1[i] + nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i]) + rdf=pd.concat([rdf,nrow],ignore_index=True) + st.write(rdf) + if st.session_state.tdet is True: + def tdet_callback(): + st.session_state.tdet=True + st.subheader('Select table name to view') + sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback) + col4, col5 = st.columns([1, 3]) + with col4: + preview= st.button("PREVIEW TABLE",key='preview') + with col5: + cdet = st.button("GET COLUMN DETAILS",key='prof') + if preview: + st.write(datasets[sbox1]) + if cdet: + cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) + t_dict=tables[sbox1] + c_dict=t_dict['columns'] + i=0 + for key in c_dict: + e_dict=c_dict[key] + if 'pii' in e_dict: + p='YES' + else: + p='NO' + if e_dict['sdtype']=='datetime': + v=e_dict['sdtype']+': '+e_dict['datetime_format'] + else: + v=e_dict['sdtype'] + new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) + cdetails=pd.concat([cdetails, new_row],ignore_index=True) + i=i+1 + if 'primary_key' in t_dict: + st.write('Primary Key:',t_dict['primary_key']) + else: + st.write('Primary Key: No key can be detected') + st.write(cdetails) + + + \ No newline at end of file diff --git a/data/1714740977.8544624-gemini_messages b/data/1714740977.8544624-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..6ae5236b9cb2c01d066c3e488f17231662d7c52b Binary files /dev/null and b/data/1714740977.8544624-gemini_messages differ diff --git a/data/1714740977.8544624-st_messages b/data/1714740977.8544624-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..b7d702151021e1fb05583ab5967073b746e5ddf3 Binary files /dev/null and b/data/1714740977.8544624-st_messages differ diff --git a/data/1714983684.3830516-gemini_messages b/data/1714983684.3830516-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1714983684.3830516-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1714983684.3830516-st_messages b/data/1714983684.3830516-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..68284c8b2c256ad9c369e1babe2d559d54cc8563 Binary files /dev/null and b/data/1714983684.3830516-st_messages differ diff --git a/data/1715077374.7732036-gemini_messages b/data/1715077374.7732036-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1715077374.7732036-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1715077374.7732036-st_messages b/data/1715077374.7732036-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..36de5bd5dd5b6aaeb624d8b978101c0a3683b8af Binary files /dev/null and b/data/1715077374.7732036-st_messages differ diff --git a/data/1715077982.0880668-gemini_messages b/data/1715077982.0880668-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1715077982.0880668-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1715077982.0880668-st_messages b/data/1715077982.0880668-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..b3cd4618225fff68acc0845b629e7121f8305abd Binary files /dev/null and b/data/1715077982.0880668-st_messages differ diff --git a/data/1715078619.2998087-gemini_messages b/data/1715078619.2998087-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1715078619.2998087-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1715078619.2998087-st_messages b/data/1715078619.2998087-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..bebaad90006569e1d2e79dd563c786fc2ff58583 Binary files /dev/null and b/data/1715078619.2998087-st_messages differ diff --git a/data/1715082116.767178-gemini_messages b/data/1715082116.767178-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..d88539d63f66aff0422c1b2035bbd3a4e3305e01 Binary files /dev/null and b/data/1715082116.767178-gemini_messages differ diff --git a/data/1715082116.767178-st_messages b/data/1715082116.767178-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..6851f60e4c10f1e911acbd61bad0ed3db4162e1a Binary files /dev/null and b/data/1715082116.767178-st_messages differ diff --git a/data/1715086623.645372-gemini_messages b/data/1715086623.645372-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..abb9a59c3401f0879d2a57363a9581272c0e937e Binary files /dev/null and b/data/1715086623.645372-gemini_messages differ diff --git a/data/1715086623.645372-st_messages b/data/1715086623.645372-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..63a6d52f770516922188f2875efd83aeebe8508b Binary files /dev/null and b/data/1715086623.645372-st_messages differ diff --git a/data/1715103041.8389978-gemini_messages b/data/1715103041.8389978-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..401d4d2ae85a8ed0a7e1662703dcea5a53a13bc2 Binary files /dev/null and b/data/1715103041.8389978-gemini_messages differ diff --git a/data/1715103041.8389978-st_messages b/data/1715103041.8389978-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..958918a87137dcb4f4ead370e9f2a00b2e077cea Binary files /dev/null and b/data/1715103041.8389978-st_messages differ diff --git a/data/1715249415.4287577-gemini_messages b/data/1715249415.4287577-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..144f31c7de0b7552a1b20aabc98800a9d76beb38 Binary files /dev/null and b/data/1715249415.4287577-gemini_messages differ diff --git a/data/1715249415.4287577-st_messages b/data/1715249415.4287577-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..7dddc23b5ac4e13170c8946ede758c1d649e5865 Binary files /dev/null and b/data/1715249415.4287577-st_messages differ diff --git a/data/1717564238.3270795-gemini_messages b/data/1717564238.3270795-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..cdadfbe8c77c2b291fd4186c7759224ce308d1a0 Binary files /dev/null and b/data/1717564238.3270795-gemini_messages differ diff --git a/data/1717564238.3270795-st_messages b/data/1717564238.3270795-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..96b1b409a6b2a66e8df5c41d1eebcae5311444a6 Binary files /dev/null and b/data/1717564238.3270795-st_messages differ diff --git a/data/1717579231.916155-gemini_messages b/data/1717579231.916155-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..b9443e9883ed23876107bd4ec5d97926ce94762d Binary files /dev/null and b/data/1717579231.916155-gemini_messages differ diff --git a/data/1717579231.916155-st_messages b/data/1717579231.916155-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..6a8af94029b755cffb0d0454723f62d20ca63d4e Binary files /dev/null and b/data/1717579231.916155-st_messages differ diff --git a/data/1717580421.5748234-gemini_messages b/data/1717580421.5748234-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..2116b6a9400297e1d178a195d9b299b9f725ef71 Binary files /dev/null and b/data/1717580421.5748234-gemini_messages differ diff --git a/data/1717580421.5748234-st_messages b/data/1717580421.5748234-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..d86569bb56268f80dc19853c98cfe3a78193a445 Binary files /dev/null and b/data/1717580421.5748234-st_messages differ diff --git a/data/1717672759.119191-gemini_messages b/data/1717672759.119191-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..d3e11e014c63f79858d6e7afcce82d471e76551b Binary files /dev/null and b/data/1717672759.119191-gemini_messages differ diff --git a/data/1717672759.119191-st_messages b/data/1717672759.119191-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..1868f6e0d4757f3bc8d3383f3165dbfc180e31a3 Binary files /dev/null and b/data/1717672759.119191-st_messages differ diff --git a/data/1718020955.6171474-gemini_messages b/data/1718020955.6171474-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92db4d2f12d665f50b5f517a6d4f59fc7dc310ef Binary files /dev/null and b/data/1718020955.6171474-gemini_messages differ diff --git a/data/1718020955.6171474-st_messages b/data/1718020955.6171474-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..adae67f4e2e5866eef8c840e12c7ce6993cd1b9c Binary files /dev/null and b/data/1718020955.6171474-st_messages differ diff --git a/data/1718025984.855813-gemini_messages b/data/1718025984.855813-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..ca341266a623e58028e7d8fd7e99018fb415e97e Binary files /dev/null and b/data/1718025984.855813-gemini_messages differ diff --git a/data/1718025984.855813-st_messages b/data/1718025984.855813-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..47da95d6c6d7da7153f820ff108913d7fa3eab9b Binary files /dev/null and b/data/1718025984.855813-st_messages differ diff --git a/data/1718101291.9514854-gemini_messages b/data/1718101291.9514854-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..2485dba0a142d3b574ac976a64a29413b0fb4873 Binary files /dev/null and b/data/1718101291.9514854-gemini_messages differ diff --git a/data/1718101291.9514854-st_messages b/data/1718101291.9514854-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..7a008f5dcacdd713e172d24c5ac1f45075ab6f0c Binary files /dev/null and b/data/1718101291.9514854-st_messages differ diff --git a/data/1718212497.2504222-gemini_messages b/data/1718212497.2504222-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..c81a7f00147ccb55437bd880c8e7e9fc43d08ade Binary files /dev/null and b/data/1718212497.2504222-gemini_messages differ diff --git a/data/1718212497.2504222-st_messages b/data/1718212497.2504222-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..af9c1d2ecc6c42aa9b0f01917d5197bb1cdc5b91 Binary files /dev/null and b/data/1718212497.2504222-st_messages differ diff --git a/data/1718255487.3908408-gemini_messages b/data/1718255487.3908408-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..b82e3e071f6d6127197d6ba4445d4bf75c81d941 Binary files /dev/null and b/data/1718255487.3908408-gemini_messages differ diff --git a/data/1718255487.3908408-st_messages b/data/1718255487.3908408-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..29717c904819e935ccffb9656a09a08882183ef4 Binary files /dev/null and b/data/1718255487.3908408-st_messages differ diff --git a/data/1718276733.3224595-gemini_messages b/data/1718276733.3224595-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..5f4b553ffe6ea7b64ea89f62754e583287d669b9 Binary files /dev/null and b/data/1718276733.3224595-gemini_messages differ diff --git a/data/1718276733.3224595-st_messages b/data/1718276733.3224595-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..d9411c89fe9a122c3d93166fdf7757f600785a03 Binary files /dev/null and b/data/1718276733.3224595-st_messages differ diff --git a/data/1718278412.7311835-gemini_messages b/data/1718278412.7311835-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..5c8f7eee574cbfc0a936f363d6b70cf02fed1f3f Binary files /dev/null and b/data/1718278412.7311835-gemini_messages differ diff --git a/data/1718278412.7311835-st_messages b/data/1718278412.7311835-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..32e167dbe251843521ac17fc97b301a93620e631 Binary files /dev/null and b/data/1718278412.7311835-st_messages differ diff --git a/data/1718360522.2414901-gemini_messages b/data/1718360522.2414901-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..80d658fae443f5da61ff420ec9c78efe40048a4a Binary files /dev/null and b/data/1718360522.2414901-gemini_messages differ diff --git a/data/1718360522.2414901-st_messages b/data/1718360522.2414901-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..3e6308fcf29d798ad73e5f51fba31f720512b345 Binary files /dev/null and b/data/1718360522.2414901-st_messages differ diff --git a/data/1718361823.6470416-gemini_messages b/data/1718361823.6470416-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..4227d64bfe35b8f1c4d686b023d459670b5ae512 Binary files /dev/null and b/data/1718361823.6470416-gemini_messages differ diff --git a/data/1718361823.6470416-st_messages b/data/1718361823.6470416-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..f69cbcd34cca0cd1b4b7da42aae0c1efd3654890 Binary files /dev/null and b/data/1718361823.6470416-st_messages differ diff --git a/data/1718709779.1290016-gemini_messages b/data/1718709779.1290016-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..a19f2577279984959ec444cd700165c1c9f1733e Binary files /dev/null and b/data/1718709779.1290016-gemini_messages differ diff --git a/data/1718709779.1290016-st_messages b/data/1718709779.1290016-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..fd96fa415063d45acbf80264f8448fe38f264ed0 Binary files /dev/null and b/data/1718709779.1290016-st_messages differ diff --git a/data/1718715582.2066972-gemini_messages b/data/1718715582.2066972-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..4bd26f04b417fe022233ff6b339b727713448049 Binary files /dev/null and b/data/1718715582.2066972-gemini_messages differ diff --git a/data/1718715582.2066972-st_messages b/data/1718715582.2066972-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..68373099f43329ab536a854c63fc6390dc350186 Binary files /dev/null and b/data/1718715582.2066972-st_messages differ diff --git a/data/1718784519.314947-gemini_messages b/data/1718784519.314947-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..df09dd0183ee3507e7fdf406dafec5e95d0ac7bc Binary files /dev/null and b/data/1718784519.314947-gemini_messages differ diff --git a/data/1718784519.314947-st_messages b/data/1718784519.314947-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..154a719c17af68b000035572d78b7cbdc2d21dfd Binary files /dev/null and b/data/1718784519.314947-st_messages differ diff --git a/data/1718874158.6962657-gemini_messages b/data/1718874158.6962657-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..6d2ddb390d81d92b07f0f092ad407341e9e251e1 Binary files /dev/null and b/data/1718874158.6962657-gemini_messages differ diff --git a/data/1718874158.6962657-st_messages b/data/1718874158.6962657-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..4ea5c28b52b3032adaa240b4c8c4fa57bfe1681f Binary files /dev/null and b/data/1718874158.6962657-st_messages differ diff --git a/data/1718874782.247091-gemini_messages b/data/1718874782.247091-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..dfdc1619bf06780d88da0b191689f1cedc01995e Binary files /dev/null and b/data/1718874782.247091-gemini_messages differ diff --git a/data/1718874782.247091-st_messages b/data/1718874782.247091-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..54d14acf95748747b239d29174911acb1e674da2 Binary files /dev/null and b/data/1718874782.247091-st_messages differ diff --git a/data/1718875947.408428-gemini_messages b/data/1718875947.408428-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..e5f4548146079cfa703f7eee4c874f991399c0ca Binary files /dev/null and b/data/1718875947.408428-gemini_messages differ diff --git a/data/1718875947.408428-st_messages b/data/1718875947.408428-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..a6d6392266a181c427c8215277c3c02ae5e136d1 Binary files /dev/null and b/data/1718875947.408428-st_messages differ diff --git a/data/1718880377.5963228-gemini_messages b/data/1718880377.5963228-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..f2d3c1eea3ac6548e4aec741f75d08058c397a60 Binary files /dev/null and b/data/1718880377.5963228-gemini_messages differ diff --git a/data/1718880377.5963228-st_messages b/data/1718880377.5963228-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..4fa16a9bf1d438b673cc371c57deb32c893be0b8 Binary files /dev/null and b/data/1718880377.5963228-st_messages differ diff --git a/data/1719316847.6221726-gemini_messages b/data/1719316847.6221726-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..6465a9a9b532c2b93f6385803c4eeafdf3086b27 Binary files /dev/null and b/data/1719316847.6221726-gemini_messages differ diff --git a/data/1719316847.6221726-st_messages b/data/1719316847.6221726-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..4578c366ce85f8b47b3dde831d564f065c67a080 Binary files /dev/null and b/data/1719316847.6221726-st_messages differ diff --git a/data/1719481760.292108-gemini_messages b/data/1719481760.292108-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..0ba9dee3bc2fb09c9e5c99c1e1b0365a879f613c Binary files /dev/null and b/data/1719481760.292108-gemini_messages differ diff --git a/data/1719481760.292108-st_messages b/data/1719481760.292108-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..7d1b5319db1988106974d9bdf10f96c5755321e2 Binary files /dev/null and b/data/1719481760.292108-st_messages differ diff --git a/data/1719481934.9261296-gemini_messages b/data/1719481934.9261296-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..7e63905a0b1d78d4b98afd50dc4da4d21eaac6f1 Binary files /dev/null and b/data/1719481934.9261296-gemini_messages differ diff --git a/data/1719481934.9261296-st_messages b/data/1719481934.9261296-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..d90f055bb379545f45a046cd76c63c8afd958716 Binary files /dev/null and b/data/1719481934.9261296-st_messages differ diff --git a/data/1719482278.6042926-gemini_messages b/data/1719482278.6042926-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..f63aae238df68ec6b706907e72dc0656c089f629 Binary files /dev/null and b/data/1719482278.6042926-gemini_messages differ diff --git a/data/1719482278.6042926-st_messages b/data/1719482278.6042926-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..88176cc735131860458567db1e37397b880c6921 Binary files /dev/null and b/data/1719482278.6042926-st_messages differ diff --git a/data/1719483640.3080518-gemini_messages b/data/1719483640.3080518-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1719483640.3080518-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1719483640.3080518-st_messages b/data/1719483640.3080518-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..5f21f4a9c60fae031eb7fa458c5aed760a925b10 Binary files /dev/null and b/data/1719483640.3080518-st_messages differ diff --git a/data/1719483859.333629-gemini_messages b/data/1719483859.333629-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1719483859.333629-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1719483859.333629-st_messages b/data/1719483859.333629-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..98b8a6db38c95a35caf486eed239a4ed4a43988f Binary files /dev/null and b/data/1719483859.333629-st_messages differ diff --git a/data/1719484087.4569514-gemini_messages b/data/1719484087.4569514-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1719484087.4569514-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1719484087.4569514-st_messages b/data/1719484087.4569514-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..c4a80a6ed1c060712ae175017c4d37e42061847f Binary files /dev/null and b/data/1719484087.4569514-st_messages differ diff --git a/data/1719515418.630771-gemini_messages b/data/1719515418.630771-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..db0d5cd339c24846cd2671d87423481a8c0c4e06 Binary files /dev/null and b/data/1719515418.630771-gemini_messages differ diff --git a/data/1719515418.630771-st_messages b/data/1719515418.630771-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..7d1be5a0dd497dfee727ee2d111584f010b17f6f Binary files /dev/null and b/data/1719515418.630771-st_messages differ diff --git a/data/1719515752.659899-gemini_messages b/data/1719515752.659899-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..e4ddcbb82e1d65a5c242086988766bdad4227f05 Binary files /dev/null and b/data/1719515752.659899-gemini_messages differ diff --git a/data/1719515752.659899-st_messages b/data/1719515752.659899-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..b810c224316edca924aaf08564640498d3957a19 Binary files /dev/null and b/data/1719515752.659899-st_messages differ diff --git a/data/1719515994.0227334-gemini_messages b/data/1719515994.0227334-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..530ed826c4d92c57b5a46d9ab2710102a807c623 Binary files /dev/null and b/data/1719515994.0227334-gemini_messages differ diff --git a/data/1719515994.0227334-st_messages b/data/1719515994.0227334-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..de32a2b580aebdc23e715751bfbc548ae080e072 Binary files /dev/null and b/data/1719515994.0227334-st_messages differ diff --git a/data/1719516146.6394358-gemini_messages b/data/1719516146.6394358-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..f3726e80896073eca17b348b26745a4720ee46cc Binary files /dev/null and b/data/1719516146.6394358-gemini_messages differ diff --git a/data/1719516146.6394358-st_messages b/data/1719516146.6394358-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..5d72c0c9a981995c3ed83b39d48ac0cbd5739357 Binary files /dev/null and b/data/1719516146.6394358-st_messages differ diff --git a/data/1720429379.7638645-gemini_messages b/data/1720429379.7638645-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..40e4adb0c90403592dff192dbb6ca8390260b645 Binary files /dev/null and b/data/1720429379.7638645-gemini_messages differ diff --git a/data/1720429379.7638645-st_messages b/data/1720429379.7638645-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..299514786c2bf5dc42f1c87bb61c44db4a1d6bbd Binary files /dev/null and b/data/1720429379.7638645-st_messages differ diff --git a/data/1720435822.9303865-gemini_messages b/data/1720435822.9303865-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1720435822.9303865-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1720435822.9303865-st_messages b/data/1720435822.9303865-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..3f3d871cbd797b4884cf856de2cfcfbeaffbebb7 Binary files /dev/null and b/data/1720435822.9303865-st_messages differ diff --git a/data/1720435985.0650601-gemini_messages b/data/1720435985.0650601-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1720435985.0650601-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1720435985.0650601-st_messages b/data/1720435985.0650601-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..b6efe3a6f4ad1c300865a2d7c69b5135205ba766 Binary files /dev/null and b/data/1720435985.0650601-st_messages differ diff --git a/data/1720446843.4735558-gemini_messages b/data/1720446843.4735558-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..6792eeac54a92fcb0f941258769029a455c3aeb9 Binary files /dev/null and b/data/1720446843.4735558-gemini_messages differ diff --git a/data/1720446843.4735558-st_messages b/data/1720446843.4735558-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..da908d6180b18eaae87a97b2b1660bcd3510e427 Binary files /dev/null and b/data/1720446843.4735558-st_messages differ diff --git a/data/1720503814.719314-gemini_messages b/data/1720503814.719314-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1720503814.719314-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1720503814.719314-st_messages b/data/1720503814.719314-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..1164b028d1f9b6a53d9d833025c27bd45a00be02 Binary files /dev/null and b/data/1720503814.719314-st_messages differ diff --git a/data/1720506478.502565-gemini_messages b/data/1720506478.502565-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..84e5d09d9d686b0adc25b8f64de7dfaf302fe45f Binary files /dev/null and b/data/1720506478.502565-gemini_messages differ diff --git a/data/1720506478.502565-st_messages b/data/1720506478.502565-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..7ece47c162c00c7fefada9273da7cabab2433070 Binary files /dev/null and b/data/1720506478.502565-st_messages differ diff --git a/data/1720511575.635621-gemini_messages b/data/1720511575.635621-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..92c3c883eb886f2267599fa700ed581f787db7f2 --- /dev/null +++ b/data/1720511575.635621-gemini_messages @@ -0,0 +1 @@ +�]�. \ No newline at end of file diff --git a/data/1720511575.635621-st_messages b/data/1720511575.635621-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..428ecd5b9284feceb107cea60fc59fbcc1a91075 Binary files /dev/null and b/data/1720511575.635621-st_messages differ diff --git a/data/1720517493.0581975-gemini_messages b/data/1720517493.0581975-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..2ce215d0f8854bd26ce0b38f9ac2cbaebd8b5398 Binary files /dev/null and b/data/1720517493.0581975-gemini_messages differ diff --git a/data/1720517493.0581975-st_messages b/data/1720517493.0581975-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..5c0d55607385b3f77638daa4ac97d4affa9db80d Binary files /dev/null and b/data/1720517493.0581975-st_messages differ diff --git a/data/1720711937.085012-gemini_messages b/data/1720711937.085012-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..eb3790310983d6b90007ba42d81a20d8ff0ceb8d Binary files /dev/null and b/data/1720711937.085012-gemini_messages differ diff --git a/data/1720711937.085012-st_messages b/data/1720711937.085012-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..bbecc35ffbfe20bf6a337a1c0d40c0bce4a70431 Binary files /dev/null and b/data/1720711937.085012-st_messages differ diff --git a/data/1720713697.9442222-gemini_messages b/data/1720713697.9442222-gemini_messages new file mode 100644 index 0000000000000000000000000000000000000000..a5cb886c8d3a352e5e9ef06ae11084caa4ba7af5 Binary files /dev/null and b/data/1720713697.9442222-gemini_messages differ diff --git a/data/1720713697.9442222-st_messages b/data/1720713697.9442222-st_messages new file mode 100644 index 0000000000000000000000000000000000000000..02e1aecd50720302ed6cb7bd6a27bfe0b0acfb38 Binary files /dev/null and b/data/1720713697.9442222-st_messages differ diff --git a/pages/2DATA PROFILER.py b/pages/2DATA PROFILER.py new file mode 100644 index 0000000000000000000000000000000000000000..53604127a30a203152301c27ee1abc767984619d --- /dev/null +++ b/pages/2DATA PROFILER.py @@ -0,0 +1,789 @@ +import streamlit as st +import numpy as np +import pandas as pd +import re +from streamlit_extras.dataframe_explorer import dataframe_explorer +import warnings +from sdv.metadata import SingleTableMetadata +from streamlit_extras.stateful_button import button +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.pipeline import Pipeline +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input, Dense, LSTM, Bidirectional, Conv1D, MaxPooling1D, Flatten, Concatenate, Reshape, RepeatVector +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.losses import MeanSquaredError +from streamlit_extras.stylable_container import stylable_container +from ydata_profiling import ProfileReport +from streamlit_pandas_profiling import st_profile_report +import base64 +from sdv.datasets.local import load_csvs +import pyodbc + + +warnings.filterwarnings('ignore') +st.set_page_config( + page_title='Profilify: Your AI Assisted Data Profiling App', + layout='wide', + initial_sidebar_state='collapsed' +) +st.markdown(""" + + """, unsafe_allow_html=True) + +def load_dataframe_to_sqlserver(df, table_name, connection_string): + # Establish a connection to the database + conn = pyodbc.connect(connection_string) + cursor = conn.cursor() + + # Drop table if it exists + drop_table_sql = f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}" + + try: + cursor.execute(drop_table_sql) + conn.commit() + except Exception as e: + st.error(f"Error dropping table. Please try with a different name.") + + # Create table SQL statement based on DataFrame columns and types + create_table_sql = f"CREATE TABLE {table_name} (" + for column in df.columns: + dtype = str(df[column].dtype) + sql_dtype = 'NVARCHAR(MAX)' + create_table_sql += f"{column} {sql_dtype}, " + create_table_sql = create_table_sql.rstrip(', ') + ')' + + try: + # Execute table creation + cursor.execute(create_table_sql) + conn.commit() + except Exception as e: + st.error(f"Error Creating table. Please try with a different name.") + + # Insert DataFrame data into the table using bulk insert + insert_sql = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['?' for _ in df.columns])})" + + try: + # Using `fast_executemany` for bulk inserts + cursor.fast_executemany = True + cursor.executemany(insert_sql, df.values.tolist()) + conn.commit() + st.success(f"Data Imported with table name: '{table_name}' successfully.") + except Exception as e: + st.error(f"Error Inserting Data. Please try with a different name.") + + cursor.close() + conn.close() + + +def clear_cache(): + keys = list(st.session_state.keys()) + for key in keys: + st.session_state.pop(key) + +def set_bg_hack(main_bg): + ''' + A function to unpack an image from root folder and set as bg. + + Returns + ------- + The background. + ''' + # set bg name + main_bg_ext = "png" + + st.markdown( + f""" + + """, + unsafe_allow_html=True + ) +#set_bg_hack("bg2.png") +header_style = """ + +""" + + + + + +content_style = """ + +""" + +small_style = """ + +""" + +def update_column_dtype(df, column_name, dtype): + error_entries = pd.DataFrame() + flag = None + if dtype == 'System Detected': + pass + elif dtype == 'int64': + try: + df[column_name] = df[column_name].astype('int64') + except ValueError: + error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] + st.error('Unable to convert some entries to integer. Please Clean the column.') + elif dtype == 'float64/numeric': + try: + df[column_name] = df[column_name].astype('float64') + except ValueError: + error_entries = df[pd.to_numeric(df[column_name], errors='coerce').isna()] + st.error('Unable to convert some entries to float. Please Clean the column.') + elif dtype == 'id': + try: + df[column_name] = df[column_name].astype('int64') + except ValueError: + error_entries = df[~df[column_name].apply(lambda x: str(x).isdigit())] + st.error('Unable to convert some entries to id. Please Clean the column.') + elif dtype == 'categorical/string': + df[column_name] = df[column_name].astype('category') + elif dtype == 'datetime': + try: + df[column_name] = pd.to_datetime(df[column_name], errors='raise', infer_datetime_format=True) + except ValueError: + error_entries = df[pd.to_datetime(df[column_name], errors='coerce', infer_datetime_format=True).isna()] + custom_format = st.text_input("Please provide the datetime format (e.g., %Y-%m-%d):") + if custom_format: + try: + df[column_name] = pd.to_datetime(df[column_name], errors='raise', format=custom_format) + except ValueError: + error_entries = df[pd.to_datetime(df[column_name], errors='coerce', format=custom_format).isna()] + st.error('Unable to parse datetime with the provided format. Please Clean the column.') + elif dtype == 'email': + df[column_name] = df[column_name].astype('category') + flag= 'email' + elif dtype == 'phone_number': + df[column_name] = df[column_name].astype('category') + flag= 'phone_number' + + return df, error_entries, flag + +def convert_to_special_representation(value): + value = str(value) + special_chars = set("!@#$%^&*()_+-=[]{}|;:,.<>?`~") + result = '' + for char in value: + if char.isdigit(): + result += 'N' + elif char.isalpha(): + result += 'A' + elif char in special_chars: + result += char + else: + # Handle other characters as needed + result += char + return result +with st.container(border=True): + st.subheader('SELECT TABLE') + metadata = SingleTableMetadata() + conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" + "Server=sql-ext-dev-uks-001.database.windows.net;" + "Database=sqldb-ext-dev-uks-001;" + "UID=dbadmin;" + "PWD=mYpa$$w0rD" ) + query1_1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME in ('TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM') ORDER BY TABLE_NAME ASC" + query1_2="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' and TABLE_NAME LIKE 'PROFILED%' ORDER BY TABLE_NAME ASC" + tab_names=list(pd.read_sql_query(query1_1,con=conn)['TABLE_NAME']) + tab_names_edited= list(pd.read_sql_query(query1_2,con=conn)['TABLE_NAME']) + sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache) + mode_selector=st.selectbox("Select How you want to Proceed", ["Start Profiling with Source Data", "Load Previously Profiled Data For Further Processing"], on_change=clear_cache,placeholder='Show Options') + if mode_selector == "Start Profiling with Source Data": + table_selector=st.selectbox('SELECT TABLE NAME',tab_names,index=None,on_change=clear_cache,placeholder='Select table name') + + if mode_selector == "Load Previously Profiled Data For Further Processing": + table_selector=st.selectbox('SELECT TABLE NAME',tab_names_edited,index=None,on_change=clear_cache,placeholder='Select table name') + +if table_selector is not None and sample_selector is not None: + if sample_selector=='100': + count="top 100" + elif sample_selector=='10K': + count="top 10000" + elif sample_selector=='100K': + count="top 100000" + elif sample_selector=='1M': + count="top 1000000" + else: + count="" + query2="select "+count+" * from [dbo].["+table_selector+"]" + df = pd.read_sql_query(query2,con=conn) + main_list=df.columns.to_list() + sub_list=['ID','LOADID','FILE_NAME'] + if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): + df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) + conn.close() + if 'data' not in st.session_state: + st.session_state.data= df + metadata.detect_from_dataframe(st.session_state.data) + st.sidebar.header("DataFrame Live Preview") + st.sidebar.markdown("*This Window keeps the live status of the dataframe under processing. You can review this dataframe after all the changes.*") + df_preview= st.sidebar.empty() + df_preview.write(st.session_state.data) + st.markdown(content_style, unsafe_allow_html=True) + with st.container(border=True): + cols= df.columns.to_list() + primary_key= metadata.primary_key + sugg_primary_keys = [col for col in cols if df[col].is_unique and df[col].dtype != 'float' and not df[col].isnull().any()] + prob_key= sugg_primary_keys + if primary_key in sugg_primary_keys: + default_index = sugg_primary_keys.index(primary_key) + else: + sugg_primary_keys.append(primary_key) + default_index = sugg_primary_keys.index(primary_key) + no_y_data =[] + email_cols=[] + phone_cols=[] + # cols_select= st.multiselect('Please select column(s) for Profiling and Cleansing', cols, default= cols[:5]) + tabs3= st.tabs(cols) + for i, tab in enumerate(tabs3): + with tab: + col= cols[i] + scol1,scol2= st.columns([4,1]) + with scol1: + taba, tabb, tabc, tabd, tabe = st.tabs(["📝 DataType Validation", "🧹 Missing Value Handling", "📈 Statistical Profiling", " ✨ Pattern Exploration", "🤖 AI Assisted Data Cleansing"]) + with taba: + if st.session_state.data[col].dtype.name == 'category': + st.session_state.data[col] = st.session_state.data[col].astype('str') + dtypes= ['System Detected', 'int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] + no_dtypes= ['int64', 'float64/numeric', 'id', 'categorical/string','datetime', 'email', 'phone_number'] + no_dtype = False + if metadata.columns[col]['sdtype'] != "unknown": + datatype= metadata.columns[col]['sdtype'] + st.info(f"System Identified DataType: {datatype}") + elif str(df[col].dtype) != 'object' and metadata.columns[col]['sdtype'] == "unknown": + datatype= str(df[col].dtype) + st.info(f"System Identified DataType: {datatype}") + else: + datatype= 'NA' + #st.warning("System Could Not Understand Datatype. Please Specify the Datatype") + no_dtype= True + if datatype in ['int64']: + def_index=1 + if datatype in ['float64', 'numerical']: + def_index=2 + if datatype in ['id']: + def_index=3 + if datatype in ['categorical', 'string']: + def_index=4 + if datatype in ['datetime']: + def_index=5 + if datatype in ['email']: + def_index=6 + if datatype in ['phone_number']: + def_index=7 + + if col == primary_key: + st.success("This is System Identified Primary Key") + elif col in prob_key: + st.warning("This is System suggested potential Primary Key") + if f'dtype_{col}' not in st.session_state: + st.session_state[f'dtype_{col}'] = 'initiate' + if st.session_state[f'dtype_{col}'] not in ['email', 'phone_number']: + st.session_state.flag = None + + if no_dtype == True: + fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",no_dtypes, index=3, key= f'datatype_{col}') + else: + fin_datatype= st.selectbox(f"Please Change/Define the Datatype of column: {col}:",dtypes, index=def_index, key= f'datatype_{col}') + st.session_state[f'dtype_{col}'] = st.session_state[f'datatype_{col}'] + st.session_state.data, error_df, st.session_state.flag= update_column_dtype(st.session_state.data,col,fin_datatype) + + if error_df.empty: + st.success("No Datatype Validation Errors For Current Datatype") + try: + df_preview.write(st.session_state.data) + except: + st.warning("DataFrame Updated. But Could Not Load Preview") + else: + st.subheader("Prepare the Column for Conversion:") + try: + edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, key=f'dtype_error_{col}') + except: + edited_error_df= st.data_editor(error_df, num_rows="dynamic",column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, key=f'dtype_error_{col}') + check = st.button("Fix Error", key=f"Fix{col}") + if check: + st.session_state.data= st.session_state.data.drop(error_df.index) + st.session_state.data = pd.concat([st.session_state.data, edited_error_df]) + df_preview.write(st.session_state.data) + if fin_datatype in ['id', 'email', 'phone_number']: + no_y_data.append(col) + if fin_datatype in ['email']: + email_cols.append(col) + if fin_datatype in ['phone_number']: + phone_cols.append(col) + no_y_data.extend(['Validity','Validity_phone','Validity_email']) + total_records = len(st.session_state.data) + with tabc: + if col not in no_y_data: + y_data_col= st.session_state.data[[col]] + pr = ProfileReport(y_data_col, dark_mode=True, explorative=False, config_file=r"ydata_config.yml") + pr.config.html.style.primary_colors = ['#e41a1c'] + with st.container(border=True): + st_profile_report(pr, navbar=False, key=f'profile{col}') + elif col in email_cols: + unique_emails = st.session_state.data[col].nunique() + duplicate_emails = total_records - unique_emails + # Extract email domains + email_domains = st.session_state.data[col].str.extract(r'@(.+)$')[0] + # Count occurrences of each domain + email_domain_counts = email_domains.value_counts() + # Get the top 5 email domains + top_email_domains = email_domain_counts.head(5) + + + # Format the top email domains for display + top_email_domains_str = '\n|\n'.join([f"{domain}: {count}" for domain, count in top_email_domains.items()]) + if f'invalid_em_{col}' in st.session_state: + invalid_emails= len(st.session_state[f'invalid_em_{col}']) + valid_emails= total_records - invalid_emails + percent_invalid_emails = invalid_emails / total_records * 100 + email_message = f""" + ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} ({100 - percent_invalid_emails:.2f}%)\n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails} ({percent_invalid_emails:.2f}%)\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} + """ + + else: + invalid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." + valid_emails= "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." + percent_invalid_emails = "Please Execute AI Assisted Data Validation on Email Columns for Profiling Report of them." + + email_message = f""" + ## Email Column: {col}\n\n **Valid Emails:** {valid_emails} \n\n---------------------------------------------------------------------------------------\n\n**Invalid Emails:** {invalid_emails}\n\n----------------------------------------------------------------------------------------\n\n**Unique Emails:** {unique_emails}\n\n-------------------------------------------------------------------------------------------------------------------------\n\n**Duplicate Emails:** {duplicate_emails}\n\n----------------------------------------------------------------------------------------------------------------------\n\n**Top 5 Email Domains:** {top_email_domains_str} + """ + + with st.container(border=True): + st.markdown(str(email_message)) + ref_em=st.button('Refresh', key=f'email{col}') + if ref_em: + pass + + + elif col in phone_cols: + unique_phones = st.session_state.data[col].nunique() + duplicate_phones = total_records - unique_phones + phone_country_codes = st.session_state.data[col].str.extract(r'^\+(\d+)')[0].value_counts() + top_phone_country_codes = list(phone_country_codes.head(5).to_string()) + to_remove = ['\n', ' '] + top_phone_country_codes = [item for item in top_phone_country_codes if item not in to_remove] + if f'invalid_ph_{col}' in st.session_state: + invalid_phones= len(st.session_state[f'invalid_ph_{col}']) + valid_phones= total_records - invalid_phones + percent_invalid_phones = invalid_phones / total_records * 100 + phone_message= f""" + + ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} ({100 - percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Invalid Phone Numbers:** {invalid_phones} ({percent_invalid_phones:.2f}%)\n\n----------------------------------------------------------------------------------------------------------\n\n**Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n**Top 5 Phone Country Codes:** {top_phone_country_codes} + """ + else: + invalid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." + valid_phones= "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." + percent_invalid_phones = "Please Execute AI Assisted Data Validation on Phone Number Columns for Profiling Report of them." + phone_message=f""" + + ## Phone Number Column: {col}\n\n **Valid Phone Numbers:** {valid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Invalid Phone Numbers:** {invalid_phones} \n\n----------------------------------------------------------------------------------------------------------\n\n **Unique Phone Numbers:** {unique_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Duplicate Phone Numbers:** {duplicate_phones}\n\n----------------------------------------------------------------------------------------------------------\n\n **Top 5 Phone Country Codes:** {top_phone_country_codes} + """ + + + with st.container(border=True): + st.markdown(str(phone_message)) + ref_ph=st.button('Refresh', key=f'phone{col}') + if ref_ph: + pass + with tabd: + st.session_state.data_encoded = st.session_state.data.copy() + st.session_state.data_encoded[f'Pattern_{col}'] = st.session_state.data_encoded[col].apply(convert_to_special_representation) + patterns= list(st.session_state.data_encoded[f'Pattern_{col}'].unique()) + patt_col1, patt_col2 = st.columns([1,4]) + with patt_col1: + st.session_state.pattern_list= pd.DataFrame(patterns,columns=['Pattern Name']) + event = st.dataframe( + st.session_state.pattern_list, + key=f"pattern_list_data{col}", + on_select="rerun", + selection_mode=["multi-row"], + hide_index=True, + width= 10000, + height= 450 + ) + if len(event.selection.rows) > 0: + filter= list(st.session_state.pattern_list.loc[event.selection.rows]['Pattern Name'].values) + else: + filter = None + if filter is not None: + with patt_col2: + with st.container(border= True, height= 450): + st.write("#####") + + if not st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].empty: + st.session_state.data_encoded[col] = st.session_state.data_encoded[col].astype('str') + try: + edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, height=300, key=f'Valid_pattern_{col}') + except: + edited_pattern_df= st.data_editor(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)], num_rows="dynamic",column_config={ + col: st.column_config.Column( + col, + width="medium", + ) + }, height=300, key=f'Valid_pattern_{col}') + valid_pattern = st.button("Confirm", key=f"Fix_valid_pattern_{col}") + if valid_pattern: + st.session_state.data= st.session_state.data.drop(st.session_state.data_encoded[st.session_state.data_encoded[f'Pattern_{col}'].isin(filter)].index) + st.session_state.data = pd.concat([st.session_state.data, edited_pattern_df]) + st.session_state.data=st.session_state.data.drop([f'Pattern_{col}'], axis=1) + st.session_state.data= st.session_state.data.sort_index() + df_preview.write(st.session_state.data) + else: + with patt_col2: + with stylable_container( + key=f"container_select_pattern_none{col}", + css_styles=""" + { + border: 1px solid white; + border-radius: 0.5rem; + padding: calc(1em - 1px); + width: 100%; + color: orange; + size: 100px; + } + """ + ): + st.write('##\n\n##\n\n') + st.markdown(""" + + """, unsafe_allow_html=True) + st.markdown(f'
🛈 There are total {len(st.session_state.pattern_list)} Number of Patterns Available. Please Select Pattern(s) for Matching Records
', unsafe_allow_html=True) + st.write('##\n\n##\n\n') + + with tabb: + try: + edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, key=f'miss_{col}') + except: + edited_df= st.data_editor(st.session_state.data[(st.session_state.data[col].isna()) | (st.session_state.data[col] == '') | (st.session_state.data[col] == None)], num_rows="dynamic", column_config={ + col: st.column_config.Column( + col, + width="medium", + ) + }, key=f'miss_{col}') + + incol1,incol2, extra= st.columns([1.1,1.5,8]) + with incol1: + #st.write(st.session_state[f'dtype_{col}']) + if st.session_state[f'dtype_{col}'] not in ['int64', 'float64/numeric']: + def_fill = st.text_input("Default Autofill Value",key=f"def_fill_{col}") + autofill= st.button("Autofill", key=f"autofill_{col}") + + if autofill: + if st.session_state[f'dtype_{col}'] not in ['int','float']: + st.session_state.data[col] = st.session_state.data[col].astype('str').replace('', pd.NA).replace({None: pd.NA}).fillna(def_fill) + else: + st.session_state.data[col] = st.session_state.data[col].replace({None: pd.NA}).fillna(method='ffill') + st.success("Column Autofilled. Please Review the Sidebar for updated status of the Dataframe.") + df_preview.write(st.session_state.data) + with incol2: + confirm= st.button("Confirm", key=f"Confirm_{col}") + if confirm: + st.session_state.data[col] = st.session_state.data[col].replace('', np.nan).replace({None: np.nan}) + st.session_state.data = st.session_state.data.dropna(subset=[col]) + st.session_state.data.update(edited_df) + st.session_state.data = pd.concat([st.session_state.data, edited_df[~edited_df.index.isin(st.session_state.data.index)]]) + st.session_state.data= st.session_state.data.sort_index() + st.success("State Saved. Please Review the Sidebar for updated status of the Dataframe.") + df_preview.write(st.session_state.data) + with tabe: + if "overall_invalid_df" not in st.session_state: + st.session_state.overall_invalid_df = pd.DataFrame() + if (st.session_state[f'dtype_{col}'] not in ['email', 'phone_number'] and st.session_state.flag not in ['email', 'phone_number']): + st.dataframe(st.session_state.data) + AI_check= st.button("Check For Anomalies", key= f'AI_CHECK_{col}') + if AI_check: + with st.spinner("Running Anomaly Detection AI"): + #my_bar = st.progress(0, text="Progress") + + if st.session_state[f'dtype_{col}'] in ['categorical/string']: + if 'missing@123' not in st.session_state.data[col].cat.categories: + st.session_state.data[col] = st.session_state.data[col].cat.add_categories(['missing@123']) + + st.session_state.data[col] = st.session_state.data[col].fillna('missing@123').astype(str) + st.session_state.data_encoded = st.session_state.data[col].apply(convert_to_special_representation) + mixed_transformer = Pipeline(steps=[ + ('vectorizer', CountVectorizer(analyzer='char', lowercase=False)) + ]) + + df_transformed = mixed_transformer.fit_transform(st.session_state.data_encoded) + + input_dim = df_transformed.shape[1] + encoding_dim = (input_dim // 2) + 1 + + input_layer = Input(shape=(None, input_dim)) + conv1d_layer = Conv1D(64, 3, activation='relu', padding='same')(input_layer) + maxpooling_layer = MaxPooling1D(pool_size=2, padding='same')(conv1d_layer) + encoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=False))(maxpooling_layer) + + repeat_vector = RepeatVector(input_dim)(encoder_lstm) + decoder_lstm = Bidirectional(LSTM(encoding_dim, activation='relu', return_sequences=True))(repeat_vector) + conv1d_layer_decoder = Conv1D(64, 3, activation='relu', padding='same')(decoder_lstm) + upsampling_layer = Conv1D(input_dim, 2, activation='relu', padding='same')(conv1d_layer_decoder) + + autoencoder = Model(inputs=input_layer, outputs=upsampling_layer) + + autoencoder.compile(optimizer=Adam(), loss=MeanSquaredError()) + #my_bar.progress(40, text='Progress') + autoencoder.fit(np.expand_dims(df_transformed.toarray(), axis=1), np.expand_dims(df_transformed.toarray(), axis=1), + epochs=100, batch_size=2, shuffle=True, validation_split=0.2, verbose=1) + reconstructions = autoencoder.predict(np.expand_dims(df_transformed.toarray(), axis=1)) + reconstruction_error = np.mean(np.abs(reconstructions - np.expand_dims(df_transformed.toarray(), axis=1)), axis=(1, 2)) + + threshold = np.percentile(reconstruction_error, 95) # Adjust the percentile based on desired sensitivity + #my_bar.progress(90, text='Progress') + st.session_state.data['Validity'] = ['Invalid' if error > threshold else 'Valid' for error in reconstruction_error] + st.session_state.data[col] = st.session_state.data[col].replace('missing@123', '') + st.session_state[f"invalid_ai_data_{col}"]= st.session_state.data[st.session_state.data['Validity']== 'Invalid'] + #my_bar.progress(100, text='Progress') + + if f"invalid_ai_data_{col}" in st.session_state: + st.session_state[f"invalid_ai_data_{col}"]["Invalid Field"] = col + if 'Validity' in st.session_state[f"invalid_ai_data_{col}"].columns: + st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"].drop(['Validity'], axis=1)], ignore_index=True) + else: + st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f"invalid_ai_data_{col}"]], ignore_index=True) + + try: + edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, key=f'Valid_{col}') + except: + edited_valid_df= st.data_editor(st.session_state[f"invalid_ai_data_{col}"], num_rows="dynamic",column_config={ + col: st.column_config.Column( + col, + width="medium", + ) + }, key=f'Valid_{col}') + valid = st.button("Confirm", key=f"Fix_valid_{col}") + #my_bar.empty() + if valid: + st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity'] == 'Invalid'].index) + st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) + st.session_state.data= st.session_state.data.sort_index() + df_preview.write(st.session_state.data) + + + + + elif (st.session_state[f'dtype_{col}'] in ['phone_number'] or st.session_state.flag in ['phone_number'] ): + #st.dataframe(st.session_state.data) + phone_regex = r'^\+?[0-9\s\-\(\)]+$' + # st.write(phone_regex) + st.session_state.data['Validity_phone'] = st.session_state.data[col].apply(lambda xy: 'phone_is_valid' if re.match(phone_regex,str(xy)) else 'phone_is_invalid') + st.session_state[f'invalid_phone_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) + if f'invalid_phone_{col}_check' not in st.session_state: + st.session_state[f'invalid_phone_{col}']["Invalid Field"] = col + st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_phone_{col}']], ignore_index=True, axis=0) + st.session_state[f'invalid_phone_{col}_check'] = 'yes' + try: + edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, num_rows="dynamic", key=f'Valid_phone_{col}') + except: + edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'], column_config={ + col: st.column_config.Column( + col, + width="medium", + ) + }, num_rows="dynamic", key=f'Valid_phone_{col}') + valid_phone = st.button("Confirm", key=f"Fix_valid_phone_{col}") + if valid_phone: + st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].index) + st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) + st.session_state[f'invalid_ph_{col}']= st.session_state.data[st.session_state.data['Validity_phone'] == 'phone_is_invalid'].drop(['Validity_phone'], axis=1) + st.session_state.data = st.session_state.data.drop(['Validity_phone'], axis=1) + + df_preview.write(st.session_state.data) + + elif (st.session_state[f'dtype_{col}'] in ['email'] or st.session_state.flag in ['email']): + email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$' + st.session_state.data['Validity_email'] = st.session_state.data[col].apply(lambda x: 'email_is_valid' if re.match(email_regex, x) else 'email_is_invalid') + if st.session_state.data[col].dtype.name == 'category': + st.session_state.data[col] = st.session_state.data[col].astype('str') + st.session_state[f'invalid_email_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) + if f'invalid_email_{col}_check' not in st.session_state: + st.session_state[f'invalid_email_{col}']["Invalid Field"] = col + st.session_state.overall_invalid_df = pd.concat([st.session_state.overall_invalid_df, st.session_state[f'invalid_email_{col}']], ignore_index=True, axis=0) + st.session_state[f'invalid_email_{col}_check'] = 'yes' + try: + edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ + col: st.column_config.TextColumn( + col, + width="medium", + ) + }, key=f'Valid_email_{col}') + except: + edited_valid_df= st.data_editor(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'], num_rows="dynamic", column_config={ + col: st.column_config.Column( + col, + width="medium", + ) + }, key=f'Valid_email_{col}') + valid_email = st.button("Confirm", key=f"Fix_valid_email_{col}") + if valid_email: + st.session_state.data= st.session_state.data.drop(st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].index) + st.session_state.data = pd.concat([st.session_state.data, edited_valid_df]) + st.session_state[f'invalid_em_{col}']= st.session_state.data[st.session_state.data['Validity_email'] == 'email_is_invalid'].drop(['Validity_email'], axis=1) + st.session_state.data = st.session_state.data.drop(['Validity_email'], axis=1) + df_preview.write(st.session_state.data) + + + + + with scol2: + st.markdown("**Column Being Processed**") + col_view= st.empty() + try: + col_view.write(st.session_state.data[col]) + except: + st.warning("DataFrame Updated. But Could Not Load Preview") + + pkcol1, pkcol2=st.columns(2) + with pkcol1: + if primary_key != None: + st.info(f"Primary Key Identified by AI: {primary_key}") + else: + st.warning("Could Not Finalize the Primary Key Automatically. Please go through the suggestions and Finalize one.") + with pkcol2: + st.selectbox("Please Finalize the Primary Key:", sugg_primary_keys, index= default_index) + + with st.expander("Save and Download Data"): + name_data= st.text_input("Please Specify Name of the saved/downloaded data") + csv = st.session_state.data.to_csv(index=False).encode('utf-8') + for col in ['Validity', 'Validity_email', 'Validity_phone']: + if col in st.session_state.overall_invalid_df: + st.session_state.overall_invalid_df = st.session_state.overall_invalid_df.drop([col], axis=1) + csv2 = st.session_state.overall_invalid_df.to_csv(index=False).encode('utf-8') + #st.write(st.session_state.overall_invalid_df) + # Create a download button + dldcol1, dldcol2= st.columns([1,4]) + with dldcol1: + st.download_button( + label="Download Cleaned Data as CSV", + data=csv, + file_name=f'{name_data}.csv', + mime='text/csv', + ) + with dldcol2: + st.download_button( + label="Download Anomalous Data as CSV", + data=csv2, + file_name=f'Anomaly_{name_data}.csv', + mime='text/csv', + ) + save = st.button("Save Data For Further Processing") + if save: + connection_string = ( + 'DRIVER={ODBC Driver 17 for SQL Server};' + 'SERVER=sql-ext-dev-uks-001.database.windows.net;' + 'DATABASE=sqldb-ext-dev-uks-001;' + 'UID=dbadmin;' + 'PWD=mYpa$$w0rD' + ) + st.session_state.data = st.session_state.data.astype(str) + load_dataframe_to_sqlserver(st.session_state.data, f'[dbo].[PROFILED_{name_data}]', connection_string) \ No newline at end of file diff --git a/pages/4DEDUPLICATION.py b/pages/4DEDUPLICATION.py new file mode 100644 index 0000000000000000000000000000000000000000..710a733f50847dc3b69939dd454e0561bf181a98 --- /dev/null +++ b/pages/4DEDUPLICATION.py @@ -0,0 +1,150 @@ +from logging import PlaceHolder +import pandas as pd +from fuzzywuzzy import fuzz +import numpy as np +import streamlit as st +import pyodbc +from streamlit_extras.stateful_button import button + +st.set_page_config(page_title='DUPLICATE RECORDS DETECTION', layout= 'wide') +st.title('Detect Duplicate Records') +st.subheader('SELECT TABLE') +conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" + "Server=sql-ext-dev-uks-001.database.windows.net;" + "Database=sqldb-ext-dev-uks-001;" + "UID=dbadmin;" + "PWD=mYpa$$w0rD" ) +query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC" +table1=pd.read_sql_query(query1,con=conn) +table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') +table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping') +btn11=button('RUN',key='run11') +if table_selector is not None and btn11: + st.markdown('---') + query2="select * from [dbo].["+table_selector+"]" + df = pd.read_sql_query(query2,con=conn) + conn.close() + st.subheader('Data Preview') + data1=df.copy() + if set(['ID','LOADID','FILE_NAME']).issubset(df.columns): + df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) + + df = df.replace(r'^\s*$', np.nan, regex=True) + if 'SORTL' in df.columns.values.tolist(): + df.drop('SORTL',axis=1,inplace=True) + main_col=st.multiselect('PLEASE PROVIDE CONTEXT FOR DEDUPLICATION',df.columns.values.tolist(),placeholder='Select entity for deduplication') + if main_col: + mp = df.isnull().mean()*100 ## Missing Percentage + col = mp[mp<20].index.tolist() + print(col) + up = df[col].apply(lambda x: len(x.unique())/len(x)*100) ## Unique Percentage + up.sort_values(ascending=False,inplace=True) + col = up[(up>=25)&(up<=75)].index.tolist() + df=df.replace(np.nan,'') + if len(main_col)>1: + if bool(set(col)&set(main_col)): + col=list(set(col)-set(main_col)) + df['main_column']='' + df['main_column']=df['main_column'].astype(str) + st.markdown('---') + st.write('Note: Main_column comprises of concatenated data of above selected context columns') + for i,val in enumerate(main_col): + df[main_col[i]]=df[main_col[i]].astype(str) + df['main_column']=df['main_column']+'_'+df[main_col[i]] + col.insert(0,'main_column') + rem_col=list(set(df.columns.values.tolist())-set(col)) + else: + if main_col[0] in col: + col.remove(main_col[0]) + col.insert(0,main_col[0]) + rem_col=list(set(df.columns.values.tolist())-set(col)) + + st.write('COLUMNS SUGGESTED BY AI FOR DETERMINING DUPLICATES:\n',pd.DataFrame(col,columns=['Column Name'])) + more_col=st.multiselect('DO YOU WANT TO INCLUDE ANY MORE COLUMN(s)',rem_col,placeholder='Select optional columns to check for potential duplicates') + button1=button('CHECK DUPLICATES', key='btn12') + if button1: + if more_col: + col=col+more_col + grp_col = [x+'_based_group' for x in col] + sort_col=[] + last_row_index = len(df)-1 + print(col) + threshold=80 ## Threshold is set to 80 + for j in range(len(col)): + df[col[j]]=df[col[j]].astype(str) + df[col[j]]=df[col[j]].str.upper() + df[col[j]] = df[col[j]].replace(np.nan, '', regex=True) + sort_col = sort_col+[col[j]] + df.sort_values(sort_col, inplace=True) + df = df.reset_index(drop=True) + fuzz_col = col[j]+'_fuzzy_ratio' + df.at[0,fuzz_col]=100 + df.at[last_row_index,fuzz_col]=100 + for i in range(1,last_row_index): + current = df[col[j]].iloc[i] + previous = df[col[j]].iloc[i-1] + fuzzy_ratio = fuzz.ratio(previous,current) + df.at[i,fuzz_col] = fuzzy_ratio + df[fuzz_col] = pd.to_numeric(df[fuzz_col], errors='coerce') + group_counter = 1 + fuzz_group = col[j]+'_based_group' + df.at[0,fuzz_group] = group_counter + group = df.at[0,fuzz_group] + for i in range (1, len(df)): + if df.at[i,fuzz_col] > threshold: + df.at[i,fuzz_group] = df.at[i-1,fuzz_group] + else: + if j>=1: + if df.at[i,col[j-1]+'_fuzzy_ratio'] != group: + group_counter = 1 + group = df.at[i,col[j-1]+'_based_group'] + else: + group_counter +=1 + else: + group_counter += 1 + df.at[i,fuzz_group] = group_counter + #threshold=threshold*0.9 + df['Potential_Duplicate_Cluster'] = df[grp_col].astype(int).astype(str).agg('_'.join, axis=1) + df['DUPLICATES?']=df.duplicated(subset='Potential_Duplicate_Cluster',keep=False).map({True:'Y',False:'N'}) + df = df[df['DUPLICATES?']=='Y'] + # Define a function to apply the style + def highlight_y(val): + color = 'lightcoral' if val=='Y' else 'white' + return f'background-color: {color}' + + # Apply styles + styled_df = df.style.applymap(highlight_y,subset=['DUPLICATES?']) + + st.markdown('---') + st.subheader('Results') + #st.write(styled_df) + edited_df=st.data_editor(styled_df,num_rows='dynamic') + + #out=df.to_csv() + #st.download_button(label='DOWNLOAD DUPLICATE CLUSTER DATA',data=out, file_name='Duplicate_Clusters.csv',mime='text/csv') + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pages/5SOURCE TO TARGET MAPPING.py b/pages/5SOURCE TO TARGET MAPPING.py new file mode 100644 index 0000000000000000000000000000000000000000..8e729196d06a901428d0f7ad985e3635737504dd --- /dev/null +++ b/pages/5SOURCE TO TARGET MAPPING.py @@ -0,0 +1,1352 @@ +# -*- coding: utf-8 -*- + +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +import regex as re +import streamlit as st +import pyodbc +import datetime +import google.generativeai as genai +import textwrap +import json +from streamlit_extras.stateful_button import button +from streamlit_extras.stylable_container import stylable_container +import sdv +from sdv.metadata import MultiTableMetadata +from collections import defaultdict +genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w') +# Create a GenerativeModel instance +model = genai.GenerativeModel( + model_name='models/gemini-1.5-flash' +) + +def read_excel(path, sheet): + df = pd.read_excel(path, sheet_name = sheet, dtype = 'str') + return df + +def split_join_condition(join_condition): + conditions = [] + condition = '' + bracket_count = 0 + + for char in join_condition: + if char == '(': + bracket_count += 1 + elif char == ')': + bracket_count -+ 1 + if char == ',' and bracket_count == 0: + conditions.append(condition.strip()) + condition = '' + else: + condition += char + if condition: + conditions.append(condition.strip()) + + return conditions + +def join_incr(join_conditions): + join = [] + join_pattern = re.compile(r'(\w+\.\w+)\s*=\s*(\w+\w.\w+)', re.IGNORECASE) + for join_condition in join_conditions: + parts = re.split(r'\sAND\s|\sOR\s', join_condition, flags = re.IGNORECASE) + temp = [x.strip() for x in parts if join_pattern.match(x.strip())] + join.append(' AND '.join(temp)) + return join + +def generate_sql(temp_table): + proc_query = [] + base_table = None + + source_table_schema = 'MAIN.GOLD' + temp_table_schema = 'MAIN.GOLD' + base_pk = [] + + join_fields = set() + + for _,row in df.iterrows(): + source_table = row['Source Table'] + primary_key = row['Primary Key'] + source_column = row['Source Column'] + alias = row['Alias'] + joining_keys = row['Joining Keys'] + + if not base_table: + if primary_key == 'Y': + base_table = source_table + base_pk.append(joining_keys) + + if pd.notna(joining_keys): + keys = [x.strip() for x in joining_keys.split(',')] + for x in keys: + if x not in join_fields: + join_fields.add(x) + + unique_cols = ['Source Table', 'Joining Keys', 'Primary Key', 'Join Type','Join Tables','Join Condition'] + unique_df = df.drop_duplicates(subset = unique_cols) + + incremantal_mapping = {} + incr_joins = {} + + for _,row in unique_df.iterrows(): + + source_table = row['Source Table'] + source_column = row['Source Column'] + joining_keys = row['Joining Keys'] + primary_key = row['Primary Key'] + direct_derived = row['Direct/Derived'] + join_type = row['Join Type'] + join_tables = row['Join Tables'] + join_condition = row['Join Condition'] + + if source_table == base_table: + if primary_key == 'Y': + key = (source_table, joining_keys, join_type, join_tables, join_condition) + key1 = source_table + else: + continue + else: + key = (source_table, joining_keys, join_type, join_tables, join_condition) + key1 = source_table + if pd.notna(direct_derived) and pd.notna(source_table) and pd.notna(source_column): + if key not in incremantal_mapping: + incremantal_mapping[key] = { + 'source_table': source_table, + 'joining_keys':joining_keys, + 'join_type': join_type, + 'join_tables': join_tables, + 'join_condition': join_condition + } + if key1 not in incr_joins: + if pd.notna(direct_derived) and direct_derived == 'DERIVED': + incr_joins[key1] = { + 'join_type': join_type, + 'join_tables': ', '.join([x.strip() for x in join_tables.split(',') if x != base_table]), + 'join_condition': join_condition + } + incremental_df = pd.DataFrame(incremantal_mapping.values()) + incr_join_grps = incremental_df.groupby(['source_table']) + proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.{temp_table}_INCR;') + + incr_table_join_info = {} + for _,row in incremental_df.iterrows(): + source_table = row['source_table'] + + if source_table != base_table: + joining_keys = row['joining_keys'] + join_type = row['join_type'] + join_tables = [x.strip() for x in row['join_tables'].split(',')] + index = join_tables.index(source_table) + join_condition = [x.strip() for x in row['join_condition'].split(',')][0:index] + incr_table_join_info[source_table] = ', '.join(join_condition) + + incr_query = [] + incr_cols = '' + incr_tables = [] + incr_join = {} + + for _, group in incr_join_grps: + + for table in _.split(): + if base_table != table: + + join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')] + join_keys = [t.strip() for t in ','.join(base_pk).split(',')] + join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')] + join_cond = split_join_condition(incr_table_join_info[table]) + join_condition = join_incr(join_cond) + source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')] + + join_key_list = [] + for x in join_keys: + join_key_list.append(f'{base_table}.{x}') + join_key = ', '.join(join_key_list) + + for y in source_table: + sql = f""" + INSERT INTO {temp_table_schema}.{temp_table}_INCR + ( + SELECT {join_key}, {table_details_mapping[y][0]}, {table_details_mapping[y][1]}, '{y}', 1, CURRENT_TIMESTAMP + FROM {source_table_schema}.{base_table} {base_table}""" + + incr_join_text = '' + for i in range(len(join_condition)): + sql += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}' + incr_join_text += f'\n\t{join_type[i]} JOIN {source_table_schema}.{join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}' + incr_join[y] = incr_join_text + + sql += f""" + WHERE COALESCE({join_tables[i+1]}.operation,'NA') <> 'D' + AND TO_TIMESTAMP( CAST(SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),1,4) || '-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),5,2) ||'-' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),7,2) || ' ' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),9,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),11,2) ||':' || SUBSTRING(({join_tables[i+1]}._hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS') > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{y}') + );""" + + incr_query.append(sql) + incr_tables.append(y) + + else: + source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')] + join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')] + + join_key_list = [] + for x in join_keys: + join_key_list.append(f'{base_table}.{x}') + join_key = ', '.join(join_key_list) + + incr_cols = join_key + sql = f""" + INSERT INTO {temp_table_schema}.{temp_table}_INCR + ( + SELECT {join_key}, {table_details_mapping[base_table][0]}, {table_details_mapping[base_table][1]}, '{base_table}', 1, CURRENT_TIMESTAMP + FROM {source_table_schema}.{base_table} {base_table} + WHERE COALESCE(operation,'NA') <> 'D' + AND TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS') > (SELECT MAX(max_update_date) FROM audit.reportingdb_audit_tbl_{temp_table} WHERE mart_table_name='{temp_table}' and src_table_name='{base_table}') + );""" + proc_query.append(sql) + incr_tables.append(base_table) + + proc_query.append('\n'.join(incr_query)) + proc_query.append(f'TRUNCATE TABLE {temp_table_schema}.INCR1_{temp_table};') + + sql = f""" + INSERT INTO {temp_table_schema}.INCR1_{temp_table} + ( + SELECT DISTINCT {incr_cols.replace(f'{base_table}.', '')} + FROM {temp_table_schema}.{temp_table}_INCR + );""" + + proc_query.append(sql) + + incr_table_dict = {} + for table in incr_tables: + if table == base_table: + incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}' + else: + p = [x for x in incr_join[table].split('\n\t') if len(x) > 1] + if len(p) == 1: + incr_table_dict[table] = f'{temp_table_schema}.INCR2_{table}' + else: + incr_table_dict[table] = f'{source_table_schema}.{table}' + + s = [] + for table in incr_tables: + incr2_sql_list = [] + + if table == base_table: + for key in incr_cols.replace(f'{base_table}.', '').split(','): + incr2_sql_list.append(f"{base_table}.{key} = A.{key}") + incr2_sql_join = ' AND '.join(incr2_sql_list) + + sql = f""" + CREATE TABLE {temp_table_schema}.INCR2_{table} + AS + SELECT + {table}.* + FROM + {source_table_schema}.{table} {table} + INNER JOIN + {temp_table_schema}.INCR1_{temp_table} A ON {incr2_sql_join}; """ + proc_query.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};') + proc_query.append(sql) + + else: + + p = [x for x in incr_join[table].split('\n\t') if len(x) > 1] + if len(p) == 1: + sql = f""" + CREATE TABLE {temp_table_schema}.INCR2_{table} + AS + SELECT + {table}.* + FROM + {temp_table_schema}.INCR2_{base_table} {base_table} {incr_join[table]};""" + s.append(f'DROP TABLE IF EXISTS {temp_table_schema}.INCR2_{table};') + s.append(sql) + + for x in s: + proc_query.append(x) + + select_clause = [] + from_clause = [] + where_clause = [] + + for _,row in df.iterrows(): + field_name = row['Field_Name'] + source_table = row['Source Table'] + source_column = row['Source Column'] + joining_keys = row['Joining Keys'] + primary_key = row['Primary Key'] + direct_derived = row['Direct/Derived'] + join_type = row['Join Type'] + join_tables = row['Join Tables'] + join_condition = row['Join Condition'] + column_operation = row['Column Operations'] + alias = row['Alias'] + granularity = row['Granularity'] + filter_condition = row['Filter Condition'] + clauses = row['Clauses'] + ordering = row['Ordering'] + + if pd.notna(direct_derived): + if pd.notna(column_operation): + if len(column_operation.split()) == 1: + select_expr = f'{column_operation.upper()}({source_table}.{source_column})' + else: + select_expr = column_operation + else: + if pd.notna(source_table): + select_expr = f'{source_table}.{source_column}' + else: + select_expr = source_column + + if source_column not in join_fields: + if pd.notna(alias): + select_expr += f' AS {alias}' + else: + if pd.notna(column_operation) and pd.notna(source_column): + select_expr += f' AS {source_column}' + + if direct_derived.upper() == 'DIRECT': + select_clause.append(select_expr) + elif direct_derived.upper() == 'DERIVED_BASE': + select_clause.append(select_expr) + + if pd.notna(filter_condition): + where_clause.append(filter_condition) + + select_query = ',\n\t'.join(select_clause) + sql_query = f"CREATE TABLE {temp_table_schema}.{base_table}_BASE\nAS \n\tSELECT \n\t{select_query} \nFROM\n\t{incr_table_dict[base_table]} {base_table}" + if where_clause: + sql_query += f"\nWHERE {' AND'.join(where_clause)}" + sql_query += ';' + proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{base_table}_BASE;") + proc_query.append(sql_query) + + df['Clauses'].fillna('', inplace = True) + df['Ordering'].fillna('', inplace = True) + c = 1 + temp_base_table = f'{base_table}_BASE' + grp_cols = ['Join Condition', 'Clauses', 'Ordering'] + join_grps = df[df['Direct/Derived'] == 'DERIVED'].groupby(['Join Condition', 'Clauses', 'Ordering']) + temp_tables_sql = [] + for (join_condition,clauses,ordering), group in join_grps: + if pd.notna(group['Direct/Derived'].iloc[0]): + if group['Direct/Derived'].iloc[0].upper() == 'DERIVED': + join_tables = [t.strip() for t in group['Join Tables'].iloc[0].split(',')] + join_keys = [t.strip() for t in group['Joining Keys'].iloc[0].split(',')] + join_type = [t.strip() for t in group['Join Type'].iloc[0].split(',')] + join_condition = split_join_condition(group['Join Condition'].iloc[0]) + temp_table_name = f"TEMP_{group['Source Table'].iloc[0]}" + source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')] + alias = [t.strip() for t in (','.join(group['Alias'])).split(',')] + source_table = [t.strip() for t in (','.join(group['Source Table'])).split(',')] + + base_cols = [] + for join_key in join_keys: + base_cols.append(f'{join_tables[0]}.{join_key}') + + for s_table,col,alias in zip(source_table,source_column,alias): + if pd.notna(group['Column Operations'].iloc[0]): + if len(group['Column Operations'].iloc[0].split()) == 1: + select_expr = f"{group['Column Operations'].iloc[0].upper()}({s_table}.{col})" + else: + select_expr = group['Column Operations'].iloc[0] + else: + if pd.notna(s_table): + select_expr = f"{s_table}.{col}" + else: + select_expr = col + + if alias: + select_expr += f" AS {alias}" + base_cols.append(select_expr) + + if ordering: + base_cols.append(f"{ordering} AS RN") + + sql = ',\n\t\t'.join(base_cols) + + join_sql = f"SELECT \n\t\t{sql} \nFROM\n\t{incr_table_dict[base_table]} {join_tables[0]}" + for i in range(len(join_type)): + join_sql += f'\n\t{join_type[i]} JOIN {incr_table_dict[join_tables[i+1]]} {join_tables[i+1]} ON {join_condition[i]}' + if clauses: + join_sql += f'\n\t{clauses}' + join_sql += ';' + + proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.{temp_table_name};") + proc_query.append(f"CREATE TABLE {temp_table_schema}.{temp_table_name}\nAS \n\t{join_sql}") + + granularity = [t.strip() for t in group['Granularity'].iloc[0].split(',')] + + sql = [] + for key in join_keys: + sql.append(f"A.{key} = B.{key}") + + temp_cols = [] + temp_cols.append('A.*') + + source_column = [t.strip() for t in (','.join(group['Source Column'])).split(',')] + alias = [t.strip() for t in (','.join(group['Alias'])).split(',')] + + for col,alias in zip(source_column,alias): + select_expr = f"B.{col}" + if alias: + select_expr = f"B.{alias}" + else: + select_expr = f"B.{col}" + temp_cols.append(select_expr) + + temp_select_query = ',\n\t\t'.join(temp_cols) + + proc_query.append(f"DROP TABLE IF EXISTS {temp_table_schema}.TEMP_{temp_table}_{c};") + + base_sql = f"CREATE TABLE {temp_table_schema}.TEMP_{temp_table}_{c}\nAS \n\tSELECT \n\t\t{temp_select_query} \nFROM\n\t{temp_table_schema}.{temp_base_table} AS A" + base_sql += f"\n\tLEFT OUTER JOIN {temp_table_schema}.{temp_table_name} B ON {' AND '.join(sql)}" + + if '1:1' in granularity and len(ordering) > 1: + base_sql += f" AND B.RN = 1" + base_sql += ';' + + temp_base_table = f'TEMP_{temp_table}_{c}' + c += 1 + proc_query.append(base_sql) + + fin_table_name = temp_table + fin_table_cols = [] + + for _,row in df.iterrows(): + field_name = row['Field_Name'] + source_table = row['Source Table'] + source_column = row['Source Column'] + alias = row['Alias'] + + if pd.notna(row['Direct/Derived']): + if (source_column in join_fields): + fin_table_cols.append(f'{source_column} AS "{field_name}"') + else: + fin_table_cols.append(f'"{field_name}"') + + fin_table_cols = ',\n\t\t'.join(fin_table_cols) + fin_sql = f"INSERT INTO {temp_table_schema}.{fin_table_name}\n\tSELECT \n\t\t{fin_table_cols} \nFROM\n\t{temp_table_schema}.TEMP_{temp_table}_{c-1};" + + + condition_col = '_'.join(incr_cols.replace(f'{base_table}.', '').split(',')) + proc_query.append(f"DELETE FROM {temp_table_schema}.{fin_table_name}\nWHERE {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} IN (SELECT {'_'.join(incr_cols.replace(f'{base_table}.', '').split(','))} FROM {temp_table_schema}.INCR1_{temp_table});") + proc_query.append(fin_sql) + + for table in incr_tables: + sql = f""" + INSERT INTO audit.reportingdb_audit_tbl_{temp_table} + ( + SELECT + '{temp_table}' as mart_table_name, + '{table}' as src_table_name, + coalesce( max(TO_TIMESTAMP( CAST(SUBSTRING((_hoodie_commit_time),1,4) || '-' || SUBSTRING((_hoodie_commit_time),5,2) ||'-' || SUBSTRING((_hoodie_commit_time),7,2) || ' ' || SUBSTRING((_hoodie_commit_time),9,2) ||':' || SUBSTRING((_hoodie_commit_time),11,2) ||':' || SUBSTRING((_hoodie_commit_time),13,2) AS VARCHAR(30)), 'YYYY-MM-DD HH:MI:SS')),(select max(max_update_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) max_update_date, + CURRENT_TIMESTAMP as load_timestamp, + coalesce(max(prev_updt_ts),(select max(source_reference_date) from audit.reportingdb_audit_tbl_{temp_table} where Mart_Table_Name='{temp_table}' and Src_Table_Name= '{table}')) AS source_reference_date, + max(nvl(batch_number,0))+1 + FROM {temp_table_schema}.{temp_table}_INCR where table_name = '{table}' + );""" + proc_query.append(sql) + + return base_table, base_pk, proc_query, incr_join_grps, incr_table_join_info, incr_join, temp_table_schema + +def create_df(query, table_df_mapping, table_usage_count): + script = [] + query = ' '.join(query.split()).strip() + match = re.match(r'CREATE TABLE (\w+\.\w+\.\w+) AS (SELECT .+)', query, re.IGNORECASE) + source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE) + source_tables = [table for pair in source_tables for table in pair if table] + + if not match: + raise ValueError('Invalid SQL') + table_name = match.group(1).split('.')[2] + select_statement = match.group(2) + create_script = f'{table_name} = spark.sql(""" {select_statement} """)' + persist_script = f'{table_name} = {table_name}.persist()' + view_script = f'{table_name}.createOrReplaceTempView("{table_name}")' + + for table in source_tables: + create_script = create_script.replace(table, table_df_mapping[table]) + + script.append(f"\n\t\t######################---------Creating table {create_script.split('=')[0].strip()}-------############################") + script.append(create_script) + script.append(persist_script) + script.append(view_script) + script.append(f'''print("{create_script.split('=')[0].strip()} count: ", {create_script.split('=')[0].strip()}.count()''') + + if 'INCR2_' in table_name: + x = table_name.split('INCR2_')[1] + if x in table_details_mapping.keys(): + script.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################") + script.append(f"{x}_max_update_date = INCR2_{x}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]") + script.append(f"{x}_max_source_reference_date = INCR2_{x}.agg(max(to_timestamp('{table_details_mapping[x][1].replace(x+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]") + script.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{x}',{x}_max_update_date,{x}_max_source_reference_date, max_batch_id, config)") + script.append('\n') + + for table in source_tables: + table_usage_count[table.split('.')[2]] -= 1 + + for table in source_tables: + if table_usage_count[table.split('.')[2]] == 0 and 'INCR1_' not in table: + unpersist_script = f"{table.split('.')[2]}.unpersist()" + script.append(unpersist_script) + + return '\n\t\t'.join(script) + +def generate_spark(proc_query, incr_join_grps, base_table, base_pk, incr_table_join_info, incr_join, temp_table_schema): + table_usage_count = defaultdict(int) + table_df_mapping = {} + + for query in proc_query: + if 'CREATE TABLE' or 'DELETE' in query: + source_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE) + source_tables = [table for pair in source_tables for table in pair if table] + for table in source_tables: + table_usage_count[table.split('.')[2]] += 1 + if 'DELETE' not in query: + table_df_mapping[table] = table.split('.')[2] + + script = [] + for query in proc_query: + if 'CREATE TABLE' in query: + script.append(create_df(query, table_df_mapping,table_usage_count)) + + spark_query = [] + spark_query.append("\t\t######################---------Reading source data -------############################") + for table in table_details_mapping.keys(): + spark_query.append(f'{table} = read_file(spark, config, \"{table}\").filter("{table_details_mapping[table][2]}")') + spark_query.append(f'{table} = {table}.persist()') + spark_query.append(f'{table}.createOrReplaceTempView("{table}")') + spark_query.append(f'print("{table} count: ", {table}.count()') + spark_query.append('\n') + + spark_query.append("\n\t\t######################---------Reading records-------############################") + for table in table_details_mapping.keys(): + spark_query.append(f"{table}_max_update_date = read_max_update_date(redshift_conn, config['application_name'],'{table}', config)") + spark_query.append(f'{table}_max_update_date = {table}_max_update_date[0][0]') + spark_query.append('\n') + + incr1_spark = [] + temp_incr1 = [] + for _, group in incr_join_grps: + for table in _.split(): + if base_table != table: + join_tables = [t.strip() for t in group['join_tables'].iloc[0].split(',')] + join_keys = [t.strip() for t in ','.join(base_pk).split(',')] + join_type = [t.strip() for t in group['join_type'].iloc[0].split(',')] + join_cond = split_join_condition(incr_table_join_info[table]) + join_condition = join_incr(join_cond) + source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')] + + join_key_list = [] + for x in join_keys: + join_key_list.append(f'{base_table}.{x}') + join_key = ', '.join(join_key_list) + + for y in source_table: + sql = f"""SELECT {join_key} FROM {base_table} {base_table}""" + + incr_join_text = '' + i=0 + for i in range(len(join_condition)): + sql += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}' + incr_join_text += f' {join_type[i]} JOIN {join_tables[i+1]} {join_tables[i+1]} ON {join_condition[i]}' + + sql += f''' WHERE {join_tables[i+1]}._hoodie_commit_time > cast('"""+str({join_tables[i+1]}_max_update_date)+"""' as timestamp)''' + temp_incr1.append(sql) + + else: + source_table = [t.strip() for t in group['source_table'].iloc[0].split(',')] + join_keys = [t.strip() for t in group['joining_keys'].iloc[0].split(',')] + + join_key_list = [] + for x in join_keys: + join_key_list.append(f'{base_table}.{x}') + join_key = ', '.join(join_key_list) + + sql = f'''SELECT {join_key} FROM {base_table} {base_table} WHERE {base_table}._hoodie_commit_time > cast('"""+str({base_table}_max_update_date)+"""' as timestamp)''' + incr1_spark.append(sql) + for i in temp_incr1: + incr1_spark.append(i) + incr1_spark = '\nUNION\n'.join(incr1_spark) + spark_query.append("\n\t\t######################---------Creating INCR1-------############################") + spark_query.append(f'INCR1_{temp_table} = spark.sql(""" {incr1_spark} """)') + spark_query.append(f'\n\t\tINCR1_{temp_table} = INCR1_{temp_table}.dropDuplicates()') + spark_query.append(f'INCR1_{temp_table} = INCR1_{temp_table}.persist()') + spark_query.append(f'INCR1_{temp_table}.createOrReplaceTempView("INCR1_{temp_table}")') + spark_query.append(f'print("INCR1_{temp_table} count: ", INCR1_{temp_table}.count())') + + spark_query.append("\n\t\t######################---------Creating INCR2-------############################") + for table in table_details_mapping.keys(): + if table in incr_join.keys(): + p = [x for x in incr_join[table].split('\n\t') if len(x) > 1] + if len(p) > 1: + spark_query.append(f"\n\t\t######################---------Updating the max_update_date in audit-------############################") + spark_query.append(f"{table}_max_update_date = {table}.agg({{'_hoodie_commit_time' : 'max'}}).first()[0]") + spark_query.append(f"{table}_max_source_reference_date = {table}.agg(max(to_timestamp('{table_details_mapping[table][1].replace(table+'.','')}','yyyy-MM-dd-HH.mm.ss.SSSSSS'))).first()[0]") + spark_query.append(f"insert_max_update_date(spark,redshift_conn, config['application_name'],'{table}',{table}_max_update_date,{table}_max_source_reference_date, max_batch_id, config)") + spark_query.append('\n') + + for query in script: + spark_query.append(query) + spark_query.append('\n') + + spark_query1 = [] + spark_query1.append('\n') + for query in proc_query: + if f'{temp_table_schema}.{temp_table}\n' in query: + final_tables = re.findall(r'\bFROM\s+(\w+\.\w+\.\w+)|\bJOIN\s+(\w+\.\w+\.\w+)', query, re.IGNORECASE) + final_tables = [table.split('.')[2].strip() for pair in final_tables for table in pair if table and table.split('.')[2].strip() != temp_table][0] + if 'INCR1_' in final_tables: + spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['incr2df_path'])") + else: + spark_query.append(f"{final_tables}.write.mode('overwrite').parquet(config['resultdf_path'])") + spark_query1.append(f'''cur.execute(""" {query} """)''') + spark_query1.append('\n') + + with open('template.txt') as file: + template = file.read() + + result = template.replace('INSERT_CODE_1', '\n\t\t'.join(spark_query)) + result = result.replace('INSERT_CODE_2', '\t\t'.join(spark_query1)) + + return result + + + +st.set_page_config(page_title='AUTOMATED SOURCE TO TARGET MAPPING', layout= 'wide') +st.markdown(""" + + """, unsafe_allow_html=True) +st.subheader('AUTOMATED SOURCE TO TARGET MAPPING') +mode= st.selectbox('Select Mode of Mapping',('Supervised Mapping(You Have Sufficient Sample Data in Target Template)', 'Unsupervised Mapping(You Do Not Have Sufficient Sample Data in Target Template)'), index=None,placeholder='Select category of table') +if mode == 'Supervised Mapping(You Have Sufficient Sample Data in Target Template)': + conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" + "Server=sql-ext-dev-uks-001.database.windows.net;" + "Database=sqldb-ext-dev-uks-001;" + "UID=dbadmin;" + "PWD=mYpa$$w0rD" ) + query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='dbo' ORDER BY TABLE_NAME ASC" + table1=pd.read_sql_query(query1,con=conn) + st.session_state.table1_un= table1 + table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') + colsel1, colsel2= st.columns(2) + with colsel1: + table_selector=st.selectbox('SOURCE TABLE NAME',['TCM', 'TCVM','TEM', 'TPM', 'TPP', 'TPT', 'TRM', 'TSCM', 'TSM'],index=None,placeholder='Select table for automated column mapping') + with colsel2: + target_selector=st.selectbox('TARGET TABLE NAME',['POLICY_MAPPINGTARGET_TBL','FINANCE_MAAPINGTARGET_TBL','CUSTOMER_MASTER_TARGET'],index=None,placeholder='Select target table') + st.session_state.target_selector_un = target_selector + #migrate_opt=st.toggle('DO YOU ALSO WANT TO MIGRATE DATA TO TARGET TABLE') + if table_selector is not None and target_selector is not None: + btn=button('RUN',key='RUN_GENAI_UN') + if target_selector is not None and btn and f'{table_selector}_{target_selector}_map_un' not in st.session_state: + query2="select * from ["+ table1['TABLE_SCHEMA'][0]+"].["+table_selector+"]" + i_df = pd.read_sql_query(query2,con=conn) + # conn.close() + i_df=i_df.drop(['ID','LOADID','FILE_NAME'],axis=1) + st.session_state['source_data_un'] = i_df + #st.markdown('---') + # st.subheader('Souce Data Preview') + # st.dataframe(i_df) + query3="select * from ["+ table1['TABLE_SCHEMA'][0]+"].["+target_selector+"]" + tgt_df=pd.read_sql_query(query3,con=conn).reset_index(drop=True) + main_list=tgt_df.columns.to_list() + sub_list=['ID','LOADID','FILE_NAME'] + if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): + tgt_df=tgt_df.drop(['ID','LOADID','FILE_NAME'],axis=1) + st.session_state.opt_un= list(tgt_df.columns) + st.session_state['target_data_un'] = tgt_df.head(20).reset_index() + # if tgt: + # # st.subheader('Target Table Preview') + # # st.write(tgt_df.sample(20).reset_index(drop=True)) + # # st.markdown('---') + + with st.spinner('Running data on neural network...'): + df=pd.read_csv('C:\\Applications\\MARCO POLO O AIML\\DATA CATALOG\\pages\\CUSTOMER_MASTER_TRAIN_1306.csv') #POLICY + cols=df.columns.tolist() + data=pd.DataFrame(columns=['DATA','LABEL']) + temp=pd.DataFrame(columns=['DATA','LABEL']) + for x in cols: + temp['DATA']=df[x] + temp['LABEL']=x + data=pd.concat([data,temp],ignore_index=True) + data['DATA']=data['DATA'].astype('string') + data['LABEL']=data['LABEL'].astype('string') + data=data.dropna() + data=data.reset_index(drop=True) + + + + + #FEATURE_EXTRACTION BAG OF CHARACTERS + vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 3), min_df=1) + X=vectorizer.fit_transform(data['DATA']) + feature=pd.DataFrame(data=X.toarray(),columns=vectorizer.get_feature_names_out()) + data1=pd.concat([data,feature],axis=1) + + #FEATURE_SELECTION + from sklearn.feature_selection import chi2 + chi_x=data1.drop(['DATA','LABEL'],axis=1) + chi_y=data1['LABEL'] + chi_scores=chi2(chi_x,chi_y) + p_values=pd.Series(chi_scores[1],index=chi_x.columns) + p_values=p_values.sort_values(ascending=True).reset_index() + feature_chi=p_values['index'][:1000] + data2=data1[feature_chi.to_list()] + data2=pd.concat([data,data2],axis=1) + + #FEATURE EXTRACTION GENERAL + def count_digits(str1): + return len("".join(re.findall("\d+", str1))) + + def count_vowels(string): + vowels = "aeiouAEIOU" + count = 0 + for char in string: + if char in vowels: + count += 1 + return count + + def count_special_character(string): + special_characters = "!@#$%^&*()-+?_=,<>/" + special_char = 0 + for i in range(0, len(string)): + if (string[i] in special_characters): + special_char += 1 + return special_char + + def count_spaces(string): + spaces = 0 + for char in string: + if char == " ": + spaces += 1 + return spaces + + data2['LENGTH']=data2['DATA'].apply(lambda x:len(x)) + data2['digit_c']=data2['DATA'].apply(lambda x:count_digits(x)) + data2['vowel_c']=data2['DATA'].apply(lambda x:count_vowels(x)) + data2['spchar_c']=data2['DATA'].apply(lambda x:count_special_character(x)) + data2['space_c']=data2['DATA'].apply(lambda x:count_spaces(x)) + + chi_scores1=chi2(data2[['LENGTH','digit_c','vowel_c','spchar_c','space_c']],data2['LABEL']) + p_values1=pd.Series(chi_scores1[1],index=data2[['LENGTH','digit_c','vowel_c','spchar_c','space_c']].columns).sort_values(ascending=True).reset_index() + + #MODEL + import tensorflow as tf + from tensorflow.keras import layers + from tensorflow import keras + + from sklearn.model_selection import train_test_split + from ast import literal_eval + + train_df, test_df = train_test_split(data2,test_size=.1,stratify=data2['LABEL'].values) + val_df = test_df.sample(frac=0.5) + test_df.drop(val_df.index, inplace=True) + + terms = tf.ragged.constant(data2['LABEL'].values) + lookup = tf.keras.layers.StringLookup(output_mode="one_hot") + lookup.adapt(terms) + vocab = lookup.get_vocabulary() + + def invert_multi_hot(encoded_labels): + hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0] + return np.take(vocab, hot_indices) + + max_seqlen = 150 + batch_size = 128 + padding_token = "