import pandas as pd import numpy as np import streamlit as st import sdv from sdv.datasets.local import load_csvs from sdv.metadata import MultiTableMetadata from sdv.multi_table import HMASynthesizer import time import os import gc import warnings import plotly.express as px from PIL import Image from io import BytesIO from IPython.display import Image as IPImage, display from PyPDF2 import PdfReader from fpdf import FPDF import statsmodels.api as sm import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from IPython.display import Markdown import kaleido st.set_page_config(page_title='DATA DISCOVERY') st.title('AUTOMATED DATA CATALOGUE') st.subheader('SELECT DATABASE') select1=st.selectbox('DATABASE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name') if select1=='DB_10001': with st.spinner('Performing Data Discovery'): time.sleep(2) st.success('Data cataloguing complete!') datasets = load_csvs( folder_name='BIKE_STORE_DATABASE/', read_csv_parameters={ 'skipinitialspace': True, 'encoding': 'utf_8' }) metadata = MultiTableMetadata() metadata.detect_from_csvs(folder_name='BIKE_STORE_DATABASE/') python_dict = metadata.to_dict() st.markdown('---') st.subheader('DATA CATALOG') # st.json(python_dict) brands=datasets['brands'] categories=datasets['categories'] customers=datasets['customers'] orderitems=datasets['order_items'] orders=datasets['orders'] products=datasets['products'] staffs=datasets['staffs'] stocks=datasets['stocks'] stores=datasets['stores'] tables=python_dict['tables'] table_names=[*tables] col1, col2, col3 = st.columns([2,2,2]) with col1: def view_callback(): st.session_state.tdet = False view= st.button("LIST TABLES",key='view',on_click=view_callback) with col2: if 'tdet' not in st.session_state: st.session_state.tdet = False tdet1 = st.button("SHOW TABLE DETAILS") with col3: rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback) if tdet1: st.session_state.tdet = tdet1 if view: #st.write(python_dict) st.write(pd.DataFrame(table_names,columns=['TABLE NAME'])) if rel: rlist1=python_dict['relationships'] rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY']) for i in range(len(rlist1)): rlist=rlist1[i] nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i]) rdf=pd.concat([rdf,nrow],ignore_index=True) st.write(rdf) if st.session_state.tdet is True: def tdet_callback(): st.session_state.tdet=True st.subheader('Select table name to view') sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback) col4, col5 = st.columns([1, 3]) with col4: preview= st.button("PREVIEW TABLE",key='preview') with col5: cdet = st.button("GET COLUMN DETAILS",key='prof') if preview: st.write(datasets[sbox1]) if cdet: cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) t_dict=tables[sbox1] c_dict=t_dict['columns'] i=0 for key in c_dict: e_dict=c_dict[key] if 'pii' in e_dict: p='YES' else: p='NO' if e_dict['sdtype']=='datetime': v=e_dict['sdtype']+': '+e_dict['datetime_format'] else: v=e_dict['sdtype'] new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) cdetails=pd.concat([cdetails, new_row],ignore_index=True) i=i+1 if 'primary_key' in t_dict: st.write('Primary Key:',t_dict['primary_key']) else: st.write('Primary Key: No key can be detected') st.write(cdetails)