|
import pandas as pd
|
|
import numpy as np
|
|
import streamlit as st
|
|
import sdv
|
|
from sdv.datasets.local import load_csvs
|
|
from sdv.metadata import MultiTableMetadata
|
|
from sdv.multi_table import HMASynthesizer
|
|
import time
|
|
|
|
import os
|
|
import gc
|
|
import warnings
|
|
import plotly.express as px
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
from IPython.display import Image as IPImage, display
|
|
from PyPDF2 import PdfReader
|
|
from fpdf import FPDF
|
|
import statsmodels.api as sm
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import plotly.express as px
|
|
from IPython.display import Markdown
|
|
import kaleido
|
|
|
|
st.set_page_config(page_title='DATA DISCOVERY')
|
|
st.title('AUTOMATED DATA CATALOGUE')
|
|
st.subheader('SELECT DATABASE')
|
|
select1=st.selectbox('DATABASE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
|
|
if select1=='DB_10001':
|
|
with st.spinner('Performing Data Discovery'):
|
|
time.sleep(2)
|
|
st.success('Data cataloguing complete!')
|
|
datasets = load_csvs(
|
|
folder_name='BIKE_STORE_DATABASE/',
|
|
read_csv_parameters={
|
|
'skipinitialspace': True,
|
|
'encoding': 'utf_8'
|
|
})
|
|
metadata = MultiTableMetadata()
|
|
metadata.detect_from_csvs(folder_name='BIKE_STORE_DATABASE/')
|
|
python_dict = metadata.to_dict()
|
|
st.markdown('---')
|
|
st.subheader('DATA CATALOG')
|
|
|
|
brands=datasets['brands']
|
|
categories=datasets['categories']
|
|
customers=datasets['customers']
|
|
orderitems=datasets['order_items']
|
|
orders=datasets['orders']
|
|
products=datasets['products']
|
|
staffs=datasets['staffs']
|
|
stocks=datasets['stocks']
|
|
stores=datasets['stores']
|
|
tables=python_dict['tables']
|
|
table_names=[*tables]
|
|
col1, col2, col3 = st.columns([2,2,2])
|
|
|
|
with col1:
|
|
def view_callback():
|
|
st.session_state.tdet = False
|
|
view= st.button("LIST TABLES",key='view',on_click=view_callback)
|
|
with col2:
|
|
if 'tdet' not in st.session_state:
|
|
st.session_state.tdet = False
|
|
tdet1 = st.button("SHOW TABLE DETAILS")
|
|
with col3:
|
|
rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
|
|
|
|
if tdet1:
|
|
st.session_state.tdet = tdet1
|
|
if view:
|
|
|
|
st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
|
|
|
|
if rel:
|
|
rlist1=python_dict['relationships']
|
|
rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
|
|
for i in range(len(rlist1)):
|
|
rlist=rlist1[i]
|
|
nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
|
|
rdf=pd.concat([rdf,nrow],ignore_index=True)
|
|
st.write(rdf)
|
|
if st.session_state.tdet is True:
|
|
def tdet_callback():
|
|
st.session_state.tdet=True
|
|
st.subheader('Select table name to view')
|
|
sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
|
|
col4, col5 = st.columns([1, 3])
|
|
with col4:
|
|
preview= st.button("PREVIEW TABLE",key='preview')
|
|
with col5:
|
|
cdet = st.button("GET COLUMN DETAILS",key='prof')
|
|
if preview:
|
|
st.write(datasets[sbox1])
|
|
if cdet:
|
|
cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
|
|
t_dict=tables[sbox1]
|
|
c_dict=t_dict['columns']
|
|
i=0
|
|
for key in c_dict:
|
|
e_dict=c_dict[key]
|
|
if 'pii' in e_dict:
|
|
p='YES'
|
|
else:
|
|
p='NO'
|
|
if e_dict['sdtype']=='datetime':
|
|
v=e_dict['sdtype']+': '+e_dict['datetime_format']
|
|
else:
|
|
v=e_dict['sdtype']
|
|
new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
|
|
cdetails=pd.concat([cdetails, new_row],ignore_index=True)
|
|
i=i+1
|
|
if 'primary_key' in t_dict:
|
|
st.write('Primary Key:',t_dict['primary_key'])
|
|
else:
|
|
st.write('Primary Key: No key can be detected')
|
|
st.write(cdetails)
|
|
|
|
|
|
|