File size: 4,523 Bytes
41c4cf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time
import os
import gc
import warnings
import plotly.express as px
from PIL import Image
from io import BytesIO
from IPython.display import Image as IPImage, display
from PyPDF2 import PdfReader
from fpdf import FPDF
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import Markdown
import kaleido
st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT DATABASE')
select1=st.selectbox('DATABASE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1=='DB_10001':
with st.spinner('Performing Data Discovery'):
time.sleep(2)
st.success('Data cataloguing complete!')
datasets = load_csvs(
folder_name='BIKE_STORE_DATABASE/',
read_csv_parameters={
'skipinitialspace': True,
'encoding': 'utf_8'
})
metadata = MultiTableMetadata()
metadata.detect_from_csvs(folder_name='BIKE_STORE_DATABASE/')
python_dict = metadata.to_dict()
st.markdown('---')
st.subheader('DATA CATALOG')
# st.json(python_dict)
brands=datasets['brands']
categories=datasets['categories']
customers=datasets['customers']
orderitems=datasets['order_items']
orders=datasets['orders']
products=datasets['products']
staffs=datasets['staffs']
stocks=datasets['stocks']
stores=datasets['stores']
tables=python_dict['tables']
table_names=[*tables]
col1, col2, col3 = st.columns([2,2,2])
with col1:
def view_callback():
st.session_state.tdet = False
view= st.button("LIST TABLES",key='view',on_click=view_callback)
with col2:
if 'tdet' not in st.session_state:
st.session_state.tdet = False
tdet1 = st.button("SHOW TABLE DETAILS")
with col3:
rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
if tdet1:
st.session_state.tdet = tdet1
if view:
#st.write(python_dict)
st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
if rel:
rlist1=python_dict['relationships']
rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
for i in range(len(rlist1)):
rlist=rlist1[i]
nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
rdf=pd.concat([rdf,nrow],ignore_index=True)
st.write(rdf)
if st.session_state.tdet is True:
def tdet_callback():
st.session_state.tdet=True
st.subheader('Select table name to view')
sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
col4, col5 = st.columns([1, 3])
with col4:
preview= st.button("PREVIEW TABLE",key='preview')
with col5:
cdet = st.button("GET COLUMN DETAILS",key='prof')
if preview:
st.write(datasets[sbox1])
if cdet:
cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
t_dict=tables[sbox1]
c_dict=t_dict['columns']
i=0
for key in c_dict:
e_dict=c_dict[key]
if 'pii' in e_dict:
p='YES'
else:
p='NO'
if e_dict['sdtype']=='datetime':
v=e_dict['sdtype']+': '+e_dict['datetime_format']
else:
v=e_dict['sdtype']
new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
cdetails=pd.concat([cdetails, new_row],ignore_index=True)
i=i+1
if 'primary_key' in t_dict:
st.write('Primary Key:',t_dict['primary_key'])
else:
st.write('Primary Key: No key can be detected')
st.write(cdetails)
|