File size: 4,523 Bytes
41c4cf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time

import os
import gc
import warnings
import plotly.express as px
from PIL import Image
from io import BytesIO
from IPython.display import Image as IPImage, display
from PyPDF2 import PdfReader
from fpdf import FPDF
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import Markdown
import kaleido

st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT DATABASE')
select1=st.selectbox('DATABASE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1=='DB_10001':
    with st.spinner('Performing Data Discovery'):
        time.sleep(2)
    st.success('Data cataloguing complete!')
    datasets = load_csvs(
    folder_name='BIKE_STORE_DATABASE/',
    read_csv_parameters={
        'skipinitialspace': True,
        'encoding': 'utf_8'
    })
    metadata = MultiTableMetadata()
    metadata.detect_from_csvs(folder_name='BIKE_STORE_DATABASE/')
    python_dict = metadata.to_dict()
    st.markdown('---')
    st.subheader('DATA CATALOG')
    # st.json(python_dict)
    brands=datasets['brands']
    categories=datasets['categories']
    customers=datasets['customers']
    orderitems=datasets['order_items']
    orders=datasets['orders']
    products=datasets['products']
    staffs=datasets['staffs']
    stocks=datasets['stocks']
    stores=datasets['stores']
    tables=python_dict['tables']
    table_names=[*tables]
    col1, col2, col3 = st.columns([2,2,2])

    with col1:
        def view_callback():
            st.session_state.tdet = False
        view= st.button("LIST TABLES",key='view',on_click=view_callback)
    with col2:
        if 'tdet' not in st.session_state:
            st.session_state.tdet = False
        tdet1 = st.button("SHOW TABLE DETAILS")
    with col3:
        rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)

    if tdet1:
            st.session_state.tdet = tdet1
    if view:
        #st.write(python_dict)
        st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
        
    if rel:
        rlist1=python_dict['relationships']
        rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
        for i in range(len(rlist1)):
            rlist=rlist1[i]
            nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
            rdf=pd.concat([rdf,nrow],ignore_index=True)
        st.write(rdf)
    if st.session_state.tdet is True:
        def tdet_callback():
            st.session_state.tdet=True
        st.subheader('Select table name to view')
        sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
        col4, col5 = st.columns([1, 3])
        with col4:
            preview= st.button("PREVIEW TABLE",key='preview')
        with col5:
            cdet = st.button("GET COLUMN DETAILS",key='prof')
        if preview:
            st.write(datasets[sbox1])
        if cdet:
            cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
            t_dict=tables[sbox1]
            c_dict=t_dict['columns']
            i=0
            for key in c_dict:
                e_dict=c_dict[key]
                if 'pii' in e_dict:
                    p='YES'
                else:
                    p='NO'
                if e_dict['sdtype']=='datetime':
                    v=e_dict['sdtype']+': '+e_dict['datetime_format']
                else:
                    v=e_dict['sdtype']
                new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
                cdetails=pd.concat([cdetails, new_row],ignore_index=True)
                i=i+1
            if 'primary_key' in t_dict:
                st.write('Primary Key:',t_dict['primary_key'])
            else:
                st.write('Primary Key: No key can be detected')
            st.write(cdetails)