pwc-india's picture
Upload 108 files
41c4cf9 verified
raw
history blame
11.5 kB
import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time
import os
import gc
import warnings
from PIL import Image
from sdv.metadata import SingleTableMetadata
import pyodbc
import google.generativeai as genai
import textwrap
from streamlit_extras.stylable_container import stylable_container
genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
genai_mod = genai.GenerativeModel(
model_name='models/gemini-1.5-pro-latest'
)
st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT SOURCE')
select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1 =='DB_10001':
datasets = load_csvs(
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
read_csv_parameters={
'skipinitialspace': True,
'encoding': 'utf_8'
})
st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.")
select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
if select_main == 'Multi Table Discovery':
with st.spinner('Performing Data Discovery'):
time.sleep(2)
st.success('Data cataloguing complete!')
datasets = load_csvs(
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
read_csv_parameters={
'skipinitialspace': True,
'encoding': 'utf_8'
})
metadata = MultiTableMetadata()
metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
python_dict = metadata.to_dict()
st.markdown('---')
st.subheader('DATA CATALOG')
# st.json(python_dict)
brands=datasets['brands']
categories=datasets['categories']
customers=datasets['CUSTOMER_MASTER_TBL_1']
orderitems=datasets['order_items']
orders=datasets['orders']
products=datasets['products']
staffs=datasets['staffs']
stocks=datasets['stocks']
stores=datasets['stores']
tables=python_dict['tables']
table_names=[*tables]
col1, col2, col3 = st.columns([2,2,2])
with col1:
def view_callback():
st.session_state.tdet = False
view= st.button("LIST TABLES",key='view',on_click=view_callback)
with col2:
if 'tdet' not in st.session_state:
st.session_state.tdet = False
tdet1 = st.button("SHOW TABLE DETAILS")
with col3:
rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
if tdet1:
st.session_state.tdet = tdet1
if view:
#st.write(python_dict)
st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
if rel:
rlist1=python_dict['relationships']
rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
for i in range(len(rlist1)):
rlist=rlist1[i]
nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
rdf=pd.concat([rdf,nrow],ignore_index=True)
st.write(rdf)
if st.session_state.tdet is True:
def tdet_callback():
st.session_state.tdet=True
st.subheader('Select table name to view')
sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
col4, col5 = st.columns([1, 3])
with col4:
preview= st.button("PREVIEW TABLE",key='preview')
with col5:
cdet = st.button("GET COLUMN DETAILS",key='prof')
if preview:
st.write(datasets[sbox1])
if cdet:
cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
t_dict=tables[sbox1]
c_dict=t_dict['columns']
i=0
for key in c_dict:
e_dict=c_dict[key]
if 'pii' in e_dict:
p='YES'
else:
p='NO'
if e_dict['sdtype']=='datetime':
v=e_dict['sdtype']+': '+e_dict['datetime_format']
else:
v=e_dict['sdtype']
new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
cdetails=pd.concat([cdetails, new_row],ignore_index=True)
i=i+1
if 'primary_key' in t_dict:
st.write('Primary Key:',t_dict['primary_key'])
else:
st.write('Primary Key: No key can be detected')
st.write(cdetails)
if select_main == 'Single Table Discovery':
metadata = SingleTableMetadata()
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=ipzilnpxsssp001.database.windows.net;"
"Database=Marcopolo_DB;"
"UID=ssikder004;"
"PWD=Marcopolo@123" )
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
table1=pd.read_sql_query(query1,con=conn)
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
if table_selector is not None:
st.markdown('---')
query2="select * from [Client].["+table_selector+"]"
df = pd.read_sql_query(query2,con=conn)
main_list=df.columns.to_list()
sub_list=['ID','LOADID','FILE_NAME']
if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
conn.close()
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
python_dict = metadata.to_dict()
if f'cont_{table_selector}' not in st.session_state:
with st.spinner("Processing Table"):
# Create a GenerativeModel instance
genai_mod = genai.GenerativeModel(
model_name='models/gemini-1.5-pro-latest'
)
if 'primary_key' in python_dict:
primary_key = python_dict['primary_key']
else:
primary_key = "Could Not be Identified"
story = f""" Details of the table:
table columns: {str(list(df.columns))}
column datatypes: {str(df.dtypes.to_string())}
table sample data: {df.head(10).to_string()}
"""
response = genai_mod.generate_content(textwrap.dedent("""
You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
Important: Please consider All fields are mandetorily during your analysis.
Here is the table details:
""") + story + f"The Primary Key is:{primary_key}" )
st.write(response.usage_metadata)
st.session_state[f'cont_{table_selector}'] = response.text
with stylable_container(
key=f"container_with_border",
css_styles="""
{
border: 1px solid white;
border-radius: 0.5rem;
padding: calc(1em - 1px);
width: 110%; /* Set container width to 100% */
}
"""
):
st.write(st.session_state[f'cont_{table_selector}'])
col9, col10, col11 = st.columns([2, 3, 9])
with col9:
preview= st.button("PREVIEW TABLE",key='preview')
# with col10:
# cdet = st.button("GET COLUMN DETAILS",key='prof')
if preview:
st.dataframe(df)
# if cdet:
# cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
# c_dict=python_dict['columns']
# i=0
# for key in c_dict:
# e_dict=c_dict[key]
# if 'pii' in e_dict:
# p='YES'
# else:
# p='NO'
# if e_dict['sdtype']=='datetime':
# v=e_dict['sdtype']+': '+e_dict['datetime_format']
# else:
# v=e_dict['sdtype']
# new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
# cdetails=pd.concat([cdetails, new_row],ignore_index=True)
# i=i+1
# if 'primary_key' in python_dict:
# st.write('Primary Key:',python_dict['primary_key'])
# else:
# st.write('Primary Key: No key can be detected')
# st.write(cdetails)