File size: 11,511 Bytes
41c4cf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time
import os
import gc
import warnings
from PIL import Image
from sdv.metadata import SingleTableMetadata
import pyodbc
import google.generativeai as genai
import textwrap
from streamlit_extras.stylable_container import stylable_container
genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
genai_mod = genai.GenerativeModel(
model_name='models/gemini-1.5-pro-latest'
)
st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT SOURCE')
select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1 =='DB_10001':
datasets = load_csvs(
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
read_csv_parameters={
'skipinitialspace': True,
'encoding': 'utf_8'
})
st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.")
select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
if select_main == 'Multi Table Discovery':
with st.spinner('Performing Data Discovery'):
time.sleep(2)
st.success('Data cataloguing complete!')
datasets = load_csvs(
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
read_csv_parameters={
'skipinitialspace': True,
'encoding': 'utf_8'
})
metadata = MultiTableMetadata()
metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
python_dict = metadata.to_dict()
st.markdown('---')
st.subheader('DATA CATALOG')
# st.json(python_dict)
brands=datasets['brands']
categories=datasets['categories']
customers=datasets['CUSTOMER_MASTER_TBL_1']
orderitems=datasets['order_items']
orders=datasets['orders']
products=datasets['products']
staffs=datasets['staffs']
stocks=datasets['stocks']
stores=datasets['stores']
tables=python_dict['tables']
table_names=[*tables]
col1, col2, col3 = st.columns([2,2,2])
with col1:
def view_callback():
st.session_state.tdet = False
view= st.button("LIST TABLES",key='view',on_click=view_callback)
with col2:
if 'tdet' not in st.session_state:
st.session_state.tdet = False
tdet1 = st.button("SHOW TABLE DETAILS")
with col3:
rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
if tdet1:
st.session_state.tdet = tdet1
if view:
#st.write(python_dict)
st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
if rel:
rlist1=python_dict['relationships']
rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
for i in range(len(rlist1)):
rlist=rlist1[i]
nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
rdf=pd.concat([rdf,nrow],ignore_index=True)
st.write(rdf)
if st.session_state.tdet is True:
def tdet_callback():
st.session_state.tdet=True
st.subheader('Select table name to view')
sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
col4, col5 = st.columns([1, 3])
with col4:
preview= st.button("PREVIEW TABLE",key='preview')
with col5:
cdet = st.button("GET COLUMN DETAILS",key='prof')
if preview:
st.write(datasets[sbox1])
if cdet:
cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
t_dict=tables[sbox1]
c_dict=t_dict['columns']
i=0
for key in c_dict:
e_dict=c_dict[key]
if 'pii' in e_dict:
p='YES'
else:
p='NO'
if e_dict['sdtype']=='datetime':
v=e_dict['sdtype']+': '+e_dict['datetime_format']
else:
v=e_dict['sdtype']
new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
cdetails=pd.concat([cdetails, new_row],ignore_index=True)
i=i+1
if 'primary_key' in t_dict:
st.write('Primary Key:',t_dict['primary_key'])
else:
st.write('Primary Key: No key can be detected')
st.write(cdetails)
if select_main == 'Single Table Discovery':
metadata = SingleTableMetadata()
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
"Server=ipzilnpxsssp001.database.windows.net;"
"Database=Marcopolo_DB;"
"UID=ssikder004;"
"PWD=Marcopolo@123" )
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
table1=pd.read_sql_query(query1,con=conn)
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
if table_selector is not None:
st.markdown('---')
query2="select * from [Client].["+table_selector+"]"
df = pd.read_sql_query(query2,con=conn)
main_list=df.columns.to_list()
sub_list=['ID','LOADID','FILE_NAME']
if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
conn.close()
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
python_dict = metadata.to_dict()
if f'cont_{table_selector}' not in st.session_state:
with st.spinner("Processing Table"):
# Create a GenerativeModel instance
genai_mod = genai.GenerativeModel(
model_name='models/gemini-1.5-pro-latest'
)
if 'primary_key' in python_dict:
primary_key = python_dict['primary_key']
else:
primary_key = "Could Not be Identified"
story = f""" Details of the table:
table columns: {str(list(df.columns))}
column datatypes: {str(df.dtypes.to_string())}
table sample data: {df.head(10).to_string()}
"""
response = genai_mod.generate_content(textwrap.dedent("""
You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
Important: Please consider All fields are mandetorily during your analysis.
Here is the table details:
""") + story + f"The Primary Key is:{primary_key}" )
st.write(response.usage_metadata)
st.session_state[f'cont_{table_selector}'] = response.text
with stylable_container(
key=f"container_with_border",
css_styles="""
{
border: 1px solid white;
border-radius: 0.5rem;
padding: calc(1em - 1px);
width: 110%; /* Set container width to 100% */
}
"""
):
st.write(st.session_state[f'cont_{table_selector}'])
col9, col10, col11 = st.columns([2, 3, 9])
with col9:
preview= st.button("PREVIEW TABLE",key='preview')
# with col10:
# cdet = st.button("GET COLUMN DETAILS",key='prof')
if preview:
st.dataframe(df)
# if cdet:
# cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
# c_dict=python_dict['columns']
# i=0
# for key in c_dict:
# e_dict=c_dict[key]
# if 'pii' in e_dict:
# p='YES'
# else:
# p='NO'
# if e_dict['sdtype']=='datetime':
# v=e_dict['sdtype']+': '+e_dict['datetime_format']
# else:
# v=e_dict['sdtype']
# new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
# cdetails=pd.concat([cdetails, new_row],ignore_index=True)
# i=i+1
# if 'primary_key' in python_dict:
# st.write('Primary Key:',python_dict['primary_key'])
# else:
# st.write('Primary Key: No key can be detected')
# st.write(cdetails)
|