|
import pandas as pd
|
|
import numpy as np
|
|
import streamlit as st
|
|
import sdv
|
|
from sdv.datasets.local import load_csvs
|
|
from sdv.metadata import MultiTableMetadata
|
|
from sdv.multi_table import HMASynthesizer
|
|
import time
|
|
import os
|
|
import gc
|
|
import warnings
|
|
from PIL import Image
|
|
from sdv.metadata import SingleTableMetadata
|
|
import pyodbc
|
|
import google.generativeai as genai
|
|
import textwrap
|
|
from streamlit_extras.stylable_container import stylable_container
|
|
genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
|
|
genai_mod = genai.GenerativeModel(
|
|
model_name='models/gemini-1.5-pro-latest'
|
|
)
|
|
|
|
st.set_page_config(page_title='DATA DISCOVERY')
|
|
st.title('AUTOMATED DATA CATALOGUE')
|
|
st.subheader('SELECT SOURCE')
|
|
select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
|
|
if select1 =='DB_10001':
|
|
datasets = load_csvs(
|
|
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
|
|
read_csv_parameters={
|
|
'skipinitialspace': True,
|
|
'encoding': 'utf_8'
|
|
})
|
|
st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.")
|
|
select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
|
|
if select_main == 'Multi Table Discovery':
|
|
with st.spinner('Performing Data Discovery'):
|
|
time.sleep(2)
|
|
st.success('Data cataloguing complete!')
|
|
datasets = load_csvs(
|
|
folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
|
|
read_csv_parameters={
|
|
'skipinitialspace': True,
|
|
'encoding': 'utf_8'
|
|
})
|
|
metadata = MultiTableMetadata()
|
|
metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
|
|
python_dict = metadata.to_dict()
|
|
st.markdown('---')
|
|
st.subheader('DATA CATALOG')
|
|
|
|
brands=datasets['brands']
|
|
categories=datasets['categories']
|
|
customers=datasets['CUSTOMER_MASTER_TBL_1']
|
|
orderitems=datasets['order_items']
|
|
orders=datasets['orders']
|
|
products=datasets['products']
|
|
staffs=datasets['staffs']
|
|
stocks=datasets['stocks']
|
|
stores=datasets['stores']
|
|
tables=python_dict['tables']
|
|
table_names=[*tables]
|
|
col1, col2, col3 = st.columns([2,2,2])
|
|
|
|
with col1:
|
|
def view_callback():
|
|
st.session_state.tdet = False
|
|
view= st.button("LIST TABLES",key='view',on_click=view_callback)
|
|
with col2:
|
|
if 'tdet' not in st.session_state:
|
|
st.session_state.tdet = False
|
|
tdet1 = st.button("SHOW TABLE DETAILS")
|
|
with col3:
|
|
rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
|
|
|
|
if tdet1:
|
|
st.session_state.tdet = tdet1
|
|
if view:
|
|
|
|
|
|
st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
|
|
|
|
if rel:
|
|
rlist1=python_dict['relationships']
|
|
rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
|
|
for i in range(len(rlist1)):
|
|
rlist=rlist1[i]
|
|
nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
|
|
rdf=pd.concat([rdf,nrow],ignore_index=True)
|
|
st.write(rdf)
|
|
if st.session_state.tdet is True:
|
|
def tdet_callback():
|
|
st.session_state.tdet=True
|
|
st.subheader('Select table name to view')
|
|
sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
|
|
col4, col5 = st.columns([1, 3])
|
|
with col4:
|
|
preview= st.button("PREVIEW TABLE",key='preview')
|
|
with col5:
|
|
cdet = st.button("GET COLUMN DETAILS",key='prof')
|
|
if preview:
|
|
st.write(datasets[sbox1])
|
|
if cdet:
|
|
cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
|
|
t_dict=tables[sbox1]
|
|
c_dict=t_dict['columns']
|
|
i=0
|
|
for key in c_dict:
|
|
e_dict=c_dict[key]
|
|
if 'pii' in e_dict:
|
|
p='YES'
|
|
else:
|
|
p='NO'
|
|
if e_dict['sdtype']=='datetime':
|
|
v=e_dict['sdtype']+': '+e_dict['datetime_format']
|
|
else:
|
|
v=e_dict['sdtype']
|
|
new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
|
|
cdetails=pd.concat([cdetails, new_row],ignore_index=True)
|
|
i=i+1
|
|
if 'primary_key' in t_dict:
|
|
st.write('Primary Key:',t_dict['primary_key'])
|
|
else:
|
|
st.write('Primary Key: No key can be detected')
|
|
st.write(cdetails)
|
|
|
|
if select_main == 'Single Table Discovery':
|
|
metadata = SingleTableMetadata()
|
|
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
|
|
"Server=ipzilnpxsssp001.database.windows.net;"
|
|
"Database=Marcopolo_DB;"
|
|
"UID=ssikder004;"
|
|
"PWD=Marcopolo@123" )
|
|
query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
|
|
table1=pd.read_sql_query(query1,con=conn)
|
|
table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
|
|
table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
|
|
if table_selector is not None:
|
|
st.markdown('---')
|
|
query2="select * from [Client].["+table_selector+"]"
|
|
df = pd.read_sql_query(query2,con=conn)
|
|
main_list=df.columns.to_list()
|
|
sub_list=['ID','LOADID','FILE_NAME']
|
|
if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
|
|
df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
|
|
conn.close()
|
|
metadata = SingleTableMetadata()
|
|
metadata.detect_from_dataframe(df)
|
|
python_dict = metadata.to_dict()
|
|
if f'cont_{table_selector}' not in st.session_state:
|
|
with st.spinner("Processing Table"):
|
|
|
|
genai_mod = genai.GenerativeModel(
|
|
model_name='models/gemini-1.5-pro-latest'
|
|
)
|
|
if 'primary_key' in python_dict:
|
|
primary_key = python_dict['primary_key']
|
|
else:
|
|
primary_key = "Could Not be Identified"
|
|
|
|
|
|
story = f""" Details of the table:
|
|
table columns: {str(list(df.columns))}
|
|
column datatypes: {str(df.dtypes.to_string())}
|
|
table sample data: {df.head(10).to_string()}
|
|
"""
|
|
response = genai_mod.generate_content(textwrap.dedent("""
|
|
You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
|
|
Important: Please consider All fields are mandetorily during your analysis.
|
|
|
|
Here is the table details:
|
|
|
|
""") + story + f"The Primary Key is:{primary_key}" )
|
|
st.write(response.usage_metadata)
|
|
st.session_state[f'cont_{table_selector}'] = response.text
|
|
with stylable_container(
|
|
key=f"container_with_border",
|
|
css_styles="""
|
|
{
|
|
border: 1px solid white;
|
|
border-radius: 0.5rem;
|
|
padding: calc(1em - 1px);
|
|
width: 110%; /* Set container width to 100% */
|
|
}
|
|
"""
|
|
):
|
|
st.write(st.session_state[f'cont_{table_selector}'])
|
|
col9, col10, col11 = st.columns([2, 3, 9])
|
|
with col9:
|
|
preview= st.button("PREVIEW TABLE",key='preview')
|
|
|
|
|
|
if preview:
|
|
st.dataframe(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|