import pandas as pd import numpy as np import streamlit as st import sdv from sdv.datasets.local import load_csvs from sdv.metadata import MultiTableMetadata from sdv.multi_table import HMASynthesizer import time import os import gc import warnings from PIL import Image from sdv.metadata import SingleTableMetadata import pyodbc import google.generativeai as genai import textwrap from streamlit_extras.stylable_container import stylable_container genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0') genai_mod = genai.GenerativeModel( model_name='models/gemini-1.5-pro-latest' ) st.set_page_config(page_title='DATA DISCOVERY') st.title('AUTOMATED DATA CATALOGUE') st.subheader('SELECT SOURCE') select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name') if select1 =='DB_10001': datasets = load_csvs( folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', read_csv_parameters={ 'skipinitialspace': True, 'encoding': 'utf_8' }) st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.") select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery') if select_main == 'Multi Table Discovery': with st.spinner('Performing Data Discovery'): time.sleep(2) st.success('Data cataloguing complete!') datasets = load_csvs( folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE', read_csv_parameters={ 'skipinitialspace': True, 'encoding': 'utf_8' }) metadata = MultiTableMetadata() metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE') python_dict = metadata.to_dict() st.markdown('---') st.subheader('DATA CATALOG') # st.json(python_dict) brands=datasets['brands'] categories=datasets['categories'] customers=datasets['CUSTOMER_MASTER_TBL_1'] orderitems=datasets['order_items'] orders=datasets['orders'] products=datasets['products'] staffs=datasets['staffs'] stocks=datasets['stocks'] stores=datasets['stores'] tables=python_dict['tables'] table_names=[*tables] col1, col2, col3 = st.columns([2,2,2]) with col1: def view_callback(): st.session_state.tdet = False view= st.button("LIST TABLES",key='view',on_click=view_callback) with col2: if 'tdet' not in st.session_state: st.session_state.tdet = False tdet1 = st.button("SHOW TABLE DETAILS") with col3: rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback) if tdet1: st.session_state.tdet = tdet1 if view: #st.write(python_dict) st.write(pd.DataFrame(table_names,columns=['TABLE NAME'])) if rel: rlist1=python_dict['relationships'] rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY']) for i in range(len(rlist1)): rlist=rlist1[i] nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i]) rdf=pd.concat([rdf,nrow],ignore_index=True) st.write(rdf) if st.session_state.tdet is True: def tdet_callback(): st.session_state.tdet=True st.subheader('Select table name to view') sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback) col4, col5 = st.columns([1, 3]) with col4: preview= st.button("PREVIEW TABLE",key='preview') with col5: cdet = st.button("GET COLUMN DETAILS",key='prof') if preview: st.write(datasets[sbox1]) if cdet: cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) t_dict=tables[sbox1] c_dict=t_dict['columns'] i=0 for key in c_dict: e_dict=c_dict[key] if 'pii' in e_dict: p='YES' else: p='NO' if e_dict['sdtype']=='datetime': v=e_dict['sdtype']+': '+e_dict['datetime_format'] else: v=e_dict['sdtype'] new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) cdetails=pd.concat([cdetails, new_row],ignore_index=True) i=i+1 if 'primary_key' in t_dict: st.write('Primary Key:',t_dict['primary_key']) else: st.write('Primary Key: No key can be detected') st.write(cdetails) if select_main == 'Single Table Discovery': metadata = SingleTableMetadata() conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};" "Server=ipzilnpxsssp001.database.windows.net;" "Database=Marcopolo_DB;" "UID=ssikder004;" "PWD=Marcopolo@123" ) query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC" table1=pd.read_sql_query(query1,con=conn) table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str') table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping') if table_selector is not None: st.markdown('---') query2="select * from [Client].["+table_selector+"]" df = pd.read_sql_query(query2,con=conn) main_list=df.columns.to_list() sub_list=['ID','LOADID','FILE_NAME'] if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)): df=df.drop(['ID','LOADID','FILE_NAME'],axis=1) conn.close() metadata = SingleTableMetadata() metadata.detect_from_dataframe(df) python_dict = metadata.to_dict() if f'cont_{table_selector}' not in st.session_state: with st.spinner("Processing Table"): # Create a GenerativeModel instance genai_mod = genai.GenerativeModel( model_name='models/gemini-1.5-pro-latest' ) if 'primary_key' in python_dict: primary_key = python_dict['primary_key'] else: primary_key = "Could Not be Identified" story = f""" Details of the table: table columns: {str(list(df.columns))} column datatypes: {str(df.dtypes.to_string())} table sample data: {df.head(10).to_string()} """ response = genai_mod.generate_content(textwrap.dedent(""" You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only. Important: Please consider All fields are mandetorily during your analysis. Here is the table details: """) + story + f"The Primary Key is:{primary_key}" ) st.write(response.usage_metadata) st.session_state[f'cont_{table_selector}'] = response.text with stylable_container( key=f"container_with_border", css_styles=""" { border: 1px solid white; border-radius: 0.5rem; padding: calc(1em - 1px); width: 110%; /* Set container width to 100% */ } """ ): st.write(st.session_state[f'cont_{table_selector}']) col9, col10, col11 = st.columns([2, 3, 9]) with col9: preview= st.button("PREVIEW TABLE",key='preview') # with col10: # cdet = st.button("GET COLUMN DETAILS",key='prof') if preview: st.dataframe(df) # if cdet: # cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information']) # c_dict=python_dict['columns'] # i=0 # for key in c_dict: # e_dict=c_dict[key] # if 'pii' in e_dict: # p='YES' # else: # p='NO' # if e_dict['sdtype']=='datetime': # v=e_dict['sdtype']+': '+e_dict['datetime_format'] # else: # v=e_dict['sdtype'] # new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i]) # cdetails=pd.concat([cdetails, new_row],ignore_index=True) # i=i+1 # if 'primary_key' in python_dict: # st.write('Primary Key:',python_dict['primary_key']) # else: # st.write('Primary Key: No key can be detected') # st.write(cdetails)