Spaces:

pwc-india
/

data_discovery_source_target_mapping

Sleeping

File size: 11,511 Bytes

41c4cf9

import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time
import os
import gc
import warnings
from PIL import Image
from sdv.metadata import SingleTableMetadata
import pyodbc
import google.generativeai as genai
import textwrap
from streamlit_extras.stylable_container import stylable_container
genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
genai_mod = genai.GenerativeModel(
    model_name='models/gemini-1.5-pro-latest'
)

st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT SOURCE')
select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1 =='DB_10001':
    datasets = load_csvs(
            folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
            read_csv_parameters={
                'skipinitialspace': True,
                'encoding': 'utf_8'
            })
    st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.")
    select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
    if select_main == 'Multi Table Discovery':
            with st.spinner('Performing Data Discovery'):
                time.sleep(2)
            st.success('Data cataloguing complete!')
            datasets = load_csvs(
            folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
            read_csv_parameters={
                'skipinitialspace': True,
                'encoding': 'utf_8'
            })
            metadata = MultiTableMetadata()
            metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
            python_dict = metadata.to_dict()
            st.markdown('---')
            st.subheader('DATA CATALOG')
            # st.json(python_dict)
            brands=datasets['brands']
            categories=datasets['categories']
            customers=datasets['CUSTOMER_MASTER_TBL_1']
            orderitems=datasets['order_items']
            orders=datasets['orders']
            products=datasets['products']
            staffs=datasets['staffs']
            stocks=datasets['stocks']
            stores=datasets['stores']
            tables=python_dict['tables']
            table_names=[*tables]
            col1, col2, col3 = st.columns([2,2,2])
        
            with col1:
                def view_callback():
                    st.session_state.tdet = False
                view= st.button("LIST TABLES",key='view',on_click=view_callback)
            with col2:
                if 'tdet' not in st.session_state:
                    st.session_state.tdet = False
                tdet1 = st.button("SHOW TABLE DETAILS")
            with col3:
                rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
        
            if tdet1:
                    st.session_state.tdet = tdet1
            if view:
                #st.write(python_dict)
                
                st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
                
            if rel:
                rlist1=python_dict['relationships']
                rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
                for i in range(len(rlist1)):
                    rlist=rlist1[i]
                    nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
                    rdf=pd.concat([rdf,nrow],ignore_index=True)
                st.write(rdf)
            if st.session_state.tdet is True:
                def tdet_callback():
                    st.session_state.tdet=True
                st.subheader('Select table name to view')
                sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
                col4, col5 = st.columns([1, 3])
                with col4:
                    preview= st.button("PREVIEW TABLE",key='preview')
                with col5:
                    cdet = st.button("GET COLUMN DETAILS",key='prof')
                if preview:
                    st.write(datasets[sbox1])
                if cdet:
                    cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
                    t_dict=tables[sbox1]
                    c_dict=t_dict['columns']
                    i=0
                    for key in c_dict:
                        e_dict=c_dict[key]
                        if 'pii' in e_dict:
                            p='YES'
                        else:
                            p='NO'
                        if e_dict['sdtype']=='datetime':
                            v=e_dict['sdtype']+': '+e_dict['datetime_format']
                        else:
                            v=e_dict['sdtype']
                        new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
                        cdetails=pd.concat([cdetails, new_row],ignore_index=True)
                        i=i+1
                    if 'primary_key' in t_dict:
                        st.write('Primary Key:',t_dict['primary_key'])
                    else:
                        st.write('Primary Key: No key can be detected')
                    st.write(cdetails)
    
    if select_main == 'Single Table Discovery':
            metadata = SingleTableMetadata()
            conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
                                                "Server=ipzilnpxsssp001.database.windows.net;"
                                                "Database=Marcopolo_DB;"
                                                "UID=ssikder004;"
                                                "PWD=Marcopolo@123" )
            query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
            table1=pd.read_sql_query(query1,con=conn)
            table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
            table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
            if table_selector is not None:
                st.markdown('---')
                query2="select * from [Client].["+table_selector+"]"
                df = pd.read_sql_query(query2,con=conn)
                main_list=df.columns.to_list()
                sub_list=['ID','LOADID','FILE_NAME']
                if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
                    df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
                conn.close()
                metadata = SingleTableMetadata()
                metadata.detect_from_dataframe(df)
                python_dict = metadata.to_dict()
                if f'cont_{table_selector}' not in st.session_state:
                    with st.spinner("Processing Table"):
                        # Create a GenerativeModel instance
                        genai_mod = genai.GenerativeModel(
                            model_name='models/gemini-1.5-pro-latest'
                        )
                        if 'primary_key' in python_dict:
                            primary_key = python_dict['primary_key']
                        else:
                            primary_key = "Could Not be Identified"
                        
                        
                        story =  f""" Details of the table: 

                        table columns: {str(list(df.columns))}

                        column datatypes: {str(df.dtypes.to_string())}

                        table sample data: {df.head(10).to_string()}

                        """
                        response = genai_mod.generate_content(textwrap.dedent("""

                                You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.

        Important: Please consider All fields are mandetorily during your analysis.

                            

                                    Here is the table details:

                            

                                    """) + story + f"The Primary Key is:{primary_key}" )
                        st.write(response.usage_metadata)
                        st.session_state[f'cont_{table_selector}'] = response.text
                with stylable_container(
                key=f"container_with_border",
                css_styles="""

                {

                border: 1px solid white;

                border-radius: 0.5rem;

                padding: calc(1em - 1px);

                width: 110%; /* Set container width to 100% */

                }

                """
                ):
                    st.write(st.session_state[f'cont_{table_selector}'])
                col9, col10, col11 = st.columns([2, 3, 9])
                with col9:
                    preview= st.button("PREVIEW TABLE",key='preview')
                # with col10:
                #     cdet = st.button("GET COLUMN DETAILS",key='prof')
                if preview:
                    st.dataframe(df)
                # if cdet:
                #     cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
                #     c_dict=python_dict['columns']
                #     i=0
                #     for key in c_dict:
                #         e_dict=c_dict[key]
                #         if 'pii' in e_dict:
                #             p='YES'
                #         else:
                #             p='NO'
                #         if e_dict['sdtype']=='datetime':
                #             v=e_dict['sdtype']+': '+e_dict['datetime_format']
                #         else:
                #             v=e_dict['sdtype']
                #         new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
                #         cdetails=pd.concat([cdetails, new_row],ignore_index=True)
                #         i=i+1
                #     if 'primary_key' in python_dict:
                #         st.write('Primary Key:',python_dict['primary_key'])
                #     else:
                #         st.write('Primary Key: No key can be detected')
                #     st.write(cdetails)