File size: 11,511 Bytes
41c4cf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pandas as pd
import numpy as np
import streamlit as st
import sdv
from sdv.datasets.local import load_csvs
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer
import time
import os
import gc
import warnings
from PIL import Image
from sdv.metadata import SingleTableMetadata
import pyodbc
import google.generativeai as genai
import textwrap
from streamlit_extras.stylable_container import stylable_container
genai.configure(api_key='AIzaSyDgS-r-wKmJJ6g2SawaV8ULa-DpTvRjBa0')
genai_mod = genai.GenerativeModel(
    model_name='models/gemini-1.5-pro-latest'
)

st.set_page_config(page_title='DATA DISCOVERY')
st.title('AUTOMATED DATA CATALOGUE')
st.subheader('SELECT SOURCE')
select1=st.selectbox('SOURCE NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name')
if select1 =='DB_10001':
    datasets = load_csvs(
            folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
            read_csv_parameters={
                'skipinitialspace': True,
                'encoding': 'utf_8'
            })
    st.markdown(f"System has found :orange[**{str(len(datasets))} tables**] in the source. Please proceed with selection of mode of discovery.")
    select_main = st.selectbox('Please Select Mode of Discovery',('Single Table Discovery','Multi Table Discovery'),key='mainname',index=None,placeholder='Select Mode of Discovery')
    if select_main == 'Multi Table Discovery':
            with st.spinner('Performing Data Discovery'):
                time.sleep(2)
            st.success('Data cataloguing complete!')
            datasets = load_csvs(
            folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE',
            read_csv_parameters={
                'skipinitialspace': True,
                'encoding': 'utf_8'
            })
            metadata = MultiTableMetadata()
            metadata.detect_from_csvs(folder_name='C:\Applications\MARCO POLO O AIML\DATA CATALOG\BIKE_STORE_DATABASE')
            python_dict = metadata.to_dict()
            st.markdown('---')
            st.subheader('DATA CATALOG')
            # st.json(python_dict)
            brands=datasets['brands']
            categories=datasets['categories']
            customers=datasets['CUSTOMER_MASTER_TBL_1']
            orderitems=datasets['order_items']
            orders=datasets['orders']
            products=datasets['products']
            staffs=datasets['staffs']
            stocks=datasets['stocks']
            stores=datasets['stores']
            tables=python_dict['tables']
            table_names=[*tables]
            col1, col2, col3 = st.columns([2,2,2])
        
            with col1:
                def view_callback():
                    st.session_state.tdet = False
                view= st.button("LIST TABLES",key='view',on_click=view_callback)
            with col2:
                if 'tdet' not in st.session_state:
                    st.session_state.tdet = False
                tdet1 = st.button("SHOW TABLE DETAILS")
            with col3:
                rel=st.button('SHOW RELATIONSHIPS',key='rel',on_click=view_callback)
        
            if tdet1:
                    st.session_state.tdet = tdet1
            if view:
                #st.write(python_dict)
                
                st.write(pd.DataFrame(table_names,columns=['TABLE NAME']))
                
            if rel:
                rlist1=python_dict['relationships']
                rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT PRIMARY KEY','CHILD FOREIGN KEY'])
                for i in range(len(rlist1)):
                    rlist=rlist1[i]
                    nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT PRIMARY KEY':rlist['parent_primary_key'],'CHILD FOREIGN KEY':rlist['child_foreign_key']},index=[i])
                    rdf=pd.concat([rdf,nrow],ignore_index=True)
                st.write(rdf)
            if st.session_state.tdet is True:
                def tdet_callback():
                    st.session_state.tdet=True
                st.subheader('Select table name to view')
                sbox1=st.selectbox('TABLE NAME',table_names,index=None,placeholder='Select table name',on_change=tdet_callback)
                col4, col5 = st.columns([1, 3])
                with col4:
                    preview= st.button("PREVIEW TABLE",key='preview')
                with col5:
                    cdet = st.button("GET COLUMN DETAILS",key='prof')
                if preview:
                    st.write(datasets[sbox1])
                if cdet:
                    cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
                    t_dict=tables[sbox1]
                    c_dict=t_dict['columns']
                    i=0
                    for key in c_dict:
                        e_dict=c_dict[key]
                        if 'pii' in e_dict:
                            p='YES'
                        else:
                            p='NO'
                        if e_dict['sdtype']=='datetime':
                            v=e_dict['sdtype']+': '+e_dict['datetime_format']
                        else:
                            v=e_dict['sdtype']
                        new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
                        cdetails=pd.concat([cdetails, new_row],ignore_index=True)
                        i=i+1
                    if 'primary_key' in t_dict:
                        st.write('Primary Key:',t_dict['primary_key'])
                    else:
                        st.write('Primary Key: No key can be detected')
                    st.write(cdetails)
    
    if select_main == 'Single Table Discovery':
            metadata = SingleTableMetadata()
            conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
                                                "Server=ipzilnpxsssp001.database.windows.net;"
                                                "Database=Marcopolo_DB;"
                                                "UID=ssikder004;"
                                                "PWD=Marcopolo@123" )
            query1="select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='Client' ORDER BY TABLE_NAME ASC"
            table1=pd.read_sql_query(query1,con=conn)
            table1['TABLE_NAME']=table1['TABLE_NAME'].astype('str')
            table_selector=st.selectbox('SOURCE TABLE NAME',['brands','categories','CUSTOMER_MASTER_TBL_1','orders','order_items','products','staffs','stocks','stores'],index=None,placeholder='Select table for automated column mapping')
            if table_selector is not None:
                st.markdown('---')
                query2="select * from [Client].["+table_selector+"]"
                df = pd.read_sql_query(query2,con=conn)
                main_list=df.columns.to_list()
                sub_list=['ID','LOADID','FILE_NAME']
                if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
                    df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
                conn.close()
                metadata = SingleTableMetadata()
                metadata.detect_from_dataframe(df)
                python_dict = metadata.to_dict()
                if f'cont_{table_selector}' not in st.session_state:
                    with st.spinner("Processing Table"):
                        # Create a GenerativeModel instance
                        genai_mod = genai.GenerativeModel(
                            model_name='models/gemini-1.5-pro-latest'
                        )
                        if 'primary_key' in python_dict:
                            primary_key = python_dict['primary_key']
                        else:
                            primary_key = "Could Not be Identified"
                        
                        
                        story =  f""" Details of the table: 

                        table columns: {str(list(df.columns))}

                        column datatypes: {str(df.dtypes.to_string())}

                        table sample data: {df.head(10).to_string()}

                        """
                        response = genai_mod.generate_content(textwrap.dedent("""

                                You are a SAP Data Migration expert. Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data is a SAP data, you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.

        Important: Please consider All fields are mandetorily during your analysis.

                            

                                    Here is the table details:

                            

                                    """) + story + f"The Primary Key is:{primary_key}" )
                        st.write(response.usage_metadata)
                        st.session_state[f'cont_{table_selector}'] = response.text
                with stylable_container(
                key=f"container_with_border",
                css_styles="""

                {

                border: 1px solid white;

                border-radius: 0.5rem;

                padding: calc(1em - 1px);

                width: 110%; /* Set container width to 100% */

                }

                """
                ):
                    st.write(st.session_state[f'cont_{table_selector}'])
                col9, col10, col11 = st.columns([2, 3, 9])
                with col9:
                    preview= st.button("PREVIEW TABLE",key='preview')
                # with col10:
                #     cdet = st.button("GET COLUMN DETAILS",key='prof')
                if preview:
                    st.dataframe(df)
                # if cdet:
                #     cdetails=pd.DataFrame(columns=['Column Name','Data Type','Personal Identifiable Information'])
                #     c_dict=python_dict['columns']
                #     i=0
                #     for key in c_dict:
                #         e_dict=c_dict[key]
                #         if 'pii' in e_dict:
                #             p='YES'
                #         else:
                #             p='NO'
                #         if e_dict['sdtype']=='datetime':
                #             v=e_dict['sdtype']+': '+e_dict['datetime_format']
                #         else:
                #             v=e_dict['sdtype']
                #         new_row=pd.DataFrame({'Column Name':key,'Data Type':v,'Personal Identifiable Information':p},index=[i])
                #         cdetails=pd.concat([cdetails, new_row],ignore_index=True)
                #         i=i+1
                #     if 'primary_key' in python_dict:
                #         st.write('Primary Key:',python_dict['primary_key'])
                #     else:
                #         st.write('Primary Key: No key can be detected')
                #     st.write(cdetails)