pwc-india commited on
Commit
e27ea91
1 Parent(s): c6566d3

Create DATA CATALOG

Browse files
Files changed (1) hide show
  1. pages/DATA CATALOG +425 -0
pages/DATA CATALOG ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import networkx as nx
4
+ import numpy as np
5
+ import streamlit as st
6
+ import sdv
7
+ from sdv.datasets.local import load_csvs
8
+ from sdv.metadata import MultiTableMetadata
9
+ from sdv.multi_table import HMASynthesizer
10
+ import time
11
+ import os
12
+ import gc
13
+ import warnings
14
+ from PIL import Image
15
+ from sdv.metadata import SingleTableMetadata
16
+ import pyodbc
17
+ import google.generativeai as genai
18
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
19
+ import textwrap
20
+ from streamlit_extras.stylable_container import stylable_container
21
+ from streamlit_extras.stateful_button import button
22
+ import json
23
+ from io import BytesIO
24
+ import pymssql
25
+
26
+ genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w')
27
+ genai_mod = genai.GenerativeModel(
28
+ model_name='models/gemini-pro'
29
+ )
30
+
31
+ st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide')
32
+ st.markdown("""
33
+ <style>
34
+
35
+ /* Remove blank space at top and bottom */
36
+ .block-container {
37
+ padding-top: 2rem;
38
+ }
39
+
40
+ /* Remove blank space at the center canvas */
41
+ .st-emotion-cache-z5fcl4 {
42
+ position: relative;
43
+ top: -62px;
44
+ }
45
+
46
+ /* Make the toolbar transparent and the content below it clickable */
47
+ .st-emotion-cache-18ni7ap {
48
+ pointer-events: none;
49
+ background: rgb(255 255 255 / 0%)
50
+ }
51
+ .st-emotion-cache-zq5wmm {
52
+ pointer-events: auto;
53
+ background: rgb(255 255 255);
54
+ border-radius: 5px;
55
+ }
56
+ </style>
57
+ """, unsafe_allow_html=True)
58
+ def clear_cache():
59
+ if 'rdf' in st.session_state:
60
+ st.session_state.pop('rdf')
61
+
62
+ def create_er_diagram(df):
63
+ G = nx.DiGraph() # Directed graph
64
+
65
+ # Dictionary to hold table columns
66
+ table_columns = {}
67
+
68
+ # Add nodes and edges to the graph
69
+ for _, row in df.iterrows():
70
+ parent_table = row['PARENT TABLE']
71
+ child_table = row['CHILD TABLE']
72
+ parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN']
73
+ child_fk = row['CHILD TABLE RELATIONSHIP COLUMN']
74
+ cardinality = row.get('CARDINALITY', '1:N')
75
+
76
+ # Add columns to tables
77
+ if parent_table not in table_columns:
78
+ table_columns[parent_table] = []
79
+ table_columns[parent_table].append(parent_pk)
80
+
81
+ if child_table not in table_columns:
82
+ table_columns[child_table] = []
83
+ table_columns[child_table].append(child_fk)
84
+
85
+ # Add nodes and edges
86
+ G.add_node(parent_table)
87
+ G.add_node(child_table)
88
+ G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}')
89
+
90
+ return G, table_columns
91
+
92
+ def draw_er_diagram(G, table_columns):
93
+ pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes
94
+
95
+ plt.figure(figsize=(8, 8))
96
+ nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True)
97
+
98
+ # Draw node labels (table names in bold)
99
+ for node, (x, y) in pos.items():
100
+ plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center')
101
+
102
+ # Draw column names
103
+ for node, columns in table_columns.items():
104
+ x, y = pos[node]
105
+ column_text = '\n'.join(columns)
106
+ plt.text(x, y, column_text, fontsize=6, ha='center', va='center')
107
+
108
+ # Draw edge labels
109
+ edge_labels = nx.get_edge_attributes(G, 'label')
110
+ nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
111
+ st.subheader("Schematic Representation")
112
+ with st.container(border=True, height= 350):
113
+ st.pyplot(plt)
114
+ img_bytes = BytesIO()
115
+ plt.savefig(img_bytes, format='png')
116
+ img_bytes.seek(0)
117
+ return img_bytes
118
+
119
+ def cardinality(parent_df, child_df, parent_column, child_column):
120
+ # Check uniqueness of parent primary key
121
+ is_parent_unique = parent_df[parent_column].is_unique
122
+
123
+ # Check uniqueness of child foreign key
124
+ is_child_unique = child_df[child_column].is_unique
125
+
126
+ # Determine cardinality
127
+ if is_parent_unique and is_child_unique:
128
+ return '1:1'
129
+ elif is_parent_unique and not is_child_unique:
130
+ return '1:N'
131
+ elif not is_parent_unique and is_child_unique:
132
+ return 'N:1'
133
+ else:
134
+ return 'N:N'
135
+
136
+ #st.title('AUTOMATED DATA CATALOGUE')
137
+ st.subheader('SELECT SOURCE')
138
+ selectcol11, selectcol12 = st.columns(2)
139
+ with selectcol11:
140
+ select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache)
141
+ with selectcol12:
142
+ select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache)
143
+ if select1 =='DB_10001' and select2 is not None:
144
+ with st.spinner("Loading Tables:"):
145
+
146
+ conn1 = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;"
147
+ "Database=sqldb-ext-dev-uks-001;"
148
+ "UID=dbadmin;"
149
+ "PWD=mYpa$$w0rD" )
150
+
151
+ query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC"
152
+ st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME'])
153
+
154
+ table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache)
155
+ sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
156
+
157
+ discover= button("Discover", key='discover')
158
+
159
+ if discover:
160
+ if sample_selector=='100':
161
+ count="top 100"
162
+ elif sample_selector=='10K':
163
+ count="top 10000"
164
+ elif sample_selector=='100K':
165
+ count="top 100000"
166
+ elif sample_selector=='1M':
167
+ count="top 1000000"
168
+ else:
169
+ count=""
170
+
171
+ query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC"
172
+ st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME'])
173
+ st.session_state.dataframes = {}
174
+ st.session_state.col_names = []
175
+ for tab in st.session_state.tab_names:
176
+ query2_2= "select "+count+" * from ["+select2+"].["+tab+"]"
177
+ st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1)
178
+ st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns)
179
+ #st.session_state.data_load = "Yes"
180
+
181
+ tab_names = st.session_state.tab_names
182
+ dataframes = st.session_state.dataframes
183
+ col_names = st.session_state.col_names
184
+ metadata = MultiTableMetadata()
185
+ metadata.detect_from_dataframes(
186
+ data= st.session_state.dataframes
187
+ )
188
+ multi_python_dict = metadata.to_dict()
189
+
190
+ st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.")
191
+ #st.subheader("DATA CATALOGUE")
192
+ tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"])
193
+ def view_callback():
194
+ st.session_state.tdet = False
195
+ with tab1:
196
+ #st.write(python_dict)
197
+ st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME'])
198
+ containter_length = (len(st.session_state.table_list) + 1)*35
199
+ tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values)
200
+ tabs2= st.tabs(tab_names_shown)
201
+ for i, tab in enumerate(tabs2):
202
+ with tab:
203
+ with st.container(height= 400, border=True):
204
+ cole1,cole2=st.columns([1,1.5])
205
+ with cole1:
206
+ conn = pymssql.connect("Driver={ODBC Driver 17 for SQL Server};"
207
+ "Server=sql-ext-dev-uks-001.database.windows.net;"
208
+ "Database=sqldb-ext-dev-uks-001;"
209
+ "UID=dbadmin;"
210
+ "PWD=mYpa$$w0rD" )
211
+
212
+ table_selector= tab_names_shown[i]
213
+ if table_selector is not None:
214
+ query2="select "+count+" * from [dbo].["+table_selector+"]"
215
+ #df = pd.read_sql_query(query2,con=conn)
216
+ df = st.session_state.dataframes[table_selector]
217
+ selected_df = pd.DataFrame()
218
+ for col in df.columns:
219
+ # Filter non-null and non-blank values
220
+ non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip()
221
+
222
+ # Select up to 10 values (or fewer if less than 10 non-null values)
223
+ selected_values = list(non_null_values[:10])
224
+ selected_values = selected_values + [""] * (10 - len(selected_values))
225
+ # Add selected values to the new dataframe
226
+ selected_df[col] = selected_values
227
+ #st.dataframe(selected_df)
228
+ null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1]
229
+ null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n"
230
+ for col in null_columns[:-1]:
231
+ null_mes += f":orange[**{col}**]" + ', '
232
+ for collast in null_columns[-1:]:
233
+ if len(null_columns)> 1:
234
+ null_mes += '**and** ' + f":orange[**{collast}**]"
235
+ else:
236
+ null_mes += f":orange[**{collast}**]"
237
+
238
+ if len(null_columns) != 0:
239
+ with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True):
240
+ st.markdown(null_mes)
241
+ inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns))
242
+ run = st.button('Check', key= f"{tab_names_shown[i]}")
243
+ else:
244
+ st.success("No redundant Columns Found in Terms of Data Completeness")
245
+ inf_filter= None
246
+ run = False
247
+
248
+ if inf_filter is not None:
249
+ df.drop(columns=inf_filter, inplace=True)
250
+ selected_df.drop(columns=inf_filter, inplace=True)
251
+
252
+ if run or len(null_columns) == 0:
253
+ main_list=df.columns.to_list()
254
+ sub_list=['ID','LOADID','FILE_NAME']
255
+ if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
256
+ df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
257
+ conn.close()
258
+ sin_metadata = SingleTableMetadata()
259
+ sin_metadata.detect_from_dataframe(df)
260
+ python_dict = sin_metadata.to_dict()
261
+ if f'cont_{table_selector}' not in st.session_state:
262
+ with st.spinner("Processing Table"):
263
+ # Create a GenerativeModel instance
264
+ genai_mod = genai.GenerativeModel(
265
+ model_name='models/gemini-pro'
266
+ )
267
+ if 'primary_key' in python_dict:
268
+ primary_key = python_dict['primary_key']
269
+ else:
270
+ primary_key = "Could Not be Identified"
271
+
272
+
273
+ story = f""" Details of the table:
274
+ table columns: {str(list(df.columns))}
275
+ column datatypes: {str(df.dtypes.to_string())}
276
+ table sample data: {selected_df.head(10).to_string()}
277
+ """
278
+ response = genai_mod.generate_content(textwrap.dedent("""
279
+ You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
280
+ Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN.
281
+
282
+ Here is the table details:
283
+
284
+ """) + story + f"The Primary Key is:{primary_key}" ,
285
+ safety_settings={
286
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
287
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
288
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
289
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
290
+ })
291
+ st.session_state[f'cont_{table_selector}'] = response.text
292
+
293
+ st.markdown(st.session_state[f'cont_{table_selector}'])
294
+ with cole2:
295
+ st.markdown("**DATA PREVIEW**")
296
+ st.dataframe(df, use_container_width= True)
297
+
298
+ with tab2:
299
+ metadata1 = MultiTableMetadata()
300
+ metadata1.detect_from_dataframes(
301
+ data= st.session_state.dataframes
302
+ )
303
+ multi_python_dict1 = metadata1.to_dict()
304
+ rlist1=multi_python_dict1['relationships']
305
+ rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY'])
306
+ for i in range(len(rlist1)):
307
+ rlist=rlist1[i]
308
+ nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i])
309
+ rdf=pd.concat([rdf,nrow],ignore_index=True)
310
+
311
+ rdf['CARDINALITY'] = rdf.apply(
312
+ lambda row: cardinality(
313
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
314
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
315
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
316
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
317
+
318
+
319
+ if 'rdf' not in st.session_state:
320
+ st.session_state.rdf = rdf
321
+
322
+ edited_map_df = st.data_editor(
323
+ st.session_state.rdf,
324
+ column_config={
325
+ "PARENT TABLE": st.column_config.SelectboxColumn(
326
+ "Available Parent Table",
327
+ width="medium",
328
+ options=tab_names,
329
+ required=True,
330
+ ),
331
+ "CHILD TABLE": st.column_config.SelectboxColumn(
332
+ "Available Child Table",
333
+ width="medium",
334
+ options=tab_names,
335
+ required=True,
336
+ ),
337
+ "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
338
+ "Available Parent Table Relationship Column",
339
+ width="medium",
340
+ options=col_names,
341
+ required=True,
342
+ ),
343
+ "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
344
+ "Available Child Table Relationship Column",
345
+ width="medium",
346
+ options=col_names,
347
+ required=True,
348
+ ),
349
+ "CARDINALITY": st.column_config.SelectboxColumn(
350
+ "Cardinality",
351
+ width="medium",
352
+ options=['1:1','1:N','N:1','N:N'],
353
+ required=True,
354
+ )
355
+ },
356
+ hide_index=True,
357
+ num_rows = 'dynamic',
358
+ use_container_width = True
359
+ )
360
+
361
+ for i,row in edited_map_df.iterrows():
362
+ pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns
363
+ ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns
364
+ pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
365
+ cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
366
+ match = [val for val in pvals if val in cvals]
367
+ #st.write(match)
368
+ if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist:
369
+ st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}")
370
+ else:
371
+ pass
372
+ if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist:
373
+ st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}")
374
+ else:
375
+ pass
376
+ if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist):
377
+ pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
378
+ cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
379
+ match = [val for val in pvals if val in cvals]
380
+ if match == []:
381
+ st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ")
382
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []):
383
+ # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique()
384
+ # if primary_check:
385
+ # pass
386
+ # else:
387
+ # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.")
388
+ pass
389
+
390
+ add = st.button("Add Relationship", key='add')
391
+ if add:
392
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])):
393
+ add_df = edited_map_df
394
+ else:
395
+ add_df = st.session_state.rdf
396
+ else:
397
+ add_df = st.session_state.rdf
398
+
399
+ add_df['CARDINALITY'] = add_df.apply(
400
+ lambda row: cardinality(
401
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
402
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
403
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
404
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
405
+
406
+ st.session_state.add_df = add_df
407
+ edited_map_df = st.session_state.add_df
408
+
409
+ rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values)
410
+ unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs]
411
+ st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""")
412
+
413
+ G, table_columns = create_er_diagram(st.session_state.add_df)
414
+ img_bytes= draw_er_diagram(G, table_columns)
415
+ col21, col22= st.columns([1,8])
416
+ with col21:
417
+ if st.button("Regenerate"):
418
+ st.rerun()
419
+ with col22:
420
+ st.download_button(
421
+ label="Download ER Diagram",
422
+ data=img_bytes,
423
+ file_name="er_diagram.png",
424
+ mime="image/png"
425
+ )