pwc-india commited on
Commit
edabb35
1 Parent(s): ddb383f

Update pages/DATA CATALOG.py

Browse files
Files changed (1) hide show
  1. pages/DATA CATALOG.py +397 -385
pages/DATA CATALOG.py CHANGED
@@ -22,404 +22,416 @@ from streamlit_extras.stateful_button import button
22
  import json
23
  from io import BytesIO
24
  import pymssql
 
 
25
 
26
- genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w')
27
- genai_mod = genai.GenerativeModel(
28
- model_name='models/gemini-pro'
29
- )
30
 
31
- st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide')
32
- st.markdown("""
33
- <style>
34
 
35
- /* Remove blank space at top and bottom */
36
- .block-container {
37
- padding-top: 2rem;
38
- }
39
-
40
- /* Remove blank space at the center canvas */
41
- .st-emotion-cache-z5fcl4 {
42
- position: relative;
43
- top: -62px;
44
- }
45
-
46
- /* Make the toolbar transparent and the content below it clickable */
47
- .st-emotion-cache-18ni7ap {
48
- pointer-events: none;
49
- background: rgb(255 255 255 / 0%)
50
- }
51
- .st-emotion-cache-zq5wmm {
52
- pointer-events: auto;
53
- background: rgb(255 255 255);
54
- border-radius: 5px;
55
- }
56
- </style>
57
- """, unsafe_allow_html=True)
58
- def clear_cache():
59
- if 'rdf' in st.session_state:
60
- st.session_state.pop('rdf')
61
-
62
- def create_er_diagram(df):
63
- G = nx.DiGraph() # Directed graph
64
-
65
- # Dictionary to hold table columns
66
- table_columns = {}
67
-
68
- # Add nodes and edges to the graph
69
- for _, row in df.iterrows():
70
- parent_table = row['PARENT TABLE']
71
- child_table = row['CHILD TABLE']
72
- parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN']
73
- child_fk = row['CHILD TABLE RELATIONSHIP COLUMN']
74
- cardinality = row.get('CARDINALITY', '1:N')
75
-
76
- # Add columns to tables
77
- if parent_table not in table_columns:
78
- table_columns[parent_table] = []
79
- table_columns[parent_table].append(parent_pk)
80
-
81
- if child_table not in table_columns:
82
- table_columns[child_table] = []
83
- table_columns[child_table].append(child_fk)
84
-
85
- # Add nodes and edges
86
- G.add_node(parent_table)
87
- G.add_node(child_table)
88
- G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}')
89
-
90
- return G, table_columns
91
-
92
- def draw_er_diagram(G, table_columns):
93
- pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes
94
-
95
- plt.figure(figsize=(8, 8))
96
- nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True)
97
-
98
- # Draw node labels (table names in bold)
99
- for node, (x, y) in pos.items():
100
- plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center')
101
-
102
- # Draw column names
103
- for node, columns in table_columns.items():
104
- x, y = pos[node]
105
- column_text = '\n'.join(columns)
106
- plt.text(x, y, column_text, fontsize=6, ha='center', va='center')
107
-
108
- # Draw edge labels
109
- edge_labels = nx.get_edge_attributes(G, 'label')
110
- nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
111
- st.subheader("Schematic Representation")
112
- with st.container(border=True, height= 350):
113
- st.pyplot(plt)
114
- img_bytes = BytesIO()
115
- plt.savefig(img_bytes, format='png')
116
- img_bytes.seek(0)
117
- return img_bytes
118
-
119
- def cardinality(parent_df, child_df, parent_column, child_column):
120
- # Check uniqueness of parent primary key
121
- is_parent_unique = parent_df[parent_column].is_unique
122
-
123
- # Check uniqueness of child foreign key
124
- is_child_unique = child_df[child_column].is_unique
125
-
126
- # Determine cardinality
127
- if is_parent_unique and is_child_unique:
128
- return '1:1'
129
- elif is_parent_unique and not is_child_unique:
130
- return '1:N'
131
- elif not is_parent_unique and is_child_unique:
132
- return 'N:1'
133
- else:
134
- return 'N:N'
135
-
136
- #st.title('AUTOMATED DATA CATALOGUE')
137
- st.subheader('SELECT SOURCE')
138
- selectcol11, selectcol12 = st.columns(2)
139
- with selectcol11:
140
- select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache)
141
- with selectcol12:
142
- select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache)
143
- if select1 =='DB_10001' and select2 is not None:
144
- with st.spinner("Loading Tables:"):
145
 
146
- conn1 = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;"
147
- "Database=sqldb-ext-dev-uks-001;"
148
- "UID=dbadmin;"
149
- "PWD=mYpa$$w0rD" )
150
-
151
- query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC"
152
- st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache)
155
- sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
156
-
157
- discover= button("Discover", key='discover')
158
-
159
- if discover:
160
- if sample_selector=='100':
161
- count="top 100"
162
- elif sample_selector=='10K':
163
- count="top 10000"
164
- elif sample_selector=='100K':
165
- count="top 100000"
166
- elif sample_selector=='1M':
167
- count="top 1000000"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
- count=""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC"
172
- st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME'])
173
- st.session_state.dataframes = {}
174
- st.session_state.col_names = []
175
- for tab in st.session_state.tab_names:
176
- query2_2= "select "+count+" * from ["+select2+"].["+tab+"]"
177
- st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1)
178
- st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns)
179
- #st.session_state.data_load = "Yes"
180
-
181
- tab_names = st.session_state.tab_names
182
- dataframes = st.session_state.dataframes
183
- col_names = st.session_state.col_names
184
- metadata = MultiTableMetadata()
185
- metadata.detect_from_dataframes(
186
- data= st.session_state.dataframes
187
- )
188
- multi_python_dict = metadata.to_dict()
189
-
190
- st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.")
191
- #st.subheader("DATA CATALOGUE")
192
- tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"])
193
- def view_callback():
194
- st.session_state.tdet = False
195
- with tab1:
196
- #st.write(python_dict)
197
- st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME'])
198
- containter_length = (len(st.session_state.table_list) + 1)*35
199
- tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values)
200
- tabs2= st.tabs(tab_names_shown)
201
- for i, tab in enumerate(tabs2):
202
- with tab:
203
- with st.container(height= 400, border=True):
204
- cole1,cole2=st.columns([1,1.5])
205
- with cole1:
206
- conn = pymssql.connect("Driver={ODBC Driver 17 for SQL Server};"
207
- "Server=sql-ext-dev-uks-001.database.windows.net;"
208
- "Database=sqldb-ext-dev-uks-001;"
209
- "UID=dbadmin;"
210
- "PWD=mYpa$$w0rD" )
211
-
212
- table_selector= tab_names_shown[i]
213
- if table_selector is not None:
214
- query2="select "+count+" * from [dbo].["+table_selector+"]"
215
- #df = pd.read_sql_query(query2,con=conn)
216
- df = st.session_state.dataframes[table_selector]
217
- selected_df = pd.DataFrame()
218
- for col in df.columns:
219
- # Filter non-null and non-blank values
220
- non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip()
221
-
222
- # Select up to 10 values (or fewer if less than 10 non-null values)
223
- selected_values = list(non_null_values[:10])
224
- selected_values = selected_values + [""] * (10 - len(selected_values))
225
- # Add selected values to the new dataframe
226
- selected_df[col] = selected_values
227
- #st.dataframe(selected_df)
228
- null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1]
229
- null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n"
230
- for col in null_columns[:-1]:
231
- null_mes += f":orange[**{col}**]" + ', '
232
- for collast in null_columns[-1:]:
233
- if len(null_columns)> 1:
234
- null_mes += '**and** ' + f":orange[**{collast}**]"
235
- else:
236
- null_mes += f":orange[**{collast}**]"
237
-
238
- if len(null_columns) != 0:
239
- with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True):
240
- st.markdown(null_mes)
241
- inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns))
242
- run = st.button('Check', key= f"{tab_names_shown[i]}")
243
- else:
244
- st.success("No redundant Columns Found in Terms of Data Completeness")
245
- inf_filter= None
246
- run = False
247
-
248
- if inf_filter is not None:
249
- df.drop(columns=inf_filter, inplace=True)
250
- selected_df.drop(columns=inf_filter, inplace=True)
251
-
252
- if run or len(null_columns) == 0:
253
- main_list=df.columns.to_list()
254
- sub_list=['ID','LOADID','FILE_NAME']
255
- if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
256
- df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
257
- conn.close()
258
- sin_metadata = SingleTableMetadata()
259
- sin_metadata.detect_from_dataframe(df)
260
- python_dict = sin_metadata.to_dict()
261
- if f'cont_{table_selector}' not in st.session_state:
262
- with st.spinner("Processing Table"):
263
- # Create a GenerativeModel instance
264
- genai_mod = genai.GenerativeModel(
265
- model_name='models/gemini-pro'
266
- )
267
- if 'primary_key' in python_dict:
268
- primary_key = python_dict['primary_key']
269
- else:
270
- primary_key = "Could Not be Identified"
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- story = f""" Details of the table:
274
- table columns: {str(list(df.columns))}
275
- column datatypes: {str(df.dtypes.to_string())}
276
- table sample data: {selected_df.head(10).to_string()}
277
- """
278
- response = genai_mod.generate_content(textwrap.dedent("""
279
- You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
280
- Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- Here is the table details:
283
 
284
- """) + story + f"The Primary Key is:{primary_key}" ,
285
- safety_settings={
286
- HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
287
- HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
288
- HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
289
- HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
290
- })
291
- st.session_state[f'cont_{table_selector}'] = response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- st.markdown(st.session_state[f'cont_{table_selector}'])
294
- with cole2:
295
- st.markdown("**DATA PREVIEW**")
296
- st.dataframe(df, use_container_width= True)
297
-
298
- with tab2:
299
- metadata1 = MultiTableMetadata()
300
- metadata1.detect_from_dataframes(
301
- data= st.session_state.dataframes
302
- )
303
- multi_python_dict1 = metadata1.to_dict()
304
- rlist1=multi_python_dict1['relationships']
305
- rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY'])
306
- for i in range(len(rlist1)):
307
- rlist=rlist1[i]
308
- nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i])
309
- rdf=pd.concat([rdf,nrow],ignore_index=True)
310
-
311
- rdf['CARDINALITY'] = rdf.apply(
312
- lambda row: cardinality(
313
- st.session_state.dataframes[str(row['PARENT TABLE'])],
314
- st.session_state.dataframes[str(row['CHILD TABLE'])],
315
- str(row['PARENT TABLE RELATIONSHIP COLUMN']),
316
- str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
317
-
318
-
319
- if 'rdf' not in st.session_state:
320
- st.session_state.rdf = rdf
321
-
322
- edited_map_df = st.data_editor(
323
- st.session_state.rdf,
324
- column_config={
325
- "PARENT TABLE": st.column_config.SelectboxColumn(
326
- "Available Parent Table",
327
- width="medium",
328
- options=tab_names,
329
- required=True,
330
- ),
331
- "CHILD TABLE": st.column_config.SelectboxColumn(
332
- "Available Child Table",
333
- width="medium",
334
- options=tab_names,
335
- required=True,
336
- ),
337
- "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
338
- "Available Parent Table Relationship Column",
339
- width="medium",
340
- options=col_names,
341
- required=True,
342
- ),
343
- "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
344
- "Available Child Table Relationship Column",
345
- width="medium",
346
- options=col_names,
347
- required=True,
348
- ),
349
- "CARDINALITY": st.column_config.SelectboxColumn(
350
- "Cardinality",
351
- width="medium",
352
- options=['1:1','1:N','N:1','N:N'],
353
- required=True,
354
- )
355
- },
356
- hide_index=True,
357
- num_rows = 'dynamic',
358
- use_container_width = True
359
- )
360
-
361
- for i,row in edited_map_df.iterrows():
362
- pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns
363
- ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns
364
- pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
365
- cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
366
- match = [val for val in pvals if val in cvals]
367
- #st.write(match)
368
- if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist:
369
- st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}")
370
- else:
371
- pass
372
- if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist:
373
- st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}")
374
- else:
375
- pass
376
- if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist):
377
  pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
378
  cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
379
  match = [val for val in pvals if val in cvals]
380
- if match == []:
381
- st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ")
382
- if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []):
383
- # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique()
384
- # if primary_check:
385
- # pass
386
- # else:
387
- # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.")
388
- pass
389
-
390
- add = st.button("Add Relationship", key='add')
391
- if add:
392
- if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])):
393
- add_df = edited_map_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  else:
395
- add_df = st.session_state.rdf
396
- else:
397
- add_df = st.session_state.rdf
398
-
399
- add_df['CARDINALITY'] = add_df.apply(
400
- lambda row: cardinality(
401
- st.session_state.dataframes[str(row['PARENT TABLE'])],
402
- st.session_state.dataframes[str(row['CHILD TABLE'])],
403
- str(row['PARENT TABLE RELATIONSHIP COLUMN']),
404
- str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
405
-
406
- st.session_state.add_df = add_df
407
- edited_map_df = st.session_state.add_df
408
-
409
- rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values)
410
- unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs]
411
- st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""")
412
-
413
- G, table_columns = create_er_diagram(st.session_state.add_df)
414
- img_bytes= draw_er_diagram(G, table_columns)
415
- col21, col22= st.columns([1,8])
416
- with col21:
417
- if st.button("Regenerate"):
418
- st.rerun()
419
- with col22:
420
- st.download_button(
421
- label="Download ER Diagram",
422
- data=img_bytes,
423
- file_name="er_diagram.png",
424
- mime="image/png"
425
- )
 
 
22
  import json
23
  from io import BytesIO
24
  import pymssql
25
+ ############
26
+ from streamlit_app import sidebar
27
 
28
+ def main():
29
+ st.title('PAGE TITLE') # Change this for each page
30
+ sidebar()
31
+ # Rest of your page content here
32
 
 
 
 
33
 
34
+
35
+ genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w')
36
+ genai_mod = genai.GenerativeModel(
37
+ model_name='models/gemini-pro'
38
+ )
39
+
40
+ st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide')
41
+ st.markdown("""
42
+ <style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ /* Remove blank space at top and bottom */
45
+ .block-container {
46
+ padding-top: 2rem;
47
+ }
48
+
49
+ /* Remove blank space at the center canvas */
50
+ .st-emotion-cache-z5fcl4 {
51
+ position: relative;
52
+ top: -62px;
53
+ }
54
+
55
+ /* Make the toolbar transparent and the content below it clickable */
56
+ .st-emotion-cache-18ni7ap {
57
+ pointer-events: none;
58
+ background: rgb(255 255 255 / 0%)
59
+ }
60
+ .st-emotion-cache-zq5wmm {
61
+ pointer-events: auto;
62
+ background: rgb(255 255 255);
63
+ border-radius: 5px;
64
+ }
65
+ </style>
66
+ """, unsafe_allow_html=True)
67
+ def clear_cache():
68
+ if 'rdf' in st.session_state:
69
+ st.session_state.pop('rdf')
70
 
71
+ def create_er_diagram(df):
72
+ G = nx.DiGraph() # Directed graph
73
+
74
+ # Dictionary to hold table columns
75
+ table_columns = {}
76
+
77
+ # Add nodes and edges to the graph
78
+ for _, row in df.iterrows():
79
+ parent_table = row['PARENT TABLE']
80
+ child_table = row['CHILD TABLE']
81
+ parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN']
82
+ child_fk = row['CHILD TABLE RELATIONSHIP COLUMN']
83
+ cardinality = row.get('CARDINALITY', '1:N')
84
+
85
+ # Add columns to tables
86
+ if parent_table not in table_columns:
87
+ table_columns[parent_table] = []
88
+ table_columns[parent_table].append(parent_pk)
89
+
90
+ if child_table not in table_columns:
91
+ table_columns[child_table] = []
92
+ table_columns[child_table].append(child_fk)
93
+
94
+ # Add nodes and edges
95
+ G.add_node(parent_table)
96
+ G.add_node(child_table)
97
+ G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}')
98
+
99
+ return G, table_columns
100
+
101
+ def draw_er_diagram(G, table_columns):
102
+ pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes
103
+
104
+ plt.figure(figsize=(8, 8))
105
+ nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True)
106
+
107
+ # Draw node labels (table names in bold)
108
+ for node, (x, y) in pos.items():
109
+ plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center')
110
+
111
+ # Draw column names
112
+ for node, columns in table_columns.items():
113
+ x, y = pos[node]
114
+ column_text = '\n'.join(columns)
115
+ plt.text(x, y, column_text, fontsize=6, ha='center', va='center')
116
+
117
+ # Draw edge labels
118
+ edge_labels = nx.get_edge_attributes(G, 'label')
119
+ nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
120
+ st.subheader("Schematic Representation")
121
+ with st.container(border=True, height= 350):
122
+ st.pyplot(plt)
123
+ img_bytes = BytesIO()
124
+ plt.savefig(img_bytes, format='png')
125
+ img_bytes.seek(0)
126
+ return img_bytes
127
+
128
+ def cardinality(parent_df, child_df, parent_column, child_column):
129
+ # Check uniqueness of parent primary key
130
+ is_parent_unique = parent_df[parent_column].is_unique
131
+
132
+ # Check uniqueness of child foreign key
133
+ is_child_unique = child_df[child_column].is_unique
134
+
135
+ # Determine cardinality
136
+ if is_parent_unique and is_child_unique:
137
+ return '1:1'
138
+ elif is_parent_unique and not is_child_unique:
139
+ return '1:N'
140
+ elif not is_parent_unique and is_child_unique:
141
+ return 'N:1'
142
  else:
143
+ return 'N:N'
144
+
145
+ #st.title('AUTOMATED DATA CATALOGUE')
146
+ st.subheader('SELECT SOURCE')
147
+ selectcol11, selectcol12 = st.columns(2)
148
+ with selectcol11:
149
+ select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache)
150
+ with selectcol12:
151
+ select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache)
152
+ if select1 =='DB_10001' and select2 is not None:
153
+ with st.spinner("Loading Tables:"):
154
+
155
+ conn1 = pymssql.connect("Server=sql-ext-dev-uks-001.database.windows.net;"
156
+ "Database=sqldb-ext-dev-uks-001;"
157
+ "UID=dbadmin;"
158
+ "PWD=mYpa$$w0rD" )
159
+
160
+ query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC"
161
+ st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME'])
162
 
163
+ table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache)
164
+ sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
165
+
166
+ discover= button("Discover", key='discover')
167
+
168
+ if discover:
169
+ if sample_selector=='100':
170
+ count="top 100"
171
+ elif sample_selector=='10K':
172
+ count="top 10000"
173
+ elif sample_selector=='100K':
174
+ count="top 100000"
175
+ elif sample_selector=='1M':
176
+ count="top 1000000"
177
+ else:
178
+ count=""
179
+
180
+ query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC"
181
+ st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME'])
182
+ st.session_state.dataframes = {}
183
+ st.session_state.col_names = []
184
+ for tab in st.session_state.tab_names:
185
+ query2_2= "select "+count+" * from ["+select2+"].["+tab+"]"
186
+ st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1)
187
+ st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns)
188
+ #st.session_state.data_load = "Yes"
189
+
190
+ tab_names = st.session_state.tab_names
191
+ dataframes = st.session_state.dataframes
192
+ col_names = st.session_state.col_names
193
+ metadata = MultiTableMetadata()
194
+ metadata.detect_from_dataframes(
195
+ data= st.session_state.dataframes
196
+ )
197
+ multi_python_dict = metadata.to_dict()
198
+
199
+ st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.")
200
+ #st.subheader("DATA CATALOGUE")
201
+ tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"])
202
+ def view_callback():
203
+ st.session_state.tdet = False
204
+ with tab1:
205
+ #st.write(python_dict)
206
+ st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME'])
207
+ containter_length = (len(st.session_state.table_list) + 1)*35
208
+ tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values)
209
+ tabs2= st.tabs(tab_names_shown)
210
+ for i, tab in enumerate(tabs2):
211
+ with tab:
212
+ with st.container(height= 400, border=True):
213
+ cole1,cole2=st.columns([1,1.5])
214
+ with cole1:
215
+ conn = pymssql.connect("Driver={ODBC Driver 17 for SQL Server};"
216
+ "Server=sql-ext-dev-uks-001.database.windows.net;"
217
+ "Database=sqldb-ext-dev-uks-001;"
218
+ "UID=dbadmin;"
219
+ "PWD=mYpa$$w0rD" )
220
+
221
+ table_selector= tab_names_shown[i]
222
+ if table_selector is not None:
223
+ query2="select "+count+" * from [dbo].["+table_selector+"]"
224
+ #df = pd.read_sql_query(query2,con=conn)
225
+ df = st.session_state.dataframes[table_selector]
226
+ selected_df = pd.DataFrame()
227
+ for col in df.columns:
228
+ # Filter non-null and non-blank values
229
+ non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ # Select up to 10 values (or fewer if less than 10 non-null values)
232
+ selected_values = list(non_null_values[:10])
233
+ selected_values = selected_values + [""] * (10 - len(selected_values))
234
+ # Add selected values to the new dataframe
235
+ selected_df[col] = selected_values
236
+ #st.dataframe(selected_df)
237
+ null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1]
238
+ null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n"
239
+ for col in null_columns[:-1]:
240
+ null_mes += f":orange[**{col}**]" + ', '
241
+ for collast in null_columns[-1:]:
242
+ if len(null_columns)> 1:
243
+ null_mes += '**and** ' + f":orange[**{collast}**]"
244
+ else:
245
+ null_mes += f":orange[**{collast}**]"
246
 
247
+ if len(null_columns) != 0:
248
+ with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True):
249
+ st.markdown(null_mes)
250
+ inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns))
251
+ run = st.button('Check', key= f"{tab_names_shown[i]}")
252
+ else:
253
+ st.success("No redundant Columns Found in Terms of Data Completeness")
254
+ inf_filter= None
255
+ run = False
256
+
257
+ if inf_filter is not None:
258
+ df.drop(columns=inf_filter, inplace=True)
259
+ selected_df.drop(columns=inf_filter, inplace=True)
260
+
261
+ if run or len(null_columns) == 0:
262
+ main_list=df.columns.to_list()
263
+ sub_list=['ID','LOADID','FILE_NAME']
264
+ if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
265
+ df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
266
+ conn.close()
267
+ sin_metadata = SingleTableMetadata()
268
+ sin_metadata.detect_from_dataframe(df)
269
+ python_dict = sin_metadata.to_dict()
270
+ if f'cont_{table_selector}' not in st.session_state:
271
+ with st.spinner("Processing Table"):
272
+ # Create a GenerativeModel instance
273
+ genai_mod = genai.GenerativeModel(
274
+ model_name='models/gemini-pro'
275
+ )
276
+ if 'primary_key' in python_dict:
277
+ primary_key = python_dict['primary_key']
278
+ else:
279
+ primary_key = "Could Not be Identified"
280
 
 
281
 
282
+ story = f""" Details of the table:
283
+ table columns: {str(list(df.columns))}
284
+ column datatypes: {str(df.dtypes.to_string())}
285
+ table sample data: {selected_df.head(10).to_string()}
286
+ """
287
+ response = genai_mod.generate_content(textwrap.dedent("""
288
+ You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
289
+ Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN.
290
+
291
+ Here is the table details:
292
+
293
+ """) + story + f"The Primary Key is:{primary_key}" ,
294
+ safety_settings={
295
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
296
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
297
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
298
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
299
+ })
300
+ st.session_state[f'cont_{table_selector}'] = response.text
301
+
302
+ st.markdown(st.session_state[f'cont_{table_selector}'])
303
+ with cole2:
304
+ st.markdown("**DATA PREVIEW**")
305
+ st.dataframe(df, use_container_width= True)
306
+
307
+ with tab2:
308
+ metadata1 = MultiTableMetadata()
309
+ metadata1.detect_from_dataframes(
310
+ data= st.session_state.dataframes
311
+ )
312
+ multi_python_dict1 = metadata1.to_dict()
313
+ rlist1=multi_python_dict1['relationships']
314
+ rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY'])
315
+ for i in range(len(rlist1)):
316
+ rlist=rlist1[i]
317
+ nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i])
318
+ rdf=pd.concat([rdf,nrow],ignore_index=True)
319
 
320
+ rdf['CARDINALITY'] = rdf.apply(
321
+ lambda row: cardinality(
322
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
323
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
324
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
325
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
326
+
327
+
328
+ if 'rdf' not in st.session_state:
329
+ st.session_state.rdf = rdf
330
+
331
+ edited_map_df = st.data_editor(
332
+ st.session_state.rdf,
333
+ column_config={
334
+ "PARENT TABLE": st.column_config.SelectboxColumn(
335
+ "Available Parent Table",
336
+ width="medium",
337
+ options=tab_names,
338
+ required=True,
339
+ ),
340
+ "CHILD TABLE": st.column_config.SelectboxColumn(
341
+ "Available Child Table",
342
+ width="medium",
343
+ options=tab_names,
344
+ required=True,
345
+ ),
346
+ "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
347
+ "Available Parent Table Relationship Column",
348
+ width="medium",
349
+ options=col_names,
350
+ required=True,
351
+ ),
352
+ "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
353
+ "Available Child Table Relationship Column",
354
+ width="medium",
355
+ options=col_names,
356
+ required=True,
357
+ ),
358
+ "CARDINALITY": st.column_config.SelectboxColumn(
359
+ "Cardinality",
360
+ width="medium",
361
+ options=['1:1','1:N','N:1','N:N'],
362
+ required=True,
363
+ )
364
+ },
365
+ hide_index=True,
366
+ num_rows = 'dynamic',
367
+ use_container_width = True
368
+ )
369
+
370
+ for i,row in edited_map_df.iterrows():
371
+ pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns
372
+ ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
374
  cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
375
  match = [val for val in pvals if val in cvals]
376
+ #st.write(match)
377
+ if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist:
378
+ st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}")
379
+ else:
380
+ pass
381
+ if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist:
382
+ st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}")
383
+ else:
384
+ pass
385
+ if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist):
386
+ pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
387
+ cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
388
+ match = [val for val in pvals if val in cvals]
389
+ if match == []:
390
+ st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ")
391
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []):
392
+ # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique()
393
+ # if primary_check:
394
+ # pass
395
+ # else:
396
+ # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.")
397
+ pass
398
+
399
+ add = st.button("Add Relationship", key='add')
400
+ if add:
401
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])):
402
+ add_df = edited_map_df
403
+ else:
404
+ add_df = st.session_state.rdf
405
  else:
406
+ add_df = st.session_state.rdf
407
+
408
+ add_df['CARDINALITY'] = add_df.apply(
409
+ lambda row: cardinality(
410
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
411
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
412
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
413
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
414
+
415
+ st.session_state.add_df = add_df
416
+ edited_map_df = st.session_state.add_df
417
+
418
+ rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values)
419
+ unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs]
420
+ st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""")
421
+
422
+ G, table_columns = create_er_diagram(st.session_state.add_df)
423
+ img_bytes= draw_er_diagram(G, table_columns)
424
+ col21, col22= st.columns([1,8])
425
+ with col21:
426
+ if st.button("Regenerate"):
427
+ st.rerun()
428
+ with col22:
429
+ st.download_button(
430
+ label="Download ER Diagram",
431
+ data=img_bytes,
432
+ file_name="er_diagram.png",
433
+ mime="image/png"
434
+ )
435
+ ######
436
+ if __name__ == '__main__':
437
+ main()